Skip to content

Commit

Permalink
Merge pull request #1 from drammock/stevecantmerge
Browse files Browse the repository at this point in the history
Stevecantmerge
  • Loading branch information
bambooforest committed Sep 11, 2014
2 parents 0241059 + a4efaa6 commit 513cd21
Show file tree
Hide file tree
Showing 4 changed files with 301 additions and 100 deletions.
30 changes: 14 additions & 16 deletions code/aggregation/aggregate-raw-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ uw.path <- paste(datadir, "UW", "phoible_inventories.tsv", sep="/")
aa.path <- paste(datadir, "AA", "AA_inventories.tsv", sep="/")
spa.path <- paste(datadir, "SPA", "SPA_Phones.tsv", sep="/")
spa.ipa.path <- paste(datadir, "SPA", "SPA_IPA_correspondences.tsv", sep="/")
spa.iso.path <- paste(datadir, "SPA", "SPA_LangNamesCodes.tsv", sep="/")
upsid.segments.path <- paste(datadir, "UPSID", "UPSID_Segments.tsv", sep="/")
upsid.character.codes.path <- paste(datadir, "UPSID", "UPSID_CharCodes.tsv", sep="/")
upsid.languages.path <- paste(datadir, "UPSID", "UPSID_Languages.tsv", sep="/")
Expand Down Expand Up @@ -113,6 +114,7 @@ aa.data$source <- "aa"
# TODO: SPA data does not have ISO codes; is there a mapping to SPA language numbers?
# TODO: the SPA "Notes" column just has numeric codes in it. where is the key?
spa.ipa <- read.delim(spa.ipa.path, na.strings="", stringsAsFactors=FALSE, quote="")
spa.iso <- read.delim(spa.iso.path, na.strings="", stringsAsFactors=FALSE, quote="")
spa.raw <- read.delim(spa.path, na.strings="", stringsAsFactors=FALSE, quote="")
spa.raw$spaLangNum <- na.locf(spa.raw$spaLangNum)
spa.split <- split(spa.raw, spa.raw$spaLangNum)
Expand All @@ -125,8 +127,9 @@ spa.data$spaAllophoneDescription <- gsub("[", "", gsub("]", "",
spa.data$Allophones <- spa.ipa$Phoneme[match(spa.data$spaAllophoneDescription,
spa.ipa$spaDescription)]
spa.data <- collapseAllophones(spa.data, "LanguageName")
spa.data <- merge(spa.data, spa.iso)
spa.data$source <- "spa"
rm(spa.raw, spa.split, spa.ipa)
rm(spa.raw, spa.split, spa.ipa, spa.iso)

# UPSID
upsid.ipa <- read.delim(upsid.ipa.path, na.strings="",
Expand Down Expand Up @@ -175,22 +178,16 @@ all.data$Phoneme <- gsub("ç", "ç", all.data$Phoneme, fixed=TRUE)
# REMOVE ALL TIEBARS
all.data$Phoneme <- gsub("͡", "", all.data$Phoneme, fixed=TRUE)
all.data$Phoneme <- gsub("͜", "", all.data$Phoneme, fixed=TRUE)
# FIX SOME NORMALIZATION ORDER ISSUES
all.data$Phoneme <- gsub("æ̞̃", "æ̞̃", all.data$Phoneme, fixed=TRUE)
all.data$Phoneme <- gsub("̰̃", "̰̃", all.data$Phoneme, fixed=TRUE)
#all.data$Phoneme <- gsub("ḭ̃", "ḭ̃", all.data$Phoneme, fixed=TRUE)
#all.data$Phoneme <- gsub("ṵ̃", "ṵ̃", all.data$Phoneme, fixed=TRUE)


# CTOR AFTER SUBSTITUTIONS
# FACTOR AFTER SUBSTITUTIONS
all.data$Phoneme <- factor(all.data$Phoneme)


foo TE ISO CODES
# VALIDATE ISO CODES



# LOAD THE FEATURE AND IMPLEMENT THE RULES
# LOAD THE FEATURES AND IMPLEMENT THE RULES
feats <- read.delim(features.path, sep='\t', stringsAsFactors=TRUE)
feat.columns <- c("tone", "stress", "syllabic", "short", "long",
"consonantal", "sonorant", "continuant",
Expand Down Expand Up @@ -218,12 +215,13 @@ upsid.feats <- do.call(rbind, lapply(upsid.disjuncts, function(i) {
}))
all.data[upsid.disjunct.indices, feat.columns] <- upsid.feats[feat.columns]

# TODO: still a couple dozen unique phonemes without features; many are c-cedillas
foo <- all.data[is.na(all.data$syllabic),]
sink("/media/dan/data/Desktop/featurelessPhonemes.tsv")
cat(paste(unique(foo$Phoneme), collapse="\n"))
sink()

# TEMPORARY CODE FOR DEBUGGING
# still a few phonemes without features
missing.feats <- all.data[is.na(all.data$syllabic),]
#sink("~/Desktop/featurelessPhonemes.tsv")
cat(paste(c("phonemes without feature vectors:",
unique(as.character(missing.feats$Phoneme))), collapse="\n"))
#sink()

# TRUMP ORDERING: more preferred data sources come earlier in the list
trump.order <- c("uw", "spa", "aa", "upsid", "ramaswami") # "casl", "saphon"
Expand Down
5 changes: 5 additions & 0 deletions data/FEATURES/phoible-segments-features.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2176,3 +2176,8 @@ ẽ̞ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + - - + - + - - + - - - 0 - - 0
ẽ̞ĩ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + -,+ - + - + - - + - - - 0 - - 0
ɨ̞̃ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + + - - - + - - + - - - 0 - - 0
ãõ̞ˤ 0 - + - - - + + 0 + - - + - -,+ + - - 0 0 0 + - +,- - -,+ + + - + - - - 0 - - 0
ḭ̃ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + + - + - + - - + - - + 0 - - 0
ṵ̃ 0 - + - - - + + 0 + - - + - + + - - 0 0 0 + + - - + + - - + - - + 0 - - 0
ẽ̞ũ 0 - + - - - + + 0 + - - + - -,+ + - - 0 0 0 + -,+ - +,- -,+ + - - + - - - 0 - - 0
ĩẽ̞ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + +,- - + - + - - + - - - 0 - - 0
ã̰ 0 - + - - - + + 0 + - - + - - 0 0 - 0 0 0 + - + - - 0 - - + - - + 0 - - 0
198 changes: 198 additions & 0 deletions data/SPA/SPA_LangNamesCodes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
LanguageCode LanguageName
ain Ainu
aka Akan
akz Alabama
ale Aleut
alh Alawa
amc Amahuaca
ame Amuesha
amh Amharic
anc Angas
ant Western Desert
apn Apinaye
arn Araucanian
ary Moroccan Arabic
arz Egyptian Arabic
auy Auyana
awn Awiya
jqr Aymara
azj Azerbaijani
adz Adzera
ben Bengali
beq Beembe
bre Breton
bsk Burushaski
bao Barasano
bul Bulgarian
bbc Batak
cao Chacobo
car Carib
cbi Cayapa
cha Chamorro
chf Chontal
chp Chipewyan
chv Chuvash
cja Cham
ckt Chukchi
cmn Mandarin Chinese
cni Campa
crb Island Carib
dag Dagbani
lkt Dakota
dap Dafla
deu German
dih Digueno
ell Modern Greek
eng English
eus Basque
eve Even
ewe Ewe
fia Mahas-Fiyadikka
fin Finnish
fra French
gaa Ga
gaj Gadsup
gbp Gbeya
gle Irish Gaelic
grt Garo
guc Goajiro
gug Guarani
hak Hakka
hau Hausa
haw Hawaiian
hdn Haida
heb Modern Hebrew
hin Hindi-Urdu
hop Hopi
jiv Jivaro
hun Hungarian
hup Hupa
hye Armenian
iai Iai
ibo Igbo
ign Moxo
irk Iraqw
isl Icelandic
ito Itonama
ium Yao
jav Javanese
jpn Japanese
kal Inuit
kas Kashimiri
kat Georgian
kbd Kabardian
kca Ostyak
ket Ket
kfe Kota
kha Khasi
khk Khalkha
khl Kaliai
khm Cambodian
khr Kharia
kir Kirghiz
kjn Kunjen
kmo Washkuk
knc Kanuri
kpv Komi
kor Korean
kru Kurukh
ksw Karen
kup Kunimaipa
kwk Kwakiutl
kyh Karok
lbc Lakkia
lbe Lak
lgg Logbara
lhu Lahu
lit Lithuanian
lue Luvale
lui Luiseno
luo Luo
maq Mazateco
mas Maasai
maz Mazahua
mhr Cheremis
mig Mixtec
mlt Maltese
mph Maung
mri Maori
mrt Margi
mya Burmese
naq Nama
nas Nasioi
nav Navaho
nez Nez Perce
niv Gilyak
nmu Maidu
nna Nyangumata
nob Norwegian
noo Nootka
nuy Nunggubuyu
oca Ocaina
ojg Ojibwa
one Oneida
ood Pima
ote Otomi
pan Punjabi
pbb Paez
pcc Yay
pes Persian
plt Malagasy
pom Pomo
por Portuguese
pst Pashto
quh Quechua
rif Shilha
ron Rumanian
rus Russian
sah Yakut
sed Sedang
see Seneca
ses Songhai
set Sentani
sin Sinhalese
snv 'Sa''ban'
som Somali
spa Spanish
spl Selepet
als Albanian
squ Squamish
srq Siriono
str Salish
sun Sundanese
swh Swahili
tay Atayal
tca Ticuna
tel Telugu
tew Tewa
tgl Tagalog
tha Thai
tig Tigre
tix Tiwa
tlf Telefol
tml Asmat
tol Chasta Costa
top Totonac
tsz Tarascan
tun Tunica
tur Turkish
tzh Tzeltal
unm Delaware
unr Mundari
vie Vietnamese
wap Wapishana
wic Wichita
wim Wik-Munkan
wol Wolof
wuu Wu
xpe Kpelle
xtc Katcha
ykg Yukaghir
yrk Yurak
yuc Yuchi
yue Cantonese
zmr Maranungku
zoc Zoque
zsm Malay
zul Zulu
zun Zuni
Loading

0 comments on commit 513cd21

Please sign in to comment.