Skip to content

Commit

Permalink
lines can be get by addressee. added MLU.
Browse files Browse the repository at this point in the history
  • Loading branch information
macramole committed Dec 18, 2024
1 parent 5aa472e commit 0a34572
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 44 deletions.
100 changes: 60 additions & 40 deletions ChaFile.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
LEXICAL_DIVERSITY_MTLD = "mtld"
###############################################

# Linguistic productivity constants.
LINGUISTIC_PRODUCTIVITY_MLU = "mlu"
###############################################

# Language constants
LANGUAGE_SPANISH = "spa"
LANGUAGE_ENGLISH = "eng"
Expand Down Expand Up @@ -275,13 +279,31 @@ def processLines(self):
if TIER_MOR not in l:
l[TIER_MOR] = []

def getLines(self):
def getLines(self, addressee=ADDRESSEE_ALL):
"""Get an array of parsed utterances
Returns:
list: Utterances. Access data using the LINE constants
"""
return self.lines

lines = []

if addressee == ADDRESSEE_ALL:
lines = self.lines[:]
elif addressee == ADDRESSEE_CHILD_DIRECTED:
for l in self.lines:
if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
lines.append(l)
elif addressee == ADDRESSEE_CHILD_PRODUCED:
for l in self.lines:
if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
lines.append(l)
elif addressee == ADDRESSEE_OVER_HEARD:
for l in self.lines:
if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
lines.append(l)

return lines

def getLine(self, lineNumber):
"""Get a line by its number
Expand Down Expand Up @@ -500,15 +522,15 @@ def morUnitToWord(self, line, morUnitIndex):

return line[LINE_MOR_TO_WORDS][morUnitIndex]

def countUtterances(self, ignoreEmptyUtterances = True):
def countUtterances(self, addressee=ADDRESSEE_ALL, ignoreEmptyUtterances = True):
"""Returns number of utterances ignoring empty ones based on a word criteria
Returns:
int: Number of utterances in the current transcript
"""

uttCount = 0
for l in self.getLines():
for l in self.getLines(addressee):
if ignoreEmptyUtterances and self.isUtteranceEmpty(l):
continue

Expand Down Expand Up @@ -985,28 +1007,10 @@ def add(l):
else:
c[v] = 1

if addressee == ADDRESSEE_ALL:
for l in self.getLines():
add( l )
elif addressee == ADDRESSEE_CHILD_DIRECTED:
for l in self.getLines():
if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
add( l )
elif addressee == ADDRESSEE_CHILD_PRODUCED:
for l in self.getLines():
if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
add( l )
elif addressee == ADDRESSEE_OVER_HEARD:
for l in self.getLines():
if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
add( l )
elif addressee == ADDRESSEE_ADULT:
for l in self.getLines():
if l[LINE_ADDRESSEE] == SPEAKER_ADULT and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
add( l )
lines = self.getLines(addressee)
for l in lines:
add( l )

# print(c)

if countType == COUNT_TYPE_TOKENS:
return sum(c.values())
else:
Expand Down Expand Up @@ -1057,22 +1061,17 @@ def applyMorCriteriaInLine(self, line, criteria, criteriaType = MOR_UNIT_CATEGOR
return self._checkCriteria( line["mor"], criteria, criteriaType )

def getLexicalDiversity(self, addressee=ADDRESSEE_ALL, metric=LEXICAL_DIVERSITY_HDD, extraParam = None):
lines = []
"""Calculate lexical diversity metric
if addressee == ADDRESSEE_ALL:
lines = self.getLines()
elif addressee == ADDRESSEE_CHILD_DIRECTED:
for l in self.getLines():
if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
lines.append(l)
elif addressee == ADDRESSEE_CHILD_PRODUCED:
for l in self.getLines():
if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
lines.append(l)
elif addressee == ADDRESSEE_OVER_HEARD:
for l in self.getLines():
if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
lines.append(l)
Args:
addressee (str, optional): Defaults to ADDRESSEE_ALL.
metric (str, optional): LEXICAL_DIVERSITY_HDD, LEXICAL_DIVERSITY_MAAS, LEXICAL_DIVERSITY_MTLD, LEXICAL_DIVERSITY_MATTR, LEXICAL_DIVERSITY_TTR . Defaults to LEXICAL_DIVERSITY_HDD.
extraParam (_type_, optional): LEXICAL_DIVERSITY_MATTR has extra param window_size that defaults to 50. Defaults to None.
Returns:
_type_: _description_
"""
lines = self.getLines(addressee)

tokens = []
for l in lines:
Expand All @@ -1097,6 +1096,27 @@ def getLexicalDiversity(self, addressee=ADDRESSEE_ALL, metric=LEXICAL_DIVERSITY_

return result

def getLinguisticProductivity(self, addressee=ADDRESSEE_ALL, metric=LINGUISTIC_PRODUCTIVITY_MLU):
"""Calculates MLU metric. Note that results can be very different from CLAN
Args:
addressee (str, optional): Defaults to ADDRESSEE_ALL.
metric (str, optional): Defaults to LINGUISTIC_PRODUCTIVITY_MLU.
Returns:
tuple: (utterance count, morpheme count, MLU)
"""
lines = self.getLines(addressee)
count_utts = 0
count_mor = 0

for line in lines:
if len(line[TIER_MOR]) > 0: #and not " xxx" in line[LINE_UTTERANCE]
count_utts += 1
count_mor += len(line[TIER_MOR])

return (count_utts, count_mor, count_mor/count_utts)

def isUtteranceEmpty(self, line):
"""Returns True if the utterance is empty based on a word criteria
Expand Down
56 changes: 52 additions & 4 deletions test.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,64 @@
from ChaFile import *
from glob import glob
import os

chaPaths = glob("/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/*.cha")
chaPaths = glob("/home/macramole/Code/isolci/corpus_bebes/longi_audio1/codificados/codificados bullets elan/*.cha")
# chaPaths = glob("/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/*.cha")
# chaPaths = glob("/home/macramole/Code/ciipme/aclew/highVolFromGit/allCHA/eng/*.cha")

# chaPath = "/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/donatow-a1-nsm.elan.cha"
# chaPath = "/home/macramole/Code/ciipme/corpusESPON/longi_audio1/codificados/codificados bullets elan/alma-a1-nsb.cha"
# cha = ChaFile( chaPath, language=LANGUAGE_SPANISH )

for c in chaPaths:
cha = ChaFile( c, language=LANGUAGE_SPANISH )
#for c in chaPaths:
#cha = ChaFile( c, language=LANGUAGE_SPANISH )
# cha = ChaFile( c, language=LANGUAGE_ENGLISH )
print(cha.countUtterances(False), " / ", cha.countUtterances(True))
#print(cha.countUtterances(False), " / ", cha.countUtterances(True))

path = "/home/macramole/Code/isolci/corpus_bebes/longi_audio1/codificados/codificados bullets elan/alma-a1-nsb.cha"
cha = ChaFile( path, language=LANGUAGE_SPANISH, onlyCDS=True )
# lines = cha.getLinesBySpeakers()

# for speaker in lines:
# count = 0
# for line in lines[speaker]:
# if len(line[TIER_MOR]) > 0 and not " xxx" in line[LINE_UTTERANCE]:
# # print(line[LINE_UTTERANCE])
# count += 1

# print(f"{speaker}: {count}")

print( cha.getLinguisticProductivity(ADDRESSEE_CHILD_DIRECTED) )

# print()

# for line in cha.getLines():
# if line[LINE_SPEAKER] == "NIN":
# print(line[LINE_UTTERANCE])
# print(line[TIER_MOR])
# print()


# for path in chaPaths:
# cha = ChaFile( path, language=LANGUAGE_SPANISH )
# # print(cha.countUtterances(ADDRESSEE_CHILD_DIRECTED))
# # print(len(cha.getLines(ADDRESSEE_CHILD_DIRECTED)))
# cant_utt = 0
# cant_mor = 0

# for l in cha.getLines(ADDRESSEE_CHILD_DIRECTED):
# # if len(l[TIER_MOR]) > 0:
# # if len(l[TIER_MOR]) == 1:
# # if l[TIER_MOR][0][MOR_UNIT_CATEGORIA] == "co":
# # continue

# cant_utt += 1
# cant_mor += len(l[TIER_MOR])

# # if len(l[TIER_MOR]) == 1:
# # print( l[LINE_UTTERANCE] )
# # print( l[TIER_MOR] )
# # print()

# print(os.path.basename(path))
# print(f"{cant_utt} | {cant_mor} = {cant_mor/cant_utt}")

0 comments on commit 0a34572

Please sign in to comment.