lines can be get by addressee. added MLU.

macramole · Dec 18, 2024 · 0a34572 · 0a34572
1 parent 5aa472e
commit 0a34572
Show file tree

Hide file tree

Showing 2 changed files with 112 additions and 44 deletions.
diff --git a/ChaFile.py b/ChaFile.py
@@ -59,6 +59,10 @@
 LEXICAL_DIVERSITY_MTLD = "mtld"
 ###############################################
 
+# Linguistic productivity constants.
+LINGUISTIC_PRODUCTIVITY_MLU = "mlu"
+###############################################
+
 # Language constants
 LANGUAGE_SPANISH = "spa"
 LANGUAGE_ENGLISH = "eng"
@@ -275,13 +279,31 @@ def processLines(self):
 				if TIER_MOR not in l:
 					l[TIER_MOR] = []
 
-	def getLines(self):
+	def getLines(self, addressee=ADDRESSEE_ALL):
 		"""Get an array of parsed utterances
 
 		Returns:
 			list: Utterances. Access data using the LINE constants
 		"""
-		return self.lines
+
+		lines = []
+
+		if addressee == ADDRESSEE_ALL:
+			lines = self.lines[:]
+		elif addressee == ADDRESSEE_CHILD_DIRECTED:
+			for l in self.lines:
+				if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
+					lines.append(l)
+		elif addressee == ADDRESSEE_CHILD_PRODUCED:
+			for l in self.lines:
+				if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
+					lines.append(l)
+		elif addressee == ADDRESSEE_OVER_HEARD:
+			for l in self.lines:
+				if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
+					lines.append(l)
+
+		return lines
 
 	def getLine(self, lineNumber):
 		"""Get a line by its number
@@ -500,15 +522,15 @@ def morUnitToWord(self, line, morUnitIndex):
 
 		return line[LINE_MOR_TO_WORDS][morUnitIndex]
 
-	def countUtterances(self, ignoreEmptyUtterances = True):
+	def countUtterances(self, addressee=ADDRESSEE_ALL, ignoreEmptyUtterances = True):
 		"""Returns number of utterances ignoring empty ones based on a word criteria
 
 		Returns:
 			int: Number of utterances in the current transcript
 		"""
 
 		uttCount = 0
-		for l in self.getLines():
+		for l in self.getLines(addressee):
 			if ignoreEmptyUtterances and self.isUtteranceEmpty(l):
 				continue
 
@@ -985,28 +1007,10 @@ def add(l):
 					else:
 						c[v] = 1
 
-		if addressee == ADDRESSEE_ALL:
-			for l in self.getLines():
-				add( l )
-		elif addressee == ADDRESSEE_CHILD_DIRECTED:
-			for l in self.getLines():
-				if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
-					add( l )
-		elif addressee == ADDRESSEE_CHILD_PRODUCED:
-			for l in self.getLines():
-				if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
-					add( l )
-		elif addressee == ADDRESSEE_OVER_HEARD:
-			for l in self.getLines():
-				if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
-					add( l )
-		elif addressee == ADDRESSEE_ADULT:
-			for l in self.getLines():
-				if l[LINE_ADDRESSEE] == SPEAKER_ADULT and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
-					add( l )
+		lines = self.getLines(addressee)
+		for l in lines:
+			add( l )
 
-		# print(c)
-
 		if countType == COUNT_TYPE_TOKENS:
 			return sum(c.values())
 		else:
@@ -1057,22 +1061,17 @@ def applyMorCriteriaInLine(self, line, criteria, criteriaType = MOR_UNIT_CATEGOR
 		return self._checkCriteria( line["mor"], criteria, criteriaType )
 
 	def getLexicalDiversity(self, addressee=ADDRESSEE_ALL, metric=LEXICAL_DIVERSITY_HDD, extraParam = None):
-		lines = []
+		"""Calculate lexical diversity metric
 
-		if addressee == ADDRESSEE_ALL:
-			lines = self.getLines()
-		elif addressee == ADDRESSEE_CHILD_DIRECTED:
-			for l in self.getLines():
-				if l[LINE_ADDRESSEE] == SPEAKER_TARGET_CHILD:
-					lines.append(l)
-		elif addressee == ADDRESSEE_CHILD_PRODUCED:
-			for l in self.getLines():
-				if l[LINE_SPEAKER] == SPEAKER_TARGET_CHILD:
-					lines.append(l)
-		elif addressee == ADDRESSEE_OVER_HEARD:
-			for l in self.getLines():
-				if l[LINE_ADDRESSEE] != SPEAKER_TARGET_CHILD and l[LINE_SPEAKER] != SPEAKER_TARGET_CHILD :
-					lines.append(l)
+		Args:
+			addressee (str, optional): Defaults to ADDRESSEE_ALL.
+			metric (str, optional): LEXICAL_DIVERSITY_HDD, LEXICAL_DIVERSITY_MAAS, LEXICAL_DIVERSITY_MTLD, LEXICAL_DIVERSITY_MATTR, LEXICAL_DIVERSITY_TTR . Defaults to LEXICAL_DIVERSITY_HDD.
+			extraParam (_type_, optional): LEXICAL_DIVERSITY_MATTR has extra param window_size that defaults to 50. Defaults to None.
+
+		Returns:
+			_type_: _description_
+		"""
+		lines = self.getLines(addressee)
 
 		tokens = []
 		for l in lines:
@@ -1097,6 +1096,27 @@ def getLexicalDiversity(self, addressee=ADDRESSEE_ALL, metric=LEXICAL_DIVERSITY_
 
 		return result
 
+	def getLinguisticProductivity(self, addressee=ADDRESSEE_ALL, metric=LINGUISTIC_PRODUCTIVITY_MLU):
+		"""Calculates MLU metric. Note that results can be very different from CLAN
+
+		Args:
+			addressee (str, optional): Defaults to ADDRESSEE_ALL.
+			metric (str, optional): Defaults to LINGUISTIC_PRODUCTIVITY_MLU.
+
+		Returns:
+			tuple: (utterance count, morpheme count, MLU)
+		"""
+		lines = self.getLines(addressee)
+		count_utts = 0
+		count_mor = 0
+
+		for line in lines:
+			if len(line[TIER_MOR]) > 0: #and not " xxx" in line[LINE_UTTERANCE]
+				count_utts += 1
+				count_mor += len(line[TIER_MOR])
+
+		return (count_utts, count_mor, count_mor/count_utts)
+
 	def isUtteranceEmpty(self, line):
 		"""Returns True if the utterance is empty based on a word criteria
 

diff --git a/test.py b/test.py
@@ -1,16 +1,64 @@
 from ChaFile import *
 from glob import glob
+import os
 
-chaPaths = glob("/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/*.cha")
+chaPaths = glob("/home/macramole/Code/isolci/corpus_bebes/longi_audio1/codificados/codificados bullets elan/*.cha")
+# chaPaths = glob("/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/*.cha")
 # chaPaths = glob("/home/macramole/Code/ciipme/aclew/highVolFromGit/allCHA/eng/*.cha")
 
 # chaPath = "/home/macramole/Code/ciipme/aclew/highvolResults/listos_cha/donatow-a1-nsm.elan.cha"
 # chaPath = "/home/macramole/Code/ciipme/corpusESPON/longi_audio1/codificados/codificados bullets elan/alma-a1-nsb.cha"
 # cha = ChaFile( chaPath, language=LANGUAGE_SPANISH )
 
-for c in chaPaths:
-    cha = ChaFile( c, language=LANGUAGE_SPANISH )
+#for c in chaPaths:
+    #cha = ChaFile( c, language=LANGUAGE_SPANISH )
     # cha = ChaFile( c, language=LANGUAGE_ENGLISH )
-    print(cha.countUtterances(False), " / ", cha.countUtterances(True))
+    #print(cha.countUtterances(False), " / ", cha.countUtterances(True))
 
+path = "/home/macramole/Code/isolci/corpus_bebes/longi_audio1/codificados/codificados bullets elan/alma-a1-nsb.cha"
+cha = ChaFile( path, language=LANGUAGE_SPANISH, onlyCDS=True )
+# lines = cha.getLinesBySpeakers()
 
+# for speaker in lines:
+#     count = 0
+#     for line in lines[speaker]:
+#         if len(line[TIER_MOR]) > 0 and not " xxx" in line[LINE_UTTERANCE]:
+#             # print(line[LINE_UTTERANCE])
+#             count += 1
+
+#     print(f"{speaker}: {count}")
+
+print( cha.getLinguisticProductivity(ADDRESSEE_CHILD_DIRECTED) )
+
+# print()
+
+# for line in cha.getLines():
+#     if line[LINE_SPEAKER] == "NIN":
+#         print(line[LINE_UTTERANCE])
+#         print(line[TIER_MOR])
+#         print()
+
+
+# for path in chaPaths:
+#     cha = ChaFile( path, language=LANGUAGE_SPANISH )
+#     # print(cha.countUtterances(ADDRESSEE_CHILD_DIRECTED))
+#     # print(len(cha.getLines(ADDRESSEE_CHILD_DIRECTED)))
+#     cant_utt = 0
+#     cant_mor = 0
+
+#     for l in cha.getLines(ADDRESSEE_CHILD_DIRECTED):
+#         # if len(l[TIER_MOR]) > 0:
+#             # if len(l[TIER_MOR]) == 1:
+#             #     if l[TIER_MOR][0][MOR_UNIT_CATEGORIA] == "co":
+#             #         continue
+
+#         cant_utt += 1
+#         cant_mor += len(l[TIER_MOR])
+
+#             # if len(l[TIER_MOR]) == 1:
+#             #     print( l[LINE_UTTERANCE] )
+#             #     print( l[TIER_MOR] ) 
+#             #     print()
+
+#     print(os.path.basename(path))
+#     print(f"{cant_utt} | {cant_mor} = {cant_mor/cant_utt}")