Measures: Add effect size - conditional probability

BLKSerene · Nov 2, 2024 · 436257e · 436257e
1 parent 97b0f15
commit 436257e
Show file tree

Hide file tree

Showing 25 changed files with 355 additions and 232 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,7 +20,7 @@
 
 ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
 ### 🎉 New Features
-- Measures: Add effect size - squared association ratio
+- Measures: Add effect size - conditional probability / squared association ratio
 - Utils: Add Stanza's Sindhi dependency parser
 
 ### 📌 Bugfixes

diff --git a/doc/doc.md b/doc/doc.md
diff --git a/doc/measures/effect_size/conditional_probability.svg b/doc/measures/effect_size/conditional_probability.svg
diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py
@@ -34,12 +34,12 @@ def test_colligation_extractor():
     tests_statistical_significance = [
         test_statistical_significance
         for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
-        if vals['collocation_extractor']
+        if vals['collocation']
     ]
     measures_bayes_factor = [
         measure_bayes_factor
         for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
-        if vals['collocation_extractor']
+        if vals['collocation']
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 

diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py
@@ -34,12 +34,12 @@ def test_collocation_extractor():
     tests_statistical_significance = [
         test_statistical_significance
         for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
-        if vals['collocation_extractor']
+        if vals['collocation']
     ]
     measures_bayes_factor = [
         measure_bayes_factor
         for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
-        if vals['collocation_extractor']
+        if vals['collocation']
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 

diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py
@@ -31,12 +31,12 @@ def test_keyword_extractor():
     tests_statistical_significance = [
         test_statistical_significance
         for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
-        if vals['keyword_extractor']
+        if vals['keyword']
     ]
     measures_bayes_factor = [
         measure_bayes_factor
         for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
-        if vals['keyword_extractor']
+        if vals['keyword']
     ]
     measures_effect_size = list(main.settings_global['measures_effect_size'].keys())
 

diff --git a/tests/tests_measures/test_measures_adjusted_freq.py b/tests/tests_measures/test_measures_adjusted_freq.py
@@ -22,7 +22,7 @@
 
 main = wl_test_init.Wl_Test_Main()
 
-# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 410)
+# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 410
 def test_fald():
     assert round(wl_measures_adjusted_freq.fald(main, test_measures_dispersion.TOKENS, 'a'), 3) == 11.764
     assert wl_measures_adjusted_freq.fald(main, test_measures_dispersion.TOKENS, 'aa') == 0
@@ -36,44 +36,44 @@ def test_fawt():
     assert wl_measures_adjusted_freq.fawt(main, test_measures_dispersion.TOKENS, 'aa') == 0
 
 # References:
-#     Carroll, J. B. (1970). An alternative to Juilland’s usage coefficient for lexical frequencies and a proposal for a standard frequency index. Computer Studies in the Humanities and Verbal Behaviour, 3(2), 61–65. https://doi.org/10.1002/
-#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
-#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
+#     Carroll, J. B. (1970). An alternative to Juillands's usage coefficient for lexical frequencies. ETS Research Bulletin Series, 1970(2), i–15. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x | p. 13
+#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
+#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
 def test_carrolls_um():
     assert round(wl_measures_adjusted_freq.carrolls_um(main, [2, 1, 1, 1, 0]), 2) == 4.31
     assert round(wl_measures_adjusted_freq.carrolls_um(main, [4, 2, 1, 1, 0]), 3) == 6.424
     assert round(wl_measures_adjusted_freq.carrolls_um(main, [1, 2, 3, 4, 5]), 3) == 14.108
     assert wl_measures_adjusted_freq.carrolls_um(main, [0, 0, 0, 0, 0]) == 0
 
 # References
-#     Carroll, J. B. (1970). An alternative to Juilland’s usage coefficient for lexical frequencies and a proposal for a standard frequency index. Computer Studies in the Humanities and Verbal Behaviour, 3(2), 61–65. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x
-#     Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. (p. 115)
-#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
+#     Carroll, J. B. (1970). An alternative to Juillands's usage coefficient for lexical frequencies. ETS Research Bulletin Series, 1970(2), i–15. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x | p. 14
+#     Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. | p. 115
+#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
 def test_juillands_u():
     assert round(wl_measures_adjusted_freq.juillands_u(main, [0, 4, 3, 2, 1]), 2) == 6.46
     assert round(wl_measures_adjusted_freq.juillands_u(main, [2, 2, 2, 2, 2]), 0) == 10
     assert round(wl_measures_adjusted_freq.juillands_u(main, [4, 2, 1, 1, 0]), 3) == 4.609
     assert wl_measures_adjusted_freq.juillands_u(main, [0, 0, 0, 0, 0]) == 0
 
 # References:
-#     Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. (p. 117)
-#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
-#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
+#     Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. | p. 117
+#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
+#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
 def test_rosengres_kf():
     assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [2, 2, 2, 2, 1]), 2) == 8.86
     assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [4, 2, 1, 1, 0]), 3) == 5.863
     assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [1, 2, 3, 4, 5]), 3) == 14.053
     assert wl_measures_adjusted_freq.rosengrens_kf(main, [0, 0, 0, 0, 0]) == 0
 
 # References:
-#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
-#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
+#     Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
+#     Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
 def test_engwalls_fm():
     assert round(wl_measures_adjusted_freq.engwalls_fm(main, [4, 2, 1, 1, 0]), 1) == 6.4
     assert round(wl_measures_adjusted_freq.engwalls_fm(main, [1, 2, 3, 4, 5]), 0) == 15
     assert wl_measures_adjusted_freq.engwalls_fm(main, [0, 0, 0, 0, 0]) == 0
 
-# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
+# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
 def test_kromers_ur():
     assert round(wl_measures_adjusted_freq.kromers_ur(main, [2, 1, 1, 1, 0]), 1) == 4.5
     assert wl_measures_adjusted_freq.kromers_ur(main, [0, 0, 0, 0, 0]) == 0