Skip to content

Commit

Permalink
Measures: Add effect size - conditional probability
Browse files Browse the repository at this point in the history
  • Loading branch information
BLKSerene committed Nov 2, 2024
1 parent 97b0f15 commit 436257e
Show file tree
Hide file tree
Showing 25 changed files with 355 additions and 232 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
### 🎉 New Features
- Measures: Add effect size - squared association ratio
- Measures: Add effect size - conditional probability / squared association ratio
- Utils: Add Stanza's Sindhi dependency parser

### 📌 Bugfixes
Expand Down
97 changes: 51 additions & 46 deletions doc/doc.md

Large diffs are not rendered by default.

29 changes: 29 additions & 0 deletions doc/measures/effect_size/conditional_probability.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions tests/test_colligation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def test_colligation_extractor():
tests_statistical_significance = [
test_statistical_significance
for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
if vals['collocation_extractor']
if vals['collocation']
]
measures_bayes_factor = [
measure_bayes_factor
for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
if vals['collocation_extractor']
if vals['collocation']
]
measures_effect_size = list(main.settings_global['measures_effect_size'].keys())

Expand Down
4 changes: 2 additions & 2 deletions tests/test_collocation_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ def test_collocation_extractor():
tests_statistical_significance = [
test_statistical_significance
for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
if vals['collocation_extractor']
if vals['collocation']
]
measures_bayes_factor = [
measure_bayes_factor
for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
if vals['collocation_extractor']
if vals['collocation']
]
measures_effect_size = list(main.settings_global['measures_effect_size'].keys())

Expand Down
4 changes: 2 additions & 2 deletions tests/test_keyword_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ def test_keyword_extractor():
tests_statistical_significance = [
test_statistical_significance
for test_statistical_significance, vals in main.settings_global['tests_statistical_significance'].items()
if vals['keyword_extractor']
if vals['keyword']
]
measures_bayes_factor = [
measure_bayes_factor
for measure_bayes_factor, vals in main.settings_global['measures_bayes_factor'].items()
if vals['keyword_extractor']
if vals['keyword']
]
measures_effect_size = list(main.settings_global['measures_effect_size'].keys())

Expand Down
26 changes: 13 additions & 13 deletions tests/tests_measures/test_measures_adjusted_freq.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

main = wl_test_init.Wl_Test_Main()

# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 410)
# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 410
def test_fald():
assert round(wl_measures_adjusted_freq.fald(main, test_measures_dispersion.TOKENS, 'a'), 3) == 11.764
assert wl_measures_adjusted_freq.fald(main, test_measures_dispersion.TOKENS, 'aa') == 0
Expand All @@ -36,44 +36,44 @@ def test_fawt():
assert wl_measures_adjusted_freq.fawt(main, test_measures_dispersion.TOKENS, 'aa') == 0

# References:
# Carroll, J. B. (1970). An alternative to Juilland’s usage coefficient for lexical frequencies and a proposal for a standard frequency index. Computer Studies in the Humanities and Verbal Behaviour, 3(2), 61–65. https://doi.org/10.1002/
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
# Carroll, J. B. (1970). An alternative to Juillands's usage coefficient for lexical frequencies. ETS Research Bulletin Series, 1970(2), i–15. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x | p. 13
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
def test_carrolls_um():
assert round(wl_measures_adjusted_freq.carrolls_um(main, [2, 1, 1, 1, 0]), 2) == 4.31
assert round(wl_measures_adjusted_freq.carrolls_um(main, [4, 2, 1, 1, 0]), 3) == 6.424
assert round(wl_measures_adjusted_freq.carrolls_um(main, [1, 2, 3, 4, 5]), 3) == 14.108
assert wl_measures_adjusted_freq.carrolls_um(main, [0, 0, 0, 0, 0]) == 0

# References
# Carroll, J. B. (1970). An alternative to Juilland’s usage coefficient for lexical frequencies and a proposal for a standard frequency index. Computer Studies in the Humanities and Verbal Behaviour, 3(2), 61–65. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x
# Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. (p. 115)
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
# Carroll, J. B. (1970). An alternative to Juillands's usage coefficient for lexical frequencies. ETS Research Bulletin Series, 1970(2), i–15. https://doi.org/10.1002/j.2333-8504.1970.tb00778.x | p. 14
# Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. | p. 115
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
def test_juillands_u():
assert round(wl_measures_adjusted_freq.juillands_u(main, [0, 4, 3, 2, 1]), 2) == 6.46
assert round(wl_measures_adjusted_freq.juillands_u(main, [2, 2, 2, 2, 2]), 0) == 10
assert round(wl_measures_adjusted_freq.juillands_u(main, [4, 2, 1, 1, 0]), 3) == 4.609
assert wl_measures_adjusted_freq.juillands_u(main, [0, 0, 0, 0, 0]) == 0

# References:
# Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. (p. 117)
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
# Rosengren, I. (1971). The quantitative concept of language and its relation to the structure of frequency dictionaries. Études de linguistique appliquée, 1, 103–127. | p. 117
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
def test_rosengres_kf():
assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [2, 2, 2, 2, 1]), 2) == 8.86
assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [4, 2, 1, 1, 0]), 3) == 5.863
assert round(wl_measures_adjusted_freq.rosengrens_kf(main, [1, 2, 3, 4, 5]), 3) == 14.053
assert wl_measures_adjusted_freq.rosengrens_kf(main, [0, 0, 0, 0, 0]) == 0

# References:
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. (p. 122)
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
# Engwall, G. (1974). Fréquence et distribution du vocabulaire dans un choix de romans français [Unpublished doctoral dissertation]. Stockholm University. | p. 122
# Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
def test_engwalls_fm():
assert round(wl_measures_adjusted_freq.engwalls_fm(main, [4, 2, 1, 1, 0]), 1) == 6.4
assert round(wl_measures_adjusted_freq.engwalls_fm(main, [1, 2, 3, 4, 5]), 0) == 15
assert wl_measures_adjusted_freq.engwalls_fm(main, [0, 0, 0, 0, 0]) == 0

# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri (p. 409)
# Reference: Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. International Journal of Corpus Linguistics, 13(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri | p. 409
def test_kromers_ur():
assert round(wl_measures_adjusted_freq.kromers_ur(main, [2, 1, 1, 1, 0]), 1) == 4.5
assert wl_measures_adjusted_freq.kromers_ur(main, [0, 0, 0, 0, 0]) == 0
Expand Down
Loading

0 comments on commit 436257e

Please sign in to comment.