diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef5c161f0..b86e18e8d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@
## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
### 🎉 New Features
-- Measures: Add effect size - conditional probability / squared association ratio
+- Measures: Add effect size - conditional probability / ΔP / squared association ratio
- Utils: Add Stanza's Sindhi dependency parser
### 📌 Bugfixes
diff --git a/doc/doc.md b/doc/doc.md
index 1235dfa2d..64fd3a11f 100644
--- a/doc/doc.md
+++ b/doc/doc.md
@@ -1471,15 +1471,15 @@ Test of Statistical Significance|Measure of Bayes Factor|Formula|Collocation Ext
Z-test (Berry-Rogghe)
([Berry-Rogghe, 1973](#ref-berry-rogghe-1973))||![Formula](/doc/measures/statistical_significance/z_test_berry_rogghe.svg)
where **S** is the average span size on both sides of the node word.|✔|✖️
+
\ No newline at end of file
diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py
index fb0d9aed4..41026e6e2 100644
--- a/tests/tests_measures/test_measures_effect_size.py
+++ b/tests/tests_measures/test_measures_effect_size.py
@@ -35,30 +35,6 @@ def assert_zeros(func, result = 0):
numpy.array([result] * 10)
)
-# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18
-def test_pct_diff():
- numpy.testing.assert_array_equal(
- numpy.round(wl_measures_effect_size.pct_diff(
- main,
- numpy.array([206523] * 2),
- numpy.array([178174] * 2),
- numpy.array([959641 - 206523] * 2),
- numpy.array([1562358 - 178174] * 2)
- ), 1),
- numpy.array([88.7] * 2)
- )
-
- numpy.testing.assert_array_equal(
- wl_measures_effect_size.pct_diff(
- main,
- numpy.array([0, 1, 0]),
- numpy.array([1, 0, 0]),
- numpy.array([0, 0, 0]),
- numpy.array([1, 1, 0])
- ),
- numpy.array([float('-inf'), float('inf'), 0])
- )
-
# Reference: Durrant, P. (2008). High frequency collocations and second language learning [Doctoral dissertation, University of Nottingham]. Nottingham eTheses. https://eprints.nottingham.ac.uk/10622/1/final_thesis.pdf | pp. 80, 84
def test_conditional_probability():
numpy.testing.assert_array_equal(
@@ -72,9 +48,26 @@ def test_conditional_probability():
numpy.array([0.178, 0.349])
)
+ assert_zeros(wl_measures_effect_size.conditional_probability)
+
def test_im3():
assert_zeros(wl_measures_effect_size.im3)
+# Reference: Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. International Journal of Corpus Linguistics, 18(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri | p. 144
+def test_delta_p():
+ numpy.testing.assert_array_equal(
+ numpy.round(wl_measures_effect_size.delta_p(
+ main,
+ numpy.array([5610, 5610]),
+ numpy.array([2257, 168938]),
+ numpy.array([168938, 2257]),
+ numpy.array([10233063, 10233063])
+ ), 3),
+ numpy.array([0.032, 0.697])
+ )
+
+ assert_zeros(wl_measures_effect_size.delta_p)
+
# Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), pp. 1–38. | p. 13
def test_dice_sorensen_coeff():
numpy.testing.assert_array_equal(
@@ -216,6 +209,30 @@ def test_odds_ratio():
numpy.array([float('-inf'), float('inf'), 0])
)
+# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18
+def test_pct_diff():
+ numpy.testing.assert_array_equal(
+ numpy.round(wl_measures_effect_size.pct_diff(
+ main,
+ numpy.array([206523] * 2),
+ numpy.array([178174] * 2),
+ numpy.array([959641 - 206523] * 2),
+ numpy.array([1562358 - 178174] * 2)
+ ), 1),
+ numpy.array([88.7] * 2)
+ )
+
+ numpy.testing.assert_array_equal(
+ wl_measures_effect_size.pct_diff(
+ main,
+ numpy.array([0, 1, 0]),
+ numpy.array([1, 0, 0]),
+ numpy.array([0, 0, 0]),
+ numpy.array([1, 1, 0])
+ ),
+ numpy.array([float('-inf'), float('inf'), 0])
+ )
+
# Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. | p. 24
def test_pmi():
numpy.testing.assert_array_equal(
@@ -256,6 +273,7 @@ def test_squared_phi_coeff():
test_pct_diff()
test_conditional_probability()
test_im3()
+ test_delta_p()
test_dice_sorensen_coeff()
test_diff_coeff()
test_jaccard_index()
diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py
index 348566467..922bc5b7a 100644
--- a/wordless/wl_measures/wl_measures_effect_size.py
+++ b/wordless/wl_measures/wl_measures_effect_size.py
@@ -22,24 +22,6 @@
from wordless.wl_measures import wl_measures_statistical_significance, wl_measure_utils
-# %DIFF
-# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf
-def pct_diff(main, o11s, o12s, o21s, o22s):
- _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s)
-
- return numpy.where(
- (o11s == 0) & (o12s > 0),
- -numpy.inf,
- numpy.where(
- (o11s > 0) & (o12s == 0),
- numpy.inf,
- wl_measure_utils.numpy_divide(
- (wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s)) * 100,
- wl_measure_utils.numpy_divide(o12s, ox2s)
- )
- )
- )
-
# Conditional probability
# Reference: Durrant, P. (2008). High frequency collocations and second language learning [Doctoral dissertation, University of Nottingham]. Nottingham eTheses. https://eprints.nottingham.ac.uk/10622/1/final_thesis.pdf | p. 84
def conditional_probability(main, o11s, o12s, o21s, o22s):
@@ -54,6 +36,13 @@ def im3(main, o11s, o12s, o21s, o22s):
return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 3, e11s))
+# ΔP
+# Reference: Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. International Journal of Corpus Linguistics, 18(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri
+def delta_p(main, o11s, o12s, o21s, o22s):
+ _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s)
+
+ return wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s)
+
# Dice-Sørensen coefficient
# Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), 1–38. | p. 8
def dice_sorensen_coeff(main, o11s, o12s, o21s, o22s):
@@ -188,6 +177,24 @@ def odds_ratio(main, o11s, o12s, o21s, o22s):
)
)
+# %DIFF
+# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf
+def pct_diff(main, o11s, o12s, o21s, o22s):
+ _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s)
+
+ return numpy.where(
+ (o11s == 0) & (o12s > 0),
+ -numpy.inf,
+ numpy.where(
+ (o11s > 0) & (o12s == 0),
+ numpy.inf,
+ wl_measure_utils.numpy_divide(
+ (wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s)) * 100,
+ wl_measure_utils.numpy_divide(o12s, ox2s)
+ )
+ )
+ )
+
# Pointwise mutual information
# Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29.
def pmi(main, o11s, o12s, o21s, o22s):
diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py
index 329417dad..8ebcfe2f1 100644
--- a/wordless/wl_settings/wl_settings_global.py
+++ b/wordless/wl_settings/wl_settings_global.py
@@ -3593,10 +3593,10 @@ def init_settings_global():
'effect_size': {
_tr('wl_settings_global', 'None'): 'none',
- '%DIFF': 'pct_diff',
_tr('wl_settings_global', 'Conditional probability'): 'conditional_probability',
_tr('wl_settings_global', 'Cubic association ratio'): 'im3',
- _tr('wl_settings_global', "Dice-Sørensen coefficient"): 'dice_sorensen_coeff',
+ 'ΔP': 'delta_p',
+ _tr('wl_settings_global', 'Dice-Sørensen coefficient'): 'dice_sorensen_coeff',
_tr('wl_settings_global', 'Difference coefficient'): 'diff_coeff',
_tr('wl_settings_global', 'Jaccard index'): 'jaccard_index',
_tr('wl_settings_global', "Kilgarriff's ratio"): 'kilgarriffs_ratio',
@@ -3609,6 +3609,7 @@ def init_settings_global():
_tr('wl_settings_global', 'Mutual Expectation'): 'me',
_tr('wl_settings_global', 'Mutual information'): 'mi',
_tr('wl_settings_global', 'Odds ratio'): 'or',
+ '%DIFF': 'pct_diff',
_tr('wl_settings_global', 'Pointwise mutual information'): 'pmi',
_tr('wl_settings_global', 'Poisson collocation measure'): 'poisson_collocation_measure',
_tr('wl_settings_global', 'Squared association ratio'): 'im2',
@@ -3616,11 +3617,12 @@ def init_settings_global():
}
},
+ # Column headers are capitalized
'measures_dispersion': {
'none': {
'col_text': None,
'func': None,
- 'type': ''
+ 'type': None
},
'ald': {
@@ -3682,7 +3684,7 @@ def init_settings_global():
'none': {
'col_text': None,
'func': None,
- 'type': ''
+ 'type': None
},
'fald': {
@@ -3840,13 +3842,6 @@ def init_settings_global():
'keyword': True
},
- 'pct_diff': {
- 'col_text': '%DIFF',
- 'func': wl_measures_effect_size.pct_diff,
- 'collocation': False,
- 'keyword': True
- },
-
'conditional_probability': {
'col_text': 'P',
'func': wl_measures_effect_size.conditional_probability,
@@ -3861,6 +3856,13 @@ def init_settings_global():
'keyword': True
},
+ 'delta_p': {
+ 'col_text': 'ΔP',
+ 'func': wl_measures_effect_size.delta_p,
+ 'collocation': True,
+ 'keyword': False
+ },
+
'dice_sorensen_coeff': {
'col_text': _tr('wl_settings_global', 'Dice-Sørensen Coefficient'),
'func': wl_measures_effect_size.dice_sorensen_coeff,
@@ -3952,6 +3954,13 @@ def init_settings_global():
'keyword': True
},
+ 'pct_diff': {
+ 'col_text': '%DIFF',
+ 'func': wl_measures_effect_size.pct_diff,
+ 'collocation': False,
+ 'keyword': True
+ },
+
'pmi': {
'col_text': 'PMI',
'func': wl_measures_effect_size.pmi,