From c24322c418c86c81ca90c26ec102b4c5a988a49a Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Sun, 3 Nov 2024 23:57:04 +0800 Subject: [PATCH] =?UTF-8?q?Measures:=20Add=20effect=20size=20-=20=CE=94P?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- doc/doc.md | 14 ++-- doc/measures/effect_size/delta_p.svg | 34 ++++++++++ .../test_measures_effect_size.py | 66 ++++++++++++------- .../wl_measures/wl_measures_effect_size.py | 43 +++++++----- wordless/wl_settings/wl_settings_global.py | 31 +++++---- 6 files changed, 132 insertions(+), 58 deletions(-) create mode 100644 doc/measures/effect_size/delta_p.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index ef5c161f0..b86e18e8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 ### 🎉 New Features -- Measures: Add effect size - conditional probability / squared association ratio +- Measures: Add effect size - conditional probability / ΔP / squared association ratio - Utils: Add Stanza's Sindhi dependency parser ### 📌 Bugfixes diff --git a/doc/doc.md b/doc/doc.md index 1235dfa2d..64fd3a11f 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -1471,15 +1471,15 @@ Test of Statistical Significance|Measure of Bayes Factor|Formula|Collocation Ext Z-test (Berry-Rogghe)
([Berry-Rogghe, 1973](#ref-berry-rogghe-1973))||![Formula](/doc/measures/statistical_significance/z_test_berry_rogghe.svg)
where **S** is the average span size on both sides of the node word.|✔|✖️ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py index fb0d9aed4..41026e6e2 100644 --- a/tests/tests_measures/test_measures_effect_size.py +++ b/tests/tests_measures/test_measures_effect_size.py @@ -35,30 +35,6 @@ def assert_zeros(func, result = 0): numpy.array([result] * 10) ) -# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18 -def test_pct_diff(): - numpy.testing.assert_array_equal( - numpy.round(wl_measures_effect_size.pct_diff( - main, - numpy.array([206523] * 2), - numpy.array([178174] * 2), - numpy.array([959641 - 206523] * 2), - numpy.array([1562358 - 178174] * 2) - ), 1), - numpy.array([88.7] * 2) - ) - - numpy.testing.assert_array_equal( - wl_measures_effect_size.pct_diff( - main, - numpy.array([0, 1, 0]), - numpy.array([1, 0, 0]), - numpy.array([0, 0, 0]), - numpy.array([1, 1, 0]) - ), - numpy.array([float('-inf'), float('inf'), 0]) - ) - # Reference: Durrant, P. (2008). High frequency collocations and second language learning [Doctoral dissertation, University of Nottingham]. Nottingham eTheses. https://eprints.nottingham.ac.uk/10622/1/final_thesis.pdf | pp. 80, 84 def test_conditional_probability(): numpy.testing.assert_array_equal( @@ -72,9 +48,26 @@ def test_conditional_probability(): numpy.array([0.178, 0.349]) ) + assert_zeros(wl_measures_effect_size.conditional_probability) + def test_im3(): assert_zeros(wl_measures_effect_size.im3) +# Reference: Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. International Journal of Corpus Linguistics, 18(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri | p. 144 +def test_delta_p(): + numpy.testing.assert_array_equal( + numpy.round(wl_measures_effect_size.delta_p( + main, + numpy.array([5610, 5610]), + numpy.array([2257, 168938]), + numpy.array([168938, 2257]), + numpy.array([10233063, 10233063]) + ), 3), + numpy.array([0.032, 0.697]) + ) + + assert_zeros(wl_measures_effect_size.delta_p) + # Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), pp. 1–38. | p. 13 def test_dice_sorensen_coeff(): numpy.testing.assert_array_equal( @@ -216,6 +209,30 @@ def test_odds_ratio(): numpy.array([float('-inf'), float('inf'), 0]) ) +# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf | p. 18 +def test_pct_diff(): + numpy.testing.assert_array_equal( + numpy.round(wl_measures_effect_size.pct_diff( + main, + numpy.array([206523] * 2), + numpy.array([178174] * 2), + numpy.array([959641 - 206523] * 2), + numpy.array([1562358 - 178174] * 2) + ), 1), + numpy.array([88.7] * 2) + ) + + numpy.testing.assert_array_equal( + wl_measures_effect_size.pct_diff( + main, + numpy.array([0, 1, 0]), + numpy.array([1, 0, 0]), + numpy.array([0, 0, 0]), + numpy.array([1, 1, 0]) + ), + numpy.array([float('-inf'), float('inf'), 0]) + ) + # Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. | p. 24 def test_pmi(): numpy.testing.assert_array_equal( @@ -256,6 +273,7 @@ def test_squared_phi_coeff(): test_pct_diff() test_conditional_probability() test_im3() + test_delta_p() test_dice_sorensen_coeff() test_diff_coeff() test_jaccard_index() diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py index 348566467..922bc5b7a 100644 --- a/wordless/wl_measures/wl_measures_effect_size.py +++ b/wordless/wl_measures/wl_measures_effect_size.py @@ -22,24 +22,6 @@ from wordless.wl_measures import wl_measures_statistical_significance, wl_measure_utils -# %DIFF -# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf -def pct_diff(main, o11s, o12s, o21s, o22s): - _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) - - return numpy.where( - (o11s == 0) & (o12s > 0), - -numpy.inf, - numpy.where( - (o11s > 0) & (o12s == 0), - numpy.inf, - wl_measure_utils.numpy_divide( - (wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s)) * 100, - wl_measure_utils.numpy_divide(o12s, ox2s) - ) - ) - ) - # Conditional probability # Reference: Durrant, P. (2008). High frequency collocations and second language learning [Doctoral dissertation, University of Nottingham]. Nottingham eTheses. https://eprints.nottingham.ac.uk/10622/1/final_thesis.pdf | p. 84 def conditional_probability(main, o11s, o12s, o21s, o22s): @@ -54,6 +36,13 @@ def im3(main, o11s, o12s, o21s, o22s): return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 3, e11s)) +# ΔP +# Reference: Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. International Journal of Corpus Linguistics, 18(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri +def delta_p(main, o11s, o12s, o21s, o22s): + _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) + + return wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s) + # Dice-Sørensen coefficient # Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), 1–38. | p. 8 def dice_sorensen_coeff(main, o11s, o12s, o21s, o22s): @@ -188,6 +177,24 @@ def odds_ratio(main, o11s, o12s, o21s, o22s): ) ) +# %DIFF +# Reference: Gabrielatos, C., & Marchi, A. (2011, November 5). Keyness: Matching metrics to definitions [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf +def pct_diff(main, o11s, o12s, o21s, o22s): + _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) + + return numpy.where( + (o11s == 0) & (o12s > 0), + -numpy.inf, + numpy.where( + (o11s > 0) & (o12s == 0), + numpy.inf, + wl_measure_utils.numpy_divide( + (wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s)) * 100, + wl_measure_utils.numpy_divide(o12s, ox2s) + ) + ) + ) + # Pointwise mutual information # Reference: Church, K. W., & Hanks, P. (1990). Word association norms, mutual information, and lexicography. Computational Linguistics, 16(1), 22–29. def pmi(main, o11s, o12s, o21s, o22s): diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index 329417dad..8ebcfe2f1 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -3593,10 +3593,10 @@ def init_settings_global(): 'effect_size': { _tr('wl_settings_global', 'None'): 'none', - '%DIFF': 'pct_diff', _tr('wl_settings_global', 'Conditional probability'): 'conditional_probability', _tr('wl_settings_global', 'Cubic association ratio'): 'im3', - _tr('wl_settings_global', "Dice-Sørensen coefficient"): 'dice_sorensen_coeff', + 'ΔP': 'delta_p', + _tr('wl_settings_global', 'Dice-Sørensen coefficient'): 'dice_sorensen_coeff', _tr('wl_settings_global', 'Difference coefficient'): 'diff_coeff', _tr('wl_settings_global', 'Jaccard index'): 'jaccard_index', _tr('wl_settings_global', "Kilgarriff's ratio"): 'kilgarriffs_ratio', @@ -3609,6 +3609,7 @@ def init_settings_global(): _tr('wl_settings_global', 'Mutual Expectation'): 'me', _tr('wl_settings_global', 'Mutual information'): 'mi', _tr('wl_settings_global', 'Odds ratio'): 'or', + '%DIFF': 'pct_diff', _tr('wl_settings_global', 'Pointwise mutual information'): 'pmi', _tr('wl_settings_global', 'Poisson collocation measure'): 'poisson_collocation_measure', _tr('wl_settings_global', 'Squared association ratio'): 'im2', @@ -3616,11 +3617,12 @@ def init_settings_global(): } }, + # Column headers are capitalized 'measures_dispersion': { 'none': { 'col_text': None, 'func': None, - 'type': '' + 'type': None }, 'ald': { @@ -3682,7 +3684,7 @@ def init_settings_global(): 'none': { 'col_text': None, 'func': None, - 'type': '' + 'type': None }, 'fald': { @@ -3840,13 +3842,6 @@ def init_settings_global(): 'keyword': True }, - 'pct_diff': { - 'col_text': '%DIFF', - 'func': wl_measures_effect_size.pct_diff, - 'collocation': False, - 'keyword': True - }, - 'conditional_probability': { 'col_text': 'P', 'func': wl_measures_effect_size.conditional_probability, @@ -3861,6 +3856,13 @@ def init_settings_global(): 'keyword': True }, + 'delta_p': { + 'col_text': 'ΔP', + 'func': wl_measures_effect_size.delta_p, + 'collocation': True, + 'keyword': False + }, + 'dice_sorensen_coeff': { 'col_text': _tr('wl_settings_global', 'Dice-Sørensen Coefficient'), 'func': wl_measures_effect_size.dice_sorensen_coeff, @@ -3952,6 +3954,13 @@ def init_settings_global(): 'keyword': True }, + 'pct_diff': { + 'col_text': '%DIFF', + 'func': wl_measures_effect_size.pct_diff, + 'collocation': False, + 'keyword': True + }, + 'pmi': { 'col_text': 'PMI', 'func': wl_measures_effect_size.pmi,