From 07ad76106f49189257571e07e50c4acf8e4b7349 Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Tue, 5 Nov 2024 15:21:19 +0800 Subject: [PATCH] =?UTF-8?q?Measures:=20Add=20effect=20size=20-=20=CE=BC-va?= =?UTF-8?q?lue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 +- doc/doc.md | 12 ++++++-- doc/measures/effect_size/mu_val.svg | 22 ++++++++++++++ .../test_measures_effect_size.py | 30 ++++++++++++++----- .../wl_measures/wl_measures_effect_size.py | 9 +++++- wordless/wl_settings/wl_settings_global.py | 8 +++++ 6 files changed, 71 insertions(+), 12 deletions(-) create mode 100644 doc/measures/effect_size/mu_val.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b6a5fefa..80f99acb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 ### 🎉 New Features -- Measures: Add effect size - conditional probability / ΔP / mutual information (normalized) / pointwise mutual information (normalized) / squared association ratio +- Measures: Add effect size - conditional probability / ΔP / mutual information (normalized) / μ-value / pointwise mutual information (normalized) / squared association ratio - Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared) - Utils: Add Stanza's Sindhi dependency parser diff --git a/doc/doc.md b/doc/doc.md index a8b96912c..a99e299fd 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -1510,6 +1510,9 @@ Mutual information: Mutual information (normalized): \text{NMI} = \frac{\sum_{i = 1}^2 \sum_{j = 1}^2 \left(\frac{O_{ij}}{O_{xx}} \times \log_{base} \frac{O_{ij}}{E_{ij}}\right)}{-\sum_{i = 1}^2 \sum_{j = 1}^2 \left(\frac{O_{ij}}{O_{xx}} \times \log_{base} \frac{O_{ij}}{O_{xx}}\right)} +μ-value: + \mu = \frac{O_{11}}{E_{11}} + Odds ratio: \text{Odds ratio} = \frac{O_{11} \times O_{22}}{O_{12} \times O_{21}} @@ -1546,10 +1549,11 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction logDice
([Rychlý, 2008, p. 9](#ref-rychly-2008))|![Formula](/doc/measures/effect_size/log_dice.svg)|✔|✖️ Log Ratio
([Hardie, 2014](#ref-hardie-2014))|![Formula](/doc/measures/effect_size/log_ratio.svg)|âś”|âś” MI.log-f
([Kilgarriff & Tugwell, 2002](#ref-kilgarriff-tugwell-2002); [Lexical Computing Ltd., 2015, p. 4](#ref-lexical-computing-ltd-2015))|![Formula](/doc/measures/effect_size/mi_log_f.svg)|✔|✖️ -Minimum sensitivity
([Pedersen, 1998](#ref-pedersen-1998))|![Formula](/doc/measures/effect_size/min_sensitivity.svg)|✔|✖️ +Minimum sensitivity
([Pedersen & Bruce, 1996](#ref-pedersen-bruce-1996))|![Formula](/doc/measures/effect_size/min_sensitivity.svg)|✔|✖️ Mutual Expectation
([Dias et al., 1999](#ref-dias-et-al-1999))|![Formula](/doc/measures/effect_size/me.svg)|✔|✖️ Mutual information
([Dunning, 1998, pp. 49–52](#ref-dunning-1998); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/mi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Mutual Information → Base of logarithm**.|✔|✔ Mutual information (normalized)
([Bouma, 2009](#ref-bouma-2009); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/nmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Mutual Information (Normalized) → Base of logarithm**.|✔|✔ +μ-value
([Evert, 2005, p. 54](#ref-evert-2005))|![Formula](/doc/measures/effect_size/mu_val.svg)|✔|✖️ Odds ratio
([Pecina, 2005, p. 15](#ref-pecina-2005), [Pojanapunya & Todd, 2016](#ref-pojanapunya-todd-2016))|![Formula](/doc/measures/effect_size/odds_ratio.svg)|âś”|âś” %DIFF
([Gabrielatos & Marchi, 2011](#ref-gabrielatos-marchi-2011))|![Formula](/doc/measures/effect_size/pct_diff.svg)|✖️|✔ Pointwise mutual information
([Church & Hanks, 1990](#ref-church-hanks-1990); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/pmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Pointwise Mutual Information → Base of logarithm**.|✔|✔ @@ -1644,6 +1648,8 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction 1. [**^**](#ref-osman) El-Haj, M., & Rayson, P. (2016). OSMAN: A novel Arabic readability metric. In N. Calzolari, K. Choukri, T. Declerck, S. Goggi, M. Grobelnik, B. Maegaard, J. Mariani, H. Mazo, A. Moreno, J. Odijk, & S. Piperidis (Eds.), *Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)* (pp. 250–255). European Language Resources Association. http://www.lrec-conf.org/proceedings/lrec2016/index.html 1. [**^**](#ref-engwalls-fm) Engwall, G. (1974). *Fréquence et distribution du vocabulaire dans un choix de romans français* [Unpublished doctoral dissertation]. Stockholm University. + +1. [**^**](#ref-mu-val) Evert, S. (2005). *The statistics of word cooccurrences: Word pairs and collocations* [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556 1. [**^**](#ref-elf) Fang, I. E. (1966). The easy listening formula. *Journal of Broadcasting*, *11*(1), 63–68. https://doi.org/10.1080/08838156609363529 @@ -1743,8 +1749,8 @@ Linguistic Computing Bulletin*, *7*(2), 172–177. 1. [**^**](#ref-re) Partiko, Z. V. (2001). *Zagal’ne redaguvannja. Normativni osnovi.* Afiša. 1. [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group. - -1. [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press. + +1. [**^**](#ref-min-sensitivity) Pedersen, T., & Bruce, R. (1996). What to infer from a description. In *Technical report 96-CSE-04*. Southern Methodist University. 1. [**^**](#ref-odds-ratio) Pecina, P. (2005). An extensive empirical study of collocation extraction methods. In C. Callison-Burch & S. Wan (Eds.), *Proceedings of the Student Research Workshop* (pp. 13–18). Association for Computational Linguistics. diff --git a/doc/measures/effect_size/mu_val.svg b/doc/measures/effect_size/mu_val.svg new file mode 100644 index 000000000..bb2a9a730 --- /dev/null +++ b/doc/measures/effect_size/mu_val.svg @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py index e899ee6c9..ada4ad1a1 100644 --- a/tests/tests_measures/test_measures_effect_size.py +++ b/tests/tests_measures/test_measures_effect_size.py @@ -143,17 +143,17 @@ def test_log_ratio(): def test_mi_log_f(): assert_zeros(wl_measures_effect_size.mi_log_f) -# Reference: Pedersen, T. (1998). Dependent bigram identification. In Proceedings of the Fifteenth National Conference on Artificial Intelligence (p. 1197). AAAI Press. +# Reference: Pedersen, T., & Bruce, R. (1996). What to infer from a description. In Technical report 96-CSE-04. Southern Methodist University. | p. 12 def test_min_sensitivity(): numpy.testing.assert_array_equal( numpy.round(wl_measures_effect_size.min_sensitivity( main, - numpy.array([17] * 2), - numpy.array([240] * 2), - numpy.array([1001] * 2), - numpy.array([1298742] * 2) - ), 3), - numpy.array([0.017] * 2) + numpy.array([17, 10, 0]), + numpy.array([240, 0, 10]), + numpy.array([1001, 0, 10]), + numpy.array([1298742, 90, 80]) + ), 6), + numpy.array([0.016699, 1, 0]) ) assert_zeros(wl_measures_effect_size.min_sensitivity) @@ -191,6 +191,21 @@ def test_nmi(): assert_zeros(wl_measures_effect_size.nmi) +# Reference: Evert, S. (2005). The statistics of word cooccurrences: Word pairs and collocations [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556 | p. 54 +def test_mu_val(): + numpy.testing.assert_array_equal( + wl_measures_effect_size.mu_val( + main, + numpy.array([1] * 2), + numpy.array([9] * 2), + numpy.array([9] * 2), + numpy.array([81] * 2) + ), + numpy.array([1] * 2) + ) + + assert_zeros(wl_measures_effect_size.mu_val) + # Reference: Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. Corpus Linguistics and Linguistic Theory, 15(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030 | p. 154 def test_odds_ratio(): numpy.testing.assert_array_equal( @@ -307,6 +322,7 @@ def test_squared_phi_coeff(): test_me() test_mi() test_nmi() + test_mu_val() test_odds_ratio() test_pct_diff() test_pmi() diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py index d0e657f8b..d11b09ebb 100644 --- a/wordless/wl_measures/wl_measures_effect_size.py +++ b/wordless/wl_measures/wl_measures_effect_size.py @@ -122,7 +122,7 @@ def mi_log_f(main, o11s, o12s, o21s, o22s): return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 2, e11s)) * wl_measure_utils.numpy_log(o11s + 1) # Minimum sensitivity -# Reference: Pedersen, T. (1998). Dependent bigram identification. In Proceedings of the Fifteenth National Conference on Artificial Intelligence (p. 1197). AAAI Press. +# Reference: Pedersen, T., & Bruce, R. (1996). What to infer from a description. In Technical report 96-CSE-04. Southern Methodist University. def min_sensitivity(main, o11s, o12s, o21s, o22s): o1xs, _, ox1s, _ = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) @@ -176,6 +176,13 @@ def nmi(main, o11s, o12s, o21s, o22s): -(joint_entropy_11 + joint_entropy_12 + joint_entropy_21 + joint_entropy_22) ) +# μ-value +# Reference: Evert, S. (2005). The statistics of word cooccurrences: Word pairs and collocations [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556 | p. 54 +def mu_val(main, o11s, o12s, o21s, o22s): + e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s) + + return wl_measure_utils.numpy_divide(o11s, e11s) + # Odds ratio # Reference: Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. Corpus Linguistics and Linguistic Theory, 15(1), 133–167. https://doi.org/10.1515/cllt-2015-0030 def odds_ratio(main, o11s, o12s, o21s, o22s): diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index da58379e5..5292ec58a 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -3606,6 +3606,7 @@ def init_settings_global(): _tr('wl_settings_global', 'Mutual Expectation'): 'me', _tr('wl_settings_global', 'Mutual information'): 'mi', _tr('wl_settings_global', 'Mutual information (normalized)'): 'nmi', + _tr('wl_settings_global', 'μ-value'): 'mu_val', _tr('wl_settings_global', 'Odds ratio'): 'or', '%DIFF': 'pct_diff', _tr('wl_settings_global', 'Pointwise mutual information'): 'pmi', @@ -3933,6 +3934,13 @@ def init_settings_global(): 'keyword': True }, + 'mu_val': { + 'col_text': 'μ-value', + 'func': wl_measures_effect_size.mu_val, + 'collocation': True, + 'keyword': False + }, + 'or': { 'col_text': 'OR', 'func': wl_measures_effect_size.odds_ratio,