From 07ad76106f49189257571e07e50c4acf8e4b7349 Mon Sep 17 00:00:00 2001
From: BLKSerene <blkserene@gmail.com>
Date: Tue, 5 Nov 2024 15:21:19 +0800
Subject: [PATCH] =?UTF-8?q?Measures:=20Add=20effect=20size=20-=20=CE=BC-va?=
 =?UTF-8?q?lue?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGELOG.md                                  |  2 +-
 doc/doc.md                                    | 12 ++++++--
 doc/measures/effect_size/mu_val.svg           | 22 ++++++++++++++
 .../test_measures_effect_size.py              | 30 ++++++++++++++-----
 .../wl_measures/wl_measures_effect_size.py    |  9 +++++-
 wordless/wl_settings/wl_settings_global.py    |  8 +++++
 6 files changed, 71 insertions(+), 12 deletions(-)
 create mode 100644 doc/measures/effect_size/mu_val.svg
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b6a5fefa..80f99acb6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,7 +20,7 @@
 
 ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024
 ### 🎉 New Features
-- Measures: Add effect size - conditional probability / ΔP / mutual information (normalized) / pointwise mutual information (normalized) / squared association ratio
+- Measures: Add effect size - conditional probability / ΔP / mutual information (normalized) / μ-value / pointwise mutual information (normalized) / squared association ratio
 - Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared)
 - Utils: Add Stanza's Sindhi dependency parser
 
diff --git a/doc/doc.md b/doc/doc.md
index a8b96912c..a99e299fd 100644
--- a/doc/doc.md
+++ b/doc/doc.md
@@ -1510,6 +1510,9 @@ Mutual information:
 Mutual information (normalized):
     \text{NMI} = \frac{\sum_{i = 1}^2 \sum_{j = 1}^2 \left(\frac{O_{ij}}{O_{xx}} \times \log_{base} \frac{O_{ij}}{E_{ij}}\right)}{-\sum_{i = 1}^2 \sum_{j = 1}^2 \left(\frac{O_{ij}}{O_{xx}} \times \log_{base} \frac{O_{ij}}{O_{xx}}\right)}
 
+μ-value:
+    \mu = \frac{O_{11}}{E_{11}}
+
 Odds ratio:
     \text{Odds ratio} = \frac{O_{11} \times O_{22}}{O_{12} \times O_{21}}
 
@@ -1546,10 +1549,11 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction
 <span id="ref-log-dice"></span>logDice<br>([Rychlý, 2008, p. 9](#ref-rychly-2008))|![Formula](/doc/measures/effect_size/log_dice.svg)|✔|✖️
 <span id="ref-log-ratio"></span>Log Ratio<br>([Hardie, 2014](#ref-hardie-2014))|![Formula](/doc/measures/effect_size/log_ratio.svg)|✔|✔
 <span id="ref-mi-log-f"></span>MI.log-f<br>([Kilgarriff & Tugwell, 2002](#ref-kilgarriff-tugwell-2002); [Lexical Computing Ltd., 2015, p. 4](#ref-lexical-computing-ltd-2015))|![Formula](/doc/measures/effect_size/mi_log_f.svg)|✔|✖️
-<span id="ref-min-sensitivity"></span>Minimum sensitivity<br>([Pedersen, 1998](#ref-pedersen-1998))|![Formula](/doc/measures/effect_size/min_sensitivity.svg)|✔|✖️
+<span id="ref-min-sensitivity"></span>Minimum sensitivity<br>([Pedersen & Bruce, 1996](#ref-pedersen-bruce-1996))|![Formula](/doc/measures/effect_size/min_sensitivity.svg)|✔|✖️
 <span id="ref-me"></span>Mutual Expectation<br>([Dias et al., 1999](#ref-dias-et-al-1999))|![Formula](/doc/measures/effect_size/me.svg)|✔|✖️
 <span id="ref-mi"></span>Mutual information<br>([Dunning, 1998, pp. 49–52](#ref-dunning-1998); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/mi.svg)<br>where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Mutual Information → Base of logarithm**.|✔|✔
 <span id="ref-nmi"></span>Mutual information (normalized)<br>([Bouma, 2009](#ref-bouma-2009); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/nmi.svg)<br>where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Mutual Information (Normalized) → Base of logarithm**.|✔|✔
+<span id="ref-mu-val"></span>μ-value<br>([Evert, 2005, p. 54](#ref-evert-2005))|![Formula](/doc/measures/effect_size/mu_val.svg)|✔|✖️
 <span id="ref-odds-ratio"></span>Odds ratio<br>([Pecina, 2005, p. 15](#ref-pecina-2005), [Pojanapunya & Todd, 2016](#ref-pojanapunya-todd-2016))|![Formula](/doc/measures/effect_size/odds_ratio.svg)|✔|✔
 <span id="ref-pct-diff"></span>%DIFF<br>([Gabrielatos & Marchi, 2011](#ref-gabrielatos-marchi-2011))|![Formula](/doc/measures/effect_size/pct_diff.svg)|✖️|✔
 <span id="ref-pmi"></span>Pointwise mutual information<br>([Church & Hanks, 1990](#ref-church-hanks-1990); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/pmi.svg)<br>where **base** is the base of the logarithm, whose value could be modified via **Menu Bar → Preferences → Settings → Measures → Effect Size → Pointwise Mutual Information → Base of logarithm**.|✔|✔
@@ -1644,6 +1648,8 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction
 1. [**^**](#ref-osman) El-Haj, M., & Rayson, P. (2016). OSMAN: A novel Arabic readability metric. In N. Calzolari, K. Choukri, T. Declerck, S. Goggi, M. Grobelnik, B. Maegaard, J. Mariani, H. Mazo, A. Moreno, J. Odijk, & S. Piperidis (Eds.), *Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)* (pp. 250–255). European Language Resources Association. http://www.lrec-conf.org/proceedings/lrec2016/index.html
 <span id="ref-engwall-1974"></span>
 1. [**^**](#ref-engwalls-fm) Engwall, G. (1974). *Fréquence et distribution du vocabulaire dans un choix de romans français* [Unpublished doctoral dissertation]. Stockholm University.
+<span id="ref-evert-2005"></span>
+1. [**^**](#ref-mu-val) Evert, S. (2005). *The statistics of word cooccurrences: Word pairs and collocations* [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556
 <span id="ref-fang-1966"></span>
 1. [**^**](#ref-elf) Fang, I. E. (1966). The easy listening formula. *Journal of Broadcasting*, *11*(1), 63–68. https://doi.org/10.1080/08838156609363529
 <span id="ref-farr-et-al-1951"></span>
@@ -1743,8 +1749,8 @@ Linguistic Computing Bulletin*, *7*(2), 172–177.
 1. [**^**](#ref-re) Partiko, Z. V. (2001). *Zagal’ne redaguvannja. Normativni osnovi.* Afiša.
 <span id="ref-pedersen-1996"></span>
 1. [**^**](#ref-fishers-exact-test) Pedersen, T. (1996). Fishing for exactness. In T. Winn (Ed.), *Proceedings of the Sixth Annual South-Central Regional SAS Users' Group Conference* (pp. 188–200). The South–Central Regional SAS Users' Group.
-<span id="ref-pedersen-1998"></span>
-1. [**^**](#ref-min-sensitivity) Pedersen, T. (1998). Dependent bigram identification. In *Proceedings of the Fifteenth National Conference on Artificial Intelligence* (p. 1197). AAAI Press.
+<span id="ref-pedersen-bruce-1996"></span>
+1. [**^**](#ref-min-sensitivity) Pedersen, T., & Bruce, R. (1996). What to infer from a description. In *Technical report 96-CSE-04*. Southern Methodist University.
 <span id="ref-pecina-2005"></span>
 1. [**^**](#ref-odds-ratio) Pecina, P. (2005). An extensive empirical study of collocation extraction methods. In C. Callison-Burch & S. Wan (Eds.), *Proceedings of the Student Research Workshop* (pp. 13–18). Association for Computational Linguistics.
 <span id="ref-pisarek-1969"></span>
diff --git a/doc/measures/effect_size/mu_val.svg b/doc/measures/effect_size/mu_val.svg
new file mode 100644
index 000000000..bb2a9a730
--- /dev/null
+++ b/doc/measures/effect_size/mu_val.svg
@@ -0,0 +1,22 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!-- Generated by CodeCogs with dvisvgm 3.2.2 -->
+<svg version='1.1' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' width='46.965771pt' height='29.942878pt' viewBox='-.897986 -.281805 46.965771 29.942878'>
+<defs>
+<path id='g1-49' d='M2.502615-5.076961C2.502615-5.292154 2.486675-5.300125 2.271482-5.300125C1.944707-4.98132 1.522291-4.790037 .765131-4.790037V-4.527024C.980324-4.527024 1.41071-4.527024 1.872976-4.742217V-.653549C1.872976-.358655 1.849066-.263014 1.091905-.263014H.812951V0C1.139726-.02391 1.825156-.02391 2.183811-.02391S3.235866-.02391 3.56264 0V-.263014H3.283686C2.526526-.263014 2.502615-.358655 2.502615-.653549V-5.076961Z'/>
+<path id='g2-61' d='M8.069738-3.873474C8.237111-3.873474 8.452304-3.873474 8.452304-4.088667C8.452304-4.315816 8.249066-4.315816 8.069738-4.315816H1.028144C.860772-4.315816 .645579-4.315816 .645579-4.100623C.645579-3.873474 .848817-3.873474 1.028144-3.873474H8.069738ZM8.069738-1.649813C8.237111-1.649813 8.452304-1.649813 8.452304-1.865006C8.452304-2.092154 8.249066-2.092154 8.069738-2.092154H1.028144C.860772-2.092154 .645579-2.092154 .645579-1.876961C.645579-1.649813 .848817-1.649813 1.028144-1.649813H8.069738Z'/>
+<path id='g0-22' d='M1.721544-.263014C2.020423 .011955 2.462765 .119552 2.86924 .119552C3.634371 .119552 4.160399-.394521 4.435367-.765131C4.554919-.131507 5.057036 .119552 5.475467 .119552C5.834122 .119552 6.121046-.095641 6.336239-.526027C6.527522-.932503 6.694894-1.661768 6.694894-1.709589C6.694894-1.769365 6.647073-1.817186 6.575342-1.817186C6.467746-1.817186 6.455791-1.75741 6.40797-1.578082C6.228643-.872727 6.001494-.119552 5.511333-.119552C5.164633-.119552 5.140722-.430386 5.140722-.669489C5.140722-.944458 5.248319-1.374844 5.332005-1.733499L5.66675-3.024658C5.71457-3.251806 5.846077-3.789788 5.905853-4.004981C5.977584-4.291905 6.109091-4.805978 6.109091-4.853798C6.109091-5.033126 5.965629-5.152677 5.786301-5.152677C5.678705-5.152677 5.427646-5.104857 5.332005-4.746202L4.495143-1.422665C4.435367-1.183562 4.435367-1.159651 4.27995-.968369C4.136488-.765131 3.670237-.119552 2.917061-.119552C2.247572-.119552 2.032379-.609714 2.032379-1.171606C2.032379-1.518306 2.139975-1.936737 2.187796-2.139975L2.725778-4.291905C2.785554-4.519054 2.881196-4.901619 2.881196-4.97335C2.881196-5.164633 2.725778-5.272229 2.570361-5.272229C2.462765-5.272229 2.199751-5.236364 2.10411-4.853798L.37061 2.068244C.358655 2.12802 .334745 2.199751 .334745 2.271482C.334745 2.450809 .478207 2.570361 .657534 2.570361C1.004234 2.570361 1.075965 2.295392 1.159651 1.960648L1.721544-.263014Z'/>
+<path id='g0-69' d='M8.308842-2.773599C8.320797-2.809465 8.356663-2.893151 8.356663-2.940971C8.356663-3.000747 8.308842-3.060523 8.237111-3.060523C8.18929-3.060523 8.16538-3.048568 8.129514-3.012702C8.105604-3.000747 8.105604-2.976837 7.998007-2.737733C7.292653-1.06401 6.77858-.3467 4.865753-.3467H3.120299C2.952927-.3467 2.929016-.3467 2.857285-.358655C2.725778-.37061 2.713823-.394521 2.713823-.490162C2.713823-.573848 2.737733-.645579 2.761644-.753176L3.58655-4.052802H4.770112C5.702615-4.052802 5.774346-3.849564 5.774346-3.490909C5.774346-3.371357 5.774346-3.263761 5.69066-2.905106C5.66675-2.857285 5.654795-2.809465 5.654795-2.773599C5.654795-2.689913 5.71457-2.654047 5.786301-2.654047C5.893898-2.654047 5.905853-2.737733 5.953674-2.905106L6.635118-5.678705C6.635118-5.738481 6.587298-5.798257 6.515567-5.798257C6.40797-5.798257 6.396015-5.750436 6.348194-5.583064C6.109091-4.662516 5.869988-4.399502 4.805978-4.399502H3.670237L4.411457-7.340473C4.519054-7.758904 4.542964-7.79477 5.033126-7.79477H6.742715C8.2132-7.79477 8.51208-7.400249 8.51208-6.491656C8.51208-6.479701 8.51208-6.144956 8.464259-5.750436C8.452304-5.702615 8.440349-5.630884 8.440349-5.606974C8.440349-5.511333 8.500125-5.475467 8.571856-5.475467C8.655542-5.475467 8.703362-5.523288 8.727273-5.738481L8.978331-7.830635C8.978331-7.866501 9.002242-7.986052 9.002242-8.009963C9.002242-8.141469 8.894645-8.141469 8.679452-8.141469H2.84533C2.618182-8.141469 2.49863-8.141469 2.49863-7.926276C2.49863-7.79477 2.582316-7.79477 2.785554-7.79477C3.526775-7.79477 3.526775-7.711083 3.526775-7.579577C3.526775-7.519801 3.514819-7.47198 3.478954-7.340473L1.865006-.884682C1.75741-.466252 1.733499-.3467 .896638-.3467C.669489-.3467 .549938-.3467 .549938-.131507C.549938 0 .621669 0 .860772 0H6.862267C7.12528 0 7.137235-.011955 7.220922-.203238L8.308842-2.773599Z'/>
+<path id='g0-79' d='M8.679452-5.236364C8.679452-7.208966 7.388294-8.416438 5.71457-8.416438C3.156164-8.416438 .573848-5.66675 .573848-2.905106C.573848-1.028144 1.817186 .251059 3.550685 .251059C6.06127 .251059 8.679452-2.367123 8.679452-5.236364ZM3.622416-.02391C2.642092-.02391 1.601993-.74122 1.601993-2.606227C1.601993-3.694147 1.996513-5.475467 2.976837-6.670984C3.849564-7.723039 4.853798-8.153425 5.654795-8.153425C6.706849-8.153425 7.723039-7.388294 7.723039-5.66675C7.723039-4.60274 7.268742-2.940971 6.467746-1.80523C5.595019-.585803 4.507098-.02391 3.622416-.02391Z'/>
+</defs>
+<g id='page1' transform='matrix(1.13 0 0 1.13 -240.360895 -75.429757)'>
+<use x='211.579334' y='83.006809' xlink:href='#g0-22'/>
+<use x='221.943134' y='83.006809' xlink:href='#g2-61'/>
+<use x='235.564129' y='74.919051' xlink:href='#g0-79'/>
+<use x='244.510211' y='76.712314' xlink:href='#g1-49'/>
+<use x='248.744394' y='76.712314' xlink:href='#g1-49'/>
+<rect x='235.564129' y='79.778924' height='.478187' width='17.912579'/>
+<use x='235.704502' y='91.207471' xlink:href='#g0-69'/>
+<use x='244.369849' y='93.000735' xlink:href='#g1-49'/>
+<use x='248.604032' y='93.000735' xlink:href='#g1-49'/>
+</g>
+</svg>
\ No newline at end of file
diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py
index e899ee6c9..ada4ad1a1 100644
--- a/tests/tests_measures/test_measures_effect_size.py
+++ b/tests/tests_measures/test_measures_effect_size.py
@@ -143,17 +143,17 @@ def test_log_ratio():
 def test_mi_log_f():
     assert_zeros(wl_measures_effect_size.mi_log_f)
 
-# Reference: Pedersen, T. (1998). Dependent bigram identification. In Proceedings of the Fifteenth National Conference on Artificial Intelligence (p. 1197). AAAI Press.
+# Reference: Pedersen, T., & Bruce, R. (1996). What to infer from a description. In Technical report 96-CSE-04. Southern Methodist University. | p. 12
 def test_min_sensitivity():
     numpy.testing.assert_array_equal(
         numpy.round(wl_measures_effect_size.min_sensitivity(
             main,
-            numpy.array([17] * 2),
-            numpy.array([240] * 2),
-            numpy.array([1001] * 2),
-            numpy.array([1298742] * 2)
-        ), 3),
-        numpy.array([0.017] * 2)
+            numpy.array([17, 10, 0]),
+            numpy.array([240, 0, 10]),
+            numpy.array([1001, 0, 10]),
+            numpy.array([1298742, 90, 80])
+        ), 6),
+        numpy.array([0.016699, 1, 0])
     )
 
     assert_zeros(wl_measures_effect_size.min_sensitivity)
@@ -191,6 +191,21 @@ def test_nmi():
 
     assert_zeros(wl_measures_effect_size.nmi)
 
+# Reference: Evert, S. (2005). The statistics of word cooccurrences: Word pairs and collocations [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556 | p. 54
+def test_mu_val():
+    numpy.testing.assert_array_equal(
+        wl_measures_effect_size.mu_val(
+            main,
+            numpy.array([1] * 2),
+            numpy.array([9] * 2),
+            numpy.array([9] * 2),
+            numpy.array([81] * 2)
+        ),
+        numpy.array([1] * 2)
+    )
+
+    assert_zeros(wl_measures_effect_size.mu_val)
+
 # Reference: Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. Corpus Linguistics and Linguistic Theory, 15(1), pp. 133–167. https://doi.org/10.1515/cllt-2015-0030 | p. 154
 def test_odds_ratio():
     numpy.testing.assert_array_equal(
@@ -307,6 +322,7 @@ def test_squared_phi_coeff():
     test_me()
     test_mi()
     test_nmi()
+    test_mu_val()
     test_odds_ratio()
     test_pct_diff()
     test_pmi()
diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py
index d0e657f8b..d11b09ebb 100644
--- a/wordless/wl_measures/wl_measures_effect_size.py
+++ b/wordless/wl_measures/wl_measures_effect_size.py
@@ -122,7 +122,7 @@ def mi_log_f(main, o11s, o12s, o21s, o22s):
     return wl_measure_utils.numpy_log2(wl_measure_utils.numpy_divide(o11s ** 2, e11s)) * wl_measure_utils.numpy_log(o11s + 1)
 
 # Minimum sensitivity
-# Reference: Pedersen, T. (1998). Dependent bigram identification. In Proceedings of the Fifteenth National Conference on Artificial Intelligence (p. 1197). AAAI Press.
+# Reference: Pedersen, T., & Bruce, R. (1996). What to infer from a description. In Technical report 96-CSE-04. Southern Methodist University.
 def min_sensitivity(main, o11s, o12s, o21s, o22s):
     o1xs, _, ox1s, _ = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s)
 
@@ -176,6 +176,13 @@ def nmi(main, o11s, o12s, o21s, o22s):
         -(joint_entropy_11 + joint_entropy_12 + joint_entropy_21 + joint_entropy_22)
     )
 
+# μ-value
+# Reference: Evert, S. (2005). The statistics of word cooccurrences: Word pairs and collocations [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der Universität Stuttgart. https://doi.org/10.18419/opus-2556 | p. 54
+def mu_val(main, o11s, o12s, o21s, o22s):
+    e11s, _, _, _ = wl_measures_statistical_significance.get_freqs_expected(o11s, o12s, o21s, o22s)
+
+    return wl_measure_utils.numpy_divide(o11s, e11s)
+
 # Odds ratio
 # Reference: Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. Corpus Linguistics and Linguistic Theory, 15(1), 133–167. https://doi.org/10.1515/cllt-2015-0030
 def odds_ratio(main, o11s, o12s, o21s, o22s):
diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py
index da58379e5..5292ec58a 100644
--- a/wordless/wl_settings/wl_settings_global.py
+++ b/wordless/wl_settings/wl_settings_global.py
@@ -3606,6 +3606,7 @@ def init_settings_global():
                 _tr('wl_settings_global', 'Mutual Expectation'): 'me',
                 _tr('wl_settings_global', 'Mutual information'): 'mi',
                 _tr('wl_settings_global', 'Mutual information (normalized)'): 'nmi',
+                _tr('wl_settings_global', 'μ-value'): 'mu_val',
                 _tr('wl_settings_global', 'Odds ratio'): 'or',
                 '%DIFF': 'pct_diff',
                 _tr('wl_settings_global', 'Pointwise mutual information'): 'pmi',
@@ -3933,6 +3934,13 @@ def init_settings_global():
                 'keyword': True
             },
 
+            'mu_val': {
+                'col_text': 'μ-value',
+                'func': wl_measures_effect_size.mu_val,
+                'collocation': True,
+                'keyword': False
+            },
+
             'or': {
                 'col_text': 'OR',
                 'func': wl_measures_effect_size.odds_ratio,