From ea4e9589894baee9409c5ffb6b7735aa3b16c9ff Mon Sep 17 00:00:00 2001 From: BLKSerene Date: Tue, 5 Nov 2024 18:50:46 +0800 Subject: [PATCH] Measures: Add effect size - relative risk --- CHANGELOG.md | 2 +- doc/doc.md | 65 +++++++++++-------- .../effect_size/conditional_probability.svg | 8 +-- doc/measures/effect_size/delta_p.svg | 16 ++--- doc/measures/effect_size/odds_ratio.svg | 46 ------------- doc/measures/effect_size/or.svg | 33 ++++++++++ doc/measures/effect_size/rr.svg | 33 ++++++++++ .../test_measures_effect_size.py | 10 ++- wordless/wl_colligation_extractor.py | 6 +- wordless/wl_collocation_extractor.py | 6 +- .../wl_measures/wl_measures_effect_size.py | 15 +++-- wordless/wl_settings/wl_settings_global.py | 9 +++ 12 files changed, 151 insertions(+), 98 deletions(-) delete mode 100644 doc/measures/effect_size/odds_ratio.svg create mode 100644 doc/measures/effect_size/or.svg create mode 100644 doc/measures/effect_size/rr.svg diff --git a/CHANGELOG.md b/CHANGELOG.md index 80f99acb6..753c014a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,7 +20,7 @@ ## [3.6.0](https://github.com/BLKSerene/Wordless/releases/tag/3.6.0) - ??/??/2024 ### πŸŽ‰ New Features -- Measures: Add effect size - conditional probability / Ξ”P / mutual information (normalized) / ΞΌ-value / pointwise mutual information (normalized) / squared association ratio +- Measures: Add effect size - conditional probability / Ξ”P / mutual information (normalized) / mutual information (squared) / ΞΌ-value / pointwise mutual information (normalized) / relative risk - Settings: Add Settings - Measures - Effect Size - Mutual Information / Pointwise Mutual Information / Pointwise Mutual Information (Cubic) / Pointwise Mutual Information (Squared) - Utils: Add Stanza's Sindhi dependency parser diff --git a/doc/doc.md b/doc/doc.md index 1be159a4f..69786f273 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -1383,49 +1383,52 @@ Measure of Dispersion (Distance-based)|Measure of Adjusted Frequency (Distance-b #### [12.4.4 Tests of Statistical Significance, Measures of Bayes Factor, and Measures of Effect Size](#doc) -In order to calculate the statistical significance, Bayes factor, and effect size (except **Mann-Whitney U test**, **Student's t-test (2-sample)**, and **Welch's t-test**) for two words in the same file (collocates) or for one specific word in two different files (keywords), two contingency tables must be constructed first, one for observed values, the other for expected values. +In order to calculate the test statistics of tests of statistical significance, Bayes factors, and effect sizes (except **Mann-Whitney U test** and **Student's t-test (2-sample)**) for potential collocations in *Collocation Extractor* and *Colligation Extractor* and potential keywords in *Keyword Extractor*, two contingency tables must be constructed first. One for observed values and the other for expected values. -As for collocates (in *Collocation Extractor* and *Colligation Extractor*): +As for potential collocations in *Collocation Extractor* and *Colligation Extractor*: -Observed Values|*Word 1* |Not *Word 1* |Row Total +Observed Values|*Word 2* |Not *Word 2* |Row Total --------------:|:-----------------:|:-----------------:|:---------------------------------: -*Word 2* |O₁₁ |O₁₂ |O₁ₓ = *O₁₁* + *O₁₂* -Not *Word 2* |O₂₁ |Oβ‚‚β‚‚ |Oβ‚‚β‚“ = *O₂₁* + *Oβ‚‚β‚‚* +*Word 1* |O₁₁ |O₁₂ |O₁ₓ = *O₁₁* + *O₁₂* +Not *Word 1* |O₂₁ |Oβ‚‚β‚‚ |Oβ‚‚β‚“ = *O₂₁* + *Oβ‚‚β‚‚* Column Total |Oₓ₁ = *O₁₁* + *O₂₁*|Oβ‚“β‚‚ = *O₁₂* + *Oβ‚‚β‚‚*|Oβ‚“β‚“ = *O₁₁* + *O₁₂* + *O₂₁* + *Oβ‚‚β‚‚* -Expected Values|*Word 1* |Not *Word 1* +Expected Values|*Word 2* |Not *Word 2* --------------:|:-------------------:|:-------------------: -*Word 2* |![E₁₁](/doc/e_11.svg)|![E₁₂](/doc/e_12.svg) -Not *Word 2* |![E₂₁](/doc/e_21.svg)|![Eβ‚‚β‚‚](/doc/e_22.svg) +*Word 1* |![E₁₁](/doc/e_11.svg)|![E₁₂](/doc/e_12.svg) +Not *Word 1* |![E₂₁](/doc/e_21.svg)|![Eβ‚‚β‚‚](/doc/e_22.svg) O₁₁: Number of occurrences of *Word 1* followed by *Word 2*.
O₁₂: Number of occurrences of *Word 1* followed by any word except *Word 2*.
O₂₁: Number of occurrences of any word except *Word 1* followed by *Word 2*.
-Oβ‚‚β‚‚: Number of occurrences of any word except *Word 1* followed by any word except *Word 2*. +Oβ‚‚β‚‚: Number of occurrences of any word except *Word 1* followed by any word except *Word 2*.
+O₁ₓ: Total frequency of *Word 1* in the corpus.
+Oₓ₁: Total frequency of *Word 2* in the corpus.
+Oβ‚“β‚“: Size of the corpus -As for keywords (in *Keyword Extractor*): +As for potential keywords in *Keyword Extractor*: -Observed Values|Observed File |Reference File |Row Total +Observed Values|Observed Corpus |Reference Corpus |Row Total --------------:|:-----------------:|:-----------------:|:---------------------------------: *Word w* |O₁₁ |O₁₂ |O₁ₓ = *O₁₁* + *O₁₂* *Not Word w* |O₂₁ |Oβ‚‚β‚‚ |Oβ‚‚β‚“ = *O₂₁* + *Oβ‚‚β‚‚* Column Total |Oₓ₁ = *O₁₁* + *O₂₁*|Oβ‚“β‚‚ = *O₁₂* + *Oβ‚‚β‚‚*|Oβ‚“β‚“ = *O₁₁* + *O₁₂* + *O₂₁* + *Oβ‚‚β‚‚* -Expected Values|Observed File |Reference File +Expected Values|Observed Corpus |Reference Corpus --------------:|:-------------------:|:-------------------: *Word w* |![E₁₁](/doc/e_11.svg)|![E₁₂](/doc/e_12.svg) *Not Word w* |![E₂₁](/doc/e_21.svg)|![Eβ‚‚β‚‚](/doc/e_22.svg) -O₁₁: Number of occurrences of *Word w* in the observed file.
-O₁₂: Number of occurrences of *Word w* in the reference file.
-O₂₁: Number of occurrences of all words except *Word w* in the observed file.
-Oβ‚‚β‚‚: Number of occurrences of all words except *Word w* in the reference file. +O₁₁: Number of occurrences of *Word w* in the observed corpus.
+O₁₂: Number of occurrences of *Word w* in the reference corpus.
+O₂₁: Number of occurrences of all words except *Word w* in the observed corpus.
+Oβ‚‚β‚‚: Number of occurrences of all words except *Word w* in the reference corpus.
+Oₓ₁: Size of the observed corpus.
+Oβ‚“β‚‚: Size of the reference corpus. -To conduct **Mann-Whitney U test**, **Student's t-test (2-sample)**, and **Welch's t-test** on a specific word, each column total is first divided into **n** (5 by default) sub-sections respectively. To be more specific, in *Collocation Extractor* and *Colligation Extractor*, all collocates where Word 1 appears as node and the other collocates where Word 1 does not appear as node are divided into **n** parts respectively. And in *Keyword Extractor*, all tokens in the observed file and all tokens in the reference files are equally divided into **n** parts respectively. +To conduct **Mann-Whitney U test** and **Student's t-test (2-sample)** on a potential keyword, all tokens in the observed corpus and reference corpus are equally divided into **n** parts respectively. The frequencies of the *Word w* in **n** sub-sections in the observed corpus and reference corpus are counted and denoted by **F₁₁**, **F₂₁**, **F₃₁**, ..., **Fₙ₁** and **F₁₂**, **Fβ‚‚β‚‚**, **F₃₂**, ..., **Fβ‚™β‚‚** respectively. The total frequency of the *Word w* in the observed corpus and reference corpus are denoted by **Fₓ₁** and **Fβ‚“β‚‚** respectively. The mean values of the frequencies of the *Word w* over **n** sub-sections in the observed corpus and reference corpus are denoted by ![f_x1_bar](/doc/measures/f_x1_bar.svg) and ![f_x2_bar](/doc/measures/f_x2_bar.svg) respectively. -The frequencies of *Word 2* (in *Collocation Extractor* and *Colligation Extractor*) or *Word w* (in *Keyword Extractor*) in each sub-section of the 2 column totals are counted and denoted by **F₁₁**, **F₂₁**, **F₃₁**, ..., **Fₙ₁**, and **F₁₂**, **Fβ‚‚β‚‚**, **F₃₂**, ..., **Fβ‚™β‚‚** respectively. The total frequency of *Word 2* (in *Collocation Extractor* and *Colligation Extractor*) or *Word w* (in *Keyword Extractor*) in the 2 column totals are denoted by **Fₓ₁** and **Fβ‚“β‚‚** respectively. The mean value of the frequencies over all sub-sections in the 2 column totals are denoted by ![f_x1_bar](/doc/measures/f_x1_bar.svg) and ![f_x2_bar](/doc/measures/f_x2_bar.svg) respectively. - -Then the test statistic, Bayes factor, and effect size are calculated as follows: +Then the test statistics, Bayes factors, and effect sizes are calculated as follows: @@ -1554,13 +1560,14 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction Mutual information
([Dunning, 1998, pp. 49–52](#ref-dunning-1998); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/mi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Mutual Information β†’ Base of logarithm**.|βœ”|βœ” Mutual information (normalized)
([Bouma, 2009](#ref-bouma-2009); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/nmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Mutual Information (Normalized) β†’ Base of logarithm**.|βœ”|βœ” ΞΌ-value
([Evert, 2005, p. 54](#ref-evert-2005))|![Formula](/doc/measures/effect_size/mu_val.svg)|βœ”|βœ–οΈ -Odds ratio
([Pecina, 2005, p. 15](#ref-pecina-2005), [Pojanapunya & Todd, 2016](#ref-pojanapunya-todd-2016))|![Formula](/doc/measures/effect_size/odds_ratio.svg)|βœ”|βœ” +Odds ratio
([Pecina, 2005, p. 15](#ref-pecina-2005), [Pojanapunya & Todd, 2016](#ref-pojanapunya-todd-2016))|![Formula](/doc/measures/effect_size/or.svg)|βœ”|βœ” %DIFF
([Gabrielatos & Marchi, 2011](#ref-gabrielatos-marchi-2011))|![Formula](/doc/measures/effect_size/pct_diff.svg)|βœ–οΈ|βœ” Pointwise mutual information
([Church & Hanks, 1990](#ref-church-hanks-1990); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/pmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Pointwise Mutual Information β†’ Base of logarithm**.|βœ”|βœ” Pointwise mutual information (cubic)ΒΉ
([Daille, 1994, p. 139](#ref-daille-1994); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im3.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Pointwise Mutual Information (Cubic) β†’ Base of logarithm**.|βœ”|βœ” Pointwise mutual information (normalized)
([Bouma, 2009](#ref-bouma-2009); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/npmi.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Pointwise Mutual Information (Normalized) β†’ Base of logarithm**.|βœ”|βœ” Pointwise mutual information (squared)ΒΉ
([Daille, 1995, p. 21](#ref-daille-1995); [Kilgarriff, 2001, pp. 104–105](#ref-kilgarriff-2001))|![Formula](/doc/measures/effect_size/im2.svg)
where **base** is the base of the logarithm, whose value could be modified via **Menu Bar β†’ Preferences β†’ Settings β†’ Measures β†’ Effect Size β†’ Pointwise Mutual Information (Squared) β†’ Base of logarithm**.|βœ”|βœ” Poisson collocation measure
([Quasthoff & Wolff, 2002](#ref-quasthoff-wolff-2002))|![Formula](/doc/measures/effect_size/poisson_collocation_measure.svg)|βœ”|βœ–οΈ +Relative risk
([Evert, 2005, p. 55](#ref-evert-2005), [Gries, 2010, p. 276](#ref-gries-2010))|![Formula](/doc/measures/effect_size/rr.svg)|βœ”|βœ” Squared phi coefficient
([Church & Gale, 1991](#ref-church-gale-1991))|![Formula](/doc/measures/effect_size/squared_phi_coeff.svg)|βœ”|βœ–οΈ > [!NOTE] @@ -1649,7 +1656,7 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction 1. [**^**](#ref-engwalls-fm) Engwall, G. (1974). *FrΓ©quence et distribution du vocabulaire dans un choix de romans franΓ§ais* [Unpublished doctoral dissertation]. Stockholm University. -1. [**^**](#ref-mu-val) Evert, S. (2005). *The statistics of word cooccurrences: Word pairs and collocations* [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der UniversitΓ€t Stuttgart. https://doi.org/10.18419/opus-2556 +1. [**^**](#ref-mu-val) [**^**](#ref-rr) Evert, S. (2005). *The statistics of word cooccurrences: Word pairs and collocations* [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der UniversitΓ€t Stuttgart. https://doi.org/10.18419/opus-2556 1. [**^**](#ref-elf) Fang, I. E. (1966). The easy listening formula. *Journal of Broadcasting*, *11*(1), 63–68. https://doi.org/10.1080/08838156609363529 @@ -1670,6 +1677,8 @@ Measure of Effect Size|Formula|Collocation Extraction|Keyword Extraction 1. [**^**](#ref-pct-diff) Gabrielatos, C., & Marchi, A. (2011, November 5). *Keyness: Matching metrics to definitions* [Conference session]. Corpus Linguistics in the South 1, University of Portsmouth, United Kingdom. https://eprints.lancs.ac.uk/id/eprint/51449/4/Gabrielatos_Marchi_Keyness.pdf 1. [**^**](#ref-griess-dp) Gries, S. T. (2008). Dispersions and adjusted frequencies in corpora. *International Journal of Corpus Linguistics*, *13*(4), 403–437. https://doi.org/10.1075/ijcl.13.4.02gri + +1. [**^**](#ref-rr) Gries, S. T. (2010). Useful statistics for corpus linguistics. In A. SΓ‘nchez PΓ©rez & M. Almela SΓ‘nchez (Eds.), A mosaic of corpus linguistics: Selected papers (pp. 269–291). Peter Lang. 1. [**^**](#ref-delta-p) Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. *International Journal of Corpus Linguistics*, *18*(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri @@ -1752,11 +1761,11 @@ Linguistic Computing Bulletin*, *7*(2), 172–177. 1. [**^**](#ref-min-sensitivity) Pedersen, T., & Bruce, R. (1996). What to infer from a description. In *Technical report 96-CSE-04*. Southern Methodist University. -1. [**^**](#ref-odds-ratio) Pecina, P. (2005). An extensive empirical study of collocation extraction methods. In C. Callison-Burch & S. Wan (Eds.), *Proceedings of the Student Research Workshop* (pp. 13–18). Association for Computational Linguistics. +1. [**^**](#ref-or) Pecina, P. (2005). An extensive empirical study of collocation extraction methods. In C. Callison-Burch & S. Wan (Eds.), *Proceedings of the Student Research Workshop* (pp. 13–18). Association for Computational Linguistics. 1. [**^**](#ref-fog-index) Pisarek, W. (1969). Jak mierzyΔ‡ zrozumiaΕ‚oΕ›Δ‡ tekstu? *Zeszyty Prasoznawcze*, *4*(42), 35–48. -1. [**^**](#ref-odds-ratio) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), 133–167. https://doi.org/10.1515/cllt-2015-0030 +1. [**^**](#ref-or) Pojanapunya, P., & Todd, R. W. (2016). Log-likelihood and odds ratio keyness statistics for different purposes of keyword analysis. *Corpus Linguistics and Linguistic Theory*, *15*(1), 133–167. https://doi.org/10.1515/cllt-2015-0030 1. [**^**](#ref-popescu-macutek-altmanns-b1-b2-b3-b4-b5) Popescu I.-I., Mačutek, J, & Altmann, G. (2008). Word frequency and arc length. *Glottometrics*, *17*, 18–42. diff --git a/doc/measures/effect_size/conditional_probability.svg b/doc/measures/effect_size/conditional_probability.svg index 547f7ed3e..f2e11e883 100644 --- a/doc/measures/effect_size/conditional_probability.svg +++ b/doc/measures/effect_size/conditional_probability.svg @@ -2,14 +2,14 @@ - + - + @@ -19,8 +19,8 @@ - - + + diff --git a/doc/measures/effect_size/delta_p.svg b/doc/measures/effect_size/delta_p.svg index b8333e28f..5875a645a 100644 --- a/doc/measures/effect_size/delta_p.svg +++ b/doc/measures/effect_size/delta_p.svg @@ -2,14 +2,14 @@ + - - + @@ -20,15 +20,15 @@ - - + + - - + + - - + + \ No newline at end of file diff --git a/doc/measures/effect_size/odds_ratio.svg b/doc/measures/effect_size/odds_ratio.svg deleted file mode 100644 index 768aa7423..000000000 --- a/doc/measures/effect_size/odds_ratio.svg +++ /dev/null @@ -1,46 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/doc/measures/effect_size/or.svg b/doc/measures/effect_size/or.svg new file mode 100644 index 000000000..fccf2d930 --- /dev/null +++ b/doc/measures/effect_size/or.svg @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/effect_size/rr.svg b/doc/measures/effect_size/rr.svg new file mode 100644 index 000000000..d3d1b935e --- /dev/null +++ b/doc/measures/effect_size/rr.svg @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/tests_measures/test_measures_effect_size.py b/tests/tests_measures/test_measures_effect_size.py index ada4ad1a1..c54f6fbd0 100644 --- a/tests/tests_measures/test_measures_effect_size.py +++ b/tests/tests_measures/test_measures_effect_size.py @@ -41,8 +41,8 @@ def test_conditional_probability(): numpy.round(wl_measures_effect_size.conditional_probability( main, numpy.array([28, 28]), - numpy.array([8002, 15740]), numpy.array([15740, 8002]), + numpy.array([8002, 15740]), numpy.array([97596164, 97596164]) ), 3), numpy.array([0.178, 0.349]) @@ -56,8 +56,8 @@ def test_delta_p(): numpy.round(wl_measures_effect_size.delta_p( main, numpy.array([5610, 5610]), - numpy.array([2257, 168938]), numpy.array([168938, 2257]), + numpy.array([2257, 168938]), numpy.array([10233063, 10233063]) ), 3), numpy.array([0.032, 0.697]) @@ -293,7 +293,10 @@ def test_im2(): def test_poisson_collocation_measure(): assert_zeros(wl_measures_effect_size.poisson_collocation_measure) -# Reference: Church, K. W., & Gale, W. A. (1991, September 29–October 1). Concordances for parallel text [Paper presentation]. Using Corpora: Seventh Annual Conference of the UW Centre for the New OED and Text Research, St. Catherine's College, Oxford, United Kingdom. +def test_rr(): + assert_zeros(wl_measures_effect_size.rr) + +# Reference: Church, K. W., & Gale, W. A. (1991, September 29–October 1). Concordances for parallel text [Paper presentation]. Using Corpora: Seventh Annual Conference of the UW Centre for the New OED and Text Research, St. Catherine's College, Oxford, United Kingdom. | p. 12 def test_squared_phi_coeff(): numpy.testing.assert_array_equal( numpy.round(wl_measures_effect_size.squared_phi_coeff( @@ -330,4 +333,5 @@ def test_squared_phi_coeff(): test_npmi() test_im2() test_poisson_collocation_measure() + test_rr() test_squared_phi_coeff() diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index 6d31aaf82..5f17d51ff 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -1151,16 +1151,18 @@ def run(self): ox1s = collections.Counter() oxxs = {} + # Total frequencies of the node and collocate for ngram_size, colligations_freqs in colligations_freqs_file_all.items(): o1xs[ngram_size] = collections.Counter() ox1s[ngram_size] = collections.Counter() for (node, collocate), freq in colligations_freqs.items(): - o1xs[ngram_size][collocate] += freq - ox1s[ngram_size][node] += freq + o1xs[ngram_size][node] += freq + ox1s[ngram_size][collocate] += freq oxxs[ngram_size] = sum(colligations_freqs.values()) + # Observed values o11s = numpy.empty(shape = num_colligations_all, dtype = float) o12s = numpy.empty(shape = num_colligations_all, dtype = float) o21s = numpy.empty(shape = num_colligations_all, dtype = float) diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index 65ae8e281..2e25a3ad0 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -1149,16 +1149,18 @@ def run(self): ox1s = {} oxxs = {} + # Total frequencies of the node and collocate for ngram_size, collocations_freqs in collocations_freqs_file_all.items(): o1xs[ngram_size] = collections.Counter() ox1s[ngram_size] = collections.Counter() for (node, collocate), freq in collocations_freqs.items(): - o1xs[ngram_size][collocate] += freq - ox1s[ngram_size][node] += freq + o1xs[ngram_size][node] += freq + ox1s[ngram_size][collocate] += freq oxxs[ngram_size] = sum(collocations_freqs.values()) + # Observed values o11s = numpy.empty(shape = num_collocations_all, dtype = float) o12s = numpy.empty(shape = num_collocations_all, dtype = float) o21s = numpy.empty(shape = num_collocations_all, dtype = float) diff --git a/wordless/wl_measures/wl_measures_effect_size.py b/wordless/wl_measures/wl_measures_effect_size.py index d11b09ebb..be1b105bc 100644 --- a/wordless/wl_measures/wl_measures_effect_size.py +++ b/wordless/wl_measures/wl_measures_effect_size.py @@ -36,16 +36,16 @@ def get_numpy_log(main, measure_code): # Conditional probability # Reference: Durrant, P. (2008). High frequency collocations and second language learning [Doctoral dissertation, University of Nottingham]. Nottingham eTheses. https://eprints.nottingham.ac.uk/10622/1/final_thesis.pdf | p. 84 def conditional_probability(main, o11s, o12s, o21s, o22s): - _, _, ox1s, _ = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) + o1xs, _, _, _ = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) - return wl_measure_utils.numpy_divide(o11s, ox1s) * 100 + return wl_measure_utils.numpy_divide(o11s, o1xs) * 100 # Ξ”P # Reference: Gries, S. T. (2013). 50-something years of work on collocations: What is or should be next …. International Journal of Corpus Linguistics, 18(1), 137–165. https://doi.org/10.1075/ijcl.18.1.09gri def delta_p(main, o11s, o12s, o21s, o22s): - _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) + o1xs, o2xs, _, _ = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) - return wl_measure_utils.numpy_divide(o11s, ox1s) - wl_measure_utils.numpy_divide(o12s, ox2s) + return wl_measure_utils.numpy_divide(o11s, o1xs) - wl_measure_utils.numpy_divide(o21s, o2xs) # Dice-SΓΈrensen coefficient # Reference: Smadja, F., McKeown, K. R., & Hatzivassiloglou, V. (1996). Translating collocations for bilingual lexicons: A statistical approach. Computational Linguistics, 22(1), 1–38. | p. 8 @@ -272,6 +272,13 @@ def poisson_collocation_measure(main, o11s, o12s, o21s, o22s): wl_measure_utils.numpy_log(oxxs) ) +# Relative risk +# Reference: Evert, S. (2005). The statistics of word cooccurrences: Word pairs and collocations [Doctoral dissertation, University of Stuttgart]. OPUS - Online Publikationen der UniversitΓ€t Stuttgart. https://doi.org/10.18419/opus-2556 | p. 55 +def rr(main, o11s, o12s, o21s, o22s): + _, _, ox1s, ox2s = wl_measures_statistical_significance.get_freqs_marginal(o11s, o12s, o21s, o22s) + + return wl_measure_utils.numpy_divide(o11s * ox2s, o12s * ox1s) + # Squared phi coefficient # Reference: Church, K. W., & Gale, W. A. (1991, September 29–October 1). Concordances for parallel text [Paper presentation]. Using Corpora: Seventh Annual Conference of the UW Centre for the New OED and Text Research, St. Catherine's College, Oxford, United Kingdom. def squared_phi_coeff(main, o11s, o12s, o21s, o22s): diff --git a/wordless/wl_settings/wl_settings_global.py b/wordless/wl_settings/wl_settings_global.py index 5292ec58a..55be2f4f5 100644 --- a/wordless/wl_settings/wl_settings_global.py +++ b/wordless/wl_settings/wl_settings_global.py @@ -3614,6 +3614,7 @@ def init_settings_global(): _tr('wl_settings_global', 'Pointwise mutual information (normalized)'): 'npmi', _tr('wl_settings_global', 'Pointwise mutual information (squared)'): 'im2', _tr('wl_settings_global', 'Poisson collocation measure'): 'poisson_collocation_measure', + _tr('wl_settings_global', 'Relative risk'): 'rr', _tr('wl_settings_global', 'Squared phi coefficient'): 'squared_phi_coeff' } }, @@ -3943,6 +3944,7 @@ def init_settings_global(): 'or': { 'col_text': 'OR', + # "or" is a Python keyword 'func': wl_measures_effect_size.odds_ratio, 'collocation': True, 'keyword': True @@ -3990,6 +3992,13 @@ def init_settings_global(): 'keyword': False }, + 'rr': { + 'col_text': 'RR', + 'func': wl_measures_effect_size.rr, + 'collocation': True, + 'keyword': True + }, + 'squared_phi_coeff': { 'col_text': 'Ο†2', 'func': wl_measures_effect_size.squared_phi_coeff,