diff --git a/CHANGELOG.md b/CHANGELOG.md index d46f9bb49..1c4992b18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,7 +33,7 @@ - Work Area: Add Collocation Extractor - Filter results - Node/Collocation length - Work Area: Add Dependency Parser - Filter results - Work Area: Add Dependency Parser - Search Settings - Match dependency relations -- Work Area: Add Profiler - Lexical Diversity - Brunét's Index / Honoré's statistic +- Work Area: Add Profiler - Lexical Density/Diversity - Brunét's Index / Honoré's statistic / Lexical Density ### ✨ Improvements - Settings: Settings - Part-of-speeach Tagging - Tagsets - Mapping Settings - Allow editing of tagset mapping of spaCy's Catalan, Danish, French, Greek (Modern), Macedonian, Norwegian (Bokmål), Portuguese, Russian, Spanish, and Ukrainian part-of-speech taggers diff --git a/doc/doc.md b/doc/doc.md index ed494b28c..6abc872e3 100644 --- a/doc/doc.md +++ b/doc/doc.md @@ -38,7 +38,7 @@ - [4.3 Supported File Encodings](#doc-4-3) - [4.4 Supported Measures](#doc-4-4) - [4.4.1 Readability Formulas](#doc-4-4-1) - - [4.4.2 Indicators of Lexical Diversity](#doc-4-4-2) + - [4.4.2 Indicators of Lexical Density/Diversity](#doc-4-4-2) - [4.4.3 Measures of Dispersion and Adjusted Frequency](#doc-4-4-3) - [4.4.4 Tests of Statistical Significance, Measures of Bayes Factor, and Measures of Effect Size](#doc-4-4-4) - [5 References](#doc-5) @@ -113,7 +113,7 @@ By default, *Wordless* tries to detect the encoding and language settings of all In *Profiler*, you can check and compare general linguistic features of different files. -All statistics are grouped into 5 tables for better readability: Readability, Counts, Lexical Diversity, Lengths, Length Breakdown. +All statistics are grouped into 5 tables for better readability: Readability, Counts, Lexical Density/Diversity, Lengths, Length Breakdown. - **3.1.1 Readability**
Readability statistics of each file calculated according to the different readability tests used. See section [4.4.1 Readability Formulas](#doc-4-4-1) for more details. @@ -163,8 +163,8 @@ All statistics are grouped into 5 tables for better readability: Readability, Co - **3.1.2.14 Count of Characters %**
The percentage of the number of characters in each file out of the total number of characters in all files. -- **3.1.3 Lexical Diversity**
- Statistics of lexical diversity which reflect the the extend to which the vocabulary used in each file varies. See section [4.4.2 Indicators of Lexical Diversity](#doc-4-4-2) for more details. +- **3.1.3 Lexical Density/Diversity**
+ Statistics of lexical density/diversity which reflect the the extend to which the vocabulary used in each file varies. See section [4.4.2 Indicators of Lexical Density/Diversity](#doc-4-4-2) for more details. - **3.1.4 Lengths**
- **3.1.4.1 Paragraph Length in Sentences / Sentence Segments / Tokens (Mean)**
@@ -908,10 +908,10 @@ It should be noted that some readability measures are **language-specific**, or The following variables would be used in formulas:
**NumSentences**: Number of sentences
**NumWords**: Number of words
-**NumWords1Syl**: Number of monosyllabic words
-**NumWordsn+Syls**: Number of words with n or more syllables
-**NumWordsn+Ltrs**: Number of words with n or more letters
-**NumWordsn-Ltrs**: Number of words with n or less letters
+**NumWordsSyl₁**: Number of monosyllabic words
+**NumWordsSylsₙ₊**: Number of words with n or more syllables
+**NumWordsLtrsₙ₊**: Number of words with n or more letters
+**NumWordsLtrsₙ₋**: Number of words with n or fewer letters
**NumConjs**: Number of conjunctions
**NumPreps**: Number of prepositions
**NumProns**: Number of pronouns
@@ -959,10 +959,10 @@ Coleman-Liau Index: Coleman's Readability Formula: \begin{align*} - \text{Cloze \; %}_1 &= 1.29 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) - 38.45 \\ - \text{Cloze \; %}_2 &= 1.16 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) + 1.48 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) - 37.95 \\ - \text{Cloze \; %}_3 &= 1.07 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) + 1.18 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) + 0.76 \times \left(\frac{\text{NumProns}}{\text{NumWords}} \times 100\right) - 34.02 \\ - \text{Cloze \; %}_4 &= 1.04 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) + 1.06 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) + 0.56 \times \left(\frac{\text{NumProns}}{\text{NumWords}} \times 100\right) - 0.36 \times \left(\frac{\text{NumPreps}}{\text{NumWords}} \times 100\right) - 26.01 + \text{Cloze \; %}_1 &= 1.29 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) - 38.45 \\ + \text{Cloze \; %}_2 &= 1.16 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) + 1.48 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) - 37.95 \\ + \text{Cloze \; %}_3 &= 1.07 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) + 1.18 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) + 0.76 \times \left(\frac{\text{NumProns}}{\text{NumWords}} \times 100\right) - 34.02 \\ + \text{Cloze \; %}_4 &= 1.04 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) + 1.06 \times \left(\frac{\text{NumSentences}}{\text{NumWords}} \times 100\right) + 0.56 \times \left(\frac{\text{NumProns}}{\text{NumWords}} \times 100\right) - 0.36 \times \left(\frac{\text{NumPreps}}{\text{NumWords}} \times 100\right) - 26.01 \end{align*} Dale-Chall Readability Formula: @@ -1014,12 +1014,12 @@ Flesch Reading Ease: Flesch Reading Ease (Farr-Jenkins-Paterson): \begin{align*} - \text{RE} &= 1.599 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) - 1.015 \times \frac{\text{NumWords}}{\text{NumSentences}} - 31.517 \\ - \text{RE}_\text{Farr-Jenkins-Paterson} &= 8.4335 - 0.0648 \times \left(\frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100\right) + 0.0923 \times \frac{\text{NumWords}}{\text{NumSentences}} + \text{RE} &= 1.599 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) - 1.015 \times \frac{\text{NumWords}}{\text{NumSentences}} - 31.517 \\ + \text{RE}_\text{Farr-Jenkins-Paterson} &= 8.4335 - 0.0648 \times \left(\frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100\right) + 0.0923 \times \frac{\text{NumWords}}{\text{NumSentences}} \end{align*} FORCAST Grade Level: - \text{RGL} = 20.43 - 0.11 \times \text{NumWords1Syl} + \text{RGL} = 20.43 - 0.11 \times \text{NumWordsSyl}_1 Fórmula de comprensibilidad de Gutiérrez de Polini: \text{CP} = 95.2 - 9.7 \times \frac{\text{NumCharsAlpha}}{\text{NumWords}} - 0.35 \times \frac{\text{NumWords}}{\text{NumSentences}} @@ -1037,7 +1037,7 @@ Gunning Fog Index: \begin{align*} \text{Fog Index} &= 0.4 \times \left(\frac{\text{NumWords}}{\text{NumSentences}} + \frac{\text{NumHardWords}}{\text{NumWords}} \times 100\right) \\ \text{Fog Index}_\text{Powers-Sumner-Kearl} &= 3.0680 + 0.0877 \times \frac{\text{NumWords}}{\text{NumSentences}} + 0.0984 \times \left(\frac{\text{NumHardWords}}{\text{NumWords}} \times 100\right) \\ - \text{Fog Index}_\text{Navy} &= \frac{\frac{\text{NumWords} + 2 \times \text{NumWords3+Syls}}{\text{NumSentences}} - 3}{2} \\ + \text{Fog Index}_\text{Navy} &= \frac{\frac{\text{NumWords} + 2 \times \text{NumWordsSyls}_{3+}}{\text{NumSentences}} - 3}{2} \\ \text{Fog Index}_\text{Polish} &= \frac{\sqrt{\left(\frac{\text{NumWords}}{\text{NumSentences}}\right)^2 + \left(\frac{\text{NumHardWords}}{\text{NumWords}} \times 100\right)^2}}{2} \end{align*} @@ -1045,10 +1045,10 @@ Legibilidad µ: \mu = \frac{\text{NumWords}}{\text{NumWords} - 1} \times \frac{\text{LenWordsAvg}}{\text{LenWordsVar}} \times 100 Lensear Write: - \text{Score} = \text{NumWords1Syl} + 3 \times \text{NumSentences} + \text{Score} = \text{NumWordsSyl}_1 + 3 \times \text{NumSentences} Lix: - \text{Lix} = \frac{\text{NumWords}}{\text{NumSentences}} + 100 \times \frac{\text{NumWords7+Ltrs}}{\text{NumWords}} + \text{Lix} = \frac{\text{NumWords}}{\text{NumSentences}} + 100 \times \frac{\text{NumWordsLtrs}_{7+}}{\text{NumWords}} Lorge Readability Index: \begin{align*} @@ -1060,13 +1060,13 @@ Luong-Nguyen-Dinh's Readability Formula: {\text{Readability} = 0.004 \times \frac{\text{NumCharsAlnum}}{\text{NumSentences}} + 0.1905 \times \frac{\text{NumCharsAlnum}}{\text{NumWords}} + 2.7147 \times \frac{\text{NumSylsLuongNguyenDinh}_\text{1000}}{\text{NumSyls}} - 0.7295} McAlpine EFLAW Readability Score: - \text{EFLAW} = \frac{\text{NumWords} + \text{NumWords3-Ltrs}}{\text{NumSentences}} + \text{EFLAW} = \frac{\text{NumWords} + \text{NumWordsLtrs}_{3-}}{\text{NumSentences}} neue Wiener Literaturformeln: \begin{align*} \text{sW} &= \frac{\text{NumWordTypesBambergerVanecek}}{\text{NumWordTypes}} \times 100 \\ - \text{S/100} &= \frac{\text{NumSentences}}{\text{NumWords}} \times 100 \qquad \text{MS} = \frac{\text{NumWords3+Syls}}{\text{NumWords}} \times 100 \\ - \text{SL} &= \frac{\text{NumWords}}{\text{NumSentences}} \qquad \qquad \; \; \; \text{IW} = \frac{\text{NumWords7+Ltrs}}{\text{NumWords}} \times 100 \\ + \text{S/100} &= \frac{\text{NumSentences}}{\text{NumWords}} \times 100 \qquad \text{MS} = \frac{\text{NumWordsSyls}_{3+}}{\text{NumWords}} \times 100 \\ + \text{SL} &= \frac{\text{NumWords}}{\text{NumSentences}} \qquad \qquad \; \; \; \text{IW} = \frac{\text{NumWordsLtrs}_{7+}}{\text{NumWords}} \times 100 \\ \text{nWL}_1 &= 0.2032 \times \text{sW} - 0.1715 \times \text{S/100} + 0.1594 \times \text{MS} - 0.0746 \times \text{SL} - 0.145 \\ \text{nWL}_2 &= 0.2081 \times \text{sW} - 0.207 \times \text{S/100} + 0.1772 \times \text{MS} + 0.7498 \\ \text{nWL}_3 &= 0.2373 \times \text{MS} + 0.2433 \times \text{SL} + 0.1508 \times \text{IW} - 3.9203 @@ -1074,23 +1074,23 @@ neue Wiener Literaturformeln: neue Wiener Sachtextformel: \begin{align*} - \text{MS} &= \frac{\text{NumWords3+Syls}}{\text{NumWords}} \times 100 \qquad \text{SL} = \frac{\text{NumWords}}{\text{NumSentences}} \\ - \text{IW} &= \frac{\text{NumWords7+Ltrs}}{\text{NumWords}} \times 100 \qquad \text{ES} = \frac{\text{NumWords1Syl}}{\text{NumWords}} \times 100 \\ + \text{MS} &= \frac{\text{NumWordsSyls}_{3+}}{\text{NumWords}} \times 100 \qquad \text{SL} = \frac{\text{NumWords}}{\text{NumSentences}} \\ + \text{IW} &= \frac{\text{NumWordsLtrs}_{7+}}{\text{NumWords}} \times 100 \qquad \text{ES} = \frac{\text{NumWordsSyl}_1}{\text{NumWords}} \times 100 \\ \text{nWS}_1 &= 0.1935 \times \text{MS} + 0.1672 \times \text{SL} + 0.1297 \times \text{IW} - 0.0327 \times \text{ES} - 0.875 \\ \text{nWS}_2 &= 0.2007 \times \text{MS} + 0.1682 \times \text{SL} + 0.1373 \times \text{IW} - 2.779 \\ \text{nWS}_3 &= 0.2963 \times \text{MS} + 0.1905 \times \text{SL} - 1.1144 \end{align*} OSMAN: - \text{OSMAN} = 200.791 - 1.015 \times \frac{\text{NumWords}}{\text{NumSentences}} - 24.181 \times \frac{\text{NumWords6+Ltrs} + \text{NumSyls} + \text{NumWords5+Syls} + \text{NumFaseehWords}}{\text{NumWords}} + \text{OSMAN} = 200.791 - 1.015 \times \frac{\text{NumWords}}{\text{NumSentences}} - 24.181 \times \frac{\text{NumWordsLtrs}_{6+} + \text{NumSyls} + \text{NumWordsSyls}_{5+} + \text{NumFaseehWords}}{\text{NumWords}} Rix: - \text{Rix} = \frac{\text{NumWords7+Ltrs}}{\text{NumSentences}} + \text{Rix} = \frac{\text{NumWordsLtrs}_{7+}}{\text{NumSentences}} SMOG Grade: \begin{align*} - \text{g} &= 3.1291 + 1.043 \times \sqrt{\text{NumWords3+Syls}} \\ - \text{g}_\text{German} &= \sqrt{\frac{\text{NumWords3+Syl}}{\text{NumSentences}} \times 30} - 2 + \text{g} &= 3.1291 + 1.043 \times \sqrt{\text{NumWordsSyls}_{3+}} \\ + \text{g}_\text{German} &= \sqrt{\frac{\text{NumWordsSyl}_{3+}}{\text{NumSentences}} \times 30} - 2 \end{align*} Spache Grade Level: @@ -1112,7 +1112,7 @@ Tuldava's Text Difficulty: \text{TD} = \frac{\text{NumSyls}}{\text{NumWords}} \times \ln \frac{\text{NumWords}}{\text{NumSentences}} Wheeler & Smith's Readability Formula: - \text{Wheeler-Smith} = \frac{\text{NumWords}}{\text{NumUnits}} \times \frac{\text{NumWords2+Syls}}{\text{NumWords}} \times 10 + \text{Wheeler-Smith} = \frac{\text{NumWords}}{\text{NumUnits}} \times \frac{\text{NumWordsSyls}_{2+}}{\text{NumWords}} \times 10 --> Readability Formula|Formula|Supported Languages @@ -1162,8 +1162,8 @@ Readability Formula|Formula|Supported Languages > 1. Requires **built-in part-of-speech tagging support** -#### [4.4.2 Indicators of Lexical Diversity](#doc) -Lexical diversity is the measurement of the extent to which the vocabulary used in the text varies. +#### [4.4.2 Indicators of Lexical Density/Diversity](#doc) +Lexical density/diversity is the measurement of the extent to which the vocabulary used in the text varies. The following variables would be used in formulas:
**fᵢ**: Frequency of the i-th token type ranked descendingly by frequencies
@@ -1185,9 +1185,12 @@ Fisher's Index of Diversity: Herdan's Vₘ: \text{V}_\text{m} = \frac{\sum_{f = 1}^{\text{f}_\text{max}}(\text{NumTypes}_f \times f^2)}{\text{NumTokens}^2} - \frac{1}{\text{NumTypes}} -Honoré's statistic: +Honoré's Statistic: \text{R} = 100 \times \ln\frac{\text{NumTokens}}{1 - \frac{\text{NumTypes}_1}{\text{NumTypes}} +Lexical Density: + \text{Lexical Density} = \frac{\text{NumContentWords}}{\text{NumTokens}} + LogTTR: \begin{align*} \text{LogTTR}_\text{Herdan} &= \frac{\ln{\text{NumTypes}}}{\ln{\text{NumTokens}}} \\ @@ -1243,34 +1246,35 @@ Yule's Index of Diversity: \text{Index of Diversity} = \frac{\text{NumTokens}^2}{\sum_{f = 1}^{\text{f}_\text{max}}(\text{NumTypes}_f \times f^2) - \text{NumTokens}} --> -Indicator of Lexical Diversity|Formula -------------------------------|------- -Brunét's Index
([Brunét, 1978](#ref-brunet-1978))|![Formula](/doc/measures/lexical_diversity/brunets_index.svg) -Corrected TTR
([Carroll, 1964](#ref-carroll-1964))|![Formula](/doc/measures/lexical_diversity/cttr.svg) -Fisher's Index of Diversity
([Fisher et al., 1943](#ref-fisher-et-al-1943))|![Formula](/doc/measures/lexical_diversity/fishers_index_of_diversity.svg)
where W₋₁ is the -1 branch of the [Lambert W function](https://en.wikipedia.org/wiki/Lambert_W_function) -Herdan's Vₘ
([Herdan, 1955](#ref-herdan-1955))|![Formula](/doc/measures/lexical_diversity/herdans_vm.svg) -HD-D
([McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see reference.
The sample size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Diversity → HD-D → Sample size**. -Honoré's statistic
([Honoré, 1979](#ref-honore-1979))|![Formula](/doc/measures/lexical_diversity/honores_stat.svg) -LogTTR¹
(Herdan: [Herdan, 1960, p. 28](#ref-herdan-1960)
Somers: [Somers, 1966](#ref-somers-1966)
Rubet: [Dugast, 1979](#ref-dugast-1979)
Maas: [Maas, 1972](#ref-maas-1972)
Dugast: [Dugast, 1978](#ref-dugast-1978); [Dugast, 1979](#ref-dugast-1979))|![Formula](/doc/measures/lexical_diversity/logttr.svg) -Mean Segmental TTR
([Johnson, 1944](#ref-johnson-1944))|![Formula](/doc/measures/lexical_diversity/msttr.svg)
where **n** is the number of equal-sized segment, the length of which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Diversity → Mean Segmental TTR → Number of tokens in each segment**, **NumTypesSegᵢ** is the number of token types in the **i**-th segment, and **NumTokensSegᵢ** is the number of tokens in the **i**-th segment. -Measure of Textual Lexical Diversity
([McCarthy, 2005, pp. 95–96, 99–100](#ref-mccarthy-2005); [McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see references.
The factor size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Diversity → Measure of Textual Lexical Diversity → Factor size**. -Moving-average TTR
([Covington & McFall, 2010](#ref-covington-mcfall-2010))|![Formula](/doc/measures/lexical_diversity/mattr.svg)
where **w** is the window size which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Diversity → Moving-average TTR → Window size**, **NumTypesWindowₚ** is the number of token types within the moving window starting at position **p**, and **NumTokensWindowₚ** is the number of tokens within the moving window starting at position **p**. -Popescu-Mačutek-Altmann's B₁/B₂/B₃/B₄/B₅
([Popescu et al., 2008](#ref-popescu-et-al-2008))|![Formula](/doc/measures/lexical_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg) +Indicator of Lexical Density/Diversity|Formula +--------------------------------------|------- +Brunét's Index
([Brunét, 1978](#ref-brunet-1978))|![Formula](/doc/measures/lexical_density_diversity/brunets_index.svg) +Corrected TTR
([Carroll, 1964](#ref-carroll-1964))|![Formula](/doc/measures/lexical_density_diversity/cttr.svg) +Fisher's Index of Diversity
([Fisher et al., 1943](#ref-fisher-et-al-1943))|![Formula](/doc/measures/lexical_density_diversity/fishers_index_of_diversity.svg)
where W₋₁ is the -1 branch of the [Lambert W function](https://en.wikipedia.org/wiki/Lambert_W_function) +Herdan's Vₘ
([Herdan, 1955](#ref-herdan-1955))|![Formula](/doc/measures/lexical_density_diversity/herdans_vm.svg) +HD-D
([McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see reference.
The sample size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → HD-D → Sample size**. +Honoré's Statistic
([Honoré, 1979](#ref-honore-1979))|![Formula](/doc/measures/lexical_density_diversity/honores_stat.svg) +Lexical Density
([Ure, 1971](#ref-ure-1971))|![Formula](/doc/measures/lexical_density_diversity/lexical_density.svg)
where **NumContentWords** is the number of content words. By default, all tokens whose universal part-of-speech tags assigned by built-in part-of-speech taggers are ADJ (adjectives), ADV (adverbs), INTJ (interjections), NOUN (nouns), PROPN (proper nouns), NUM (numerals), VERB (verbs), SYM (symbols), or X (others) are categorized as content words. For some built-in part-of-speech taggers, this behavior could be changed via **Menu Bar → Preferences → Settings → Part-of-speech Tagging → Tagsets → Mapping Settings → Content/Function Words**. +LogTTR¹
(Herdan: [Herdan, 1960, p. 28](#ref-herdan-1960)
Somers: [Somers, 1966](#ref-somers-1966)
Rubet: [Dugast, 1979](#ref-dugast-1979)
Maas: [Maas, 1972](#ref-maas-1972)
Dugast: [Dugast, 1978](#ref-dugast-1978); [Dugast, 1979](#ref-dugast-1979))|![Formula](/doc/measures/lexical_density_diversity/logttr.svg) +Mean Segmental TTR
([Johnson, 1944](#ref-johnson-1944))|![Formula](/doc/measures/lexical_density_diversity/msttr.svg)
where **n** is the number of equal-sized segment, the length of which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Mean Segmental TTR → Number of tokens in each segment**, **NumTypesSegᵢ** is the number of token types in the **i**-th segment, and **NumTokensSegᵢ** is the number of tokens in the **i**-th segment. +Measure of Textual Lexical Diversity
([McCarthy, 2005, pp. 95–96, 99–100](#ref-mccarthy-2005); [McCarthy & Jarvis, 2010](#ref-mccarthy-jarvis-2010))|For detailed calculation procedures, see references.
The factor size could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Measure of Textual Lexical Diversity → Factor size**. +Moving-average TTR
([Covington & McFall, 2010](#ref-covington-mcfall-2010))|![Formula](/doc/measures/lexical_density_diversity/mattr.svg)
where **w** is the window size which could be modified via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity → Moving-average TTR → Window size**, **NumTypesWindowₚ** is the number of token types within the moving window starting at position **p**, and **NumTokensWindowₚ** is the number of tokens within the moving window starting at position **p**. +Popescu-Mačutek-Altmann's B₁/B₂/B₃/B₄/B₅
([Popescu et al., 2008](#ref-popescu-et-al-2008))|![Formula](/doc/measures/lexical_density_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg) Popescu's R₁
([Popescu, 2009, pp. 18, 30, 33](#ref-popescu-2009))|For detailed calculation procedures, see reference. Popescu's R₂
([Popescu, 2009, pp. 35–36, 38](#ref-popescu-2009))|For detailed calculation procedures, see reference. Popescu's R₃
([Popescu, 2009, pp. 48–49, 53](#ref-popescu-2009))|For detailed calculation procedures, see reference. Popescu's R₄
([Popescu, 2009, p. 57](#ref-popescu-2009))|For detailed calculation procedures, see reference. -Repeat Rate¹
([Popescu, 2009, p. 166](#ref-popescu-2009))|![Formula](/doc/measures/lexical_diversity/repeat_rate.svg) -Root TTR
([Guiraud, 1954](#ref-guiraud-1954))|![Formula](/doc/measures/lexical_diversity/rttr.svg) -Shannon Entropy¹
([Popescu, 2009, p. 173](#ref-popescu-2009))|![Formula](/doc/measures/lexical_diversity/shannon_entropy.svg) -Simpson's l
([Simpson, 1949](#ref-simpson-1949))|![Formula](/doc/measures/lexical_diversity/simpsons_l.svg) -Type-token Ratio
([Johnson, 1944](#ref-johnson-1944))|![Formula](/doc/measures/lexical_diversity/ttr.svg) +Repeat Rate¹
([Popescu, 2009, p. 166](#ref-popescu-2009))|![Formula](/doc/measures/lexical_density_diversity/repeat_rate.svg) +Root TTR
([Guiraud, 1954](#ref-guiraud-1954))|![Formula](/doc/measures/lexical_density_diversity/rttr.svg) +Shannon Entropy¹
([Popescu, 2009, p. 173](#ref-popescu-2009))|![Formula](/doc/measures/lexical_density_diversity/shannon_entropy.svg) +Simpson's l
([Simpson, 1949](#ref-simpson-1949))|![Formula](/doc/measures/lexical_density_diversity/simpsons_l.svg) +Type-token Ratio
([Johnson, 1944](#ref-johnson-1944))|![Formula](/doc/measures/lexical_density_diversity/ttr.svg) vocd-D
([Malvern et al., 2004, pp. 51, 56–57](#ref-malvern-et-al-2004))|For detailed calculation procedures, see reference. -Yule's Characteristic K
([Yule, 1944, pp. 52–53](#ref-yule-1944))|![Formula](/doc/measures/lexical_diversity/yules_characteristic_k.svg) -Yule's Index of Diversity
([Williams, 1970, p. 100](#ref-williams-1970))|![Formula](/doc/measures/lexical_diversity/yules_index_of_diversity.svg) +Yule's Characteristic K
([Yule, 1944, pp. 52–53](#ref-yule-1944))|![Formula](/doc/measures/lexical_density_diversity/yules_characteristic_k.svg) +Yule's Index of Diversity
([Williams, 1970, p. 100](#ref-williams-1970))|![Formula](/doc/measures/lexical_density_diversity/yules_index_of_diversity.svg) > [!NOTE] -> 1. Variants available and can be selected via **Menu Bar → Preferences → Settings → Measures → Lexical Diversity** +> 1. Variants available and can be selected via **Menu Bar → Preferences → Settings → Measures → Lexical Density/Diversity** #### [4.4.3 Measures of Dispersion and Adjusted Frequency](#doc) @@ -1741,6 +1745,8 @@ Linguistic Computing Bulletin*, *7*(2), 172–177. 1. [**^**](#ref-trankle-bailers-readability-formula) Tränkle, U., & Bailer, H. (1984). *Kreuzvalidierung und Neuberechnung von Lesbarkeitsformeln für die Deutsche Sprache* [Cross-validation and recalculation of the readability formulas for the German language]. Zeitschrift für Entwicklungspsychologie und Pädagogische Psychologie, *16*(3), 231–244. 1. [**^**](#ref-td) Tuldava, J. (1975). Ob izmerenii trudnosti tekstov [On measuring the complexity of the text]. *Uchenye zapiski Tartuskogo universiteta. Trudy po metodike prepodavaniya inostrannykh yazykov*, *345*, 102–120. + +1. [**^**](#ref-lexical-density) Ure, J. (1971). Lexical density and register differentiation. In G. E. Perren & J. L. M. Trim (Eds.), *Applications of Linguistics* (pp. 443–452). Cambridge University Press. 1. [**^**](#ref-wheeler-smiths-readability-formula) Wheeler, L. R., & Smith, E. H. (1954). A practical readability formula for the classroom teacher in the primary grades. *Elementary English*, *31*(7), 397–399. diff --git a/doc/measures/lexical_diversity/brunets_index.svg b/doc/measures/lexical_density_diversity/brunets_index.svg similarity index 100% rename from doc/measures/lexical_diversity/brunets_index.svg rename to doc/measures/lexical_density_diversity/brunets_index.svg diff --git a/doc/measures/lexical_diversity/cttr.svg b/doc/measures/lexical_density_diversity/cttr.svg similarity index 100% rename from doc/measures/lexical_diversity/cttr.svg rename to doc/measures/lexical_density_diversity/cttr.svg diff --git a/doc/measures/lexical_diversity/fishers_index_of_diversity.svg b/doc/measures/lexical_density_diversity/fishers_index_of_diversity.svg similarity index 100% rename from doc/measures/lexical_diversity/fishers_index_of_diversity.svg rename to doc/measures/lexical_density_diversity/fishers_index_of_diversity.svg diff --git a/doc/measures/lexical_diversity/herdans_vm.svg b/doc/measures/lexical_density_diversity/herdans_vm.svg similarity index 100% rename from doc/measures/lexical_diversity/herdans_vm.svg rename to doc/measures/lexical_density_diversity/herdans_vm.svg diff --git a/doc/measures/lexical_diversity/honores_stat.svg b/doc/measures/lexical_density_diversity/honores_stat.svg similarity index 100% rename from doc/measures/lexical_diversity/honores_stat.svg rename to doc/measures/lexical_density_diversity/honores_stat.svg diff --git a/doc/measures/lexical_density_diversity/lexical_density.svg b/doc/measures/lexical_density_diversity/lexical_density.svg new file mode 100644 index 000000000..0247ac81b --- /dev/null +++ b/doc/measures/lexical_density_diversity/lexical_density.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/lexical_diversity/logttr.svg b/doc/measures/lexical_density_diversity/logttr.svg similarity index 100% rename from doc/measures/lexical_diversity/logttr.svg rename to doc/measures/lexical_density_diversity/logttr.svg diff --git a/doc/measures/lexical_diversity/mattr.svg b/doc/measures/lexical_density_diversity/mattr.svg similarity index 100% rename from doc/measures/lexical_diversity/mattr.svg rename to doc/measures/lexical_density_diversity/mattr.svg diff --git a/doc/measures/lexical_diversity/msttr.svg b/doc/measures/lexical_density_diversity/msttr.svg similarity index 100% rename from doc/measures/lexical_diversity/msttr.svg rename to doc/measures/lexical_density_diversity/msttr.svg diff --git a/doc/measures/lexical_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg b/doc/measures/lexical_density_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg similarity index 100% rename from doc/measures/lexical_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg rename to doc/measures/lexical_density_diversity/popescu_macutek_altmanns_b1_b2_b3_b4_b5.svg diff --git a/doc/measures/lexical_diversity/repeat_rate.svg b/doc/measures/lexical_density_diversity/repeat_rate.svg similarity index 100% rename from doc/measures/lexical_diversity/repeat_rate.svg rename to doc/measures/lexical_density_diversity/repeat_rate.svg diff --git a/doc/measures/lexical_diversity/rttr.svg b/doc/measures/lexical_density_diversity/rttr.svg similarity index 100% rename from doc/measures/lexical_diversity/rttr.svg rename to doc/measures/lexical_density_diversity/rttr.svg diff --git a/doc/measures/lexical_diversity/shannon_entropy.svg b/doc/measures/lexical_density_diversity/shannon_entropy.svg similarity index 100% rename from doc/measures/lexical_diversity/shannon_entropy.svg rename to doc/measures/lexical_density_diversity/shannon_entropy.svg diff --git a/doc/measures/lexical_diversity/simpsons_l.svg b/doc/measures/lexical_density_diversity/simpsons_l.svg similarity index 100% rename from doc/measures/lexical_diversity/simpsons_l.svg rename to doc/measures/lexical_density_diversity/simpsons_l.svg diff --git a/doc/measures/lexical_diversity/ttr.svg b/doc/measures/lexical_density_diversity/ttr.svg similarity index 100% rename from doc/measures/lexical_diversity/ttr.svg rename to doc/measures/lexical_density_diversity/ttr.svg diff --git a/doc/measures/lexical_diversity/yules_characteristic_k.svg b/doc/measures/lexical_density_diversity/yules_characteristic_k.svg similarity index 100% rename from doc/measures/lexical_diversity/yules_characteristic_k.svg rename to doc/measures/lexical_density_diversity/yules_characteristic_k.svg diff --git a/doc/measures/lexical_diversity/yules_index_of_diversity.svg b/doc/measures/lexical_density_diversity/yules_index_of_diversity.svg similarity index 100% rename from doc/measures/lexical_diversity/yules_index_of_diversity.svg rename to doc/measures/lexical_density_diversity/yules_index_of_diversity.svg diff --git a/doc/measures/readability/colemans_readability_formula.svg b/doc/measures/readability/colemans_readability_formula.svg index 95fbd670c..1f12626a7 100644 --- a/doc/measures/readability/colemans_readability_formula.svg +++ b/doc/measures/readability/colemans_readability_formula.svg @@ -1,6 +1,6 @@ - + @@ -67,30 +67,30 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + @@ -113,63 +113,63 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -192,92 +192,92 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -300,120 +300,120 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/eflaw.svg b/doc/measures/readability/eflaw.svg index b2573f4c8..18343bf4f 100644 --- a/doc/measures/readability/eflaw.svg +++ b/doc/measures/readability/eflaw.svg @@ -1,71 +1,71 @@ - + - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/fog_index.svg b/doc/measures/readability/fog_index.svg index 0414e079a..aaff50660 100644 --- a/doc/measures/readability/fog_index.svg +++ b/doc/measures/readability/fog_index.svg @@ -1,361 +1,362 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + - + - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - + + + + + + + + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + + - - - - + + + + - - - - - - - - + + + + + + + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - + + + + - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - - + + + + + + + + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/lensear_write.svg b/doc/measures/readability/lensear_write.svg index 50218bddf..c325e8adf 100644 --- a/doc/measures/readability/lensear_write.svg +++ b/doc/measures/readability/lensear_write.svg @@ -1,61 +1,61 @@ - + - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/lix.svg b/doc/measures/readability/lix.svg index 2303c92ef..eff7ff9e0 100644 --- a/doc/measures/readability/lix.svg +++ b/doc/measures/readability/lix.svg @@ -1,83 +1,84 @@ - + + + - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - + + + + + + + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/nwl.svg b/doc/measures/readability/nwl.svg index 0e8a38b54..4c695ad5c 100644 --- a/doc/measures/readability/nwl.svg +++ b/doc/measures/readability/nwl.svg @@ -1,11 +1,13 @@ - + + + @@ -97,40 +99,40 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -139,215 +141,215 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/nws.svg b/doc/measures/readability/nws.svg index 50f4bd707..165642c57 100644 --- a/doc/measures/readability/nws.svg +++ b/doc/measures/readability/nws.svg @@ -1,13 +1,15 @@ - + + + + - - + @@ -41,9 +43,9 @@ - - - + + + @@ -52,227 +54,227 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/osman.svg b/doc/measures/readability/osman.svg index 69ff7c262..16b70ded8 100644 --- a/doc/measures/readability/osman.svg +++ b/doc/measures/readability/osman.svg @@ -1,154 +1,156 @@ - + - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/re_farr_jenkins_paterson.svg b/doc/measures/readability/re_farr_jenkins_paterson.svg index dcfe492da..b57d4991e 100644 --- a/doc/measures/readability/re_farr_jenkins_paterson.svg +++ b/doc/measures/readability/re_farr_jenkins_paterson.svg @@ -1,8 +1,9 @@ - + + @@ -69,59 +70,59 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -169,52 +170,52 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/rgl.svg b/doc/measures/readability/rgl.svg index 2a6506640..3412fc966 100644 --- a/doc/measures/readability/rgl.svg +++ b/doc/measures/readability/rgl.svg @@ -1,58 +1,59 @@ - + - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - - + + + + + + - - + + - + - - + + - - - - - - - - - - - - + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/rix.svg b/doc/measures/readability/rix.svg index 85a2f7832..4f57258bc 100644 --- a/doc/measures/readability/rix.svg +++ b/doc/measures/readability/rix.svg @@ -1,59 +1,59 @@ - + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/smog_grade.svg b/doc/measures/readability/smog_grade.svg index b65abd146..0e5811716 100644 --- a/doc/measures/readability/smog_grade.svg +++ b/doc/measures/readability/smog_grade.svg @@ -1,7 +1,9 @@ - + + + @@ -38,7 +40,7 @@ - + @@ -54,8 +56,8 @@ - - + + @@ -64,52 +66,52 @@ - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/doc/measures/readability/wheeler_smiths_readability_formula.svg b/doc/measures/readability/wheeler_smiths_readability_formula.svg index 78bd95583..87a8fb108 100644 --- a/doc/measures/readability/wheeler_smiths_readability_formula.svg +++ b/doc/measures/readability/wheeler_smiths_readability_formula.svg @@ -1,90 +1,90 @@ - + + + - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/files/file_area/misc/[amh] No language support.txt b/tests/files/file_area/misc/[amh] No language support.txt new file mode 100644 index 000000000..00aba4d44 --- /dev/null +++ b/tests/files/file_area/misc/[amh] No language support.txt @@ -0,0 +1 @@ +አማርኛ[1] ፡ የኢትዮጵያ ፡ መደበኛ ፡ ቋንቋ ፡ ነው ። ከሴማዊ ፡ ቋንቋዎች ፡ እንደ ፡ ዕብራይስጥ ፡ ወይም ፡ ዓረብኛ ፡ አንዱ ፡ ነው። በአፍሪካ ፡ ውስጥ ፡ ደግሞ ፡ ከምዕራብ ፡ አፍሪካው ፡ ሐውሳና ፡ ከምሥራቅ ፡ አፍሪካው ፡ ስዋሂሊ ፡ ቀጥሎ ፡ 3ኛውን ፡ ቦታ ፡ የያዘ ፡ ነው።[1] እንዲያውም ፡ 85.6 ፡ ሚሊዮን ፡ ያህል ፡ ተናጋሪዎች ፡ እያሉት ፣ አማርኛ ፡ ከአረብኛ ፡ ቀጥሎ ፡ ትልቁ ፡ ሴማዊ ፡ ቋንቋ ፡ ነው። የሚጻፈውም ፡ በአማርኛ ፡ ፊደል ፡ ነው። አማርኛ ፡ ከዓረብኛና ፡ ከዕብራይስጥ ፡ ያለው ፡ መሰረታዊ ፡ ልዩነት ፡ እንደ ፡ ላቲን ፡ ከግራ ፡ ወደ ፡ ቀኝ ፡ መጻፉ ፡ ነው። diff --git a/tests/files/file_area/misc/[eng_us] 1st_token_is_punc_mark.txt b/tests/files/file_area/misc/[eng_us] First token is a punctuation mark.txt similarity index 100% rename from tests/files/file_area/misc/[eng_us] 1st_token_is_punc_mark.txt rename to tests/files/file_area/misc/[eng_us] First token is a punctuation mark.txt diff --git a/tests/test_colligation_extractor.py b/tests/test_colligation_extractor.py index 9ac6ba4d4..7bc75e198 100644 --- a/tests/test_colligation_extractor.py +++ b/tests/test_colligation_extractor.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob import random from tests import wl_test_init @@ -42,7 +43,7 @@ def test_colligation_extractor(): ] measures_effect_size = list(main.settings_global['measures_effect_size'].keys()) - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: @@ -52,7 +53,11 @@ def test_colligation_extractor(): wl_test_init.select_test_files(main, no_files = [1, 2]) # Miscellaneous case _: - wl_test_init.select_test_files(main, no_files = [i + 1]) + # Excluding files without POS tagging support + if main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us': + wl_test_init.select_test_files(main, no_files = [i + 1]) + else: + continue settings['generation_settings']['test_statistical_significance'] = random.choice(tests_statistical_significance) settings['generation_settings']['measure_bayes_factor'] = random.choice(measures_bayes_factor) diff --git a/tests/test_collocation_extractor.py b/tests/test_collocation_extractor.py index fbf7dd0b6..90bc79fb5 100644 --- a/tests/test_collocation_extractor.py +++ b/tests/test_collocation_extractor.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob import random from tests import wl_test_init @@ -42,7 +43,7 @@ def test_collocation_extractor(): ] measures_effect_size = list(main.settings_global['measures_effect_size'].keys()) - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: diff --git a/tests/test_concordancer.py b/tests/test_concordancer.py index b0ed01fcf..0220239f4 100644 --- a/tests/test_concordancer.py +++ b/tests/test_concordancer.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob + from tests import wl_test_init from wordless import wl_concordancer from wordless.wl_dialogs import wl_dialogs_misc @@ -30,7 +32,7 @@ def test_concordancer(): settings['search_settings']['multi_search_mode'] = True settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: diff --git a/tests/test_concordancer_parallel.py b/tests/test_concordancer_parallel.py index 10f23e696..ba25d7228 100644 --- a/tests/test_concordancer_parallel.py +++ b/tests/test_concordancer_parallel.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob + from tests import wl_test_init from wordless import wl_concordancer_parallel from wordless.wl_dialogs import wl_dialogs_misc @@ -33,7 +35,10 @@ def test_concordancer_parallel(): case 0: wl_test_init.select_test_files(main, no_files = [0, 1, 2]) case 1: - wl_test_init.select_test_files(main, no_files = [1, 2, 3, 4]) + wl_test_init.select_test_files( + main, + no_files = list(range(1, 3 + len(glob.glob('tests/files/file_area/misc/*.txt')))) + ) print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}") diff --git a/tests/test_dependency_parser.py b/tests/test_dependency_parser.py index 4da0b757e..c971070af 100644 --- a/tests/test_dependency_parser.py +++ b/tests/test_dependency_parser.py @@ -16,6 +16,8 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob + from tests import wl_test_init from wordless import wl_dependency_parser from wordless.wl_dialogs import wl_dialogs_misc @@ -30,7 +32,7 @@ def test_dependency_parser(): settings['search_settings']['multi_search_mode'] = True settings['search_settings']['search_terms'] = wl_test_init.SEARCH_TERMS - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: @@ -40,7 +42,11 @@ def test_dependency_parser(): wl_test_init.select_test_files(main, no_files = [1, 2]) # Miscellaneous case _: - wl_test_init.select_test_files(main, no_files = [i + 1]) + # Excluding files without dependency parsing support + if main.settings_custom['file_area']['files_open'][i + 1]['lang'] == 'eng_us': + wl_test_init.select_test_files(main, no_files = [i + 1]) + else: + continue global main_global main_global = main diff --git a/tests/test_keyword_extractor.py b/tests/test_keyword_extractor.py index 7d431bbc1..802512301 100644 --- a/tests/test_keyword_extractor.py +++ b/tests/test_keyword_extractor.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob import random from tests import wl_test_init @@ -39,7 +40,7 @@ def test_keyword_extractor(): ] measures_effect_size = list(main.settings_global['measures_effect_size'].keys()) - for i in range(6): + for i in range(4 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single observed file & single reference file case 0: diff --git a/tests/test_ngram_generator.py b/tests/test_ngram_generator.py index f84d85f8b..349b4e6c5 100644 --- a/tests/test_ngram_generator.py +++ b/tests/test_ngram_generator.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob import random from tests import wl_test_init @@ -35,7 +36,7 @@ def test_ngram_generator(): measures_dispersion = list(main.settings_global['measures_dispersion']) measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq']) - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: diff --git a/tests/test_profiler.py b/tests/test_profiler.py index cca98ad2b..370fa2acc 100644 --- a/tests/test_profiler.py +++ b/tests/test_profiler.py @@ -17,7 +17,7 @@ # ---------------------------------------------------------------------- import collections -import re +import glob import numpy import scipy @@ -27,12 +27,10 @@ from wordless.wl_dialogs import wl_dialogs_misc from wordless.wl_utils import wl_misc -main_global = None - def test_profiler(): main = wl_test_init.Wl_Test_Main(switch_lang_utils = 'fast') - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: @@ -44,9 +42,6 @@ def test_profiler(): case _: wl_test_init.select_test_files(main, no_files = [i + 1]) - global main_global - main_global = main - print(f"Files: {' | '.join(wl_test_init.get_test_file_names(main))}") wl_profiler.Wl_Worker_Profiler_Table( @@ -67,8 +62,6 @@ def update_gui(err_msg, texts_stats_files): count_tokens_lens_syls = [] count_tokens_lens_chars = [] - files = main_global.settings_custom['file_area']['files_open'] - for i, stats in enumerate(texts_stats_files): stats_readability = stats[0] len_paras_sentences = numpy.array(stats[1]) @@ -76,35 +69,37 @@ def update_gui(err_msg, texts_stats_files): len_paras_tokens = numpy.array(stats[3]) len_sentences = numpy.array(stats[4]) len_sentence_segs = numpy.array(stats[5]) - len_tokens_syls = numpy.array(stats[6]) + len_tokens_syls = numpy.array(stats[6]) if stats[6] is not None else None len_tokens_chars = numpy.array(stats[7]) - len_types_syls = numpy.array(stats[8]) + len_types_syls = numpy.array(stats[8]) if stats[8] is not None else None len_types_chars = numpy.array(stats[9]) - len_syls = numpy.array(stats[10]) - stats_lexical_diversity = stats[11] + len_syls = numpy.array(stats[10]) if stats[10] is not None else None + stats_lexical_density_diversity = stats[11] count_paras = len(len_paras_sentences) count_sentences = len(len_sentences) count_sentence_segs = len(len_sentence_segs) count_tokens = len(len_tokens_chars) count_types = len(len_types_chars) - count_syls = len(len_syls) + count_syls = len(len_syls) if len_syls is not None else None count_chars = numpy.sum(len_tokens_chars) count_sentences_lens.append(collections.Counter(len_sentences)) count_sentence_segs_lens.append(collections.Counter(len_sentence_segs)) - count_tokens_lens_syls.append(collections.Counter(len_tokens_syls)) + count_tokens_lens_syls.append( + collections.Counter(len_tokens_syls) if len_tokens_syls is not None else None + ) count_tokens_lens_chars.append(collections.Counter(len_tokens_chars)) assert len(stats_readability) == 39 - for i, readability in enumerate(stats_readability): + for stat in stats_readability: assert ( ( - type(readability) in [int, float, numpy.float64] - and not numpy.isnan(readability) + type(stat) in [int, float, numpy.float64] + and not numpy.isnan(stat) ) - or readability in ['text_too_short', 'no_support'] + or stat in ['text_too_short', 'no_support'] ) # Counts @@ -113,35 +108,35 @@ def update_gui(err_msg, texts_stats_files): assert count_sentence_segs assert count_tokens assert count_types - assert count_syls assert count_chars + if count_syls is not None: + assert count_syls + # Lengths assert len_paras_sentences.size assert len_paras_sentence_segs.size assert len_paras_tokens.size assert len_sentences.size assert len_sentence_segs.size - assert len_tokens_syls.size assert len_tokens_chars.size - assert len_types_syls.size assert len_types_chars.size - assert len_syls.size - if i < len(files): - lang = re.search(r'(?<=\[)[a-z_]+(?=\])', files[i]['name']).group() - - if lang not in main_global.settings_global['syl_tokenizers']: - assert all((len_syls == 1 for len_syls in len_tokens_syls)) - assert all((len_syls == 1 for len_syls in len_types_syls)) + if len_syls is not None: + assert len_tokens_syls.size + assert len_types_syls.size + assert len_syls.size # Lexical Diversity - assert len(stats_lexical_diversity) == 27 + assert len(stats_lexical_density_diversity) == 28 - for i, lexical_diversity in enumerate(stats_lexical_diversity): + for stat in stats_lexical_density_diversity: assert ( - not numpy.isnan(lexical_diversity) - and type(lexical_diversity) in [int, float, numpy.float64] + ( + type(stat) in [int, float, numpy.float64] + and not numpy.isnan(stat) + ) + or stat == 'no_support' ) # Mean @@ -150,9 +145,11 @@ def update_gui(err_msg, texts_stats_files): assert numpy.mean(len_paras_tokens) == count_tokens / count_paras assert numpy.mean(len_sentences) == count_tokens / count_sentences assert numpy.mean(len_sentence_segs) == count_tokens / count_sentence_segs - assert numpy.mean(len_tokens_syls) == count_syls / count_tokens assert numpy.mean(len_tokens_chars) == count_chars / count_tokens + if count_syls is not None: + assert numpy.mean(len_tokens_syls) == count_syls / count_tokens + # Range and interquartile range for lens in [ len_paras_sentences, @@ -163,8 +160,9 @@ def update_gui(err_msg, texts_stats_files): len_tokens_syls, len_tokens_chars ]: - assert numpy.ptp(lens) == max(lens) - min(lens) - assert scipy.stats.iqr(lens) == numpy.percentile(lens, 75) - numpy.percentile(lens, 25) + if lens is not None: + assert numpy.ptp(lens) == max(lens) - min(lens) + assert scipy.stats.iqr(lens) == numpy.percentile(lens, 75) - numpy.percentile(lens, 25) # Count of n-token-long Sentences if any(count_sentences_lens): @@ -201,7 +199,7 @@ def update_gui(err_msg, texts_stats_files): assert 0 not in count_sentence_segs_lens # Count of n-syllable-long Tokens - if any(count_tokens_lens_syls): + if len_tokens_syls is not None: count_tokens_lens_files = wl_misc.merge_dicts(count_tokens_lens_syls) count_tokens_lens_syls = sorted(count_tokens_lens_files.keys()) diff --git a/tests/test_wordlist_generator.py b/tests/test_wordlist_generator.py index 58aa0055b..6b6b38a4e 100644 --- a/tests/test_wordlist_generator.py +++ b/tests/test_wordlist_generator.py @@ -16,6 +16,7 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import glob import random from tests import wl_test_init @@ -32,7 +33,7 @@ def test_wordlist_generator(): measures_dispersion = list(main.settings_global['measures_dispersion'].keys()) measures_adjusted_freq = list(main.settings_global['measures_adjusted_freq'].keys()) - for i in range(4): + for i in range(2 + len(glob.glob('tests/files/file_area/misc/*.txt'))): match i: # Single file case 0: diff --git a/tests/tests_measures/test_measures_lexical_diversity.py b/tests/tests_measures/test_measures_lexical_density_diversity.py similarity index 61% rename from tests/tests_measures/test_measures_lexical_diversity.py rename to tests/tests_measures/test_measures_lexical_density_diversity.py index b2d63d1f3..a64edf280 100644 --- a/tests/tests_measures/test_measures_lexical_diversity.py +++ b/tests/tests_measures/test_measures_lexical_density_diversity.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------- -# Wordless: Tests - Measures - Lexical diversity +# Wordless: Tests - Measures - Lexical density/diversity # Copyright (C) 2018-2024 Ye Lei (叶磊) # # This program is free software: you can redistribute it and/or modify @@ -20,10 +20,10 @@ import scipy from tests import wl_test_init -from wordless.wl_measures import wl_measures_lexical_diversity +from wordless.wl_measures import wl_measures_lexical_density_diversity main = wl_test_init.Wl_Test_Main() -settings = main.settings_custom['measures']['lexical_diversity'] +settings = main.settings_custom['measures']['lexical_density_diversity'] TOKENS_10 = ['This', 'is', 'a', 'sentence', '.'] * 2 TOKENS_100 = ['This', 'is', 'a', 'sentence', '.'] * 20 @@ -33,49 +33,63 @@ # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 26). Mouton de Gruyter. TOKENS_225 = [1] * 11 + [2, 3] * 9 + [4] * 7 + [5, 6] * 6 + [7, 8] * 5 + list(range(9, 16)) * 4 + list(range(16, 22)) * 3 + list(range(22, 40)) * 2 + list(range(40, 125)) +def get_test_text(tokens): + return wl_test_init.Wl_Test_Text(main, [[[tokens]]]) + +text_tokens_10 = get_test_text(TOKENS_10) +text_tokens_100 = get_test_text(TOKENS_100) +text_tokens_101 = get_test_text(TOKENS_101) +text_tokens_1000 = get_test_text(TOKENS_1000) +text_tokens_225 = get_test_text(TOKENS_225) + def test_brunets_index(): - w = wl_measures_lexical_diversity.brunets_index(main, TOKENS_100) + w = wl_measures_lexical_density_diversity.brunets_index(main, text_tokens_100) assert w == numpy.power(100, numpy.power(5, -0.165)) def test_cttr(): - cttr = wl_measures_lexical_diversity.cttr(main, TOKENS_100) + cttr = wl_measures_lexical_density_diversity.cttr(main, text_tokens_100) assert cttr == 5 / (2 * 100) ** 0.5 # Reference: Fisher, R. A., Steven, A. C., & Williams, C. B. (1943). The relation between the number of species and the number of individuals in a random sample of an animal population. Journal of Animal Ecology, 12(1), 56. https://doi.org/10.2307/1411 def test_fishers_index_of_diversity(): tokens = [str(i) for i in range(240)] + ['0'] * (15609 - 240) - alpha = wl_measures_lexical_diversity.fishers_index_of_diversity(main, tokens) + alpha = wl_measures_lexical_density_diversity.fishers_index_of_diversity(main, get_test_text(tokens)) assert round(alpha, 3) == 40.247 def test_herdans_vm(): - vm = wl_measures_lexical_diversity.herdans_vm(main, TOKENS_100) + vm = wl_measures_lexical_density_diversity.herdans_vm(main, text_tokens_100) assert vm == (5 * 20 ** 2) / (100 ** 2) - 1 / 5 def test_hdd(): - hdd_100 = wl_measures_lexical_diversity.hdd(main, TOKENS_100) + hdd_100 = wl_measures_lexical_density_diversity.hdd(main, text_tokens_100) assert hdd_100 == (1 - scipy.stats.hypergeom.pmf(k = 0, M = 100, n = 20, N = 42)) * (1 / 42) * 5 def test_honores_stat(): - r = wl_measures_lexical_diversity.honores_stat(main, TOKENS_100) + r = wl_measures_lexical_density_diversity.honores_stat(main, text_tokens_100) assert r == 100 * numpy.log(100 / (1 - 0 / 5)) +def test_lexical_density(): + lexical_density = wl_measures_lexical_density_diversity.lexical_density(main, text_tokens_100) + + assert lexical_density == 20 / 100 + def test_logttr(): settings['logttr']['variant'] = 'Herdan' - logttr_herdan = wl_measures_lexical_diversity.logttr(main, TOKENS_100) + logttr_herdan = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100) settings['logttr']['variant'] = 'Somers' - logttr_somers = wl_measures_lexical_diversity.logttr(main, TOKENS_100) + logttr_somers = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100) settings['logttr']['variant'] = 'Rubet' - logttr_rubet = wl_measures_lexical_diversity.logttr(main, TOKENS_100) + logttr_rubet = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100) settings['logttr']['variant'] = 'Maas' - logttr_maas = wl_measures_lexical_diversity.logttr(main, TOKENS_100) + logttr_maas = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100) settings['logttr']['variant'] = 'Dugast' - logttr_dugast = wl_measures_lexical_diversity.logttr(main, TOKENS_100) + logttr_dugast = wl_measures_lexical_density_diversity.logttr(main, text_tokens_100) num_types = 5 num_tokens = 100 @@ -87,28 +101,28 @@ def test_logttr(): assert logttr_dugast == (numpy.log(num_tokens) ** 2) / (numpy.log(num_tokens) - numpy.log(num_types)) def test_msttr(): - msttr_100 = wl_measures_lexical_diversity.msttr(main, TOKENS_101) + msttr_100 = wl_measures_lexical_density_diversity.msttr(main, text_tokens_101) settings['msttr']['num_tokens_in_each_seg'] = 1000 - msttr_1000 = wl_measures_lexical_diversity.msttr(main, TOKENS_101) + msttr_1000 = wl_measures_lexical_density_diversity.msttr(main, text_tokens_101) assert msttr_100 == 5 / 100 assert msttr_1000 == 0 def test_mtld(): - mtld_100 = wl_measures_lexical_diversity.mtld(main, TOKENS_100) + mtld_100 = wl_measures_lexical_density_diversity.mtld(main, text_tokens_100) assert mtld_100 == 100 / (14 + 0 / 0.28) def test_mattr(): - mattr_100 = wl_measures_lexical_diversity.mattr(main, TOKENS_100) - mattr_1000 = wl_measures_lexical_diversity.mattr(main, TOKENS_1000) + mattr_100 = wl_measures_lexical_density_diversity.mattr(main, text_tokens_100) + mattr_1000 = wl_measures_lexical_density_diversity.mattr(main, text_tokens_1000) - assert mattr_100 == wl_measures_lexical_diversity.ttr(main, TOKENS_100) + assert mattr_100 == wl_measures_lexical_density_diversity.ttr(main, text_tokens_100) assert mattr_1000 == 5 / 500 # Reference: Popescu I.-I., Mačutek, J, & Altmann, G. (2008). Word frequency and arc length. Glottometrics, 17, 21, 33. def test_popescu_macutek_altmanns_b1_b2_b3_b4_b5(): - b1, b2, b3, b4, b5 = wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, TOKENS_225) + b1, b2, b3, b4, b5 = wl_measures_lexical_density_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, text_tokens_225) assert round(b1, 3) == 0.969 assert round(b2, 3) == 0.527 @@ -118,79 +132,79 @@ def test_popescu_macutek_altmanns_b1_b2_b3_b4_b5(): # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 30). Mouton de Gruyter. def test_popescus_r1(): - r1 = wl_measures_lexical_diversity.popescus_r1(main, TOKENS_225) + r1 = wl_measures_lexical_density_diversity.popescus_r1(main, text_tokens_225) assert round(r1, 4) == 0.8667 # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 39). Mouton de Gruyter. def test_popescus_r2(): - r2 = wl_measures_lexical_diversity.popescus_r2(main, TOKENS_225) + r2 = wl_measures_lexical_density_diversity.popescus_r2(main, text_tokens_225) assert round(r2, 3) == 0.871 # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 51). Mouton de Gruyter. def test_popescus_r3(): - r3 = wl_measures_lexical_diversity.popescus_r3(main, TOKENS_225) + r3 = wl_measures_lexical_density_diversity.popescus_r3(main, text_tokens_225) assert round(r3, 4) == 0.3778 # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 59). Mouton de Gruyter. def test_popescus_r4(): - r4 = wl_measures_lexical_diversity.popescus_r4(main, TOKENS_225) + r4 = wl_measures_lexical_density_diversity.popescus_r4(main, text_tokens_225) assert round(r4, 4) == 0.6344 # Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 170, 172). Mouton de Gruyter. def test_repeat_rate(): settings['repeat_rate']['use_data'] = 'Rank-frequency distribution' - rr_distribution = wl_measures_lexical_diversity.repeat_rate(main, TOKENS_225) + rr_distribution = wl_measures_lexical_density_diversity.repeat_rate(main, text_tokens_225) settings['repeat_rate']['use_data'] = 'Frequency spectrum' - rr_spectrum = wl_measures_lexical_diversity.repeat_rate(main, TOKENS_225) + rr_spectrum = wl_measures_lexical_density_diversity.repeat_rate(main, text_tokens_225) assert round(rr_distribution, 4) == 0.0153 assert round(rr_spectrum, 4) == 0.4974 def test_rttr(): - rttr = wl_measures_lexical_diversity.rttr(main, TOKENS_100) + rttr = wl_measures_lexical_density_diversity.rttr(main, text_tokens_100) assert rttr == 5 / 100 ** 0.5 # Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 176, 178). Mouton de Gruyter. def test_shannon_entropy(): settings['shannon_entropy']['use_data'] = 'Rank-frequency distribution' - h_distribution = wl_measures_lexical_diversity.shannon_entropy(main, TOKENS_225) + h_distribution = wl_measures_lexical_density_diversity.shannon_entropy(main, text_tokens_225) settings['shannon_entropy']['use_data'] = 'Frequency spectrum' - h_spectrum = wl_measures_lexical_diversity.shannon_entropy(main, TOKENS_225) + h_spectrum = wl_measures_lexical_density_diversity.shannon_entropy(main, text_tokens_225) assert round(h_distribution, 4) == 6.5270 assert round(h_spectrum, 4) == 1.6234 def test_simpsons_l(): - l = wl_measures_lexical_diversity.simpsons_l(main, TOKENS_100) + l = wl_measures_lexical_density_diversity.simpsons_l(main, text_tokens_100) assert l == (5 * 20 ** 2 - 100) / (100 * (100 - 1)) def test_ttr(): - ttr = wl_measures_lexical_diversity.ttr(main, TOKENS_100) + ttr = wl_measures_lexical_density_diversity.ttr(main, text_tokens_100) assert ttr == 5 / 100 def test_vocdd(): - vocdd_10 = wl_measures_lexical_diversity.vocdd(main, TOKENS_10) - vocdd_100 = wl_measures_lexical_diversity.vocdd(main, TOKENS_100) - vocdd_1000 = wl_measures_lexical_diversity.vocdd(main, TOKENS_1000) + vocdd_10 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_10) + vocdd_100 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_100) + vocdd_1000 = wl_measures_lexical_density_diversity.vocdd(main, text_tokens_1000) assert vocdd_10 > 0 assert vocdd_100 > 0 assert vocdd_1000 > 0 def test_yules_characteristic_k(): - k = wl_measures_lexical_diversity.yules_characteristic_k(main, TOKENS_100) + k = wl_measures_lexical_density_diversity.yules_characteristic_k(main, text_tokens_100) assert k == 10000 * ((5 * 20 ** 2 - 100) / (100 ** 2)) def test_yules_index_of_diversity(): - index_of_diversity = wl_measures_lexical_diversity.yules_index_of_diversity(main, TOKENS_100) + index_of_diversity = wl_measures_lexical_density_diversity.yules_index_of_diversity(main, text_tokens_100) assert index_of_diversity == (100 ** 2) / (5 * 20 ** 2 - 100) @@ -201,6 +215,7 @@ def test_yules_index_of_diversity(): test_herdans_vm() test_hdd() test_honores_stat() + test_lexical_density() test_logttr() test_msttr() test_mtld() diff --git a/tests/tests_measures/test_measures_readability.py b/tests/tests_measures/test_measures_readability.py index 557bc8340..b77fff36f 100644 --- a/tests/tests_measures/test_measures_readability.py +++ b/tests/tests_measures/test_measures_readability.py @@ -16,7 +16,6 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- -import copy import math import numpy @@ -24,15 +23,6 @@ from tests import wl_test_init from wordless.wl_measures import wl_measures_readability -class Wl_Test_Text(): - def __init__(self, tokens_multilevel, lang = 'eng_us'): - super().__init__() - - self.main = main - self.lang = lang - self.tokens_multilevel = tokens_multilevel - self.tokens_multilevel_with_puncs = copy.deepcopy(tokens_multilevel) - main = wl_test_init.Wl_Test_Main() settings = main.settings_custom['measures']['readability'] @@ -40,54 +30,56 @@ def __init__(self, tokens_multilevel, lang = 'eng_us'): TOKENS_MULTILEVEL_12 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['This', 'is', 'a', 'sen-tence0', '.']]]] TOKENS_MULTILEVEL_12_PREP = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['From', 'beginning', 'to', 'end', '.']]]] TOKENS_MULTILEVEL_12_PROPN = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['Louisiana', 'readability', 'boxes', 'created', '.']]]] +TOKENS_MULTILEVEL_12_HYPHEN = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]], [[['This', 'is', 'a-', 'sen-tence0', '.']]]] TOKENS_MULTILEVEL_100 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]]] * 12 + [[[['This', 'is', 'a', 'sen-tence0', '.']]]] TOKENS_MULTILEVEL_100_PREP = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]]] * 12 + [[[['I', 'am', 'behind', 'you', '.']]]] TOKENS_MULTILEVEL_100_CONJ = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]]] * 12 + [[[['Go', 'ahead', 'and', 'turn', '.']]]] TOKENS_MULTILEVEL_120 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'metropolis', '.']]]] * 15 TOKENS_MULTILEVEL_150 = [[[['This', 'is', 'a', 'sentence', '.']], [['This', 'is', 'a', 'sentence', '.']]]] * 18 + [[[['This', 'is', 'a', 'sen-tence0', 'for', 'testing', '.']]]] -test_text_eng_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0) -test_text_eng_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12) -test_text_eng_12_prep = Wl_Test_Text(TOKENS_MULTILEVEL_12_PREP) -test_text_eng_12_propn = Wl_Test_Text(TOKENS_MULTILEVEL_12_PROPN) -test_text_eng_100 = Wl_Test_Text(TOKENS_MULTILEVEL_100) -test_text_eng_100_prep = Wl_Test_Text(TOKENS_MULTILEVEL_100_PREP) -test_text_eng_100_conj = Wl_Test_Text(TOKENS_MULTILEVEL_100_CONJ) -test_text_eng_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120) -test_text_eng_150 = Wl_Test_Text(TOKENS_MULTILEVEL_150) - -test_text_ara_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ara') -test_text_ara_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ara') -test_text_ara_faseeh = Wl_Test_Text([[[['\u064B\u064B\u0621']]]], lang = 'ara') - -test_text_deu_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'deu_de') -test_text_deu_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'deu_de') -test_text_deu_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120, lang = 'deu_de') - -test_text_ita_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'ita') -test_text_ita_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ita') - -test_text_spa_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'spa') -test_text_spa_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'spa') -test_text_spa_100 = Wl_Test_Text(TOKENS_MULTILEVEL_100, lang = 'spa') -test_text_spa_120 = Wl_Test_Text(TOKENS_MULTILEVEL_120, lang = 'spa') -test_text_spa_150 = Wl_Test_Text(TOKENS_MULTILEVEL_150, lang = 'spa') - -test_text_tha_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'tha') -test_text_tha_100 = Wl_Test_Text(TOKENS_MULTILEVEL_100, lang = 'tha') - -test_text_vie_0 = Wl_Test_Text(TOKENS_MULTILEVEL_0, lang = 'vie') -test_text_vie_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'vie') - -test_text_afr_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'afr') -test_text_nld_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'nld') -test_text_fra_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'fra') -test_text_pol_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'pol') -test_text_rus_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'rus') -test_text_ukr_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'ukr') - -test_text_other_12 = Wl_Test_Text(TOKENS_MULTILEVEL_12, lang = 'other') -test_text_other_100 = Wl_Test_Text(TOKENS_MULTILEVEL_100, lang = 'other') +test_text_eng_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0) +test_text_eng_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12) +test_text_eng_12_prep = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12_PREP) +test_text_eng_12_propn = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12_PROPN) +test_text_eng_12_hyphen = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12_HYPHEN) +test_text_eng_100 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100) +test_text_eng_100_prep = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100_PREP) +test_text_eng_100_conj = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100_CONJ) +test_text_eng_120 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_120) +test_text_eng_150 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_150) + +test_text_ara_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0, lang = 'ara') +test_text_ara_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'ara') +test_text_ara_faseeh = wl_test_init.Wl_Test_Text(main, [[[['\u064B\u064B\u0621']]]], lang = 'ara') + +test_text_deu_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0, lang = 'deu_de') +test_text_deu_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'deu_de') +test_text_deu_120 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_120, lang = 'deu_de') + +test_text_ita_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0, lang = 'ita') +test_text_ita_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'ita') + +test_text_spa_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0, lang = 'spa') +test_text_spa_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'spa') +test_text_spa_100 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100, lang = 'spa') +test_text_spa_120 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_120, lang = 'spa') +test_text_spa_150 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_150, lang = 'spa') + +test_text_tha_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'tha') +test_text_tha_100 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100, lang = 'tha') + +test_text_vie_0 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_0, lang = 'vie') +test_text_vie_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'vie') + +test_text_afr_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'afr') +test_text_nld_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'nld') +test_text_fra_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'fra') +test_text_pol_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'pol') +test_text_rus_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'rus') +test_text_ukr_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'ukr') + +test_text_other_12 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_12, lang = 'other') +test_text_other_100 = wl_test_init.Wl_Test_Text(main, TOKENS_MULTILEVEL_100, lang = 'other') def test_rd(): rd_ara_0 = wl_measures_readability.rd(main, test_text_ara_0) @@ -821,7 +813,7 @@ def test_td(): def test_wheeler_smiths_readability_formula(): wheeler_smith_eng_0 = wl_measures_readability.wheeler_smiths_readability_formula(main, test_text_eng_0) - wheeler_smith_eng_12 = wl_measures_readability.wheeler_smiths_readability_formula(main, test_text_eng_12) + wheeler_smith_eng_12 = wl_measures_readability.wheeler_smiths_readability_formula(main, test_text_eng_12_hyphen) wheeler_smith_spa_12 = wl_measures_readability.wheeler_smiths_readability_formula(main, test_text_spa_12) wheeler_smith_other_12 = wl_measures_readability.wheeler_smiths_readability_formula(main, test_text_other_12) diff --git a/tests/tests_nlp/test_lemmatization.py b/tests/tests_nlp/test_lemmatization.py index 1e2536167..d173674ec 100644 --- a/tests/tests_nlp/test_lemmatization.py +++ b/tests/tests_nlp/test_lemmatization.py @@ -94,7 +94,7 @@ def test_lemmatize(lang, lemmatizer): case 'eng_gb' | 'eng_us': match lemmatizer: case 'nltk_wordnet': - results = ['English', 'be', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] + results = ['English', 'be', 'a', 'West', 'Germanic', 'language', 'in', 'the', 'Indo', '-', 'European', 'language', 'family', '.'] case 'simplemma_eng': results = ['English', 'be', 'a', 'west', 'germanic', 'language', 'in', 'the', 'Indo-European', 'language', 'family', '.'] case _: diff --git a/tests/tests_nlp/test_pos_tagging.py b/tests/tests_nlp/test_pos_tagging.py index 9170eb506..0f693035a 100644 --- a/tests/tests_nlp/test_pos_tagging.py +++ b/tests/tests_nlp/test_pos_tagging.py @@ -42,6 +42,11 @@ test_pos_taggers.append((lang, pos_tagger)) test_pos_taggers_local.append((lang, pos_tagger)) +def test_to_content_function(): + assert wl_pos_tagging.to_content_function('ADJ') == 'Content words' + assert wl_pos_tagging.to_content_function('ADP') == 'Function words' + assert wl_pos_tagging.to_content_function('None') is None + @pytest.mark.parametrize('lang, pos_tagger', test_pos_taggers) def test_pos_tag(lang, pos_tagger): tests_lang_util_skipped = False @@ -214,6 +219,8 @@ def test_pos_tag_misc(): main.settings_custom['pos_tagging']['pos_tagger_settings']['to_universal_pos_tags'] = False if __name__ == '__main__': + test_to_content_function() + for lang, pos_tagger in test_pos_taggers_local: test_pos_tag(lang, pos_tagger) diff --git a/tests/tests_nlp/test_sentence_tokenization.py b/tests/tests_nlp/test_sentence_tokenization.py index d5b914e3f..903cc49c2 100644 --- a/tests/tests_nlp/test_sentence_tokenization.py +++ b/tests/tests_nlp/test_sentence_tokenization.py @@ -20,7 +20,7 @@ import pytest from tests import wl_test_init, wl_test_lang_examples -from wordless.wl_nlp import wl_sentence_tokenization, wl_texts, wl_word_tokenization +from wordless.wl_nlp import wl_sentence_tokenization from wordless.wl_utils import wl_misc _, is_macos, _ = wl_misc.check_os() @@ -140,7 +140,10 @@ def test_sentence_split(lang): text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}')) ) - if lang not in ['chu', 'cop', 'hbo', 'orv', 'tha', 'bod']: + if lang not in [ + 'lzh', 'zho_cn', 'zho_tw', 'chu', 'cop', 'hbo', 'isl', 'jpn', 'orv', 'srp_latn', + 'tha', 'bod' + ]: assert len(sentences_split) > 1 @pytest.mark.parametrize('lang', test_langs_split) @@ -332,32 +335,23 @@ def test_sentence_seg_tokenize(lang): case _: raise wl_test_init.Wl_Exception_Tests_Lang_Skipped(lang) -@pytest.mark.parametrize('lang', test_langs_split) -def test_sentence_seg_split(lang): - print(f'Testing {lang} / Sentence Segment Splitter...') - - sentence_segs = wl_sentence_tokenization.wl_sentence_seg_split( - main, - text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}')) - ) - - if lang not in ['chu', 'cop', 'orv', 'tha']: - assert len(sentence_segs) > 1 - @pytest.mark.parametrize('lang', test_langs) def test_sentence_seg_tokenize_tokens(lang): print(f'Testing {lang} / Sentence Segment Tokenizer with tokens...') - tokens = wl_word_tokenization.wl_word_tokenize_flat( - main, - text = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}')), - lang = lang - ) - sentence_segs = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, wl_texts.to_display_texts(tokens)) + tokens = ''.join(getattr(wl_test_lang_examples, f'TEXT_{lang.upper()}')).split() + sentence_segs = wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, tokens) - if lang not in ['chu', 'cop', 'orv', 'tha']: + if lang not in [ + 'lzh', 'zho_cn', 'zho_tw', 'chu', 'cop', 'jpn', 'orv', 'tha' + ]: assert len(sentence_segs) > 1 +def test_sentence_tokenize_misc(): + # Sentences and sentence segments should not be split within pre-tokenized tokens + assert wl_sentence_tokenization.wl_sentence_split(main, text = 'a.b c') == ['a.b c'] + assert wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens(main, tokens = ['a,b', 'c']) == [['a,b', 'c']] + if __name__ == '__main__': for lang, sentence_tokenizer in test_sentence_tokenizers_local: test_sentence_tokenize(lang, sentence_tokenizer) @@ -368,8 +362,7 @@ def test_sentence_seg_tokenize_tokens(lang): for lang in test_langs_split: test_sentence_seg_tokenize(lang) - for lang in test_langs_split: - test_sentence_seg_split(lang) - for lang in test_langs_split: test_sentence_seg_tokenize_tokens(lang) + + test_sentence_tokenize_misc() diff --git a/tests/tests_nlp/test_texts.py b/tests/tests_nlp/test_texts.py index ceb515b22..35b2357ca 100644 --- a/tests/tests_nlp/test_texts.py +++ b/tests/tests_nlp/test_texts.py @@ -48,6 +48,8 @@ def test_split_texts_properties(): 'lang': 'eng_us', 'syls': None, 'tag': '_NN', + 'tag_universal': None, + 'content_function': None, 'lemma': None, 'head': None, 'dependency_relation': None, @@ -97,20 +99,17 @@ def test_update_token_properties(): def test_clean_texts(): assert wl_texts.clean_texts([' test ', ' ']) == ['test'] -def test_wl_text_blank(): - wl_texts.Wl_Text_Blank() - def test_wl_text_total(): - text_1 = wl_texts.Wl_Text_Blank() - text_1.lang = 'eng_us' - text_1.tokens_multilevel = [] - text_1.tokens_multilevel_with_puncs = [] + text_1 = wl_test_init.Wl_Test_Text(main, tokens_multilevel = [], lang = 'eng_us', tagged = False) + text_2 = wl_test_init.Wl_Test_Text(main, tokens_multilevel = [], lang = 'eng_gb', tagged = True) - text_2 = copy.deepcopy(text_1) - text_2.lang = 'other' + text_total_1 = wl_texts.Wl_Text_Total(texts = [text_1, text_1]) + text_total_2 = wl_texts.Wl_Text_Total(texts = [text_1, text_2]) - wl_texts.Wl_Text_Total(texts = [text_1, text_1]) - wl_texts.Wl_Text_Total(texts = [text_1, text_2]) + assert text_total_1.lang == 'eng_us' + assert not text_total_1.tagged + assert text_total_2.lang == 'other' + assert text_total_2.tagged if __name__ == '__main__': test_wl_token() @@ -128,5 +127,4 @@ def test_wl_text_total(): test_update_token_properties() test_clean_texts() - test_wl_text_blank() test_wl_text_total() diff --git a/tests/tests_settings/test_settings_default.py b/tests/tests_settings/test_settings_default.py index fb85d8cc7..c5f1ffe9f 100644 --- a/tests/tests_settings/test_settings_default.py +++ b/tests/tests_settings/test_settings_default.py @@ -24,5 +24,10 @@ def test_settings_default(): assert wl_settings_default.init_settings_default(main) + # Check for invalid conversion of universal POS tags into content/function words + for mappings in main.settings_default['pos_tagging']['tagsets']['mapping_settings'].values(): + for mapping in mappings.values(): + assert all(len(pos_mapping) == 5 for pos_mapping in mapping) + if __name__ == '__main__': test_settings_default() diff --git a/tests/tests_settings/test_settings_measures.py b/tests/tests_settings/test_settings_measures.py index 8d16c8260..2d30b2225 100644 --- a/tests/tests_settings/test_settings_measures.py +++ b/tests/tests_settings/test_settings_measures.py @@ -28,11 +28,11 @@ def test_wl_settings_measures_readability(): settings_measures_readability.load_settings(defaults = True) settings_measures_readability.apply_settings() -def test_wl_settings_measures_lexical_diversity(): - settings_measures_lexical_diversity = wl_settings_measures.Wl_Settings_Measures_Lexical_Diversity(main) - settings_measures_lexical_diversity.load_settings() - settings_measures_lexical_diversity.load_settings(defaults = True) - settings_measures_lexical_diversity.apply_settings() +def test_wl_settings_measures_lexical_density_diversity(): + settings_measures_lexical_density_diversity = wl_settings_measures.Wl_Settings_Measures_Lexical_Density_Diversity(main) + settings_measures_lexical_density_diversity.load_settings() + settings_measures_lexical_density_diversity.load_settings(defaults = True) + settings_measures_lexical_density_diversity.apply_settings() def test_wl_settings_measures_dispersion(): settings_measures_dispersion = wl_settings_measures.Wl_Settings_Measures_Dispersion(main) @@ -66,7 +66,7 @@ def test_wl_settings_measures_effect_size(): if __name__ == '__main__': test_wl_settings_measures_readability() - test_wl_settings_measures_lexical_diversity() + test_wl_settings_measures_lexical_density_diversity() test_wl_settings_measures_dispersion() test_wl_settings_measures_adjusted_freq() test_wl_settings_measures_statistical_significance() diff --git a/tests/tests_settings/test_settings_pos_tagging.py b/tests/tests_settings/test_settings_pos_tagging.py index 82dd1eb3b..9992dfb30 100644 --- a/tests/tests_settings/test_settings_pos_tagging.py +++ b/tests/tests_settings/test_settings_pos_tagging.py @@ -42,7 +42,7 @@ def test_wl_settings_pos_tagging_tagsets(): settings_pos_tagging_tagsets.preview_lang_changed() settings_pos_tagging_tagsets.preview_pos_tagger_changed() - settings_pos_tagging_tagsets.update_gui(['test', 'test', 'test', 'test']) + settings_pos_tagging_tagsets.update_gui([['test', 'test', 'test', 'test', 'test']]) main.settings_custom['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger']['eng_us'] = 'nltk_perceptron_eng' settings_pos_tagging_tagsets.reset_currently_shown_table() diff --git a/tests/wl_test_file_area.py b/tests/wl_test_file_area.py index 1b5e06bfa..b4dd226fd 100644 --- a/tests/wl_test_file_area.py +++ b/tests/wl_test_file_area.py @@ -118,17 +118,24 @@ def update_gui_ref(err_msg, new_files): assert new_file['path_original'] == wl_paths.get_normalized_path(file_path) - if i < NUM_FILES_ALL or new_file['name'] == '[eng_gb] Tagged': + if i < NUM_FILES_ALL or new_file['name'] in ['[amh] No language support', '[eng_gb] Tagged']: assert new_file['encoding'] == 'utf_8' else: assert new_file['encoding'] == 'ascii' - assert new_file['lang'] == 'eng_us' + if new_file['name'] == '[amh] No language support': + assert new_file['lang'] == 'other' + else: + assert new_file['lang'] == 'eng_us' + assert not new_file['tokenized'] assert not new_file['tagged'] + if new_file['name'] == '[amh] No language support': + new_file['lang'] = new_file['text'].lang = 'amh' + if new_file['name'] == '[eng_gb] Tagged': - new_file['tagged'] = True + new_file['tagged'] = new_file['text'].tagged = True print(f'done! (In {round(time.time() - time_start, 2)} seconds)') diff --git a/tests/wl_test_init.py b/tests/wl_test_init.py index 56f4471c9..f63fc35dd 100644 --- a/tests/wl_test_init.py +++ b/tests/wl_test_init.py @@ -33,11 +33,13 @@ from tests import wl_test_file_area from wordless import wl_file_area from wordless.wl_checks import wl_checks_misc +from wordless.wl_nlp import wl_texts from wordless.wl_settings import wl_settings, wl_settings_default, wl_settings_global from wordless.wl_utils import wl_misc from wordless.wl_widgets import wl_tables -SEARCH_TERMS = ['take'] +# English, Amharic +SEARCH_TERMS = ['take', 'አማርኛ'] # An instance of QApplication must be created before any instance of QWidget wl_app = QApplication(sys.argv) @@ -220,14 +222,6 @@ def switch_lang_utils_stanza(self): break -class Wl_Exception_Tests_Lang_Skipped(Exception): - def __init__(self, lang): - super().__init__(f'Tests for language "{lang}" is skipped!') - -class Wl_Exception_Tests_Lang_Util_Skipped(Exception): - def __init__(self, lang_util): - super().__init__(f'Tests for language utility "{lang_util}" is skipped!') - class Wl_Test_Table(QTableView): def __init__(self, parent, tab = ''): super().__init__(parent) @@ -252,6 +246,38 @@ def set_label(self, row, col, text): self.setIndexWidget(self.model().index(row, col), QLabel(text)) self.indexWidget(self.model().index(row, col)).tokens_raw = [text] +class Wl_Test_Text: + def __init__(self, main, tokens_multilevel, lang = 'eng_us', tagged = False): + self.main = main + self.lang = lang + self.tagged = tagged + + self.tokens_multilevel = [] + + for para in tokens_multilevel: + self.tokens_multilevel.append([]) + + for sentence in para: + self.tokens_multilevel[-1].append([]) + + for sentence_seg in sentence: + self.tokens_multilevel[-1][-1].append(wl_texts.to_tokens(sentence_seg, lang = lang)) + + self.tokens_multilevel_with_puncs = copy.deepcopy(tokens_multilevel) + + self.get_tokens_flat = lambda: wl_texts.Wl_Text.get_tokens_flat(self) + self.update_num_tokens = lambda: wl_texts.Wl_Text.update_num_tokens(self) + + self.update_num_tokens() + +class Wl_Exception_Tests_Lang_Skipped(Exception): + def __init__(self, lang): + super().__init__(f'Tests for language "{lang}" is skipped!') + +class Wl_Exception_Tests_Lang_Util_Skipped(Exception): + def __init__(self, lang_util): + super().__init__(f'Tests for language utility "{lang_util}" is skipped!') + def wl_test_index(row, col): return QStandardItemModel().createIndex(row, col) diff --git a/wordless/wl_colligation_extractor.py b/wordless/wl_colligation_extractor.py index fd03fa676..1e96bbbf8 100644 --- a/wordless/wl_colligation_extractor.py +++ b/wordless/wl_colligation_extractor.py @@ -891,7 +891,6 @@ def __init__(self, main, dialog_progress, update_gui): def run(self): try: - texts = [] colligations_freqs_files_all = [] settings = self.main.settings_custom['colligation_extractor'] @@ -1091,15 +1090,11 @@ def run(self): # Frequency (All) colligations_freqs_files_all.append(colligations_freqs_file_all) - texts.append(text) - # Total if len(files) > 1: colligations_freqs_total = {} colligations_freqs_total_all = {} - texts.append(wl_texts.Wl_Text_Blank()) - # Frequency for colligations_freqs_file in self.colligations_freqs_files: for colligation, freqs in colligations_freqs_file.items(): @@ -1132,8 +1127,7 @@ def run(self): # Used for z-score (Berry-Rogghe) span = (abs(window_left) + abs(window_right)) / 2 - for text, colligations_freqs_file, colligations_freqs_file_all in zip( - texts, + for colligations_freqs_file, colligations_freqs_file_all in zip( self.colligations_freqs_files, colligations_freqs_files_all ): diff --git a/wordless/wl_collocation_extractor.py b/wordless/wl_collocation_extractor.py index 6ca9dd4fe..5173d7c56 100644 --- a/wordless/wl_collocation_extractor.py +++ b/wordless/wl_collocation_extractor.py @@ -888,7 +888,6 @@ def __init__(self, main, dialog_progress, update_gui): def run(self): try: - texts = [] collocations_freqs_files_all = [] settings = self.main.settings_custom['collocation_extractor'] @@ -1088,12 +1087,8 @@ def run(self): # Frequency (All) collocations_freqs_files_all.append(collocations_freqs_file_all) - texts.append(text) - # Total if len(files) > 1: - texts.append(wl_texts.Wl_Text_Blank()) - collocations_freqs_total = {} collocations_freqs_total_all = {} @@ -1129,8 +1124,7 @@ def run(self): # Used for z-score (Berry-Rogghe) span = (abs(window_left) + abs(window_right)) / 2 - for text, collocations_freqs_file, collocations_freqs_file_all in zip( - texts, + for collocations_freqs_file, collocations_freqs_file_all in zip( self.collocations_freqs_files, collocations_freqs_files_all ): diff --git a/wordless/wl_measures/wl_measures_lexical_diversity.py b/wordless/wl_measures/wl_measures_lexical_density_diversity.py similarity index 69% rename from wordless/wl_measures/wl_measures_lexical_diversity.py rename to wordless/wl_measures/wl_measures_lexical_density_diversity.py index 0a53437ea..099a59b65 100644 --- a/wordless/wl_measures/wl_measures_lexical_diversity.py +++ b/wordless/wl_measures/wl_measures_lexical_density_diversity.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------- -# Wordless: Measures - Lexical diversity +# Wordless: Measures - Lexical density/diversity # Copyright (C) 2018-2024 Ye Lei (叶磊) # # This program is free software: you can redistribute it and/or modify @@ -25,7 +25,7 @@ from PyQt5.QtCore import QCoreApplication import scipy -from wordless.wl_nlp import wl_nlp_utils +from wordless.wl_nlp import wl_nlp_utils, wl_pos_tagging _tr = QCoreApplication.translate @@ -33,35 +33,32 @@ # References: # Brunét, E. (1978). Le vocabulaire de Jean Giraudoux: Structure et evolution. Slatkine. # Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71–91. https://doi.org/10.1080/026870300401603 -def brunets_index(main, tokens): - return numpy.power(len(tokens), numpy.power(len(set(tokens)), -0.165)) +def brunets_index(main, text): + return numpy.power(text.num_tokens, numpy.power(text.num_types, -0.165)) # Corrected TTR # References: # Carroll, J. B. (1964). Language and thought. Prentice-Hall. # Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004). Lexical diversity and language development: Quantification and assessment (p. 26). Palgrave Macmillan. -def cttr(main, tokens): - return len(set(tokens)) / numpy.sqrt(2 * len(tokens)) +def cttr(main, text): + return text.num_types / numpy.sqrt(2 * text.num_tokens) # Fisher's Index of Diversity # Reference: Fisher, R. A., Steven, A. C., & Williams, C. B. (1943). The relation between the number of species and the number of individuals in a random sample of an animal population. Journal of Animal Ecology, 12(1), 42–58. https://doi.org/10.2307/1411 -def fishers_index_of_diversity(main, tokens): - num_tokens = len(tokens) - num_types = len(set(tokens)) - +def fishers_index_of_diversity(main, text): lambertw_x = -( - numpy.exp(-(num_types / num_tokens)) - * num_types - / num_tokens + numpy.exp(-(text.num_types / text.num_tokens)) + * text.num_types + / text.num_tokens ) if lambertw_x > -numpy.exp(-1): alpha = -( - (num_tokens * num_types) + (text.num_tokens * text.num_types) / ( - num_tokens + text.num_tokens * scipy.special.lambertw(lambertw_x, -1).real - + num_types + + text.num_types ) ) else: @@ -71,33 +68,30 @@ def fishers_index_of_diversity(main, tokens): # Herdan's Vₘ # Reference: Herdan, G. (1955). A new derivation and interpretation of Yule's ‘Characteristic’ K. Zeitschrift für Angewandte Mathematik und Physik (ZAMP), 6(4), 332–339. https://doi.org/10.1007/BF01587632 -def herdans_vm(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) - num_types = len(types_freqs) +def herdans_vm(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = collections.Counter(types_freqs.values()) freqs = numpy.array(list(freqs_nums_types)) nums_types = numpy.array(list(freqs_nums_types.values())) s2 = numpy.sum(nums_types * numpy.square(freqs)) - vm = s2 / (num_tokens ** 2) - 1 / num_types + vm = s2 / (text.num_tokens ** 2) - 1 / text.num_types return vm # HD-D # Reference: McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation study of sophisticated approaches to lexical diversity assessment. Behavior Research Methods, 42(2), 381–392. https://doi.org/10.3758/BRM.42.2.381 -def hdd(main, tokens): - sample_size = main.settings_custom['measures']['lexical_diversity']['hdd']['sample_size'] +def hdd(main, text): + sample_size = main.settings_custom['measures']['lexical_density_diversity']['hdd']['sample_size'] - num_tokens = len(tokens) - tokens_freqs = collections.Counter(tokens) + tokens_freqs = collections.Counter(text.get_tokens_flat()) ttrs = numpy.empty(len(list(tokens_freqs))) # Short texts - sample_size = min(sample_size, num_tokens) + sample_size = min(sample_size, text.num_tokens) for i, freq in enumerate(tokens_freqs.values()): - ttrs[i] = scipy.stats.hypergeom.pmf(k = 0, M = num_tokens, n = freq, N = sample_size) + ttrs[i] = scipy.stats.hypergeom.pmf(k = 0, M = text.num_tokens, n = freq, N = sample_size) # The probability that each type appears at least once in the sample ttrs = 1 - ttrs @@ -105,23 +99,36 @@ def hdd(main, tokens): return sum(ttrs) -# Honoré's statistic +# Honoré's Statistic # References: # Honoré, A. (1979). Some simple measures of richness of vocabulary. Association of Literary and Linguistic Computing Bulletin, 7(2), 172–177. # Bucks, R. S., Singh, S., Cuerden, J. M., & Wilcock, G. K. (2000). Analysis of spontaneous, conversational speech in dementia of Alzheimer type: Evaluation of an objective technique for analysing lexical performance. Aphasiology, 14(1), 71–91. https://doi.org/10.1080/026870300401603 -def honores_stat(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) - num_types = len(types_freqs) +def honores_stat(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = collections.Counter(types_freqs.values()) - if (denominator := 1 - freqs_nums_types[1] / num_types): - r = 100 * numpy.log(num_tokens / denominator) + if (denominator := 1 - freqs_nums_types[1] / text.num_types): + r = 100 * numpy.log(text.num_tokens / denominator) else: r = 0 return r +# Lexical Density +# Reference: Halliday, M. A. K. (1989). Spoken and written language (2nd ed., p. 64). +def lexical_density(main, text): + if text.lang in main.settings_global['pos_taggers']: + wl_pos_tagging.wl_pos_tag_universal(main, text.get_tokens_flat(), lang = text.lang, tagged = text.tagged) + + num_content_words = sum((1 for token in text.get_tokens_flat() if token.content_function == _tr('wl_measures_lexical_density_diversity', 'Content words'))) + num_tokens = text.num_tokens + + lexical_density = num_content_words / num_tokens if num_tokens else 0 + else: + lexical_density = 'no_support' + + return lexical_density + # LogTTR # Herdan: # Herdan, G. (1960). Type-token mathematics: A textbook of mathematical linguistics (p. 28). Mouton. @@ -137,22 +144,19 @@ def honores_stat(main, tokens): # Dugast, D. (1978). Sur quoi se fonde la notion d’étendue théoretique du vocabulaire?. Le Français Moderne, 46, 25–32. # Dugast, D. (1979). Vocabulaire et stylistique: I théâtre et dialogue, travaux de linguistique quantitative. Slatkine. # Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004). Lexical diversity and language development: Quantification and assessment (p. 28). Palgrave Macmillan. -def logttr(main, tokens): - variant = main.settings_custom['measures']['lexical_diversity']['logttr']['variant'] - - num_types = len(set(tokens)) - num_tokens = len(tokens) +def logttr(main, text): + variant = main.settings_custom['measures']['lexical_density_diversity']['logttr']['variant'] if variant == 'Herdan': - logttr = numpy.log(num_types) / numpy.log(num_tokens) + logttr = numpy.log(text.num_types) / numpy.log(text.num_tokens) elif variant == 'Somers': - logttr = numpy.log(numpy.log(num_types)) / numpy.log(numpy.log(num_tokens)) + logttr = numpy.log(numpy.log(text.num_types)) / numpy.log(numpy.log(text.num_tokens)) elif variant == 'Rubet': - logttr = numpy.log(num_types) / numpy.log(numpy.log(num_tokens)) + logttr = numpy.log(text.num_types) / numpy.log(numpy.log(text.num_tokens)) elif variant == 'Maas': - logttr = (numpy.log(num_tokens) - numpy.log(num_types)) / (numpy.log(num_tokens) ** 2) + logttr = (numpy.log(text.num_tokens) - numpy.log(text.num_types)) / (numpy.log(text.num_tokens) ** 2) elif variant == 'Dugast': - logttr = (numpy.log(num_tokens) ** 2) / (numpy.log(num_tokens) - numpy.log(num_types)) + logttr = (numpy.log(text.num_tokens) ** 2) / (numpy.log(text.num_tokens) - numpy.log(text.num_types)) return logttr @@ -160,12 +164,12 @@ def logttr(main, tokens): # References: # Johnson, W. (1944). Studies in language behavior: I. a program of research. Psychological Monographs, 56(2), 1–15. https://doi.org/10.1037/h0093508 # McCarthy, P. M. (2005). An assessment of the range and usefulness of lexical diversity measures and the potential of the measure of textual, lexical diversity (MTLD) [Doctoral dissertation, The University of Memphis] (p. 37). ProQuest Dissertations and Theses Global. -def msttr(main, tokens): - num_tokens_seg = main.settings_custom['measures']['lexical_diversity']['msttr']['num_tokens_in_each_seg'] +def msttr(main, text): + num_tokens_seg = main.settings_custom['measures']['lexical_density_diversity']['msttr']['num_tokens_in_each_seg'] ttrs = [ len(set(tokens_seg)) / num_tokens_seg - for tokens_seg in wl_nlp_utils.to_sections_unequal(tokens, num_tokens_seg) + for tokens_seg in wl_nlp_utils.to_sections_unequal(text.get_tokens_flat(), num_tokens_seg) # Discard the last segment of text if its length is shorter than other segments if len(tokens_seg) == num_tokens_seg ] @@ -181,10 +185,10 @@ def msttr(main, tokens): # References: # McCarthy, P. M. (2005). An assessment of the range and usefulness of lexical diversity measures and the potential of the measure of textual, lexical diversity (MTLD) [Doctoral dissertation, The University of Memphis] (pp. 95–96, 99–100). ProQuest Dissertations and Theses Global. # McCarthy, P. M., & Jarvis, S. (2010). MTLD, vocd-D, and HD-D: A validation study of sophisticated approaches to lexical diversity assessment. Behavior Research Methods, 42(2), 381–392. https://doi.org/10.3758/BRM.42.2.381 -def mtld(main, tokens): +def mtld(main, text): mtlds = numpy.empty(shape = 2) - factor_size = main.settings_custom['measures']['lexical_diversity']['mtld']['factor_size'] - num_tokens = len(tokens) + factor_size = main.settings_custom['measures']['lexical_density_diversity']['mtld']['factor_size'] + tokens = text.get_tokens_flat() for i in range(2): num_factors = 0 @@ -204,12 +208,12 @@ def mtld(main, tokens): counter.clear() # The last incomplete factor - elif j == num_tokens - 1: + elif j == text.num_tokens - 1: if factor_size < 1: num_factors += (1 - ttr) / (1 - factor_size) if num_factors: - mtlds[i] = num_tokens / num_factors + mtlds[i] = text.num_tokens / num_factors else: mtlds[i] = 0 @@ -217,12 +221,12 @@ def mtld(main, tokens): # Moving-average TTR # Reference: Covington, M. A., & McFall, J. D. (2010). Cutting the Gordian knot: The moving-average type-token ratio (MATTR). Journal of Quantitative Linguistics, 17(2), 94–100. https://doi.org/10.1080/09296171003643098 -def mattr(main, tokens): - window_size = main.settings_custom['measures']['lexical_diversity']['mattr']['window_size'] +def mattr(main, text): + window_size = main.settings_custom['measures']['lexical_density_diversity']['mattr']['window_size'] - num_tokens = len(tokens) - num_windows = max(1, num_tokens - window_size + 1) + num_windows = max(1, text.num_tokens - window_size + 1) ttrs = numpy.empty(shape = num_windows) + tokens = text.get_tokens_flat() counter = collections.Counter(tokens[:window_size]) @@ -244,15 +248,14 @@ def mattr(main, tokens): # Popescu-Mačutek-Altmann's B₁/B₂/B₃/B₄/B₅ # Reference: Popescu I.-I., Mačutek, J, & Altmann, G. (2008). Word frequency and arc length. Glottometrics, 17, 18–42. -def popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, tokens): - types_freqs = collections.Counter(tokens) - num_types = len(types_freqs) +def popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs = numpy.array(sorted(types_freqs.values(), reverse = True)) freqs_nums_types = collections.Counter(types_freqs.values()) l = numpy.sum(numpy.sqrt(numpy.square(freqs[:-1] - freqs[1:]) + 1)) - l_min = numpy.sqrt(numpy.square(num_types - 1) + numpy.square(freqs[0] - 1)) - l_max = numpy.sqrt(numpy.square(freqs[0] - 1) + 1) + num_types - 2 + l_min = numpy.sqrt(numpy.square(text.num_types - 1) + numpy.square(freqs[0] - 1)) + l_max = numpy.sqrt(numpy.square(freqs[0] - 1) + 1) + text.num_types - 2 b1 = l / l_max @@ -261,7 +264,7 @@ def popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, tokens): else: b2 = 0 - b3 = (num_types - 1) / l + b3 = (text.num_types - 1) / l b4 = (freqs[0] - 1) / l b5 = freqs_nums_types[1] / l @@ -269,12 +272,10 @@ def popescu_macutek_altmanns_b1_b2_b3_b4_b5(main, tokens): # Popescu's R₁ # Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 18, 30, 33). Mouton de Gruyter. -def popescus_r1(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) - num_types = len(types_freqs) - ranks = numpy.empty(shape = num_types) - freqs = numpy.empty(shape = num_types) +def popescus_r1(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) + ranks = numpy.empty(shape = text.num_types) + freqs = numpy.empty(shape = text.num_types) for i, freq in enumerate(sorted(types_freqs.values(), reverse = True)): ranks[i] = i + 1 @@ -298,16 +299,15 @@ def popescus_r1(main, tokens): r_min = ranks[i_min] h = (c_min * r_max - c_max * r_min) / (c_min - c_max) - f_h = numpy.sum(freqs[:int(numpy.floor(h))]) / num_tokens - r1 = 1 - (f_h - numpy.square(h) / (2 * num_tokens)) + f_h = numpy.sum(freqs[:int(numpy.floor(h))]) / text.num_tokens + r1 = 1 - (f_h - numpy.square(h) / (2 * text.num_tokens)) return r1 # Popescu's R₂ # Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 35–36, 38). Mouton de Gruyter. -def popescus_r2(main, tokens): - num_types_all = len(set(tokens)) - types_freqs = collections.Counter(tokens) +def popescus_r2(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = sorted(collections.Counter(types_freqs.values()).items()) freqs = numpy.array([freq for freq, _ in freqs_nums_types]) nums_types = numpy.array([num_types for _, num_types in freqs_nums_types]) @@ -334,24 +334,22 @@ def popescus_r2(main, tokens): else: k = 0 - g_k = numpy.sum([num_types for freq, num_types in freqs_nums_types if freq <= numpy.floor(k)]) / num_types_all - r2 = g_k - numpy.square(k) / (2 * num_types_all) + g_k = numpy.sum([num_types for freq, num_types in freqs_nums_types if freq <= numpy.floor(k)]) / text.num_types + r2 = g_k - numpy.square(k) / (2 * text.num_types) return r2 # Popescu's R₃ # Reference: Popescu, I.-I. (2009). Word frequency studies (pp. 48–49, 53). Mouton de Gruyter. -def popescus_r3(main, tokens): - num_tokens = len(tokens) - num_types = len(set(tokens)) - types_freqs = collections.Counter(tokens) +def popescus_r3(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) ranks_freqs = [ (i + 1, freq) for i, freq in enumerate(sorted(types_freqs.values(), reverse = True)) ] - rs_rel = numpy.empty(shape = num_types) - fs_rel = numpy.empty(shape = num_types) + rs_rel = numpy.empty(shape = text.num_types) + fs_rel = numpy.empty(shape = text.num_types) freq_cum = 0 for i, (rank, freq) in enumerate(ranks_freqs): @@ -360,8 +358,8 @@ def popescus_r3(main, tokens): rs_rel[i] = rank fs_rel[i] = freq_cum - rs_rel /= num_types - fs_rel /= num_tokens + rs_rel /= text.num_types + fs_rel /= text.num_tokens drs = numpy.sqrt(numpy.square(rs_rel) + numpy.square(1 - fs_rel)) m = numpy.argmin(drs) + 1 # m refers to rank @@ -372,39 +370,35 @@ def popescus_r3(main, tokens): # Popescu's R₄ # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 57). Mouton de Gruyter. -def popescus_r4(main, tokens): - num_tokens = len(tokens) - num_types = len(set(tokens)) - types_freqs = collections.Counter(tokens) +def popescus_r4(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) - ranks = numpy.empty(shape = num_types) - freqs = numpy.empty(shape = num_types) + ranks = numpy.empty(shape = text.num_types) + freqs = numpy.empty(shape = text.num_types) for i, freq in enumerate(sorted(types_freqs.values(), reverse = True)): ranks[i] = i + 1 freqs[i] = freq - r4 = 1 - (num_types + 1 - 2 / num_tokens * numpy.sum(ranks * freqs)) / num_types + r4 = 1 - (text.num_types + 1 - 2 / text.num_tokens * numpy.sum(ranks * freqs)) / text.num_types return r4 # Repeat Rate # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 166). Mouton de Gruyter. -def repeat_rate(main, tokens): - use_data = main.settings_custom['measures']['lexical_diversity']['repeat_rate']['use_data'] +def repeat_rate(main, text): + use_data = main.settings_custom['measures']['lexical_density_diversity']['repeat_rate']['use_data'] - num_tokens = len(tokens) - num_types = len(set(tokens)) - types_freqs = collections.Counter(tokens) + types_freqs = collections.Counter(text.get_tokens_flat()) - if use_data == _tr('wl_measures_lexical_diversity', 'Rank-frequency distribution'): + if use_data == _tr('wl_measures_lexical_density_diversity', 'Rank-frequency distribution'): freqs = numpy.array(list(types_freqs.values())) - rr = numpy.sum(numpy.square(freqs)) / numpy.square(num_tokens) - elif use_data == _tr('wl_measures_lexical_diversity', 'Frequency spectrum'): + rr = numpy.sum(numpy.square(freqs)) / numpy.square(text.num_tokens) + elif use_data == _tr('wl_measures_lexical_density_diversity', 'Frequency spectrum'): nums_types = numpy.array(list(collections.Counter(types_freqs.values()).values())) - rr = numpy.sum(numpy.square(nums_types)) / numpy.square(num_types) + rr = numpy.sum(numpy.square(nums_types)) / numpy.square(text.num_types) return rr @@ -412,24 +406,22 @@ def repeat_rate(main, tokens): # References: # Guiraud, P. (1954). Les caractères statistiques du vocabulaire: Essai de méthodologie. Presses universitaires de France. # Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004). Lexical diversity and language development: Quantification and assessment (p. 26). Palgrave Macmillan. -def rttr(main, tokens): - return len(set(tokens)) / numpy.sqrt(len(tokens)) +def rttr(main, text): + return text.num_types / numpy.sqrt(text.num_tokens) # Shannon Entropy # Reference: Popescu, I.-I. (2009). Word frequency studies (p. 173). Mouton de Gruyter. -def shannon_entropy(main, tokens): - use_data = main.settings_custom['measures']['lexical_diversity']['shannon_entropy']['use_data'] +def shannon_entropy(main, text): + use_data = main.settings_custom['measures']['lexical_density_diversity']['shannon_entropy']['use_data'] - num_tokens = len(tokens) - num_types = len(set(tokens)) - types_freqs = collections.Counter(tokens) + types_freqs = collections.Counter(text.get_tokens_flat()) - if use_data == _tr('wl_measures_lexical_diversity', 'Rank-frequency distribution'): + if use_data == _tr('wl_measures_lexical_density_diversity', 'Rank-frequency distribution'): freqs = numpy.array(list(types_freqs.values())) - ps = freqs / num_tokens - elif use_data == _tr('wl_measures_lexical_diversity', 'Frequency spectrum'): + ps = freqs / text.num_tokens + elif use_data == _tr('wl_measures_lexical_density_diversity', 'Frequency spectrum'): nums_types = numpy.array(list(collections.Counter(types_freqs.values()).values())) - ps = nums_types / num_types + ps = nums_types / text.num_types h = -numpy.sum(ps * numpy.log2(ps)) @@ -437,37 +429,36 @@ def shannon_entropy(main, tokens): # Simpson's l # Reference: Simpson, E. H. (1949). Measurement of diversity. Nature, 163, p. 688. https://doi.org/10.1038/163688a0 -def simpsons_l(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) +def simpsons_l(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = collections.Counter(types_freqs.values()) freqs = numpy.array(list(freqs_nums_types)) nums_types = numpy.array(list(freqs_nums_types.values())) s2 = numpy.sum(nums_types * numpy.square(freqs)) - l = (s2 - num_tokens) / (num_tokens * (num_tokens - 1)) + l = (s2 - text.num_tokens) / (text.num_tokens * (text.num_tokens - 1)) return l # Type-token Ratio # Reference: Johnson, W. (1944). Studies in language behavior: I. a program of research. Psychological Monographs, 56(2), 1–15. https://doi.org/10.1037/h0093508 -def ttr(main, tokens): - return len(set(tokens)) / len(tokens) +def ttr(main, text): + return text.num_types / text.num_tokens # vocd-D # Reference: Malvern, D., Richards, B., Chipere, N., & Durán, P. (2004). Lexical diversity and language development: Quantification and assessment (pp. 51, 56–57). Palgrave Macmillan. -def vocdd(main, tokens): +def vocdd(main, text): def ttr(n, d): return (d / n) * (numpy.sqrt(1 + 2 * n / d) - 1) - num_tokens = len(tokens) + tokens = text.get_tokens_flat() ttr_ys = numpy.empty(shape = 16) for i, n in enumerate(range(35, 51)): ttrs = numpy.empty(shape = 100) for j in range(100): - if n <= num_tokens: + if n <= text.num_tokens: sample = random.sample(tokens, k = n) else: sample = tokens @@ -486,31 +477,29 @@ def ttr(n, d): # Yule's Characteristic K # Reference: Yule, G. U. (1944). The statistical study of literary vocabulary (pp. 52–53). Cambridge University Press. -def yules_characteristic_k(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) +def yules_characteristic_k(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = collections.Counter(types_freqs.values()) freqs = numpy.array(list(freqs_nums_types)) nums_types = numpy.array(list(freqs_nums_types.values())) s2 = numpy.sum(nums_types * numpy.square(freqs)) - k = 10000 * ((s2 - num_tokens) / (num_tokens ** 2)) + k = 10000 * ((s2 - text.num_tokens) / (text.num_tokens ** 2)) return k # Yule's Index of Diversity # Reference: Williams, C. B. (1970). Style and vocabulary: Numerical studies (p. 100). Griffin. -def yules_index_of_diversity(main, tokens): - num_tokens = len(tokens) - types_freqs = collections.Counter(tokens) +def yules_index_of_diversity(main, text): + types_freqs = collections.Counter(text.get_tokens_flat()) freqs_nums_types = collections.Counter(types_freqs.values()) freqs = numpy.array(list(freqs_nums_types)) nums_types = numpy.array(list(freqs_nums_types.values())) s2 = numpy.sum(nums_types * numpy.square(freqs)) - if (divisor := s2 - num_tokens): - index_of_diversity = (num_tokens ** 2) / divisor + if (divisor := s2 - text.num_tokens): + index_of_diversity = (text.num_tokens ** 2) / divisor else: index_of_diversity = 0 diff --git a/wordless/wl_measures/wl_measures_readability.py b/wordless/wl_measures/wl_measures_readability.py index 7a7be94a8..b546f565c 100644 --- a/wordless/wl_measures/wl_measures_readability.py +++ b/wordless/wl_measures/wl_measures_readability.py @@ -17,6 +17,7 @@ # ---------------------------------------------------------------------- import bisect +import copy import math import random import re @@ -42,11 +43,11 @@ def get_nums(main, text): text.words_multilevel[-1].append([]) for sentence_seg in sentence: - text.words_multilevel[-1][-1].append(wl_texts.to_tokens([ + text.words_multilevel[-1][-1].append(copy.deepcopy([ token for token in sentence_seg if wl_checks_tokens.is_word_alphanumeric(token) - ], lang = text.lang)) + ])) text.sentences = [ list(wl_misc.flatten_list(sentence)) @@ -63,9 +64,8 @@ def get_nums(main, text): # Number of syllables if 'num_syls' not in text.__dict__ and text.lang in main.settings_global['syl_tokenizers']: - text.words_flat = wl_syl_tokenization.wl_syl_tokenize(main, text.words_flat, lang = text.lang) - text.syls_words = wl_texts.get_token_properties(text.words_flat, 'syls') - text.num_syls = sum((len(syls) for syls in text.syls_words)) + wl_syl_tokenization.wl_syl_tokenize(main, text.words_flat, lang = text.lang) + text.num_syls = sum((len(word.syls) for word in text.words_flat)) # Number of characters if 'num_chars_all' not in text.__dict__: @@ -99,34 +99,31 @@ def get_num_words_ltrs(words, len_min = 1, len_max = None): if len([char for char in word if char.isalpha()]) >= len_min ]) -def get_num_words_syls(syls_words, len_min = 1, len_max = None): +def get_num_words_syls(words, len_min = 1, len_max = None): if len_max: return sum(( 1 - for syls in syls_words - if len_min <= len(syls) <= len_max + for word in words + if len_min <= len(word.syls) <= len_max )) else: return sum(( 1 - for syls in syls_words - if len(syls) >= len_min + for word in words + if len(word.syls) >= len_min )) -def get_num_words_pos_tags(main, words, lang, pos_tag): - words = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal', force = True) - - return sum((1 for word in words if pos_tag in word.tag)) - -def get_nums_words_pos_tags(main, words, lang, pos_tags): - nums = [] +def pos_tag_words(main, text): + text.words_flat = wl_pos_tagging.wl_pos_tag_universal(main, text.words_flat, lang = text.lang, tagged = text.tagged) - words = wl_pos_tagging.wl_pos_tag(main, words, lang = lang, tagset = 'universal', force = True) +def get_num_words_pos_tag(words, pos_tag): + return sum((1 for word in words if pos_tag in word.tag_universal.split('/'))) - for pos_tag in pos_tags: - nums.append(sum((1 for word in words if pos_tag in word.tag))) - - return nums +def get_nums_words_pos_tags(words, pos_tags): + return [ + get_num_words_pos_tag(words, pos_tag) + for pos_tag in pos_tags + ] def get_num_words_outside_list(words, wordlist, use_word_types = False): words_inside_wordlist = set() @@ -189,6 +186,7 @@ def rd(main, text): variant = main.settings_custom['measures']['readability']['rd']['variant'] if variant == _tr('wl_measures_readability', 'Policy one'): + rd = ( 4.41434307 * (text.num_chars_alpha / text.num_words) - 13.46873475 @@ -310,40 +308,52 @@ def coleman_liau_index(main, text): # Coleman's Readability Formula # Reference: Liau, T. L., Bassin, C. B., Martin, C. J., & Coleman, E. B. (1976). Modification of the Coleman readability formulas. Journal of Reading Behavior, 8(4), 381–386. https://journals.sagepub.com/doi/pdf/10.1080/10862967609547193 def colemans_readability_formula(main, text): - if text.lang in main.settings_global['syl_tokenizers'] and text.lang in main.settings_global['pos_taggers']: + variant = main.settings_custom['measures']['readability']['colemans_readability_formula']['variant'] + + if ( + text.lang in main.settings_global['syl_tokenizers'] + and ( + variant in ['1', '2'] + or (variant in ['3', '4'] and text.lang in main.settings_global['pos_taggers']) + ) + ): text = get_nums(main, text) if text.num_words: - variant = main.settings_custom['measures']['readability']['colemans_readability_formula']['variant'] - num_words_1_syl = get_num_words_syls(text.syls_words, len_min = 1, len_max = 1) + num_words_1_syl = get_num_words_syls(text.words_flat, len_min = 1, len_max = 1) - if variant == '1': - cloze_pct = ( - 1.29 * (num_words_1_syl / text.num_words * 100) - - 38.45 - ) - elif variant == '2': - cloze_pct = ( - 1.16 * (num_words_1_syl / text.num_words * 100) - + 1.48 * (text.num_sentences / text.num_words * 100) - - 37.95 - ) - elif variant in ['3', '4']: - num_prons, num_preps = get_nums_words_pos_tags( # pylint: disable=unbalanced-tuple-unpacking - main, - words = text.words_flat, - lang = text.lang, - pos_tags = ['PRON', 'ADP'] - ) + match variant: + case '1': + cloze_pct = ( + 1.29 * (num_words_1_syl / text.num_words * 100) + - 38.45 + ) + case '2': + cloze_pct = ( + 1.16 * (num_words_1_syl / text.num_words * 100) + + 1.48 * (text.num_sentences / text.num_words * 100) + - 37.95 + ) + case '3': + pos_tag_words(main, text) + num_prons = get_num_words_pos_tag( + words = text.words_flat, + pos_tag = 'PRON' + ) - if variant == '3': cloze_pct = ( 1.07 * (num_words_1_syl / text.num_words * 100) + 1.18 * (text.num_sentences / text.num_words * 100) + 0.76 * (num_prons / text.num_words * 100) - 34.02 ) - elif variant == '4': + case '4': + pos_tag_words(main, text) + num_prons, num_preps = get_nums_words_pos_tags( + words = text.words_flat, + pos_tags = ['PRON', 'ADP'] + ) + cloze_pct = ( 1.04 * (num_words_1_syl / text.num_words * 100) + 1.06 * (text.num_sentences / text.num_words * 100) @@ -650,7 +660,7 @@ def re_farr_jenkins_paterson(main, text): text = get_nums(main, text) if text.num_words and text.num_sentences: - num_words_1_syl = get_num_words_syls(text.syls_words, len_min = 1, len_max = 1) + num_words_1_syl = get_num_words_syls(text.words_flat, len_min = 1, len_max = 1) if main.settings_custom['measures']['readability']['re_farr_jenkins_paterson']['use_powers_sumner_kearl_variant']: re = ( @@ -679,7 +689,7 @@ def rgl(main, text): if text.num_words >= 150: sample_start = random.randint(0, text.num_words - 150) - sample = text.syls_words[sample_start : sample_start + 150] + sample = text.words_flat[sample_start : sample_start + 150] num_words_1_syl = get_num_words_syls(sample, len_min = 1, len_max = 1) rgl = 20.43 - 0.11 * num_words_1_syl @@ -789,21 +799,14 @@ def fog_index(main, text): _tr('wl_measures_readability', 'Original'), 'Powers-Sumner-Kearl' ]: - words_tagged = wl_pos_tagging.wl_pos_tag( - main, text.words_flat, - lang = text.lang, - tagset = 'universal', - force = True - ) - - for syls, word in zip(text.syls_words, words_tagged): - tag = word.tag + pos_tag_words(main, text) + for word in text.words_flat: if ( - 'PROPN' not in tag + 'PROPN' not in word.tag_universal.split('/') and ( - (len(syls) == 3 and not word.endswith('ed') and not word.endswith('es')) - or len(syls) > 3 + (len(word.syls) == 3 and not word.endswith('ed') and not word.endswith('es')) + or len(word.syls) > 3 ) ): num_hard_words += 1 @@ -820,19 +823,24 @@ def fog_index(main, text): + 0.0984 * (num_hard_words / text.num_words * 100) ) elif variant_eng == _tr('wl_measures_readability', 'Navy'): - num_words_3_plus_syls = get_num_words_syls(text.syls_words, len_min = 3) + num_words_3_plus_syls = get_num_words_syls(text.words_flat, len_min = 3) fog_index = ( ((text.num_words + 2 * num_words_3_plus_syls) / text.num_sentences - 3) / 2 ) elif text.lang == 'pol': - words_tagged = wl_pos_tagging.wl_pos_tag(main, text.words_flat, lang = 'pol', tagset = 'universal') - lemmas = wl_lemmatization.wl_lemmatize(main, text.words_flat, lang = 'pol') - syls_words = wl_syl_tokenization.wl_syl_tokenize(main, lemmas, lang = 'pol') + pos_tag_words(main, text) + # Count number of syllables of word lemmas instead of original words + wl_lemmatization.wl_lemmatize(main, text.words_flat, lang = 'pol') + lemmas_syls = wl_syl_tokenization.wl_syl_tokenize( + main, + wl_texts.to_tokens(wl_texts.get_token_properties(text.words_flat, 'lemma'), lang = 'pol'), + lang = 'pol' + ) - for syls, (word, tag) in zip(syls_words, words_tagged): - if len(syls) > 4 and 'PROPN' not in tag: + for word, syls in zip(text.words_flat, lemmas_syls): + if len(syls) > 4 and 'PROPN' not in word.tag_universal.split('/'): num_hard_words += 1 fog_index = ( @@ -940,10 +948,9 @@ def lorge_readability_index(main, text): text = get_nums(main, text) if text.num_sentences and text.num_words: - num_preps = get_num_words_pos_tags( - main, + pos_tag_words(main, text) + num_preps = get_num_words_pos_tag( words = text.words_flat, - lang = text.lang, pos_tag = 'ADP' ) num_hard_words = get_num_words_outside_list( @@ -1023,7 +1030,7 @@ def nwl(main, text): sw = get_num_words_outside_list(text.words_flat, wordlist = 'bamberger_vanecek', use_word_types = True) / text.num_word_types * 100 s_100 = text.num_sentences / text.num_words * 100 - ms = get_num_words_syls(text.syls_words, len_min = 3) / text.num_words * 100 + ms = get_num_words_syls(text.words_flat, len_min = 3) / text.num_words * 100 sl = text.num_words / text.num_sentences iw = get_num_words_ltrs(text.words_flat, len_min = 7) / text.num_words * 100 @@ -1049,10 +1056,10 @@ def nws(main, text): if text.num_words and text.num_sentences: variant = main.settings_custom['measures']['readability']['nws']['variant'] - ms = get_num_words_syls(text.syls_words, len_min = 3) / text.num_words * 100 + ms = get_num_words_syls(text.words_flat, len_min = 3) / text.num_words * 100 sl = text.num_words / text.num_sentences iw = get_num_words_ltrs(text.words_flat, len_min = 7) / text.num_words * 100 - es = get_num_words_syls(text.syls_words, len_min = 1, len_max = 1) / text.num_words * 100 + es = get_num_words_syls(text.words_flat, len_min = 1, len_max = 1) / text.num_words * 100 if variant == '1': nws = 0.1935 * ms + 0.1672 * sl + 0.1297 * iw - 0.0327 * es - 0.875 @@ -1178,12 +1185,9 @@ def smog_grade(main, text): num_words_3_plus_syls = 0 for sentence in sample: - syls_words = wl_texts.get_token_properties( - wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang), - 'syls' - ) + sentence = wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang) - num_words_3_plus_syls += get_num_words_syls(syls_words, len_min = 3) + num_words_3_plus_syls += get_num_words_syls(sentence, len_min = 3) if text.lang.startswith('deu_'): g = numpy.sqrt(num_words_3_plus_syls / text.num_sentences * 30) - 2 @@ -1252,12 +1256,12 @@ def strain_index(main, text): num_syls = 0 for sentence in text.sentences[:3]: - syls_words = wl_texts.get_token_properties( + words_syls = wl_texts.get_token_properties( wl_syl_tokenization.wl_syl_tokenize(main, sentence, lang = text.lang), 'syls' ) - num_syls += sum((len(syls) for syls in syls_words)) + num_syls += sum((len(syls) for syls in words_syls)) strain_index = num_syls / 10 else: @@ -1276,17 +1280,17 @@ def trankle_bailers_readability_formula(main, text): text = get_nums(main, text) if text.num_words >= 100: + pos_tag_words(main, text) + sample_start = random.randint(0, text.num_words - 100) sample = text.words_flat[sample_start : sample_start + 100] num_chars_alnum = sum((1 for token in sample for char in token if char.isalnum())) num_sentences = get_num_sentences_sample(text, sample, sample_start) - num_preps, num_conjs = get_nums_words_pos_tags( # pylint: disable=unbalanced-tuple-unpacking - main, + num_preps, num_cconjs, num_sconjs = get_nums_words_pos_tags( # pylint: disable=unbalanced-tuple-unpacking words = sample, - lang = text.lang, - pos_tags = ['ADP', 'CONJ'] + pos_tags = ['ADP', 'CCONJ', 'SCONJ'] ) variant = main.settings_custom['measures']['readability']['trankle_bailers_readability_formula']['variant'] @@ -1303,7 +1307,7 @@ def trankle_bailers_readability_formula(main, text): 234.1063 - numpy.log(num_chars_alnum / 100 + 1) * 96.11069 - num_preps * 2.05444 - - num_conjs * 1.02805 + - (num_cconjs + num_sconjs) * 1.02805 ) else: trankle_bailers = 'text_too_short' @@ -1373,12 +1377,12 @@ def wheeler_smiths_readability_formula(main, text): text = get_nums(main, text) if text.num_words: - num_units = len(wl_sentence_tokenization.wl_sentence_seg_split( + num_units = len(wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens( main, - text = ' '.join(wl_misc.flatten_list(text.tokens_multilevel_with_puncs)), + tokens = wl_misc.flatten_list(text.tokens_multilevel_with_puncs), terminators = UNIT_TERMINATORS )) - num_words_2_syls = get_num_words_syls(text.syls_words, len_min = 2) + num_words_2_syls = get_num_words_syls(text.words_flat, len_min = 2) wheeler_smith = ( (text.num_words / num_units) diff --git a/wordless/wl_nlp/wl_lemmatization.py b/wordless/wl_nlp/wl_lemmatization.py index 467f0da9d..002b8a903 100644 --- a/wordless/wl_nlp/wl_lemmatization.py +++ b/wordless/wl_nlp/wl_lemmatization.py @@ -132,15 +132,13 @@ def wl_lemmatize_text(main, inputs, lang, lemmatizer): elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() - for token in wl_pos_tagging.wl_pos_tag( + for token in wl_pos_tagging.wl_pos_tag_universal( main, line, - lang = 'eng_us', - pos_tagger = 'nltk_perceptron_eng', - tagset = 'universal' + lang = 'eng_us' ): tokens.append(str(token)) - match token.tag[1:]: + match token.tag_universal: case 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADJ)) case 'NOUN' | 'PROPN': @@ -255,13 +253,12 @@ def wl_lemmatize_tokens(main, inputs, lang, lemmatizer): elif lemmatizer == 'nltk_wordnet': word_net_lemmatizer = nltk.WordNetLemmatizer() - for token in wl_pos_tagging.wl_pos_tag( - main, wl_texts.to_tokens(tokens, lang = lang), - lang = 'eng_us', - pos_tagger = 'nltk_perceptron_eng', - tagset = 'universal' + for token in wl_pos_tagging.wl_pos_tag_universal( + main, + inputs = wl_texts.to_tokens(tokens, lang = 'eng_us'), + lang = 'eng_us' ): - match token.tag[1:]: + match token.tag_universal: case 'ADJ': lemmas.append(word_net_lemmatizer.lemmatize(str(token), pos = nltk.corpus.wordnet.ADJ)) case 'NOUN' | 'PROPN': diff --git a/wordless/wl_nlp/wl_pos_tagging.py b/wordless/wl_nlp/wl_pos_tagging.py index 977fea6ad..32b45b4df 100644 --- a/wordless/wl_nlp/wl_pos_tagging.py +++ b/wordless/wl_nlp/wl_pos_tagging.py @@ -16,9 +16,12 @@ # along with this program. If not, see . # ---------------------------------------------------------------------- +import copy + import khmernltk import laonlp import nltk +from PyQt5.QtCore import QCoreApplication import pythainlp import spacy import underthesea @@ -26,16 +29,32 @@ from wordless.wl_nlp import wl_nlp_utils, wl_texts, wl_word_tokenization from wordless.wl_utils import wl_conversion -UNIVERSAL_TAGSETS_SPACY = [ +_tr = QCoreApplication.translate + +UNIVERSAL_TAGSETS_SPACY = { 'spacy_cat', 'spacy_dan', 'spacy_fra', 'spacy_ell', 'spacy_mkd', 'spacy_nob', 'spacy_por', 'spacy_rus', 'spacy_spa', 'spacy_ukr' -] -UNIVERSAL_TAGSETS_STANZA = [ +} +UNIVERSAL_TAGSETS_STANZA = { 'stanza_hye', 'stanza_hyw', 'stanza_eus', 'stanza_bxr', 'stanza_dan', 'stanza_fra', 'stanza_ell', 'stanza_heb', 'stanza_hun', 'stanza_lij', 'stanza_glv', 'stanza_mar', 'stanza_pcm', 'stanza_qpm', 'stanza_por', 'stanza_rus', 'stanza_san', 'stanza_snd', 'stanza_hsb', 'stanza_tel' -] +} + +def to_content_function(universal_pos_tag): + if universal_pos_tag in [ + 'ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'NUM', 'VERB', 'SYM', 'X', + 'NOUN/NUM', 'SYM/X' + ]: + return _tr('wl_pos_tagging', 'Content words') + elif universal_pos_tag in [ + 'ADP', 'AUX', 'CONJ', 'CCONJ', 'SCONJ', 'DET', 'PART', 'PRON', 'PUNCT', + 'ADP/SCONJ', 'PUNCT/SYM' + ]: + return _tr('wl_pos_tagging', 'Function words') + else: + return None def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', force = False): if ( @@ -69,6 +88,8 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f tokenized = not isinstance(inputs, str) ) + tags_universal = [] + if isinstance(inputs, str): # spaCy if pos_tagger.startswith('spacy_'): @@ -89,6 +110,9 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f tags.append(token.tag_) elif tagset == 'universal': tags.append(token.pos_) + + if pos_tagger not in UNIVERSAL_TAGSETS_SPACY: + tags_universal.append(token.pos_) # Stanza elif pos_tagger.startswith('stanza_'): if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -108,6 +132,9 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f tags.append(token.xpos if token.xpos else token.upos) elif tagset == 'universal': tags.append(token.upos) + + if pos_tagger not in UNIVERSAL_TAGSETS_STANZA: + tags_universal.append(token.upos) else: for line in inputs.splitlines(): tokens_tagged_line, tags_line = wl_pos_tag_text(main, line, lang, pos_tagger) @@ -145,6 +172,9 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f tags.append(token.tag_) elif tagset == 'universal': tags.append(token.pos_) + + if pos_tagger not in UNIVERSAL_TAGSETS_SPACY: + tags_universal.append(token.pos_) # Stanza elif pos_tagger.startswith('stanza_'): if lang not in ['zho_cn', 'zho_tw', 'srp_latn']: @@ -166,6 +196,9 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f tags.append(token.xpos if token.xpos else token.upos) elif tagset == 'universal': tags.append(token.upos) + + if pos_tagger not in UNIVERSAL_TAGSETS_STANZA: + tags_universal.append(token.upos) else: for tokens in wl_nlp_utils.split_token_list(main, texts, pos_tagger): results = wl_pos_tag_tokens(main, tokens, lang, pos_tagger) @@ -173,51 +206,95 @@ def wl_pos_tag(main, inputs, lang, pos_tagger = 'default', tagset = 'default', f texts_tagged.extend(results[0]) tags.extend(results[1]) - # Remove empty tokens (e.g. SudachiPy) and strip whitespace around tokens and tags - tokens_tags = zip(texts_tagged.copy(), tags.copy()) - texts_tagged.clear() - tags.clear() + if ( + not pos_tagger.startswith(('spacy_', 'stanza_')) + or pos_tagger in UNIVERSAL_TAGSETS_SPACY | UNIVERSAL_TAGSETS_STANZA + ): + mappings = { + tag: tag_universal + for tag, tag_universal, _, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger] + } - for token, tag in tokens_tags: + # Convert empty tags (to be removed later) to X + tags_universal = [(mappings[tag.strip()] if tag.strip() else 'X') for tag in tags] + + # Remove empty tokens (e.g. SudachiPy) and strip whitespace around tokens and tags + for i, token in reversed(list(enumerate(texts_tagged))): if (token_clean := token.strip()): - texts_tagged.append(token_clean) - tags.append(tag.strip()) + texts_tagged[i] = token_clean + else: + del texts_tagged[i] + del tags[i] + del tags_universal[i] if not isinstance(inputs, str): tags = wl_nlp_utils.align_tokens(texts, texts_tagged, tags) + tags_universal = wl_nlp_utils.align_tokens(texts, texts_tagged, tags_universal) + + # Convert to content/function words + if ( + pos_tagger.startswith(('spacy_', 'stanza_')) + and pos_tagger not in UNIVERSAL_TAGSETS_SPACY | UNIVERSAL_TAGSETS_STANZA + ): + content_functions = [to_content_function(tag) for tag in tags_universal] + else: + mappings = { + tag: content_function + for tag, _, content_function, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger] + } + + content_functions = [mappings[tag] for tag in tags] # Convert to universal POS tags if ( tagset == 'universal' and ( - ( - not pos_tagger.startswith('spacy_') - and not pos_tagger.startswith('stanza_') - ) - or pos_tagger in UNIVERSAL_TAGSETS_SPACY - or pos_tagger in UNIVERSAL_TAGSETS_STANZA + not pos_tagger.startswith(('spacy_', 'stanza_')) + or pos_tagger in UNIVERSAL_TAGSETS_SPACY | UNIVERSAL_TAGSETS_STANZA ) ): - mappings = { - tag: tag_universal - for tag, tag_universal, _, _ in main.settings_custom['pos_tagging']['tagsets']['mapping_settings'][lang][pos_tagger] - } - - tags = [mappings.get(tag, 'X') for tag in tags] + tags = tags_universal.copy() # Add separators between tokens and tags tags = [f'_{tag}' for tag in tags] if isinstance(inputs, str): - return wl_texts.to_tokens(texts_tagged, lang = lang, tags = tags) + return wl_texts.to_tokens( + texts_tagged, + lang = lang, + tags = tags, + tags_universal = tags_universal, + content_functions = content_functions + ) else: tokens = wl_texts.combine_texts_properties(texts, token_properties) wl_texts.set_token_properties(tokens, 'tag', tags) + wl_texts.set_token_properties(tokens, 'tag_universal', tags_universal) + wl_texts.set_token_properties(tokens, 'content_function', content_functions) wl_texts.update_token_properties(inputs, tokens) return inputs +def wl_pos_tag_universal(main, inputs, lang, pos_tagger = 'default', tagged = False): + if ( + isinstance(inputs, str) + or ( + not isinstance(inputs, str) + and inputs + and inputs[0].tag_universal is None + ) + ): + # Assign universal POS tags to tagged files without modifying original tags + if tagged: + tokens = wl_pos_tag(main, copy.deepcopy(inputs), lang, pos_tagger, force = True) + wl_texts.set_token_properties(inputs, 'tag_universal', wl_texts.get_token_properties(tokens, 'tag_universal')) + wl_texts.set_token_properties(inputs, 'content_function', wl_texts.get_token_properties(tokens, 'content_function')) + else: + inputs = wl_pos_tag(main, inputs, lang, pos_tagger) + + return inputs + def wl_pos_tag_text(main, text, lang, pos_tagger): tokens_tagged = [] tags = [] diff --git a/wordless/wl_nlp/wl_sentence_tokenization.py b/wordless/wl_nlp/wl_sentence_tokenization.py index 3c2e10da3..870864298 100644 --- a/wordless/wl_nlp/wl_sentence_tokenization.py +++ b/wordless/wl_nlp/wl_sentence_tokenization.py @@ -209,7 +209,7 @@ def wl_sentence_tokenize(main, text, lang, sentence_tokenizer = 'default'): def wl_sentence_split(main, text, terminators = SENTENCE_TERMINATORS): return [ sentence.strip() - for sentence in re.findall(fr'.+?[{terminators}][{terminators}\s]*|.+?$', text.strip()) + for sentence in re.findall(fr'.+?[{terminators}]+\s|.+?$', text.strip()) ] # Reference: https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Terminal_Punctuation=Yes:] @@ -295,21 +295,13 @@ def wl_sentence_seg_tokenize(main, text, terminators = SENTENCE_SEG_TERMINATORS) for sentence_seg in re.findall(fr'.+?[{terminators}]+|.+?$', text.strip()) ] -def wl_sentence_seg_split(main, text, terminators = SENTENCE_SEG_TERMINATORS): - return [ - sentence_seg.strip() - for sentence_seg in re.findall(fr'.+?[{terminators}][{terminators}\s]*|.+?$', text.strip()) - ] - REPLACEMENT_CHAR = '\uFFFF' def wl_sentence_seg_tokenize_tokens(main, tokens, terminators = SENTENCE_SEG_TERMINATORS): - sentence_segs = [] - # Insert a replacement character between tokens to prevent text from being split within tokens text = REPLACEMENT_CHAR.join(tokens) - for sentence_seg in re.findall(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$', text.strip()): - sentence_segs.append(wl_texts.clean_texts(sentence_seg.split(REPLACEMENT_CHAR))) - - return sentence_segs + return [ + wl_texts.clean_texts(sentence_seg.split(REPLACEMENT_CHAR)) + for sentence_seg in re.findall(fr'.+?[{terminators}]+{REPLACEMENT_CHAR}|.+?$', text.strip()) + ] diff --git a/wordless/wl_nlp/wl_texts.py b/wordless/wl_nlp/wl_texts.py index 3c237e697..782edaea6 100644 --- a/wordless/wl_nlp/wl_texts.py +++ b/wordless/wl_nlp/wl_texts.py @@ -38,7 +38,7 @@ def __new__(cls, text, **kwargs): def __init__( self, text, lang = 'eng_us', syls = None, - tag = None, + tag = None, tag_universal = None, content_function = None, lemma = None, head = None, dependency_relation = None, dependency_len = None, punc_mark = None @@ -46,6 +46,8 @@ def __init__( self.lang = lang self.syls = syls self.tag = tag + self.tag_universal = tag_universal + self.content_function = content_function self.lemma = lemma self.head = head self.dependency_relation = dependency_relation @@ -60,14 +62,16 @@ def __eq__(self, other): def display_text(self, punc_mark = False): if punc_mark: - return str(self) + (self.punc_mark or '') + (self.tag or '') + return f"{self}{(self.punc_mark or '')}{self.tag or ''}" else: - return str(self) + (self.tag or '') + return f"{self}{self.tag or ''}" def update_properties(self, token): self.lang = token.lang self.syls = token.syls self.tag = token.tag + self.tag_universal = token.tag_universal + self.content_function = token.content_function self.lemma = token.lemma self.head = token.head self.dependency_relation = token.dependency_relation @@ -77,7 +81,7 @@ def update_properties(self, token): def to_tokens( texts, lang = 'eng_us', syls_tokens = None, - tags = None, + tags = None, tags_universal = None, content_functions = None, lemmas = None, heads = None, dependency_relations = None, dependency_lens = None, punc_marks = None @@ -86,6 +90,8 @@ def to_tokens( syls_tokens = syls_tokens or [None] * num_tokens tags = tags or [None] * num_tokens + tags_universal = tags_universal or [None] * num_tokens + content_functions = content_functions or [None] * num_tokens lemmas = lemmas or [None] * num_tokens heads = heads or [None] * num_tokens dependency_relations = dependency_relations or [None] * num_tokens @@ -96,7 +102,7 @@ def to_tokens( Wl_Token( text, lang = lang, syls = syls_tokens[i], - tag = tags[i], + tag = tags[i], tag_universal = tags_universal[i], content_function = content_functions[i], lemma = lemmas[i], head = heads[i], dependency_relation = dependency_relations[i], dependency_len = dependency_lens[i], punc_mark = punc_marks[i] @@ -122,6 +128,8 @@ def split_texts_properties(tokens): 'lang': token.lang, 'syls': token.syls, 'tag': token.tag, + 'tag_universal': token.tag_universal, + 'content_function': token.content_function, 'lemma': token.lemma, 'head': token.head, 'dependency_relation': token.dependency_relation, @@ -248,8 +256,11 @@ def __init__(self, main, file): for sentence in wl_sentence_tokenization.wl_sentence_split(self.main, para): self.tokens_multilevel[-1].append([]) - for sentence_seg in wl_sentence_tokenization.wl_sentence_seg_split(self.main, sentence): - self.tokens_multilevel[-1][-1].append(sentence_seg.split()) + for sentence_seg in wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens( + self.main, + sentence.split() + ): + self.tokens_multilevel[-1][-1].append(sentence_seg) # Tokenized & Tagged elif self.tokenized and self.tagged: for i, para in enumerate(text.splitlines()): @@ -262,8 +273,11 @@ def __init__(self, main, file): for sentence in wl_sentence_tokenization.wl_sentence_split(self.main, text_no_tags): self.tokens_multilevel[-1].append([]) - for sentence_seg in wl_sentence_tokenization.wl_sentence_seg_split(self.main, sentence): - self.tokens_multilevel[-1][-1].append(sentence_seg.split()) + for sentence_seg in wl_sentence_tokenization.wl_sentence_seg_tokenize_tokens( + self.main, + sentence.split() + ): + self.tokens_multilevel[-1][-1].append(sentence_seg) # Check if the first token in the text is a tag if i == 0 and re.match(re_tags_start, para): @@ -276,16 +290,16 @@ def __init__(self, main, file): tags_tokens.append([]) # Extract tags - tag_end = 0 + tag_last_end = 0 for tag in re.finditer(re_tags, para): - tags_tokens = self.add_tags_splitting(para[tag_end:tag.start()], tags_tokens) + tags_tokens = self.add_tags_splitting(para[tag_last_end:tag.start()], tags_tokens) tags_tokens[-1].append(tag.group()) - tag_end = tag.end() + tag_last_end = tag.end() # The last part of the text - if (para := para[tag_end:]): + if (para := para[tag_last_end:]): tags_tokens = self.add_tags_splitting(para, tags_tokens) # Add empty tags for untagged files @@ -369,8 +383,8 @@ def __init__(self, main, file): i_tag += len_sentence_seg - # Record number of tokens - self.num_tokens = len(self.get_tokens_flat()) + # Record number of tokens and types + self.update_num_tokens() # Remove Wl_Main object from the text since it cannot be pickled del self.main @@ -396,6 +410,7 @@ def add_tags_splitting(self, text, tags): def update_num_tokens(self): self.num_tokens = len(self.get_tokens_flat()) + self.num_types = len(set(self.get_tokens_flat())) def get_tokens_flat(self): return list(wl_misc.flatten_list(self.tokens_multilevel)) @@ -598,10 +613,6 @@ def __init__(self, main, file): # pylint: disable=super-init-not-called # Remove Wl_Main object from the text since it cannot be pickled del self.main -class Wl_Text_Blank(Wl_Text): - def __init__(self): # pylint: disable=super-init-not-called - pass - class Wl_Text_Total(Wl_Text): def __init__(self, texts): # pylint: disable=super-init-not-called # Set language for the combined text only if all texts are in the same language @@ -610,6 +621,8 @@ def __init__(self, texts): # pylint: disable=super-init-not-called else: self.lang = 'other' + self.tagged = any((text.tagged for text in texts)) + self.tokens_multilevel = [ copy.deepcopy(para) for text in texts diff --git a/wordless/wl_nlp/wl_token_processing.py b/wordless/wl_nlp/wl_token_processing.py index 12e090dfb..b0adc0ccd 100644 --- a/wordless/wl_nlp/wl_token_processing.py +++ b/wordless/wl_nlp/wl_token_processing.py @@ -28,14 +28,12 @@ # Assign part-of-speech tags def text_pos_tag(main, text, settings): if settings['assign_pos_tags'] and not text.tagged: - tokens = wl_pos_tagging.wl_pos_tag( + wl_pos_tagging.wl_pos_tag( main, inputs = text.get_tokens_flat(), lang = text.lang ) - text.update_token_properties(tokens) - # Apply lemmatization / Match inflected forms def text_lemmatize(main, text, token_settings, search_settings = None): search_settings = search_settings or { @@ -55,24 +53,20 @@ def text_lemmatize(main, text, token_settings, search_settings = None): and search_settings['context_settings']['excl']['match_inflected_forms'] ) ): - tokens = wl_lemmatization.wl_lemmatize( + wl_lemmatization.wl_lemmatize( main, inputs = text.get_tokens_flat(), lang = text.lang ) - text.update_token_properties(tokens) - # Syllable tokenization def text_syl_tokenize(main, text): - tokens = wl_syl_tokenization.wl_syl_tokenize( + wl_syl_tokenization.wl_syl_tokenize( main, inputs = text.get_tokens_flat(), lang = text.lang, ) - text.update_token_properties(tokens) - # Ignore tags def text_ignore_tags(text, settings): if settings['ignore_tags']: @@ -133,7 +127,7 @@ def wl_process_tokens(main, text, token_settings, search_settings = None): # Remove tags temporarily if text is untagged and users do not choose to assign POS tags on the fly if not settings['assign_pos_tags'] and not text.tagged: - text_modified.set_token_properties('tag', '') + text_modified.set_token_properties('tag', None) # Punctuation marks if not settings['punc_marks']: @@ -272,12 +266,25 @@ def remove_empty_tokens_multilevel(tokens_multilevel, empty_tokens = True): return tokens_multilevel -def wl_process_tokens_profiler(main, text, token_settings): +def wl_process_tokens_profiler(main, text, token_settings, profiler_tab): # Punctuation marks must be preserved for some readability measures (e.g. Wheeler & Smith's Readability Formula) text.tokens_multilevel_with_puncs = copy.deepcopy(text.tokens_multilevel) text_syl_tokenize(main, text) + if profiler_tab in ['readability', 'all']: + if text.lang in main.settings_global['pos_taggers']: + wl_pos_tagging.wl_pos_tag_universal(main, text.get_tokens_flat(), lang = text.lang, tagged = text.tagged) + + # Polish variant of Gunning Fog Index + if text.lang == 'pol': + wl_lemmatization.wl_lemmatize(main, text.get_tokens_flat(), lang = text.lang) + + if profiler_tab in ['lexical_density_diversity', 'all']: + # Lexical density + if text.lang in main.settings_global['pos_taggers']: + wl_pos_tagging.wl_pos_tag_universal(main, text.get_tokens_flat(), lang = text.lang, tagged = text.tagged) + text_modified = wl_process_tokens_ngram_generator(main, text, token_settings) text_modified.tokens_multilevel = remove_empty_tokens_multilevel(text_modified.tokens_multilevel) text_modified.update_num_tokens() @@ -285,19 +292,17 @@ def wl_process_tokens_profiler(main, text, token_settings): return text_modified def wl_process_tokens_concordancer(main, text, token_settings, search_settings, preserve_blank_lines = False): - settings = copy.deepcopy(token_settings) - text_pos_tag(main, text, token_settings) text_lemmatize(main, text, token_settings, search_settings) text_modified = copy.deepcopy(text) # Remove tags temporarily if text is untagged and users do not choose to assign POS tags on the fly - if not settings['assign_pos_tags'] and not text.tagged: + if not token_settings['assign_pos_tags'] and not text.tagged: text_modified.set_token_properties('tag', '') # Punctuation marks - if not settings['punc_marks']: + if not token_settings['punc_marks']: tokens_flat_punc_marks = [] for i, token in enumerate(text_modified.get_tokens_flat()): @@ -358,18 +363,14 @@ def wl_process_tokens_concordancer(main, text, token_settings, search_settings, return text_modified def wl_process_tokens_dependency_parser(main, text, token_settings, search_settings): - # Dependency parsing - tokens_modified = [] - + # Do not modify original sentence tokenization during dependency parsing for para in text.tokens_multilevel: for sentence in para: - tokens_modified.extend(wl_dependency_parsing.wl_dependency_parse( + wl_dependency_parsing.wl_dependency_parse( main, inputs = list(wl_misc.flatten_list(sentence)), lang = text.lang, - )) - - text.update_token_properties(tokens_modified) + ) return wl_process_tokens_concordancer(main, text, token_settings, search_settings) diff --git a/wordless/wl_profiler.py b/wordless/wl_profiler.py index 1935009b8..1b72ce903 100644 --- a/wordless/wl_profiler.py +++ b/wordless/wl_profiler.py @@ -29,7 +29,7 @@ from wordless.wl_checks import wl_checks_tokens, wl_checks_work_area from wordless.wl_dialogs import wl_dialogs_misc -from wordless.wl_measures import wl_measures_lexical_diversity, wl_measures_misc, wl_measures_readability +from wordless.wl_measures import wl_measures_lexical_density_diversity, wl_measures_misc, wl_measures_readability from wordless.wl_nlp import wl_texts, wl_token_processing from wordless.wl_utils import wl_misc, wl_threading from wordless.wl_widgets import wl_layouts, wl_tables, wl_widgets @@ -43,14 +43,14 @@ def __init__(self, main): # Table self.table_profiler_readability = Wl_Table_Profiler_Readability(self) self.table_profiler_counts = Wl_Table_Profiler_Counts(self) - self.table_profiler_lexical_diversity = Wl_Table_Profiler_Lexical_Diversity(self) + self.table_profiler_lexical_density_diversity = Wl_Table_Profiler_Lexical_Density_Diversity(self) self.table_profiler_lens = Wl_Table_Profiler_Lens(self) self.table_profiler_len_breakdown = Wl_Table_Profiler_Len_Breakdown(self) self.tables = [ self.table_profiler_readability, self.table_profiler_counts, - self.table_profiler_lexical_diversity, + self.table_profiler_lexical_density_diversity, self.table_profiler_lens, self.table_profiler_len_breakdown ] @@ -76,7 +76,7 @@ def __init__(self, main): self.tabs_profiler = QTabWidget(self) self.tabs_profiler.addTab(self.table_profiler_readability, self.tr('Readability')) self.tabs_profiler.addTab(self.table_profiler_counts, self.tr('Counts')) - self.tabs_profiler.addTab(self.table_profiler_lexical_diversity, self.tr('Lexical Diversity')) + self.tabs_profiler.addTab(self.table_profiler_lexical_density_diversity, self.tr('Lexical Density/Diversity')) self.tabs_profiler.addTab(self.table_profiler_lens, self.tr('Lengths')) self.tabs_profiler.addTab(self.table_profiler_len_breakdown, self.tr('Length Breakdown')) @@ -426,16 +426,13 @@ def update_gui_table(self, err_msg, text_stats_files): self.disable_updates() for i, stats in enumerate(text_stats_files): - readability_stats = stats[0] - - # Readability - for j, statistic in enumerate(readability_stats): - if statistic == 'no_support': + for j, stat in enumerate(stats[0]): + if stat == 'no_support': self.set_item_err(j, i, self.tr('No language support'), alignment_hor = 'right') - elif statistic == 'text_too_short': + elif stat == 'text_too_short': self.set_item_err(j, i, self.tr('Text is too short'), alignment_hor = 'right') else: - self.set_item_num(j, i, statistic) + self.set_item_num(j, i, stat) self.enable_updates() @@ -508,34 +505,25 @@ def update_gui_table(self, err_msg, text_stats_files): count_sentence_segs_total = len(text_stats_files[-1][5]) count_tokens_total = len(text_stats_files[-1][7]) count_types_total = len(text_stats_files[-1][9]) - count_syls_total = len(text_stats_files[-1][10]) + count_syls_total = len(text_stats_files[-1][10]) if text_stats_files[-1][10] is not None else None count_chars_total = sum(text_stats_files[-1][7]) self.disable_updates() for i, stats in enumerate(text_stats_files): - if i < len(files): - file_lang = files[i]['lang'] - # Total - else: - if len({file['lang'] for file in files}) == 1: - file_lang = files[0]['lang'] - else: - file_lang = 'other' - len_paras_sentences = numpy.array(stats[1]) len_sentences = numpy.array(stats[4]) len_sentence_segs = numpy.array(stats[5]) len_tokens_chars = numpy.array(stats[7]) len_types_chars = numpy.array(stats[9]) - len_syls = numpy.array(stats[10]) + len_syls = numpy.array(stats[10]) if stats[10] is not None else None count_paras = len(len_paras_sentences) count_sentences = len(len_sentences) count_sentence_segs = len(len_sentence_segs) count_tokens = len(len_tokens_chars) count_types = len(len_types_chars) - count_syls = len(len_syls) + count_syls = len(len_syls) if len_syls is not None else None count_chars = numpy.sum(len_tokens_chars) # Count of Paragraphs @@ -559,7 +547,7 @@ def update_gui_table(self, err_msg, text_stats_files): self.set_item_num(9, i, count_types, count_types_total) # Count of Syllables - if file_lang in self.main.settings_global['syl_tokenizers']: + if count_syls is not None: self.set_item_num(10, i, count_syls) self.set_item_num(11, i, count_syls, count_syls_total) else: @@ -582,16 +570,17 @@ def update_gui_table(self, err_msg, text_stats_files): return err_msg -class Wl_Table_Profiler_Lexical_Diversity(Wl_Table_Profiler): +class Wl_Table_Profiler_Lexical_Density_Diversity(Wl_Table_Profiler): def __init__(self, parent): - HEADERS_LEXICAL_DIVERSITY = [ + HEADERS_LEXICAL_DENSITY_DIVERSITY = [ _tr('wl_profiler', "Brunét's Index"), _tr('wl_profiler', 'Corrected TTR'), _tr('wl_profiler', "Fisher's Index of Diversity"), _tr('wl_profiler', "Herdan's Vₘ"), - _tr('wl_profiler', 'HD-D'), - _tr('wl_profiler', "Honoré's statistic"), - _tr('wl_profiler', 'LogTTR'), + 'HD-D', + _tr('wl_profiler', "Honoré's Statistic"), + _tr('wl_profiler', 'Lexical Density'), + 'LogTTR', _tr('wl_profiler', 'Mean Segmental TTR'), _tr('wl_profiler', 'Measure of Textual Lexical Diversity'), _tr('wl_profiler', 'Moving-average TTR'), @@ -609,16 +598,16 @@ def __init__(self, parent): _tr('wl_profiler', 'Shannon Entropy'), _tr('wl_profiler', "Simpson's l"), _tr('wl_profiler', 'Type-token Ratio'), - _tr('wl_profiler', 'vocd-D'), + 'vocd-D', _tr('wl_profiler', "Yule's Characteristic K"), _tr('wl_profiler', "Yule's Index of Diversity") ] super().__init__( parent, - headers = HEADERS_LEXICAL_DIVERSITY, - headers_float = HEADERS_LEXICAL_DIVERSITY, - profiler_tab = 'lexical_diversity' + headers = HEADERS_LEXICAL_DENSITY_DIVERSITY, + headers_float = HEADERS_LEXICAL_DENSITY_DIVERSITY, + profiler_tab = 'lexical_density_diversity' ) def update_gui_table(self, err_msg, text_stats_files): @@ -643,8 +632,11 @@ def update_gui_table(self, err_msg, text_stats_files): self.disable_updates() for i, stats in enumerate(text_stats_files): - for j, lexical_diversity in enumerate(stats[11]): - self.set_item_num(j, i, lexical_diversity) + for j, stat in enumerate(stats[11]): + if stat == 'no_support': + self.set_item_err(j, i, self.tr('No language support'), alignment_hor = 'right') + else: + self.set_item_num(j, i, stat) self.enable_updates() @@ -846,25 +838,16 @@ def update_gui_table(self, err_msg, text_stats_files): self.disable_updates() for i, stats in enumerate(text_stats_files): - if i < len(files): - file_lang = files[i]['lang'] - # Total - else: - if len({file['lang'] for file in files}) == 1: - file_lang = files[0]['lang'] - else: - file_lang = 'other' - len_paras_sentences = numpy.array(stats[1]) len_paras_sentence_segs = numpy.array(stats[2]) len_paras_tokens = numpy.array(stats[3]) len_sentences = numpy.array(stats[4]) len_sentence_segs = numpy.array(stats[5]) - len_tokens_syls = numpy.array(stats[6]) + len_tokens_syls = numpy.array(stats[6]) if stats[6] is not None else None len_tokens_chars = numpy.array(stats[7]) - len_types_syls = numpy.array(stats[8]) + len_types_syls = numpy.array(stats[8]) if stats[8] is not None else None len_types_chars = numpy.array(stats[9]) - len_syls = numpy.array(stats[10]) + len_syls = numpy.array(stats[10]) if stats[10] is not None else None # Paragraph Length in Sentences / Sentence Segments / Tokens # Sentence / Sentence Segment Length in Tokens @@ -908,7 +891,7 @@ def update_gui_table(self, err_msg, text_stats_files): [55, 77, 99], [len_tokens_syls, len_types_syls, len_syls] ): - if file_lang in self.main.settings_global['syl_tokenizers']: + if lens is not None: if lens.any(): self.set_item_num(row, i, numpy.mean(lens)) self.set_item_num(row + 1, i, numpy.std(lens)) @@ -983,12 +966,14 @@ def update_gui_table(self, err_msg, text_stats_files): for i, stats in enumerate(text_stats_files): len_sentences = numpy.array(stats[4]) len_sentence_segs = numpy.array(stats[5]) - len_tokens_syls = numpy.array(stats[6]) + len_tokens_syls = numpy.array(stats[6]) if stats[6] is not None else None len_tokens_chars = numpy.array(stats[7]) count_sentences_lens.append(collections.Counter(len_sentences)) count_sentence_segs_lens.append(collections.Counter(len_sentence_segs)) - count_tokens_lens_syls.append(collections.Counter(len_tokens_syls)) + count_tokens_lens_syls.append( + collections.Counter(len_tokens_syls) if len_tokens_syls is not None else None + ) count_tokens_lens_chars.append(collections.Counter(len_tokens_chars)) # Count of n-token-long Sentences @@ -1064,7 +1049,7 @@ def update_gui_table(self, err_msg, text_stats_files): ) # Count of n-syllable-long Tokens - if any(count_tokens_lens_syls): + if len_tokens_syls is not None: count_tokens_lens_files = wl_misc.merge_dicts(count_tokens_lens_syls) count_tokens_lens_total = { len_token: count_tokens_files[-1] @@ -1166,7 +1151,8 @@ def run(self): for file in files: text = wl_token_processing.wl_process_tokens_profiler( self.main, file['text'], - token_settings = settings['token_settings'] + token_settings = settings['token_settings'], + profiler_tab = self.profiler_tab ) texts.append(text) @@ -1224,7 +1210,7 @@ def run(self): else: stats_readability = None - if self.profiler_tab in ['lexical_diversity', 'counts', 'lens', 'len_breakdown', 'all']: + if self.profiler_tab in ['lexical_density_diversity', 'counts', 'lens', 'len_breakdown', 'all']: # Paragraph length len_paras_sentences = [ len(para) @@ -1252,22 +1238,33 @@ def run(self): for sentence_seg in sentence ] - syls_tokens = text.get_token_properties('syls', flat = True) + if text.lang in self.main.settings_global['syl_tokenizers']: + syls_tokens = text.get_token_properties('syls', flat = True) + + # Remove punctuation marks + for i, syls in enumerate(syls_tokens): + syls_tokens[i] = tuple(syl for syl in syls if not wl_checks_tokens.is_punc(syl)) - # Remove punctuation marks - for i, syls in enumerate(syls_tokens): - syls_tokens[i] = tuple(syl for syl in syls if not wl_checks_tokens.is_punc(syl)) + syls_tokens = [syls for syls in syls_tokens if syls] - syls_tokens = [syls for syls in syls_tokens if syls] + # Token length + len_tokens_syls = [len(syls) for syls in syls_tokens] + # Type length + len_types_syls = [len(syls) for syls in set(syls_tokens)] + # Syllable length + len_syls = [len(syl) for syls in syls_tokens for syl in syls] + else: + # Token length + len_tokens_syls = None + # Type length + len_types_syls = None + # Syllable length + len_syls = None # Token length - len_tokens_syls = [len(syls) for syls in syls_tokens] len_tokens_chars = [len(token) for token in tokens] # Type length - len_types_syls = [len(syls) for syls in set(syls_tokens)] len_types_chars = [len(token_type) for token_type in set(tokens)] - # Syllable length - len_syls = [len(syl) for syls in syls_tokens for syl in syls] else: len_paras_sentences = len_paras_sentence_segs = len_paras_tokens = None len_sentences = len_sentence_segs = None @@ -1275,38 +1272,39 @@ def run(self): len_types_syls = len_types_chars = None len_syls = None - # Lexical Diversity - if self.profiler_tab in ['lexical_diversity', 'all']: + # Lexical Density/Diversity + if self.profiler_tab in ['lexical_density_diversity', 'all']: if tokens: - stats_lexical_diversity = [ - wl_measures_lexical_diversity.brunets_index(self.main, tokens), - wl_measures_lexical_diversity.cttr(self.main, tokens), - wl_measures_lexical_diversity.fishers_index_of_diversity(self.main, tokens), - wl_measures_lexical_diversity.herdans_vm(self.main, tokens), - wl_measures_lexical_diversity.hdd(self.main, tokens), - wl_measures_lexical_diversity.honores_stat(self.main, tokens), - wl_measures_lexical_diversity.logttr(self.main, tokens), - wl_measures_lexical_diversity.msttr(self.main, tokens), - wl_measures_lexical_diversity.mtld(self.main, tokens), - wl_measures_lexical_diversity.mattr(self.main, tokens), - *wl_measures_lexical_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, tokens), - wl_measures_lexical_diversity.popescus_r1(self.main, tokens), - wl_measures_lexical_diversity.popescus_r2(self.main, tokens), - wl_measures_lexical_diversity.popescus_r3(self.main, tokens), - wl_measures_lexical_diversity.popescus_r4(self.main, tokens), - wl_measures_lexical_diversity.repeat_rate(self.main, tokens), - wl_measures_lexical_diversity.rttr(self.main, tokens), - wl_measures_lexical_diversity.shannon_entropy(self.main, tokens), - wl_measures_lexical_diversity.simpsons_l(self.main, tokens), - wl_measures_lexical_diversity.ttr(self.main, tokens), - wl_measures_lexical_diversity.vocdd(self.main, tokens), - wl_measures_lexical_diversity.yules_characteristic_k(self.main, tokens), - wl_measures_lexical_diversity.yules_index_of_diversity(self.main, tokens) + stats_lexical_density_diversity = [ + wl_measures_lexical_density_diversity.brunets_index(self.main, text), + wl_measures_lexical_density_diversity.cttr(self.main, text), + wl_measures_lexical_density_diversity.fishers_index_of_diversity(self.main, text), + wl_measures_lexical_density_diversity.herdans_vm(self.main, text), + wl_measures_lexical_density_diversity.hdd(self.main, text), + wl_measures_lexical_density_diversity.honores_stat(self.main, text), + wl_measures_lexical_density_diversity.lexical_density(self.main, text), + wl_measures_lexical_density_diversity.logttr(self.main, text), + wl_measures_lexical_density_diversity.msttr(self.main, text), + wl_measures_lexical_density_diversity.mtld(self.main, text), + wl_measures_lexical_density_diversity.mattr(self.main, text), + *wl_measures_lexical_density_diversity.popescu_macutek_altmanns_b1_b2_b3_b4_b5(self.main, text), + wl_measures_lexical_density_diversity.popescus_r1(self.main, text), + wl_measures_lexical_density_diversity.popescus_r2(self.main, text), + wl_measures_lexical_density_diversity.popescus_r3(self.main, text), + wl_measures_lexical_density_diversity.popescus_r4(self.main, text), + wl_measures_lexical_density_diversity.repeat_rate(self.main, text), + wl_measures_lexical_density_diversity.rttr(self.main, text), + wl_measures_lexical_density_diversity.shannon_entropy(self.main, text), + wl_measures_lexical_density_diversity.simpsons_l(self.main, text), + wl_measures_lexical_density_diversity.ttr(self.main, text), + wl_measures_lexical_density_diversity.vocdd(self.main, text), + wl_measures_lexical_density_diversity.yules_characteristic_k(self.main, text), + wl_measures_lexical_density_diversity.yules_index_of_diversity(self.main, text) ] else: - stats_lexical_diversity = [0] * 27 + stats_lexical_density_diversity = [0] * 28 else: - stats_lexical_diversity = None + stats_lexical_density_diversity = None self.text_stats_files.append([ stats_readability, @@ -1320,7 +1318,7 @@ def run(self): len_types_syls, len_types_chars, len_syls, - stats_lexical_diversity + stats_lexical_density_diversity ]) if len(files) == 1: diff --git a/wordless/wl_settings/wl_settings.py b/wordless/wl_settings/wl_settings.py index 7e6d6390e..001a349c8 100644 --- a/wordless/wl_settings/wl_settings.py +++ b/wordless/wl_settings/wl_settings.py @@ -90,7 +90,7 @@ def __init__(self, main): self.tree_settings.model().appendRow(QStandardItem(self.tr('Measures'))) self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Readability'))) - self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Lexical Diversity'))) + self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Lexical Density/Diversity'))) self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Dispersion'))) self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Adjusted Frequency'))) self.tree_settings.model().item(10).appendRow(QStandardItem(self.tr('Statistical Significance'))) @@ -144,7 +144,7 @@ def __init__(self, main): # Measures self.settings_measures_readability = wl_settings_measures.Wl_Settings_Measures_Readability(self.main) - self.settings_measures_lexical_diversity = wl_settings_measures.Wl_Settings_Measures_Lexical_Diversity(self.main) + self.settings_measures_lexical_density_diversity = wl_settings_measures.Wl_Settings_Measures_Lexical_Density_Diversity(self.main) self.settings_measures_dispersion = wl_settings_measures.Wl_Settings_Measures_Dispersion(self.main) self.settings_measures_adjusted_freq = wl_settings_measures.Wl_Settings_Measures_Adjusted_Freq(self.main) self.settings_measures_statistical_significance = wl_settings_measures.Wl_Settings_Measures_Statistical_Significance(self.main) @@ -183,7 +183,7 @@ def __init__(self, main): self.tr('Sentiment Analysis'): self.settings_sentiment_analysis, self.tr('Readability'): self.settings_measures_readability, - self.tr('Lexical Diversity'): self.settings_measures_lexical_diversity, + self.tr('Lexical Density/Diversity'): self.settings_measures_lexical_density_diversity, self.tr('Dispersion'): self.settings_measures_dispersion, self.tr('Adjusted Frequency'): self.settings_measures_adjusted_freq, self.tr('Statistical Significance'): self.settings_measures_statistical_significance, diff --git a/wordless/wl_settings/wl_settings_default.py b/wordless/wl_settings/wl_settings_default.py index 46f6cde90..33b23753a 100644 --- a/wordless/wl_settings/wl_settings_default.py +++ b/wordless/wl_settings/wl_settings_default.py @@ -22,6 +22,7 @@ from PyQt5.QtCore import QCoreApplication from PyQt5.QtWidgets import QDesktopWidget +from wordless.wl_nlp import wl_pos_tagging from wordless.wl_settings import wl_settings_global from wordless.wl_tagsets import ( wl_tagset_cat_universal, @@ -2310,8 +2311,8 @@ def init_settings_default(main): } }, - # Settings - Measures - Lexical Diversity - 'lexical_diversity': { + # Settings - Measures - Lexical Density/Diversity + 'lexical_density_diversity': { 'hdd': { 'sample_size': 42 }, @@ -2509,6 +2510,12 @@ def init_settings_default(main): } # Tagsets + for mappings in settings_default['pos_tagging']['tagsets']['mapping_settings'].values(): + for mapping in mappings.values(): + if len(mapping[0]) == 4: + for i, (_, universal_pos_tag, _, _) in enumerate(mapping): + mapping[i].insert(2, wl_pos_tagging.to_content_function(universal_pos_tag)) + settings_default['pos_tagging']['tagsets']['preview_settings']['preview_pos_tagger'] = settings_default['pos_tagging']['pos_tagger_settings']['pos_taggers'].copy() # Custom stop word lists diff --git a/wordless/wl_settings/wl_settings_measures.py b/wordless/wl_settings/wl_settings_measures.py index 527fa9c87..a25f80859 100644 --- a/wordless/wl_settings/wl_settings_measures.py +++ b/wordless/wl_settings/wl_settings_measures.py @@ -353,13 +353,13 @@ def apply_settings(self): return True -# Measures - Lexical Diversity -class Wl_Settings_Measures_Lexical_Diversity(wl_settings.Wl_Settings_Node): +# Measures - Lexical Density/Diversity +class Wl_Settings_Measures_Lexical_Density_Diversity(wl_settings.Wl_Settings_Node): def __init__(self, main): super().__init__(main) - self.settings_default = self.main.settings_default['measures']['lexical_diversity'] - self.settings_custom = self.main.settings_custom['measures']['lexical_diversity'] + self.settings_default = self.main.settings_default['measures']['lexical_density_diversity'] + self.settings_custom = self.main.settings_custom['measures']['lexical_density_diversity'] # HD-D self.group_box_hdd = QGroupBox(self.tr('HD-D'), self) diff --git a/wordless/wl_settings/wl_settings_pos_tagging.py b/wordless/wl_settings/wl_settings_pos_tagging.py index 17bed7e00..c2930f833 100644 --- a/wordless/wl_settings/wl_settings_pos_tagging.py +++ b/wordless/wl_settings/wl_settings_pos_tagging.py @@ -305,6 +305,7 @@ def __init__(self, main): headers = [ self.tr('Part-of-speech Tag'), self.tr('Universal Part-of-speech Tag'), + self.tr('Content/Function Words'), self.tr('Description'), self.tr('Examples') ], @@ -339,8 +340,15 @@ def __init__(self, main): ], editable = True )) - self.table_mappings.setItemDelegateForColumn(2, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit)) + self.table_mappings.setItemDelegateForColumn(2, wl_item_delegates.Wl_Item_Delegate_Combo_Box( + parent = self.table_mappings, + items = [ + 'Content words', + 'Function words' + ] + )) self.table_mappings.setItemDelegateForColumn(3, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit)) + self.table_mappings.setItemDelegateForColumn(4, wl_item_delegates.Wl_Item_Delegate(self.table_mappings, QPlainTextEdit)) self.button_tagsets_reset.setMinimumWidth(100) self.button_tagsets_reset_all.setMinimumWidth(100) @@ -446,11 +454,12 @@ def update_gui(self, mappings): self.table_mappings.disable_updates() - for i, (tag, tag_universal, description, examples) in enumerate(mappings): + for i, (tag, tag_universal, content_function_words, description, examples) in enumerate(mappings): self.table_mappings.model().setItem(i, 0, QStandardItem(tag)) self.table_mappings.model().setItem(i, 1, QStandardItem(tag_universal)) - self.table_mappings.model().setItem(i, 2, QStandardItem(description)) - self.table_mappings.model().setItem(i, 3, QStandardItem(examples)) + self.table_mappings.model().setItem(i, 2, QStandardItem(content_function_words)) + self.table_mappings.model().setItem(i, 3, QStandardItem(description)) + self.table_mappings.model().setItem(i, 4, QStandardItem(examples)) self.table_mappings.enable_updates() diff --git a/wordless/wl_tagsets/wl_tagset_lao_seqlabeling.py b/wordless/wl_tagsets/wl_tagset_lao_seqlabeling.py index 2d27a7bd2..fefada8d7 100644 --- a/wordless/wl_tagsets/wl_tagset_lao_seqlabeling.py +++ b/wordless/wl_tagsets/wl_tagset_lao_seqlabeling.py @@ -52,6 +52,6 @@ ['FIX', 'PART', '前置词', ''], ['NEG', 'PART', '否定词', ''], - ['INT', 'INT', '语气词', ''], + ['INT', 'INTJ', '语气词', ''], ['PUNCT', 'PUNCT', '标点符号', ''] ] diff --git a/wordless/wl_tagsets/wl_tagset_lao_yunshan_cup_2020.py b/wordless/wl_tagsets/wl_tagset_lao_yunshan_cup_2020.py index 69b465775..eb55d416a 100644 --- a/wordless/wl_tagsets/wl_tagset_lao_yunshan_cup_2020.py +++ b/wordless/wl_tagsets/wl_tagset_lao_yunshan_cup_2020.py @@ -52,6 +52,6 @@ ['FIX', 'PART', '前置词', ''], ['NEG', 'PART', '否定词', ''], - ['INT', 'INT', '语气词', ''], + ['INT', 'INTJ', '语气词', ''], ['PUNCT', 'PUNCT', '标点符号', ''] ]