Add files via upload

jinhangjiang · Dec 23, 2022 · 6a9104f · 6a9104f
1 parent fc474eb
commit 6a9104f
Show file tree

Hide file tree

Showing 12 changed files with 641 additions and 170 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,12 @@
 # CHANGELOG
+## Version 0.2.1, 2022-12-22
+- Fixed the counting bug in Specificity 
+- Added max_doc parameter to Boilerplate
 
 ## Version 0.2.0, 2022-10-2
 
-- Add the "get_ngram" feature to the Boilerplate function
-- Add the percentage as a option for "min_doc" in Boilerpate, when the given value is between 0 and 1, it will automatically become a percentage for "min_doc"
+- Added the "get_ngram" feature to the Boilerplate function
+- Added the percentage as a option for "min_doc" in Boilerpate, when the given value is between 0 and 1, it will automatically become a percentage for "min_doc"
 
 ## Version 0.1.3, 2022-06-10
 

diff --git a/README.md b/README.md
@@ -1,104 +1,105 @@
-[![DOI](https://zenodo.org/badge/490040941.svg)](https://zenodo.org/badge/latestdoi/490040941)
-[![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://opensource.org/licenses/BSD-3-Clause)
-[![PyPI](https://img.shields.io/pypi/v/morethansentiments)](https://pypi.org/project/morethansentiments/)
-[![Code Ocean](https://codeocean.com/codeocean-assets/badge/open-in-code-ocean.svg)](https://codeocean.com/capsule/7670045/tree)
-
-# MoreThanSentiments
-Besides sentiment scores, this Python package offers various ways of quantifying text corpus based on multiple works of literature. Currently, we support the calculation of the following measures:
-
--   Boilerplate (Lang and Stice-Lawrence, 2015)
--   Redundancy (Cazier and Pfeiffer, 2015)
--   Specificity (Hope et al., 2016)
--   Relative_prevalence (Blankespoor, 2016)
-
-A medium blog is here: [MoreThanSentiments: A Python Library for Text Quantification](https://towardsdatascience.com/morethansentiments-a-python-library-for-text-quantification-e57ff9d51cd5)
-
-## Citation
-
-If this package was helpful in your work, feel free to cite it as
-
-- Jinhang Jiang, & Srinivasan, Karthik. (2022). MoreThanSentiments: Text Analysis Package. Zenodo. https://doi.org/10.5281/zenodo.6853352
-
-## Installation
-
-The easiest way to install the toolbox is via pip (pip3 in some
-distributions):
-
-    pip install MoreThanSentiments
-
-
-## Usage
-
-#### Import the Package
-
-    import MoreThanSentiments as mts
-
-#### Read data from txt files
-
-    my_dir_path = "D:/YourDataFolder"
-    df = mts.read_txt_files(PATH = my_dir_path)
-
-#### Sentence Token
-
-    df['sent_tok'] = df.text.apply(mts.sent_tok)
-
-#### Clean Data
-
-If you want to clean on the sentence level:
-
-    df['cleaned_data'] = pd.Series()    
-    for i in range(len(df['sent_tok'])):
-        df['cleaned_data'][i] = [mts.clean_data(x,\
-                                                lower = True,\
-                                                punctuations = True,\
-                                                number = False,\
-                                                unicode = True,\
-                                                stop_words = False) for x in df['sent_tok'][i]] 
-                                                
-If you want to clean on the document level:
-
-    df['cleaned_data'] = df.text.apply(mts.clean_data, args=(True, True, False, True, False))
-
-For the data cleaning function, we offer the following options:
--   lower: make all the words to lowercase
--   punctuations: remove all the punctuations in the corpus
--   number: remove all the digits in the corpus
--   unicode: remove all the unicodes in the corpus
--   stop_words: remove the stopwords in the corpus
-
-#### Boilerplate
-
-    df['Boilerplate'] = mts.Boilerplate(sent_tok, n = 4, min_doc = 5, get_ngram = False)
-
-Parameters:
--   input_data: this function requires tokenized documents.
--   n: number of the ngrams to use. The default is 4.
--   min_doc: when building the ngram list, ignore the ngrams that have a document frequency strictly lower than the given threshold. The default is 5 document. 30% of the number of the documents is recommended.
--   get_ngram: if this parameter is set to "True" it will return a datafram with all the ngrams and the corresponding frequency, and "min_doc" parameter will become ineffective.
-
-#### Redundancy
-
-    df['Redundancy'] = mts.Redundancy(df.cleaned_data, n = 10)
-
-Parameters:
--   input_data: this function requires tokenized documents.
--   n: number of the ngrams to use. The default is 10.
-
-#### Specificity
-
-    df['Specificity'] = mts.Specificity(df.text)
-
-Parameters:
--   input_data: this function requires the documents without tokenization
-
-#### Relative_prevalence
-
-    df['Relative_prevalence'] = mts.Relative_prevalence(df.text)
-
-Parameters:
--   input_data: this function requires the documents without tokenization
-
-
-For the full code script, you may check here:
--   [Script](https://github.com/jinhangjiang/morethansentiments/blob/main/tests/test_code.py)
--   [Jupyter Notebook](https://github.com/jinhangjiang/morethansentiments/blob/main/Boilerplate.ipynb)
+[![DOI](https://zenodo.org/badge/490040941.svg)](https://zenodo.org/badge/latestdoi/490040941)
+[![License](https://img.shields.io/badge/License-BSD_3--Clause-green.svg)](https://opensource.org/licenses/BSD-3-Clause)
+[![PyPI](https://img.shields.io/pypi/v/morethansentiments)](https://pypi.org/project/morethansentiments/)
+[![Code Ocean](https://codeocean.com/codeocean-assets/badge/open-in-code-ocean.svg)](https://codeocean.com/capsule/7670045/tree)
+
+# MoreThanSentiments
+Besides sentiment scores, this Python package offers various ways of quantifying text corpus based on multiple works of literature. Currently, we support the calculation of the following measures:
+
+-   Boilerplate (Lang and Stice-Lawrence, 2015)
+-   Redundancy (Cazier and Pfeiffer, 2015)
+-   Specificity (Hope et al., 2016)
+-   Relative_prevalence (Blankespoor, 2016)
+
+A medium blog is here: [MoreThanSentiments: A Python Library for Text Quantification](https://towardsdatascience.com/morethansentiments-a-python-library-for-text-quantification-e57ff9d51cd5)
+
+## Citation
+
+If this package was helpful in your work, feel free to cite it as
+
+- Jiang, J., & Srinivasan, K. (2022). MoreThanSentiments: A text analysis package. Software Impacts, 100456. https://doi.org/10.1016/J.SIMPA.2022.100456
+
+## Installation
+
+The easiest way to install the toolbox is via pip (pip3 in some
+distributions):
+
+    pip install MoreThanSentiments
+
+
+## Usage
+
+#### Import the Package
+
+    import MoreThanSentiments as mts
+
+#### Read data from txt files
+
+    my_dir_path = "D:/YourDataFolder"
+    df = mts.read_txt_files(PATH = my_dir_path)
+
+#### Sentence Token
+
+    df['sent_tok'] = df.text.apply(mts.sent_tok)
+
+#### Clean Data
+
+If you want to clean on the sentence level:
+
+    df['cleaned_data'] = pd.Series()    
+    for i in range(len(df['sent_tok'])):
+        df['cleaned_data'][i] = [mts.clean_data(x,\
+                                                lower = True,\
+                                                punctuations = True,\
+                                                number = False,\
+                                                unicode = True,\
+                                                stop_words = False) for x in df['sent_tok'][i]] 
+                                                
+If you want to clean on the document level:
+
+    df['cleaned_data'] = df.text.apply(mts.clean_data, args=(True, True, False, True, False))
+
+For the data cleaning function, we offer the following options:
+-   lower: make all the words to lowercase
+-   punctuations: remove all the punctuations in the corpus
+-   number: remove all the digits in the corpus
+-   unicode: remove all the unicodes in the corpus
+-   stop_words: remove the stopwords in the corpus
+
+#### Boilerplate
+
+    df['Boilerplate'] = mts.Boilerplate(sent_tok, n = 4, min_doc = 5, get_ngram = False)
+
+Parameters:
+-   input_data: this function requires tokenized documents.
+-   n: number of the ngrams to use. The default is 4.
+-   min_doc: when building the ngram list, ignore the ngrams that have a document frequency strictly lower than the given threshold. The default is 5 document. 30% of the number of the documents is recommended.
+-   get_ngram: if this parameter is set to "True" it will return a datafram with all the ngrams and the corresponding frequency, and "min_doc" parameter will become ineffective.
+-   max_doc: when building the ngram list, ignore the ngrams that have a document frequency strictly lower than the given threshold. The default is 75% of document. It can be percentage or integer.
+
+#### Redundancy
+
+    df['Redundancy'] = mts.Redundancy(df.cleaned_data, n = 10)
+
+Parameters:
+-   input_data: this function requires tokenized documents.
+-   n: number of the ngrams to use. The default is 10.
+
+#### Specificity
+
+    df['Specificity'] = mts.Specificity(df.text)
+
+Parameters:
+-   input_data: this function requires the documents without tokenization
+
+#### Relative_prevalence
+
+    df['Relative_prevalence'] = mts.Relative_prevalence(df.text)
+
+Parameters:
+-   input_data: this function requires the documents without tokenization
+
+
+For the full code script, you may check here:
+-   [Script](https://github.com/jinhangjiang/morethansentiments/blob/main/tests/test_code.py)
+-   [Jupyter Notebook](https://github.com/jinhangjiang/morethansentiments/blob/main/Boilerplate.ipynb)
diff --git a/build/lib/MoreThanSentiments.py b/build/lib/MoreThanSentiments.py
@@ -71,7 +71,7 @@ def sent_tok(doc:str):
     return nltk.sent_tokenize(doc)
 
 
-def Boilerplate(input_data: pd.Series, n: int = 4, min_doc: float = 5, get_ngram: bool = False):
+def Boilerplate(input_data: pd.Series, n: int = 4, min_doc: float = 5, get_ngram: bool = False, **kwargs):
     '''
     #### LOGIC (Lang and Stice-Lawrence, 2015):
     
@@ -87,28 +87,48 @@ def Boilerplate(input_data: pd.Series, n: int = 4, min_doc: float = 5, get_ngram
     # If get_ngram is set to True, it will return a dataframe with ngrams and corresponding frequency.
     '''
 
+    doc_length = len(input_data)
     # Test 1: check ngram
 
     assert 3 <= n <= 6, "Invalid Value for n (int) [3,6]"
 
     # Test 2: check min_doc
     if min_doc >= 1:
 
-        assert min_doc <= len(input_data)/2, "Invalid Value for min_doc (int), it cannot excess half of the total number of the documents"   
+        assert min_doc <= doc_length/2, "Invalid Value for min_doc (int), it cannot excess half of the total number of the documents"   
 
     if 0 < min_doc <1:
 
         assert min_doc <= 0.50, "Invalid Value for min_doc, it cannot excess half of the total number of the documents"
 
-        min_doc = round((min_doc*len(input_data)))
+        min_doc = round((min_doc*doc_length))
 
     else:
 
         assert min_doc > 0, "Invalid Value for min_doc."
+
+    # Test 3: check upper limit ratio
+    max_doc = kwargs.get("max_doc", 0.75)
+
+    if max_doc >= 1:
+
+        assert max_doc >= doc_length*0.5, "Invalid Value for max_doc (int), it cannot be less than half of the total number of the documents"   
+        upper_ratio = round(max_doc/doc_length)
+
+    if 0 < max_doc <1:
+
+        assert max_doc >= 0.50, "Invalid Value for max_doc, the ratio cannot be less than 0.5"
 
+    else:
+
+        assert max_doc > 0, "Invalid Value for min_doc."    
+
+    # update the upper_ratio
+    upper_ratio = max_doc
+
     # capture the 4-grams for each sentence for all the documents
-    ngram = [0]*len(input_data)
-    for i in tqdm(range(len(input_data)), desc = 'Get the Boilerplate'):
+    ngram = [0]*doc_length
+    for i in tqdm(range(doc_length), desc = 'Get the Boilerplate'):
 
         ngram[i] = [0]*len(input_data[i])
         for j in range(len(input_data[i])):
@@ -131,16 +151,16 @@ def Boilerplate(input_data: pd.Series, n: int = 4, min_doc: float = 5, get_ngram
 
     else:
         # Remove tetragrams that occur more than 75% and less than the specified min_doc across documents
-        upper_limit = 0.75 * len(input_data)
+        upper_limit = upper_ratio * doc_length
         fndf = fndf_all.query(f'counts >= {min_doc} and counts <= {upper_limit}')
 
         # NWoS, calculate the number of the words in each sentence per document, and store them
-        temp_nwos = [0]*len(input_data)
+        temp_nwos = [0]*doc_length
         for i in tqdm(range(len(temp_nwos)), desc = "Get the Length of Sentence"):
             temp_nwos[i] = [len(j.split()) for j in input_data[i]]    
 
         # Flag the sentence
-        sent_flag = [[]]*len(input_data)
+        sent_flag = [[]]*doc_length
         for i in tqdm(range(len(sent_flag)), desc = 'Flag the Sentence'):
 
             sent_flag[i] = [0]*len(ngram[i])
@@ -157,7 +177,7 @@ def Boilerplate(input_data: pd.Series, n: int = 4, min_doc: float = 5, get_ngram
 
         #Final Calculation of Boilerplate
         display("======================== Boilerplate Calculation Started =========================")
-        boilerplate = [sum(sent_flag[i])/sum(temp_nwos[i]) for i in range(len(input_data))]
+        boilerplate = [sum(sent_flag[i])/sum(temp_nwos[i]) for i in range(doc_length)]
         display("======================== Boilerplate Calculation Finished ========================")
 
         return boilerplate
@@ -170,11 +190,12 @@ def Redundancy(input_data: pd.Series, n: int = 10):
     #  % of 10-grams that occur more than once in each document (Cazier and Pfeiffer, 2015)
     '''
 
+    doc_length = len(input_data)
     assert 5 <= n <= 15, "Invalid Value for n (int) [5,15]"    
 
-    # capture the 10-grams for each sentence for all the documents
-    ngram = [0]*len(input_data)
-    for i in tqdm(range(len(input_data)), desc = 'Get the Redundancy'):
+     # capture the 10-grams for each sentence for all the documents
+    ngram = [0]*doc_length
+    for i in tqdm(range(doc_length), desc = 'Get the Redundancy'):
 
         ngram[i] = [0]*len(input_data[i])
         for j in range(len(input_data[i])):
@@ -206,15 +227,15 @@ def Specificity(input_data: pd.Series):
     # All scaled by the total number of words in document.
     '''
 
+    doc_length = len(input_data)
 
     ner = spacy.load('en_core_web_sm')
 
-    specificity = [0]*len(input_data)
+    specificity = [0]*doc_length
 
-    for i in tqdm(range(len(input_data)), desc = 'Get the Specificity'):
-        specificity[i] = len(ner(input_data[i]).ents)/len(input_data[i])
+    for i in tqdm(range(doc_length), desc = 'Get the Specificity'):
+        specificity[i] = len(ner(input_data[i]).ents)/len(input_data[i].split())
 
-    #[len(ner(data[i]).ents)/len(data[i]) for i in tqdm(range(len(data)))]
 
     return specificity
 
@@ -240,9 +261,11 @@ def Relative_prevalence(input_data:pd.Series):
 #             if ent.label_ == "GPE":
 #                 ents.append(ent)
 
-    relative_prevalence = [0]*len(input_data)
+    doc_length = len(input_data)
+
+    relative_prevalence = [0]*doc_length
 
-    for i in tqdm(range(len(input_data)), desc = 'Get the Relative_prevalence'):
+    for i in tqdm(range(doc_length), desc = 'Get the Relative_prevalence'):
 
         a = len(input_data[i].split())
         doc = ''.join([j for j in input_data[i] if not j.isdigit()])

diff --git a/dist/MoreThanSentiments-0.2.1-py3-none-any.whl b/dist/MoreThanSentiments-0.2.1-py3-none-any.whl
diff --git a/dist/MoreThanSentiments-0.2.1.tar.gz b/dist/MoreThanSentiments-0.2.1.tar.gz