Bringing over files from the more modularized variant of the Bias pip…

…eline. New version!
SarthakJShetty · Sep 15, 2020 · 1f7c8d8 · 1f7c8d8
1 parent 1a4bcf7
commit 1f7c8d8
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 16 deletions.
diff --git a/pyResearchInsights/Bias.py b/pyResearchInsights/Bias.py
@@ -8,17 +8,17 @@
 12/09/2018'''
 
 '''Imports scraper_main() from Scraper.py'''
-from Scraper import scraper_main
+from pyResearchInsights.Scraper import scraper_main
 '''Importing the analyzer code here as well'''
-from Analyzer import analyzer_main
+from pyResearchInsights.Analyzer import analyzer_main
 '''Importing the Cleaner functions here that removes special characters from the corpus'''
-from Cleaner import cleaner_main
+from pyResearchInsights.Cleaner import cleaner_main
 '''Importing the visualizer and gensim code here'''
-from NLP_Engine import nlp_engine_main
+from pyResearchInsights.NLP_Engine import nlp_engine_main
 '''Imports some of the functions required by different scripts here.'''
-from common_functions import pre_processing, arguments_parser, end_process
+from pyResearchInsights.common_functions import pre_processing, arguments_parser, end_process
 '''Declaring tarballer here from system_functions() to tarball the LOG directory, & rm_original_folder to delete the directory and save space.'''
-from system_functions import tarballer, rm_original_folder
+from pyResearchInsights.system_functions import tarballer, rm_original_folder
 
 '''Keywords from the user are extracted here'''
 keywords_to_search, trend_keywords = arguments_parser()

diff --git a/pyResearchInsights/NLP_Engine.py b/pyResearchInsights/NLP_Engine.py
@@ -42,13 +42,8 @@
 
 from nltk.corpus import stopwords
 stop_words = stopwords.words('english')
-stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'com', 'https', 'url', 'link', 'xe', 'abstract', 'author', 'chapter', 'springer', 'title', "the", "of", "and", "in", "to", "a", "is", "for", "from", "with", "that",	"by", "are", "on", "was", "as", 
-	"were", "url:", "abstract:", "abstract",  "author:", "title:", "at", "be", "an", "during", "have", "this", "which", "study", "been", "species", "not", "has", "between",
-	"using", "its", "also", "these", "this", "used", "over", "can", "within", "into", "all","due", "use", "about", "a", 'it', 'their', "where", "we", "most", "may", "through",
-	"though", "like", "or", "further", "e.g.", "along", "any", "those", "had", "toward", "due", "both", "some", "use", "even", "more", "but", "while", "pass", 
-	"well", "will", "when", "only", "after", "author", "title", "there", "our", "did", "much", "as", "if", "become", "still", "various", "very", "out",
-	"they", "via", "available", "such", "than", "different", "many", "areas", "no", "one", "two", "small", "first", "other", "such", "-", "could", "studies", "high",
-	"provide", "among", "highly", "no", "case", "across", "given", "need", "would", "under", "found", "low", "values", "xe2\\x80\\x89", "xa", "xc", "xb", "\xc2\xa0C\xc2\xa0ha\xe2\x88\x921", "suggest", "up", "'The", "area"])
+stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'com', 'https', 'url', 'link', 'abstract', 'author', 'chapter', 'springer', 'title', "the", "of", "and", "in", "to", "a", "is", "for", "from", "with", "that", "by", "are", "on", "was", "as", 
+	"were", "url:", "abstract:", "abstract",  "author:", "title:", "at", "be", "an", "have", "this", "which", "study", "been", "not", "has", "its", "also", "these", "this", "can", "a", 'it', 'their', "e.g.", "those", "had", "but", "while", "will", "when", "only", "author", "title", "there", "our", "did", "as", "if", "they", "such", "than", "no", "-", "could",])
 
 def data_reader(abstracts_log_name, status_logger_name):
 	'''This wherer the file is being parsed from to the model'''
@@ -178,6 +173,9 @@ def nlp_engine_main(abstracts_log_name, status_logger_name):
 	nlp_engine_main_end_status_key = "Idling the NLP Engine"
 	status_logger(status_logger_name, nlp_engine_main_end_status_key)
 
+	'''We can arrive at logs_folder_name from abstracts_log_name, instead of passing it to the NLP_Engine function each time'''
+	logs_folder_name = abstracts_log_name.split('Abstract')[0][:-1]
+
 	'''Importing the visualizer_main function to view the LDA Model built by the NLP_engine_main() function'''
 	visualizer_main(lda_model, corpus, id2word, logs_folder_name, status_logger_name)
 

diff --git a/pyResearchInsights/Scraper.py b/pyResearchInsights/Scraper.py
@@ -73,7 +73,7 @@ def url_generator(start_url, query_string, status_logger_name):
 	'''We generate the urls_to_scrape from the stripped down determiner element'''
 	urls_to_scrape = [(start_url+str(counter)+"?facet-content-type=\"Article\"&query="+query_string+"&facet-language=\"En\"") for counter in range(1, (int(determiner.replace(',', '')) + 1))]
 
-	url_generator_stop_status_key = determiner.replace(',', '') + " URLs have been obtained"
+	url_generator_stop_status_key = determiner.replace(',', '') + " page URLs have been obtained"
 	status_logger(status_logger_name, url_generator_stop_status_key)
 
 	return urls_to_scrape
@@ -365,7 +365,7 @@ def processor(abstract_url, urls_to_scrape, abstract_id_log_name, abstracts_log_
 	visualizer code to generate the trends histogram.'''
 	permanent_word_sorter_list = word_sorter_list_generator(status_logger_name)
 
-	for site_url_index in range(0, (len(urls_to_scrape)+1)):
+	for site_url_index in range(0, len(urls_to_scrape)):
 		print(urls_to_scrape[site_url_index])
 		if(site_url_index==0):
 			results_determiner(urls_to_scrape[site_url_index], status_logger_name)

diff --git a/pyResearchInsights/common_functions.py b/pyResearchInsights/common_functions.py
@@ -104,7 +104,7 @@ def arguments_parser():
 
 	parser = argparse.ArgumentParser()
 	parser.add_argument("--keywords", help="Keyword to search on Springer", default="Tiger")
-	parser.add_argument("--trends", help="Keywords to generate the trends histogram for", default="Tiger")
+	parser.add_argument("--trends", help="Keywords to generate the trends histogram for", default="Conservation")
 	parser.add_argument("--paper", help="If papers have to be downloaded as well", default="No")
 
 	arguments = parser.parse_args()