Skip to content

Commit

Permalink
Bringing over files from the more modularized variant of the Bias pip…
Browse files Browse the repository at this point in the history
…eline. New version!
  • Loading branch information
SarthakJShetty committed Sep 15, 2020
1 parent 1a4bcf7 commit 1f7c8d8
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 16 deletions.
12 changes: 6 additions & 6 deletions pyResearchInsights/Bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@
12/09/2018'''

'''Imports scraper_main() from Scraper.py'''
from Scraper import scraper_main
from pyResearchInsights.Scraper import scraper_main
'''Importing the analyzer code here as well'''
from Analyzer import analyzer_main
from pyResearchInsights.Analyzer import analyzer_main
'''Importing the Cleaner functions here that removes special characters from the corpus'''
from Cleaner import cleaner_main
from pyResearchInsights.Cleaner import cleaner_main
'''Importing the visualizer and gensim code here'''
from NLP_Engine import nlp_engine_main
from pyResearchInsights.NLP_Engine import nlp_engine_main
'''Imports some of the functions required by different scripts here.'''
from common_functions import pre_processing, arguments_parser, end_process
from pyResearchInsights.common_functions import pre_processing, arguments_parser, end_process
'''Declaring tarballer here from system_functions() to tarball the LOG directory, & rm_original_folder to delete the directory and save space.'''
from system_functions import tarballer, rm_original_folder
from pyResearchInsights.system_functions import tarballer, rm_original_folder

'''Keywords from the user are extracted here'''
keywords_to_search, trend_keywords = arguments_parser()
Expand Down
12 changes: 5 additions & 7 deletions pyResearchInsights/NLP_Engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,8 @@

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'com', 'https', 'url', 'link', 'xe', 'abstract', 'author', 'chapter', 'springer', 'title', "the", "of", "and", "in", "to", "a", "is", "for", "from", "with", "that", "by", "are", "on", "was", "as",
"were", "url:", "abstract:", "abstract", "author:", "title:", "at", "be", "an", "during", "have", "this", "which", "study", "been", "species", "not", "has", "between",
"using", "its", "also", "these", "this", "used", "over", "can", "within", "into", "all","due", "use", "about", "a", 'it', 'their', "where", "we", "most", "may", "through",
"though", "like", "or", "further", "e.g.", "along", "any", "those", "had", "toward", "due", "both", "some", "use", "even", "more", "but", "while", "pass",
"well", "will", "when", "only", "after", "author", "title", "there", "our", "did", "much", "as", "if", "become", "still", "various", "very", "out",
"they", "via", "available", "such", "than", "different", "many", "areas", "no", "one", "two", "small", "first", "other", "such", "-", "could", "studies", "high",
"provide", "among", "highly", "no", "case", "across", "given", "need", "would", "under", "found", "low", "values", "xe2\\x80\\x89", "xa", "xc", "xb", "\xc2\xa0C\xc2\xa0ha\xe2\x88\x921", "suggest", "up", "'The", "area"])
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'com', 'https', 'url', 'link', 'abstract', 'author', 'chapter', 'springer', 'title', "the", "of", "and", "in", "to", "a", "is", "for", "from", "with", "that", "by", "are", "on", "was", "as",
"were", "url:", "abstract:", "abstract", "author:", "title:", "at", "be", "an", "have", "this", "which", "study", "been", "not", "has", "its", "also", "these", "this", "can", "a", 'it', 'their', "e.g.", "those", "had", "but", "while", "will", "when", "only", "author", "title", "there", "our", "did", "as", "if", "they", "such", "than", "no", "-", "could",])

def data_reader(abstracts_log_name, status_logger_name):
'''This wherer the file is being parsed from to the model'''
Expand Down Expand Up @@ -178,6 +173,9 @@ def nlp_engine_main(abstracts_log_name, status_logger_name):
nlp_engine_main_end_status_key = "Idling the NLP Engine"
status_logger(status_logger_name, nlp_engine_main_end_status_key)

'''We can arrive at logs_folder_name from abstracts_log_name, instead of passing it to the NLP_Engine function each time'''
logs_folder_name = abstracts_log_name.split('Abstract')[0][:-1]

'''Importing the visualizer_main function to view the LDA Model built by the NLP_engine_main() function'''
visualizer_main(lda_model, corpus, id2word, logs_folder_name, status_logger_name)

Expand Down
4 changes: 2 additions & 2 deletions pyResearchInsights/Scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def url_generator(start_url, query_string, status_logger_name):
'''We generate the urls_to_scrape from the stripped down determiner element'''
urls_to_scrape = [(start_url+str(counter)+"?facet-content-type=\"Article\"&query="+query_string+"&facet-language=\"En\"") for counter in range(1, (int(determiner.replace(',', '')) + 1))]

url_generator_stop_status_key = determiner.replace(',', '') + " URLs have been obtained"
url_generator_stop_status_key = determiner.replace(',', '') + " page URLs have been obtained"
status_logger(status_logger_name, url_generator_stop_status_key)

return urls_to_scrape
Expand Down Expand Up @@ -365,7 +365,7 @@ def processor(abstract_url, urls_to_scrape, abstract_id_log_name, abstracts_log_
visualizer code to generate the trends histogram.'''
permanent_word_sorter_list = word_sorter_list_generator(status_logger_name)

for site_url_index in range(0, (len(urls_to_scrape)+1)):
for site_url_index in range(0, len(urls_to_scrape)):
print(urls_to_scrape[site_url_index])
if(site_url_index==0):
results_determiner(urls_to_scrape[site_url_index], status_logger_name)
Expand Down
2 changes: 1 addition & 1 deletion pyResearchInsights/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def arguments_parser():

parser = argparse.ArgumentParser()
parser.add_argument("--keywords", help="Keyword to search on Springer", default="Tiger")
parser.add_argument("--trends", help="Keywords to generate the trends histogram for", default="Tiger")
parser.add_argument("--trends", help="Keywords to generate the trends histogram for", default="Conservation")
parser.add_argument("--paper", help="If papers have to be downloaded as well", default="No")

arguments = parser.parse_args()
Expand Down

0 comments on commit 1f7c8d8

Please sign in to comment.