diff --git a/.gitignore b/.gitignore index 814b8e9..09d61ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ keyword-library.txt workbench.ipynb csv-Salmonella-set.csv -escherichi-set.csv \ No newline at end of file +escherichi-set.csv +SalmonellaSerovar-set.csv \ No newline at end of file diff --git a/litsearch.py b/litsearch.py index 2bea3a0..1c94e95 100644 --- a/litsearch.py +++ b/litsearch.py @@ -118,6 +118,7 @@ def main(csv_file): lit["ST"] = lit["ST"].replace("", np.NaN) # Identify empty values in ST col lit = lit[lit["ST"].notna()] # Filter out rows without mention of ST lit = PubMed.add_times_cited(lit) # Fetch times each article has been cited + print("Literature search has finished, you can now search for sequence types.") return lit diff --git a/serovar-search.py b/serovar-search.py new file mode 100644 index 0000000..eb6c35d --- /dev/null +++ b/serovar-search.py @@ -0,0 +1,65 @@ + + +import re +from collections import Counter +from sys import argv + +import numpy as np +import pandas as pd +from Bio import Entrez + +from litsearch import PubMed, ask_email, search + + + +def find_terms(text): + p = re.compile(r"Salmonella enterica [Ss]erovar (\w+)") + try: + n = p.search(text).group(1) + return n + except AttributeError: + return np.NaN + + +def main(csv_file): + + lit = pd.read_csv(csv_file, index_col=0) # make DataFrame + lit = lit.drop(["PMCID", "NIHMS ID", "First Author"], axis=1) + lit["Serovar"] = lit["Title"].apply(find_terms) # Identify sequence type from title + lit = lit[lit["Serovar"].notna()] # Filter out rows without mention of ST + lit = PubMed.add_times_cited(lit) + + return lit + + +if __name__ == "__main__": + + if argv[2].endswith(".csv"): + fp = argv[2] + skip_arg_two = False + else: + fp = "escherichi-set.csv" + skip_arg_two = True + + + if argv[1] == "new": + ask_email() + df = main(fp) + df.to_csv(fp) + + elif argv[1] == "find": + df = pd.read_csv(fp, index_col=0) + if not skip_arg_two: + term = argv[3] + else: + term = argv[2] + print(search(df, term)) + + + elif argv[1] == "summary": + ask_email() + if not skip_arg_two: + pmid = argv[3] + else: + pmid = argv[2] + PubMed.long_summary(pmid) \ No newline at end of file