minor changes

CaileanCarter · May 11, 2021 · 7155b9e · 7155b9e
1 parent ad95ac0
commit 7155b9e
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 keyword-library.txt
 workbench.ipynb
 csv-Salmonella-set.csv
-escherichi-set.csv
+escherichi-set.csv
+SalmonellaSerovar-set.csv
diff --git a/litsearch.py b/litsearch.py
@@ -118,6 +118,7 @@ def main(csv_file):
     lit["ST"] = lit["ST"].replace("", np.NaN)                       # Identify empty values in ST col
     lit = lit[lit["ST"].notna()]                                    # Filter out rows without mention of ST
     lit = PubMed.add_times_cited(lit)                               # Fetch times each article has been cited
+    print("Literature search has finished, you can now search for sequence types.")
     return lit
 
 

diff --git a/serovar-search.py b/serovar-search.py
@@ -0,0 +1,65 @@
+
+
+import re
+from collections import Counter
+from sys import argv
+
+import numpy as np
+import pandas as pd
+from Bio import Entrez
+
+from litsearch import PubMed, ask_email, search
+
+
+
+def find_terms(text):
+    p = re.compile(r"Salmonella enterica [Ss]erovar (\w+)")
+    try:
+        n = p.search(text).group(1)
+        return n
+    except AttributeError:
+        return np.NaN
+
+
+def main(csv_file):
+
+    lit = pd.read_csv(csv_file, index_col=0)                        # make DataFrame
+    lit = lit.drop(["PMCID", "NIHMS ID", "First Author"], axis=1)
+    lit["Serovar"] = lit["Title"].apply(find_terms)                      # Identify sequence type from title
+    lit = lit[lit["Serovar"].notna()]                                    # Filter out rows without mention of ST
+    lit = PubMed.add_times_cited(lit)
+
+    return lit
+
+
+if __name__ == "__main__":
+
+    if argv[2].endswith(".csv"):
+        fp = argv[2]
+        skip_arg_two = False
+    else:
+        fp = "escherichi-set.csv"
+        skip_arg_two = True
+
+
+    if argv[1] == "new":
+        ask_email()
+        df = main(fp)
+        df.to_csv(fp)
+
+    elif argv[1] == "find":
+        df = pd.read_csv(fp, index_col=0)
+        if not skip_arg_two:
+            term = argv[3]
+        else:
+            term = argv[2]
+        print(search(df, term))
+
+
+    elif argv[1] == "summary":
+        ask_email()
+        if not skip_arg_two:
+            pmid = argv[3]
+        else:
+            pmid = argv[2]
+        PubMed.long_summary(pmid)