Removed last bit of perl stuff and simplified organization

SystemsGenetics · Oct 17, 2021 · a157df4 · a157df4
1 parent ad30c38
commit a157df4
Show file tree

Hide file tree

Showing 19 changed files with 40 additions and 54 deletions.
diff --git a/docker/py.Dockerfile → Dockerfile b/docker/py.Dockerfile → Dockerfile
diff --git a/docker/README.md b/docker/README.md
diff --git a/docker/pl.Dockerfile b/docker/pl.Dockerfile
diff --git a/func_e/FUNC_E.py b/func_e/FUNC_E.py
@@ -144,31 +144,23 @@ def importTerms2FeaturesFiles(self, files):
     def importFiles(self, fdict):
         """
         """
-        if fdict['background']:
+        if 'background' in fdict.keys():
             self.importBackgroundFile(fdict['background'])
-        else:
-            raise Exception("A background file is required." )
 
-        if fdict['query']:
+        if 'query' in fdict.keys():
             self.importQueryFile(fdict['query'])
-        else:
-            raise Exception("A query file is required." )
 
-        if fdict['terms']:
+        if 'terms' in fdict.keys():
             if isinstance(fdict['terms'], list):
                 self.importTermsFiles(fdict['terms'])
             else:
                 self.importTermsFiles([fdict['terms']])
-        else:
-            raise Exception("At least one term file is required." )
 
-        if fdict['terms2features']:
+        if 'terms2features' in fdict.keys():
             if isinstance(fdict['terms2features'], list):
                 self.importTerms2FeaturesFiles(fdict['terms2features'])
             else:
                 self.importTerms2FeaturesFiles([fdict['terms2features']])
-        else:
-            raise Exception("At least one term2features file is required." )
 
     def doCounts(self):
         """

diff --git a/func_e/cmd.py b/func_e/cmd.py
@@ -1,12 +1,32 @@
 import os.path
 from os import path
 import argparse
+
 from func_e.FUNC_E import FUNC_E
+import func_e.vocabs.all as vocabs
 
-def parseArgs():
+def getTerms():
     """
-    Retrieves the arguments provided on the command-line.
     """
+    parser = argparse.ArgumentParser(description="This script generates files copmatible with the --terms argument for FUNC-E.")
+
+    parser.add_argument("--vocab", dest="vocab", type=str, nargs='*',
+        required=False, help="Optional.  Specify the term vocabulary ID to perform enrichment and clustering.  Provide as many vocabulary IDs as desired.  Vocab IDs may include, for example, GO, IPR, KEGG, TOS, GNAME or whatever vocabularies are provided.  Be sure that these vocabularies are present in the terms list or enrichment will be not be performed.")
+    parser.add_argument("--outprefix", dest="outprefix", type=str,
+        default=None, required=False, help="Optional.  Provide a prefix for the output file.")
+
+    args = parser.parse_args()
+    terms = vocabs.getTerms(args.vocab)
+
+    outprefix = args.outprefix + '.' if args.outprefix else ''
+    terms.to_csv(outprefix + 'terms.tsv', index=None, sep="\t", header=None)
+
+
+def func_e():
+    """
+    """
+
+    # Retrieves the arguments provided on the command-line.
     parser = argparse.ArgumentParser(description="This script will perform functional enrichment and enriched term clustering on a list of genes.You must provide a background file of gene or transcript names, a network or query file, a set of vocabularies (e.g. GO, InterPro, etc), and a file mapping genes in the network or query file to the terms in the vocabularies. For information on the format of these files see the argument section below.")
 
     parser.add_argument("--background", dest="background", type=str,
@@ -27,10 +47,10 @@ def parseArgs():
     parser.add_argument("--outprefix", dest="outprefix", type=str,
         default=None, required=False, help="Optional.  Provide a prefix for the output reports.")
 
-    parser.add_argument("--module", dest="module", type=str,
+    parser.add_argument("--module", dest="module", type=str, default=[],
         required=False, help="Optional. Specify a module name to limit the counting by module.")
 
-    parser.add_argument("--vocab", dest="vocab", type=str, nargs='*',
+    parser.add_argument("--vocab", dest="vocab", type=str, nargs='*', default=[],
         required=False, help="Optional.  Specify the term vocabulary ID to perform enrichment and clustering.  Provide as many vocabulary IDs as desired.  Vocab IDs may include, for example, GO, IPR, KEGG, TOS, GNAME or whatever vocabularies are provided.  Be sure that these vocabularies are present in the terms list or enrichment will be not be performed.")
 
     parser.add_argument("--similarity_threshold", dest="similarity_threshold", type=str,
@@ -54,17 +74,7 @@ def parseArgs():
     parser.add_argument("--verbose", dest="verbose", type=float, default="1",
         required=False, help="Optional. Set to 1 to print to STDOUT default progress deteails. Setto 2 for debugging logs. Set to 0 to run quietly without anything printed to STDOUT. The default value is 1.")
 
-    # TODO: make sure that the either the network or query arguments are
-    # provided.
-
-    return parser.parse_args()
-
-
-def func_e():
-    """
-    The main subrouting of FUNC-E.
-    """
-    args = parseArgs()
+    args = parser.parse_args()
 
     fe = FUNC_E()
     fe.setVerbosity(args.verbose)

diff --git a/func_e/vocabs/GO.py b/func_e/vocabs/GO.py
@@ -12,7 +12,7 @@ def getTerms():
         line = line.decode("utf-8")
         if re.search(r'^id: GO', line):
             if len(cols.keys()) == 3:
-                terms_list.append([cols['Vocab'], cols['Term'], cols['Name']])
+                terms_list.append([cols['Vocabulary'], cols['Term'], cols['Name']])
             cols = {}
             m = re.search(r'^id: (GO:\d+)', line)
             cols['Term'] = m.group(1)
@@ -21,6 +21,6 @@ def getTerms():
             cols['Name'] = m.group(1)
         if re.search(r'^namespace: ', line):
             m = re.search(r'^namespace: (.+)', line)
-            cols['Vocab'] = 'GO' #m.group(1)
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+            cols['Vocabulary'] = 'GO' #m.group(1)
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     return terms
diff --git a/func_e/vocabs/IPR.py b/func_e/vocabs/IPR.py
@@ -13,5 +13,5 @@ def getTerms():
             continue
         cols = line.decode("utf-8").split("\t")
         terms_list.append(['IPR', cols[0], cols[2]])
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     return terms
diff --git a/func_e/vocabs/KEGG.py b/func_e/vocabs/KEGG.py
@@ -11,7 +11,7 @@ def getOrthologs():
     for line in r.content.splitlines():
         cols = line.decode("utf-8").split("\t")
         terms_list.append(['KEGG', cols[0], cols[1]])
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     terms['Term'] = terms['Term'].str.replace(r'ko:','', regex=True)
 
     return terms
@@ -26,7 +26,7 @@ def getPathways():
     for line in r.content.splitlines():
         cols = line.decode("utf-8").split("\t")
         terms_list.append(['KEGG', cols[0], cols[1]])
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     terms['Term'] = terms['Term'].str.replace(r'path:map','ko', regex=True)
     return terms
 
@@ -40,7 +40,7 @@ def getModules():
     for line in r.content.splitlines():
         cols = line.decode("utf-8").split("\t")
         terms_list.append(['KEGG', cols[0], cols[1]])
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     terms['Term'] = terms['Term'].str.replace(r'md:','', regex=True)
     return terms
 

diff --git a/func_e/vocabs/Pfam.py b/func_e/vocabs/Pfam.py
@@ -3,7 +3,7 @@
 
 def getTerms():
     return
-    
+
     url = 'http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz'
     r = requests.get(url, allow_redirects=True)
 
@@ -15,5 +15,5 @@ def getTerms():
             continue
         cols = line.decode("utf-8").split("\t")
         terms_list.append(['Pfam', cols[1], cols[4]])
-    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(terms_list, columns=['Vocabulary', 'Term', 'Name'])
     return terms
diff --git a/func_e/vocabs/all.py b/func_e/vocabs/all.py
@@ -6,7 +6,7 @@
 def getTerms(vocabs = []):
     """
     """
-    terms = pd.DataFrame(columns=['Vocab', 'Term', 'Name'])
+    terms = pd.DataFrame(columns=['Vocabulary', 'Term', 'Name'])
     if 'GO' in vocabs:
         terms = pd.concat([terms, GO_getTerms()])
     if 'IPR' in vocabs:

diff --git a/setup.py b/setup.py
@@ -24,5 +24,6 @@
     tests_require=['pytest'],
     entry_points={'console_scripts': [
         'FUNC-E = func_e.cmd:func_e',
+        'FUNC-E-terms = func_e.cmd:getTerms'
     ]},
 )
diff --git a/test/__init__.py b/test/__init__.py
diff --git a/demo/GO.terms.txt → test/demo/GO.terms.txt b/demo/GO.terms.txt → test/demo/GO.terms.txt
diff --git a/demo/IPR.terms.txt → test/demo/IPR.terms.txt b/demo/IPR.terms.txt → test/demo/IPR.terms.txt
diff --git a/demo/demo_query.txt → test/demo/demo_query.txt b/demo/demo_query.txt → test/demo/demo_query.txt
diff --git a/demo/oryza_sativa.MSU_v7_0.genes.txt → test/demo/oryza_sativa.MSU_v7_0.genes.txt b/demo/oryza_sativa.MSU_v7_0.genes.txt → test/demo/oryza_sativa.MSU_v7_0.genes.txt
diff --git a/demo/oryza_sativa.MSU_v7_0.genes2GO.txt → test/demo/oryza_sativa.MSU_v7_0.genes2GO.txt b/demo/oryza_sativa.MSU_v7_0.genes2GO.txt → test/demo/oryza_sativa.MSU_v7_0.genes2GO.txt
diff --git a/demo/oryza_sativa.MSU_v7_0.genes2IPR.txt → .../demo/oryza_sativa.MSU_v7_0.genes2IPR.txt b/demo/oryza_sativa.MSU_v7_0.genes2IPR.txt → .../demo/oryza_sativa.MSU_v7_0.genes2IPR.txt
diff --git a/demo/run_demo.sh → test/demo/run_demo.sh b/demo/run_demo.sh → test/demo/run_demo.sh