Added functions to get terms

SystemsGenetics · Oct 17, 2021 · 30d29ba · 30d29ba
1 parent 8020690
commit 30d29ba
Show file tree

Hide file tree

Showing 22 changed files with 132 additions and 50 deletions.
diff --git a/bin/AraCyc.sh b/bin/AraCyc.sh
diff --git a/bin/GO.sh b/bin/GO.sh
diff --git a/bin/IPR.sh b/bin/IPR.sh
diff --git a/bin/KEGG.sh b/bin/KEGG.sh
diff --git a/bin/PO.sh b/bin/PO.sh
diff --git a/bin/Pfam.sh b/bin/Pfam.sh
diff --git a/bin/README.md b/bin/README.md
diff --git a/bin/RiceCyc.sh b/bin/RiceCyc.sh
diff --git a/func_e/FUNC_E.py b/func_e/FUNC_E.py
@@ -550,8 +550,6 @@ def _calculateClusterStats(self, clusters, module):
             }, ignore_index=True)
         return cluster_stats
 
-
-
     def doModuleClustering(self, module):
         """
         """

diff --git a/func_e/vocabs/GO.py b/func_e/vocabs/GO.py
@@ -0,0 +1,26 @@
+import requests
+import re
+import pandas as pd
+
+def getTerms():
+    url = 'http://purl.obolibrary.org/obo/go.obo'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    cols = {}
+    for line in r.content.splitlines():
+        line = line.decode("utf-8")
+        if re.search(r'^id: GO', line):
+            if len(cols.keys()) == 3:
+                terms_list.append([cols['Vocab'], cols['Term'], cols['Name']])
+            cols = {}
+            m = re.search(r'^id: (GO:\d+)', line)
+            cols['Term'] = m.group(1)
+        if re.search(r'^name: ', line):
+            m = re.search(r'^name: (.+)', line)
+            cols['Name'] = m.group(1)
+        if re.search(r'^namespace: ', line):
+            m = re.search(r'^namespace: (.+)', line)
+            cols['Vocab'] = 'GO' #m.group(1)
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    return terms
diff --git a/func_e/vocabs/IPR.py b/func_e/vocabs/IPR.py
@@ -0,0 +1,17 @@
+import requests
+import pandas as pd
+
+def getTerms():
+    url = 'http://ftp.ebi.ac.uk/pub/databases/interpro/entry.list'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    in_header = True
+    for line in r.content.splitlines():
+        if in_header:
+            in_header = False
+            continue
+        cols = line.decode("utf-8").split("\t")
+        terms_list.append(['IPR', cols[0], cols[2]])
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    return terms
diff --git a/func_e/vocabs/KEGG.py b/func_e/vocabs/KEGG.py
@@ -0,0 +1,52 @@
+import requests
+import pandas as pd
+
+def getOrthologs():
+    """
+    """
+    url = 'http://rest.kegg.jp/list/ko'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    for line in r.content.splitlines():
+        cols = line.decode("utf-8").split("\t")
+        terms_list.append(['KEGG', cols[0], cols[1]])
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms['Term'] = terms['Term'].str.replace(r'ko:','', regex=True)
+
+    return terms
+
+def getPathways():
+    """
+    """
+    url = 'http://rest.kegg.jp/list/pathway'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    for line in r.content.splitlines():
+        cols = line.decode("utf-8").split("\t")
+        terms_list.append(['KEGG', cols[0], cols[1]])
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms['Term'] = terms['Term'].str.replace(r'path:map','ko', regex=True)
+    return terms
+
+def getModules():
+    """
+    """
+    url = 'http://rest.kegg.jp/list/md'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    for line in r.content.splitlines():
+        cols = line.decode("utf-8").split("\t")
+        terms_list.append(['KEGG', cols[0], cols[1]])
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    terms['Term'] = terms['Term'].str.replace(r'md:','', regex=True)
+    return terms
+
+def getTerms():
+    modules = getModules()
+    orthologs = getOrthologs()
+    pathways = getPathways()
+
+    return pd.concat([pathways, orthologs, modules])
diff --git a/func_e/vocabs/Pfam.py b/func_e/vocabs/Pfam.py
@@ -0,0 +1,19 @@
+import requests
+import pandas as pd
+
+def getTerms():
+    return
+
+    url = 'http://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.full.gz'
+    r = requests.get(url, allow_redirects=True)
+
+    terms_list = []
+    in_header = True
+    for line in r.content.splitlines():
+        if in_header:
+            in_header = False
+            continue
+        cols = line.decode("utf-8").split("\t")
+        terms_list.append(['Pfam', cols[1], cols[4]])
+    terms = pd.DataFrame(terms_list, columns=['Vocab', 'Term', 'Name'])
+    return terms
diff --git a/func_e/vocabs/__init__.py b/func_e/vocabs/__init__.py
diff --git a/func_e/vocabs/__pycache__/GO.cpython-38.pyc b/func_e/vocabs/__pycache__/GO.cpython-38.pyc
diff --git a/func_e/vocabs/__pycache__/IPR.cpython-38.pyc b/func_e/vocabs/__pycache__/IPR.cpython-38.pyc
diff --git a/func_e/vocabs/__pycache__/KEGG.cpython-38.pyc b/func_e/vocabs/__pycache__/KEGG.cpython-38.pyc
diff --git a/func_e/vocabs/__pycache__/__init__.cpython-38.pyc b/func_e/vocabs/__pycache__/__init__.cpython-38.pyc
diff --git a/func_e/vocabs/__pycache__/all.cpython-38.pyc b/func_e/vocabs/__pycache__/all.cpython-38.pyc
diff --git a/func_e/vocabs/all.py b/func_e/vocabs/all.py
@@ -0,0 +1,16 @@
+from .GO import getTerms as GO_getTerms
+from .KEGG import getTerms as KEGG_getTerms
+from .IPR import getTerms as IPR_getTerms
+import pandas as pd
+
+def getTerms(vocabs = []):
+    """
+    """
+    terms = pd.DataFrame(columns=['Vocab', 'Term', 'Name'])
+    if 'GO' in vocabs:
+        terms = pd.concat([terms, GO_getTerms()])
+    if 'IPR' in vocabs:
+        terms = pd.concat([terms, IPR_getTerms()])
+    if 'KEGG' in vocabs:
+        terms = pd.concat([terms, KEGG_getTerms()])
+    return terms
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ statsmodels
 sklearn
 progressbar2
 scipy
+requests
diff --git a/setup.py b/setup.py
@@ -8,6 +8,7 @@
   sklearn
   progressbar2
   scipy
+  requests
 """.split()
 
 setup(
-Original file line number
+Diff line change
@@ Expand Up / @@ -550,8 +550,6 @@ def _calculateClusterStats(self, clusters, module): @@
                 }, ignore_index=True)
             return cluster_stats
         def doModuleClustering(self, module):
             """
             """
@@ Expand Down @@