-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathquery.py
executable file
·258 lines (232 loc) · 9.64 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/env python
""" Queries with MetaNetX data indexed with MongoDB or Elasticsearch """
import json
import re
import networkx as nx
from nosqlbiosets.dbutils import DBconnection
from nosqlbiosets.graphutils import remove_highly_connected_nodes
from nosqlbiosets.metanetx.index import TYPE_COMPOUND, TYPE_REACTION
from nosqlbiosets.qryutils import parseinputquery
def cobrababel_parse_metanetx_equation(equation):
""" Note: This function is a copy of the _parse_metanetx_equation() function
in cobrababel project metanetx.py file:
https://github.com/mmundy42/cobrababel
https://github.com/mmundy42/cobrababel/blob/master/LICENSE.txt
Copyright (c) 2017,2017, Mayo Foundation for Medical Education and Research
"""
metabolite_re = re.compile(
r'(\d*\.\d+|\d+) (MNXM\d+|BIOMASS)@(MNXD[\dX]|BOUNDARY)')
parts = equation.split(' = ')
if len(parts) != 2:
return None
# Build a dictionary keyed by metabolite ID with the information needed for
# setting the metabolites in a cobra.core.Reaction object.
metabolites = dict()
reactants = parts[0].split(' + ')
for r in reactants:
match = re.search(metabolite_re, r)
if match is None:
return None
met_id = '{0}_{1}'.format(match.group(2), match.group(3))
metabolites[met_id] = {
'mnx_id': match.group(2),
'coefficient': -1.0 * float(match.group(1)),
'compartment': match.group(3)
}
products = parts[1].split(' + ')
for p in products:
match = re.search(metabolite_re, p)
if match is None:
return None
met_id = '{0}_{1}'.format(match.group(2), match.group(3))
metabolites[met_id] = {
'mnx_id': match.group(2),
'coefficient': float(match.group(1)),
'compartment': match.group(3)
}
return metabolites
class QueryMetaNetX:
def __init__(self, db="MongoDB", index='biosets', version="", **kwargs):
self.dbc = DBconnection(db, index, **kwargs)
self.rcollection = TYPE_REACTION+version
self.ccollection = TYPE_COMPOUND+version
# Given MetaNetX compound id return its name
def getcompoundname(self, mid, limit=0):
if self.dbc.db == 'Elasticsearch':
index, doctype = TYPE_COMPOUND, "_doc"
qc = {"match": {"_id": mid}}
hits, n = self.esquery(self.dbc.es, index, qc, doctype)
else: # MongoDB
qc = {"_id": mid}
hits = list(self.dbc.mdbi[self.ccollection].find(qc, limit=limit))
n = len(hits)
assert 1 == n, "%s %s" % (self.dbc.db, mid)
c = hits[0]
desc = c['desc'] if 'desc' in c else c['_source']['desc']
return desc
def autocomplete_metabolitenames(self, qterm, **kwargs):
"""
Given query term find possible metabolite names that match query term
:param qterm: query term
"""
qc = {"desc": {"$regex": "^%s" % qterm, "$options": "i"}}
r = self.query_metabolites(qc, projection=['desc'], **kwargs)
return list(r)
# Text search metabolites with given query term
def textsearch_metabolites(self, queryterm):
assert "MongoDB" == self.dbc.db
qc = {'$text': {'$search': queryterm}}
hits = self.dbc.mdbi[self.ccollection].find(qc, projection={
'score': {'$meta': "textScore"}})
r = [c for c in hits]
r.sort(key=lambda x: x['score'])
return r
# Given KEGG compound ids find ids for other libraries
def keggcompoundids2otherids(self, cids, lib='MetanetX'):
if self.dbc.db == 'Elasticsearch':
index, doctype = TYPE_COMPOUND, "_doc"
qc = {"match": {"xrefs.id": ' '.join(cids)}}
hits, n = self.esquery(self.dbc.es, index, qc, doctype, len(cids))
else: # MongoDB
qc = {'xrefs.id': {'$in': cids}}
hits = list(self.dbc.mdbi[self.ccollection].find(qc))
n = len(hits)
assert len(cids) == n
mids = [None] * n
for c in hits:
i, libids = None, []
for xref in (c['xrefs'] if 'xrefs' in c else c['_source']['xrefs']):
if i is None and xref['lib'] == 'kegg':
for id_ in xref['id']:
if id_ in cids:
i = cids.index(id_)
break
if xref['lib'] == lib:
if libids is []:
libids = xref['id']
else:
libids += xref['id']
if i is not None:
assert len(libids) >= 1
mids[i] = libids[0] # TODO: check me
return mids
@staticmethod
def esquery(es, index, qc, doc_type=None, size=10):
print("Querying '%s' %s" % (doc_type, str(qc)))
r = es.search(index=index,
body={"query": qc}, size=size)
nhits = r['hits']['total']
return r['hits']['hits'], nhits
# Query metabolites with given query clause
def query_metabolites(self, qc, **kwargs):
assert "MongoDB" == self.dbc.db
if qc is None:
qc = {}
r = self.dbc.mdbi[self.ccollection].find(qc, **kwargs)
return r
# Query metabolites with given query clause
def aggregatequery_metabolites(self, qc, **kwargs):
assert "MongoDB" == self.dbc.db
assert qc is not None
r = self.dbc.mdbi[self.ccollection].aggregate(qc, **kwargs)
return r
# Query metabolites with given query clause
def aggregatequery_reactions(self, qc, **kwargs):
assert "MongoDB" == self.dbc.db
assert qc is not None
r = self.dbc.mdbi[self.rcollection].aggregate(qc, **kwargs)
return r
# Query compartments with given query clause
def query_compartments(self, qc=None):
assert "MongoDB" == self.dbc.db
if qc is None:
qc = {}
doctype = "metanetx_compartment"
hits = self.dbc.mdbi[doctype].find(qc)
r = [c for c in hits]
return r
# Query reactions with given query clause
def query_reactions(self, qc, **kwargs):
if "MongoDB" == self.dbc.db:
hits = self.dbc.mdbi[self.rcollection].find(qc, **kwargs)
r = [c for c in hits]
else:
index, doctype = self.rcollection, "_doc"
r, _ = self.esquery(self.dbc.es, index, qc, doctype, **kwargs)
return r
# Query reactions and return reactions together with their metabolites
def reactionswithmetabolites(self, qc, **kwargs):
cr = self.dbc.mdbi[self.rcollection].find(qc, **kwargs)
reacts = []
mids = set()
for r in cr:
eq = cobrababel_parse_metanetx_equation(r['equation'])
if eq is None:
continue
r['reactants'] = set()
r['products'] = set()
for m in eq.items():
mid = m[1]['mnx_id']
mids.add(mid)
if m[1]['coefficient'] < 0:
r['reactants'].add(mid)
else:
r['products'].add(mid)
reacts.append(r)
qc = {"_id": {"$in": list(mids)}}
cr = self.query_metabolites(qc, projection=['desc'])
# TODO: option to return metabolite names based on selected library
metabolites = {i['_id']: i['desc'] for i in cr}
return reacts, metabolites
def get_metabolite_network(self, qc, sidec=None, selfloops=False,
max_degree=40):
"""
Get network of metabolites,
edges refer to the unique set of reactions connecting two metabolites
:param qc: specify the subset of reactions in MetaNetX
:param sidec: metabolites not to include in the graph returned
:param selfloops: include self loops,
metabolites are often on both sides of reactions
:param max_degree: nodes with connections more than max_degree
are not included in the result graph
:return: metabolites graph as NetworkX object
"""
reacts, metabolites = self.reactionswithmetabolites(qc)
if sidec is not None:
sidec = " ".join(sidec)
mn = nx.DiGraph(name='MetaNetX',
query=json.dumps(qc).replace('"', '\''))
for r in reacts:
for u_ in r['reactants']:
if sidec is not None and u_ in sidec:
continue
u = metabolites[u_]
for v in r['products']:
if not selfloops and v == u_:
continue
if sidec is not None and v in sidec:
continue
v = metabolites[v]
if mn.has_edge(u, v):
er = mn.get_edge_data(u, v)['reactions']
if r['_id'] not in er:
er.append(r['_id'])
else:
er = list([r['_id']])
mn.add_edge(u, v, reactions=er,
sourcelib=r['source']['lib'], ec=r['ecno'])
remove_highly_connected_nodes(mn, max_degree=max_degree)
return mn
def cyview(query):
""" See metabolite networks with Cytoscape runing on your local machine """
from py2cytoscape.data.cyrest_client import CyRestClient
qc = parseinputquery(query)
qry = QueryMetaNetX()
mn = qry.get_metabolite_network(qc, max_degree=maxdegree)
client = CyRestClient()
client.network.create_from_networkx(mn)
if __name__ == '__main__':
import argh
argh.dispatch_commands([
cyview
])