forked from comocheng/RMG-database
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqueryNIST.py
536 lines (453 loc) · 20.9 KB
/
queryNIST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
#!/usr/bin/env python
# encoding: utf-8
"""
This script queries the NIST Chemical Kinetics Database
(<http://kinetics.nist.gov>) for a given gas-phase reaction, parses the HTML
output, and creates a new file in the RMG database for that reaction and all
the matching kinetics.
"""
import os
import os.path
import sys
import re
from rmgpy.data.base import Entry
from rmgpy.data.reference import *
from rmgpy.kinetics import Arrhenius
from rmgpy.species import Species
from rmgpy.reaction import Reaction
import urllib
import urllib2
import cookielib
from BeautifulSoup import BeautifulSoup
################################################################################
forwardReaction = None
reverseReaction = None
reactionIndex = None
reactionLabel = None
def reaction(index, label, reactant1, product1, forwardDegeneracy, reverseDegeneracy, reactant2=None, product2=None):
global forwardReaction, reverseReaction, reactionIndex, reactionLabel
reactants = []
reactants.append(Species().fromAdjacencyList(reactant1))
if reactant2: reactants.append(Species().fromAdjacencyList(reactant2))
for reactant in reactants:
if reactant.label == '':
reactant.label = reactant.molecule[0].toSMILES()
products = []
products.append(Species().fromAdjacencyList(product1))
if product2: products.append(Species().fromAdjacencyList(product2))
for product in products:
if product.label == '':
product.label = product.molecule[0].toSMILES()
reactionIndex = index
reactionLabel = label
forwardReaction = Reaction(reactants=reactants, products=products, degeneracy=forwardDegeneracy)
reverseReaction = Reaction(reactants=products, products=reactants, degeneracy=reverseDegeneracy)
def entry(forward, kinetics, reference, referenceType, shortDesc, longDesc):
pass
def loadReaction(path):
"""
"""
print 'Loading reaction from {0}...'.format(os.path.relpath(path))
# Set up global and local context
global_context = {
'__builtins__': None,
'True': True,
'False': False,
'reactionIndex': None,
'reactionLabel': None,
'forwardReaction': None,
'reverseReaction': None,
}
local_context = {
'__builtins__': None,
'reaction': reaction,
'entry': entry,
'Arrhenius': Arrhenius,
'Article': Article,
'Book': Book,
'Thesis': Thesis,
'Reference': Reference,
}
# Process the file
f = open(path, 'r')
exec f in global_context, local_context
f.close()
return reactionIndex, reactionLabel, forwardReaction, reverseReaction
################################################################################
def getNISTIdentifier(species):
"""
For a given `species`, return a string identifier that NIST is likely to
associate with the same species in its database. In general the lowest
available CAS number seems to work. We query the NIST Chemical Identifier
Resolver to convert an InChI to a CAS number; if this fails we return the
chemical formula.
"""
inchi = species.molecule[0].toInChI()
smiles = species.molecule[0].toSMILES()
formula = species.molecule[0].getFormula()
# When this works, it is more likely to give a CAS number that NIST knows about
url = "http://webbook.nist.gov/cgi/inchi/{0}".format(urllib.quote(inchi))
try:
f = urllib2.urlopen(url, timeout=5)
except urllib2.URLError, e:
pass
else:
searcher = re.compile('<li><a href="/cgi/inchi\?GetInChI=C(\d+)')
for line in f:
m = searcher.match(line)
if m:
return m.group(1)
# When the above does not work, the below sometimes gives at least one CAS number
# It could also give multiple CAS numbers, in which case we keep the lowest
url = "http://cactus.nci.nih.gov/chemical/structure/{0}/cas".format((inchi))
try:
f = urllib2.urlopen(url, timeout=5)
except urllib2.URLError, e:
pass
else:
searcher = re.compile('(\d+)-(\d+)-(\d+)')
caslist = []
for line in f:
m = searcher.match(line)
if m:
caslist.append(int(m.group(0).replace('-', '')))
if len(caslist) > 0:
return str(min(caslist))
# If nothing else works, just use the chemical formula
return formula
################################################################################
def queryNIST(reactants, products):
"""
Query the NIST website for the gas-phase reaction consisting of the given
lists of `reactants` and `products`, which should already be in a form that
NIST understands (e.g. CAS number).
"""
# Create a session cookie (so that the units set below persist)
cookiejar = cookielib.CookieJar()
# Set the units to use for this session
setNISTUnits(reactants, products, cookiejar)
# Now search for the reaction
entries = queryNISTKinetics(reactants, products, cookiejar)
# Also extract reference information for each reference
# (This information is also available within the kinetics search results,
# but is much easier to extract from its details page)
for entry in entries:
queryNISTReference(entry, entry.reference, cookiejar)
# Clear the cookies we used for these queries
cookiejar.clear_session_cookies()
# Return the resulting entries
return entries
def setNISTUnits(reactants, products, cookiejar):
"""
Query the NIST website for the gas-phase reaction consisting of the given
lists of `reactants` and `products`, which should already be in a form that
NIST understands (e.g. CAS number).
"""
# Set the POST data to contain the units to use for this session
post = {
'energyUnits': 'kJ',
'evaluationTemperature': '300.0',
'moleculeUnits': 'Mole',
'pressureUnits': 'bar',
'referenceTemperature': '1.0',
'temperatureUnits': 'K',
'volumeUnits': 'cm',
}
# Make an HTML query to NIST to POST the units we want to use
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
request = opener.open('http://kinetics.nist.gov/kinetics/SetUnitsBean.jsp', data=urllib.urlencode(post))
unitsHTML = request.read()
request.close()
def queryNISTKinetics(reactants, products, cookiejar):
"""
Query the NIST website for the gas-phase reaction consisting of the given
lists of `reactants` and `products`, which should already be in a form that
NIST understands (e.g. CAS number).
"""
entries = []
# Set the POST data to contain the parameters for our reaction kinetics query
post = {
'boolean1': '',
'boolean2': 'and',
'boolean3': 'and',
'boolean4': 'and',
'category': '0',
'database': 'kinetics',
'doc': 'SearchForm',
'lp1': ' ',
'lp2': ' ',
'lp3': ' ',
'lp4': ' ',
'relate1': '=',
'relate2': '=',
'relate3': '=',
'relate4': '=',
'rp1': ' ',
'rp2': ' ',
'rp3': ' ',
'rp4': ' ',
'type': 'java',
'Units': '',
}
count = 0
for reactant in reactants:
count += 1
post['field{0:d}'.format(count)] = 'reactants'
post['text{0:d}'.format(count)] = reactant
for product in products:
count += 1
post['field{0:d}'.format(count)] = 'products'
post['text{0:d}'.format(count)] = product
post['numberOfFields'] = str(count)
# Formally query the NIST kinetics database
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
request = opener.open('http://kinetics.nist.gov/kinetics/Search.jsp', data=urllib.urlencode(post))
searchHTML = request.read()
request.close()
# Some checks to ensure that the search was successful
if "No records found" in searchHTML:
return []
elif "There was no match in the database for the following input" in searchHTML:
return []
elif "Click on a link in the table below to see detail on the selected reaction" in searchHTML:
return []
# Parse the returned HTML to extract the kinetics information
# We use the BeautifulSoup package because the returned HTML is probably
# not valid XML, so DOM and SAX parsers would choke
soup = BeautifulSoup(searchHTML)
form = soup.findAll(name='form', attrs={'name': 'KineticsResults'})[0]
for tr in form.findAll(name='tr'):
tdlist = tr.findAll(name='td')
if len(tdlist) == 17:
# Some extra effort is needed to extract the squib from the BeautifulSoup DOM tree
squib = re.search('\d\d\d\d\w\w\w?(/\w\w\w?)?\w+(-\w+)?:\d+', unicode(tdlist[2].a))
squib = squib.group()
# Assume the result is of modified Arrhenius form
Trange = tdlist[6].text
A = tdlist[8].text
n = tdlist[10].text
Ea = tdlist[12].text
k300 = tdlist[14].text
order = tdlist[16].text
# Reject results that don't have a valid preexponential or activation energy
if A == ' ' or Ea == ' ':
continue
# Reject results whose reaction order does not match the number of reactants
if int(order) != len(reactants):
continue
# Many times n is not given, so set it to zero if that happens
if n == ' ':
n = 0.0
# Convert the temperature range to separate minimum and maximum valies
if '-' in Trange:
Tmin, Tmax = Trange.split('-')
else:
Tmin = Trange; Tmax = Trange
# Create an entry for this result and append it to the list of entries
entry = Entry(
data = Arrhenius(
A = (float(A),"cm^3/(mol*s)" if len(reactants) == 2 else "s^-1"),
n = float(n),
Ea = (float(Ea),"kJ/mol"),
T0 = (1.0,"K"),
Tmin = (float(Tmin),"K"),
Tmax = (float(Tmax),"K"),
),
reference = squib, # Just save the squib for now; we'll get the actual reference later
)
entries.append(entry)
return entries
def queryNISTReference(entry, squib0, cookiejar):
# The NIST database stores its authors as last name first (e.g.
# "Allen, J.W."), which leads to comma overload in many-author references
# I prefer having the last name last (e.g. "J. W. Allen") to make things
# easier to read
# These regular expressions handle various forms for authors
author_regexes = [
(r'([\w-]+), (\w+\.)(\w+\.)(\w+\.), Jr.', r'\2 \3 \4 \1, Jr.'),
(r'([\w-]+), (\w+\.)(\w+\.), Jr.', r'\2 \3 \1, Jr.'),
(r'([\w-]+), (\w+\.)-(\w+\.), Jr.', r'\2-\3 \1, Jr.'),
(r'([\w-]+), (\w+-\w+\.), Jr.', r'\2 \1, Jr.'),
(r'([\w-]+), (\w+\.), Jr.', r'\2 \1, Jr.'),
(r'([\w-]+), (\w+\.)(\w+\.)(\w+\.)', r'\2 \3 \4 \1'),
(r'([\w-]+), (\w+\.)(\w+\.)', r'\2 \3 \1'),
(r'([\w-]+), (\w+\.)-(\w+\.)', r'\2-\3 \1'),
(r'([\w-]+), (\w+-\w+\.)', r'\2 \1'),
(r'([\w-]+), (\w+\.)', r'\2 \1'),
]
# Fetch the HTML page for the reference and kinetics
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
request = opener.open('http://kinetics.nist.gov/kinetics/Detail?id={0}'.format(squib0))
referenceHTML = request.read()
request.close()
# Parse the returned HTML to extract the reference information
# We use the BeautifulSoup package because the returned HTML is probably
# not valid XML, so DOM and SAX parsers would choke
soup = BeautifulSoup(referenceHTML)
squib = soup.table.findAll(text='Squib:')[0].parent.nextSibling[13:]
category = soup.table.findAll(text='Category:')[0].parent.nextSibling[7:].lower()
datatype = soup.table.findAll(text='Data type:')[0].parent.nextSibling[13:]
reftype = soup.table.findAll(text='Reference type:')[0].parent.nextSibling[13:].lower()
if reftype == 'technical report': reftype = 'journal article'
assert reftype in ['journal article', 'book chapter'], 'Unexpected reference type "{0}" from squib "{1}".'.format(reftype, squib)
# Extract the authors and process each author name to the preferred format
if reftype in ['journal article', 'book chapter']:
authors0 = soup.table.findAll(text='Author(s):')[0].parent.nextSibling[13:]
authors = []
for author0 in authors0.split(';'):
author0 = author0.strip()
for pattern, repl in author_regexes:
author = re.sub(pattern, repl, author0)
if author != author0:
break
authors.append(author)
# Create a reference object describing the reference based on the type in reftype
if reftype == 'journal article':
title = soup.table.findAll(text='Title:')[0].parent.nextSibling[13:]
journal = soup.table.findAll(text='Journal:')[0].parent.nextSibling[13:]
try:
volume = soup.table.findAll(text='Volume:')[0].parent.nextSibling[13:]
except IndexError:
volume = ''
pages = soup.table.findAll(text='Page(s):')[0].parent.nextSibling[13:]
year = soup.table.findAll(text='Year:')[0].parent.nextSibling[13:]
entry.reference = Article(
authors = authors,
title = title,
journal = journal,
volume = volume,
pages = re.sub(' - ', '-', pages),
year = year,
)
elif reftype == 'book chapter':
title = soup.table.findAll(text='Title:')[0].parent.nextSibling[13:]
publisher = soup.table.findAll(text='Publisher address:')[0].parent.nextSibling[13:]
year = soup.table.findAll(text='Year:')[0].parent.nextSibling[13:]
entry.reference = Book(
authors = authors,
title = title,
publisher = publisher,
year = year,
)
# Reference metadata common to all reference types
entry.reference.url = 'http://kinetics.nist.gov/kinetics/Detail?id={0}'.format(squib)
entry.referenceType = category
entry.shortDesc = datatype
entry.longDesc = """Imported from http://kinetics.nist.gov/kinetics/Detail?id={0}""".format(squib0)
return entry
################################################################################
def saveEntry(f, entry, forward):
f.write('entry(\n')
f.write(' forward = {0},\n'.format(forward))
# Write kinetics
kinetics = entry.data.toPrettyRepr()
lines = kinetics.splitlines()
f.write(' kinetics = {0}\n'.format(lines[0]))
for line in lines[1:-1]:
f.write(' {0}\n'.format(line))
f.write(' ),\n'.format(lines[0]))
# Write reference
reference = entry.reference.toPrettyRepr()
lines = reference.splitlines()
f.write(' reference = {0}\n'.format(lines[0]))
for line in lines[1:-1]:
f.write(' {0}\n'.format(line))
f.write(' ),\n'.format(lines[0]))
f.write(' referenceType = "{0}",\n'.format(entry.referenceType))
f.write(' shortDesc = u"""{0}""",\n'.format(entry.shortDesc))
f.write(' longDesc = \n')
f.write('u"""\n')
f.write(entry.longDesc.strip() + "\n")
f.write('""",\n')
f.write(')\n\n')
def save(reactionIndex, reactionLabel, forwardReaction, reverseReaction, forwardEntries, reverseEntries, filename):
"""
Save the results of a query to the NIST kinetics database to `filename` on
disk.
"""
with open(filename, 'w') as f:
f.write('#!/usr/bin/env python\n')
f.write('# encoding: utf-8\n\n')
f.write('reaction(\n')
f.write(' index = {0:d},\n'.format(reactionIndex))
f.write(' label = "{0!s}",\n'.format(reactionLabel))
f.write(' reactant1 = \n')
f.write('"""\n')
f.write(forwardReaction.reactants[0].toAdjacencyList())
f.write('""",\n')
if len(forwardReaction.reactants) > 1:
f.write(' reactant2 = \n')
f.write('"""\n')
f.write(forwardReaction.reactants[1].toAdjacencyList())
f.write('""",\n')
f.write(' product1 = \n')
f.write('"""\n')
f.write(forwardReaction.products[0].toAdjacencyList())
f.write('""",\n')
if len(forwardReaction.products) > 1:
f.write(' product2 = \n')
f.write('"""\n')
f.write(forwardReaction.products[1].toAdjacencyList())
f.write('""",\n')
f.write(' forwardDegeneracy = {0:d},\n'.format(forwardReaction.degeneracy))
f.write(' reverseDegeneracy = {0:d},\n'.format(reverseReaction.degeneracy))
f.write(')\n\n')
f.write('################################################################################\n\n')
for entry in forwardEntries:
saveEntry(f, entry, True)
for entry in reverseEntries:
saveEntry(f, entry, False)
################################################################################
def searchReaction(reactionIndex, reactionLabel, forwardReaction, reverseReaction, filename):
"""
For a given chemical reaction defined by a list of one or two `reactant`
species and a list of one or two `product` species, search the literature
(via the NIST Chemical Kinetics Database at <http://kinetics.nist.gov/> for
kinetic models for that reaction. The `forwardDegeneracy` and
`reverseDegeneracy` parameters are used to specify the forward and reverse
reaction-path degeneracies, respectively. The results of the search are
saved to the given `filename` on disk.
"""
reactants = forwardReaction.reactants
products = forwardReaction.products
forwardDegeneracy = forwardReaction.degeneracy
reverseDegeneracy = reverseReaction.degeneracy
# For each species, determine an identifier that the NIST database is likely to understand
print 'Determining NIST species identifiers...'
reactantIdentifiers = []; productIdentifiers = []
for species in reactants:
identifier = getNISTIdentifier(species)
reactantIdentifiers.append(identifier)
print ' Identifier for {0!s} is "{1}"'.format(species, identifier)
for species in products:
identifier = getNISTIdentifier(species)
productIdentifiers.append(identifier)
print ' Identifier for {0!s} is "{1}"'.format(species, identifier)
# Query the NIST database for kinetics in both directions
print 'Searching NIST for reaction {0} -> {1}...'.format(' + '.join([str(r) for r in reactants]), ' + '.join([str(p) for p in products]))
forwardEntries = queryNIST(reactantIdentifiers, productIdentifiers)
print '{0} results found.'.format(len(forwardEntries))
print 'Searching NIST for reaction {0} -> {1}...'.format(' + '.join([str(p) for p in products]), ' + '.join([str(r) for r in reactants]))
reverseEntries = queryNIST(productIdentifiers, reactantIdentifiers)
print '{0} results found.'.format(len(reverseEntries))
# Swap the reaction direction if there are more matches in the reverse
# direction than in the forward direction
if len(reverseEntries) > len(forwardEntries):
forwardReaction, reverseReaction = reverseReaction, forwardReaction
reactantIdentifiers, productIdentifiers = productIdentifiers, reactantIdentifiers
forwardEntries, reverseEntries = reverseEntries, forwardEntries
forwardDegeneracy, reverseDegeneracy = reverseDegeneracy, forwardDegeneracy
# Save the matching results in both directions to the specified file
print 'Saving {0} results to {1}...'.format(len(forwardEntries) + len(reverseEntries), filename)
save(reactionIndex, reactionLabel, forwardReaction, reverseReaction, forwardEntries, reverseEntries, filename)
################################################################################
if __name__ == '__main__':
if len(sys.argv) < 3:
raise Exception('You must pass the family name and reaction index (or multiple indices) as command-line parameters.')
family = str(sys.argv[1])
for index in sys.argv[2:]:
index = int(index)
filename = 'input/kinetics/families/{0!s}/training/{1:d}.py'.format(family, index)
reactionIndex, reactionLabel, forwardReaction, reverseReaction = loadReaction(filename)
filename = 'input/kinetics/families/{0!s}/training/{1:d}.py'.format(family, index)
searchReaction(reactionIndex, reactionLabel, forwardReaction, reverseReaction, filename)