-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsolvation_predict.py
148 lines (119 loc) · 6.15 KB
/
solvation_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
# encoding: utf-8
"""
solvation_predict.py to run RMG Abraham estimator for many solvent/solute combinations
Reads in three files:
adjList.txt containing RMG-style adjacency lists of all the solutes by name
Solvent Database.csv containing Abraham's parameters for the solvents (with a header row)
Solvent-Solute List.csv containing a list of Solvent/Solute pairs that should be estimated (with a header row)
Creates one file:
Solvent-Solute-Solvation_by_RMG.csv
Requires a working RMG Abraham.class and dependency RMG classes (which may be inside a .jar)
which in turn needs functioning GATPFit.exe in the appropriate path
(see RMG instructions at http://rmg.sourceforge.net/)
First created by Richard West on 2009-06-23.
Copyright (c) 2009 MIT. All rights reserved.
"""
# I think all these are standard libraries, so you should have them all (in Python 2.5+)
import sys
import os
import csv
import subprocess
import re
# you may need to change this to point to where your Abraham.class file is (which may be in an RMG.jar)
RMG_classpath = '$RMG/software/RMG'
import platform
if platform.system() in ('Windows', 'Microsoft'):
RMG_classpath = '%rmg%/classes'
# read in adjacency list data for all solutes
adjDict = dict()
adjList = open('adjList.txt','U') # the 'U' allows for universal line endings (ie. windows stuff)
for line in adjList:
line=line.rstrip()+'\n' # strip the line ending and replace with a proper one
if line.startswith('InChI='): # new solute; reset adj
inchi = line.strip()
adj = ''
adj += line
if line.isspace(): # reached end of adj; store it
adjDict[inchi] = adj
# should now have object like adjDict['InChI=1/C2H4O2/c1-4-2-3/h2H,1H3'] = 'multi-line adjacency list string'
# read in solvent data
solventReader = csv.DictReader(open('Solvent Database.csv'), dialect='excel') # assumes first line contains field names
solventDict = dict()
for solvent in solventReader:
assert solvent.has_key('a'), "Solvent Database should have 'a' column"
solventName = solvent['Solvents']
solventDict[solventName] = solvent
# should now have object like: solventDict['water']['a']
# read in Solvent/Solute list
solventSoluteList = list() # used to make sure the output is in the same order as the input
solventSoluteReader = csv.DictReader(open('Solvent-Solute list.csv'), dialect='excel') # assumes first line contains field names
solutesDict = dict() # a list of solutes for each solvent
dataDict = dict() # a dictionary (of dictionaries) that we will store values in later
for entry in solventSoluteReader:
solvent = entry['Solvent']
solute = entry['Solute']
if not solutesDict.has_key(solvent):
solutesDict[solvent]=list()
if not dataDict.has_key(solvent):
dataDict[solvent]=dict()
solutesDict[solvent].append(solute)
solventSoluteList.append((solvent,solute))
# should now have structure like: solutesDict['water'] = [list of solutes]
# and: dataDict['water'] = {} # empty dictionary to store values in later
# and: solventSoluteList = [list of (solvent,solute) tuples]
errors=[] # we'll store them all up to summarise at the end
# now iterate over solvents
for solvent,solutes in solutesDict.items(): # (warning: iterating over a dictionary, so order may be different each time)
sv=solventDict[solvent] # get solvent parameters
AbrahamFile=open('Abraham_input.txt','w')
AbrahamFile.write('SolventParameters: %s %s %s %s %s %s %s\n\n'%(sv['c'], sv['e'], sv['s'], sv['a'], sv['b'], sv['l'], sv['Solvents']))
for solute in solutes:
AbrahamFile.write(adjDict[solute]) # write the adjacency lists out to the file
AbrahamFile.close()
# run the java program
command = 'java -classpath %s Abraham'%RMG_classpath
os.path.isdir('GATPFit') or os.makedirs('GATPFit') # apparently this dir must exist for the java to run
print "Running java Abraham for ",solvent
process=subprocess.Popen(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True )
output,error=process.communicate()
if error:
errorMessage= "Error for something(s) dissolved in %s:\n%s"%(solvent,error)
print errorMessage
errors.append(errorMessage)
# raise StandardError(error) # if we want it to stop on error, uncomment this line
# ## interpreting the standard output
# # not sure this works as expected...
# searcher=re.compile('The free energy of solvation in (?P<solvent>\S+) at 298K without reference state corrections is = (?P<correction>\S+) J/mol')
# matches = searcher.findall(output)
# assert len(matches) == len(solutes), "%d answers from %d inputs to java Abraham code"%(len(matches),len(solutes))
# for i, (solvent, correction) in enumerate(matches):
# solute=solutes[i]
# print solvent,solute,correction
## reading the output file
# I think this works as intended
AbrahamFile=open('Abraham_output.txt','U') # U to allow universal line endings (for Windows)
line=AbrahamFile.readline()
match = re.search('Solvent:\s+(?P<solvent>.*)',line)
assert match, "Didn't find Solvent: line at top of output file"
assert solvent.startswith( match.groupdict()['solvent'].strip()), "Solvent in output is not solvent in input!"
# use startswith() because when solvent is 'dibutyl ether' the Abraham_output.txt will just say 'dibutyl'
searcher=re.compile('(?P<solute>InChI=\S+)\s+(?P<value>\S+)')
matches = searcher.findall(AbrahamFile.read()) # read the rest in one go
for (solute, value) in matches:
print solvent,solute,value
dataDict[solvent][solute]=value
# create the output file
outfile = open('Solvent-Solute-Solvation_by_RMG.csv','wb')
outputter = csv.DictWriter(outfile,['solvent','solute','logK'],dialect='excel')
for (solvent,solute) in solventSoluteList: # the order of the input file is preserved
answer={'solvent':solvent, 'solute':solute, 'logK':dataDict[solvent][solute]}
outputter.writerow(answer)
outfile.close()
print "\nSummary of errors:"
for error in errors:
print error
def main():
pass
if __name__ == '__main__':
main()