Skip to content

Commit

Permalink
python 3 compatibility, flake8 setup, "quotes" -> 'quotes'
Browse files Browse the repository at this point in the history
  • Loading branch information
wojdyr committed Nov 19, 2018
1 parent 966ce7f commit c4e3e55
Show file tree
Hide file tree
Showing 12 changed files with 689 additions and 654 deletions.
4 changes: 2 additions & 2 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

import sys
if sys.version_info[:2] != (2, 7):
sys.stderr.write("Error. Python 2.7 is required.\n")
if sys.version_info[:2] != (2, 7) and sys.version_info[:2] < (3, 5):
sys.stderr.write('Error. Python 2.7 or 3.5+ is required.\n')
sys.exit(1)
4 changes: 2 additions & 2 deletions ccp4i-dimple.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ def add_blob(n):
opt = parse_dimple_commands(sys.argv[1:])
jobdir = os.path.abspath(opt.output_dir)
if not os.path.exists(jobdir):
os.makedirs(jobdir) # it would be created by dimple, but too late
os.makedirs(jobdir) # it would be created by dimple, but too late

print "_JOB_DIRECTORY:", jobdir
print("_JOB_DIRECTORY: %s" % jobdir)
sys.stdout.flush()

input_files = qtrapi.Files()
Expand Down
25 changes: 11 additions & 14 deletions cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def angle(x):
if x == 90.: return '90'
else: return '%.2f' % x
return '%.2f, %.2f, %.2f, %s, %s, %s' % (
self.a, self.b, self.c,
angle(self.alpha), angle(self.beta), angle(self.gamma))
self.a, self.b, self.c,
angle(self.alpha), angle(self.beta), angle(self.gamma))

def __str__(self):
return '%-12s (%s)' % (self.symmetry, self.parameters_as_str())
Expand Down Expand Up @@ -105,7 +105,7 @@ def unscrewed_symmetry(self):
return ' '.join(a[0] for a in self.symmetry.split())

class Mat3(object):
"Matrix 3x3"
'Matrix 3x3'
def __init__(self, *args):
if len(args) == 1:
self.m = tuple(args[0])
Expand All @@ -116,16 +116,16 @@ def __init__(self, *args):
def __getitem__(self, index):
return self.m[index]
def __str__(self):
return "[%g %g %g; %g %g %g; %g %g %g]" % self.m
return '[%g %g %g; %g %g %g; %g %g %g]' % self.m
def __repr__(self):
return "Mat3" + str(self.m)
return 'Mat3' + str(self.m)

def __add__(self, other):
assert isinstance(other, Mat3)
return Mat3(a+b for a,b in zip(self.m, other.m))
return Mat3(a+b for a, b in zip(self.m, other.m))
def __sub__(self, other):
assert isinstance(other, Mat3)
return Mat3(a-b for a,b in zip(self.m, other.m))
return Mat3(a-b for a, b in zip(self.m, other.m))
# scalar must be float
def __mul__(self, scalar):
assert isinstance(scalar, float)
Expand All @@ -137,7 +137,6 @@ def identity():
0, 1, 0,
0, 0, 1)


def transpose(self):
m = self.m
return Mat3(m[0], m[3], m[6],
Expand All @@ -164,7 +163,7 @@ def trace(self):
def inverse(self):
d = self.det()
if d == 0:
raise ValueError("Matrix is not invertible")
raise ValueError('Matrix is not invertible')
m = self.m
return Mat3(( m[4] * m[8] - m[5] * m[7]) / d,
(-m[1] * m[8] + m[2] * m[7]) / d,
Expand Down Expand Up @@ -212,7 +211,6 @@ def calculate_difference(meta1, meta2):
return sys.float_info.max
if match is False:
return sys.float_info.max / 2
#return sum(abs(a-b) for a,b in zip(meta1.cell, meta2.cell))
return meta1.to_standard().max_shift_in_mapping(meta2.to_standard())

# space group utils
Expand All @@ -223,7 +221,7 @@ def match_symmetry(meta1, meta2):
def sig(sym):
first_chars = [a[0] for a in sym.split()]
s = first_chars[0] + ''.join(sorted(first_chars[1:]))
if s == 'I112': # I2 is equivalent to C2
if s == 'I112': # I2 is equivalent to C2
return 'C112'
# note: I see names such as 'H 3' used in ccp4, never 'R 3'
if s[0] == 'R':
Expand All @@ -246,7 +244,7 @@ def sig(sym):
'622': 12,
'23': 12,
'432': 24,
}
}


# the list of space group names extracted from symop.lib with this script:
Expand All @@ -261,7 +259,7 @@ def sig(sym):
# sg = sgtbx.space_group(sgs.hall())
# if sg.is_chiral():
# print '"%s": "%s",' % (fields[3], spacegroups[0])
_short_spg_names = {
_short_spg_names = { # noqa: E122
"P1": "P 1", "P2": "P 1 2 1", "P21": "P 1 21 1", "C2": "C 1 2 1",
"P222": "P 2 2 2", "P2221": "P 2 2 21", "P21212": "P 21 21 2",
"P212121": "P 21 21 21", "C2221": "C 2 2 21", "C222": "C 2 2 2",
Expand Down Expand Up @@ -290,4 +288,3 @@ def sig(sym):
def calculate_z_order(hm):
pg = ''.join(a[0] for a in hm.split()[1:])
return _centering_n[hm[0]] * _pg_symop[pg]

92 changes: 47 additions & 45 deletions contaminants/prepare.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python2
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script that generates data.py - list of unit cells of contaminants
"""

from __future__ import print_function
import csv
import hashlib
import itertools
Expand All @@ -12,7 +13,10 @@
import os
import re
import sys
import urllib2
try:
from urllib2 import urlopen # Python 2
except ImportError:
from urllib.request import urlopen # Python 3
from collections import OrderedDict
import xml.etree.ElementTree as ET
import numpy
Expand All @@ -25,8 +29,8 @@

if __name__ == '__main__' and __package__ is None:
assert os.path.basename(os.getcwd()) == 'contaminants'
sys.path.insert(1,
os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
sys.path.insert(1, os.path.abspath(os.path.join(os.path.dirname(__file__),
'..', '..')))
import dimple.cell

WIKI_URL = ('https://raw.githubusercontent.com/wiki/'
Expand All @@ -52,8 +56,8 @@ def cached_urlopen(url, cache_name):
if not os.path.exists(path):
if not os.path.isdir(CACHE_DIR):
os.mkdir(CACHE_DIR)
print '--> %s' % path
response = urllib2.urlopen(url)
print('--> %s' % path)
response = urlopen(url)
with open(path, 'wb') as f:
f.write(response.read())
return open(path)
Expand Down Expand Up @@ -113,7 +117,7 @@ def fetch_pdb_info_from_ebi(pdb_id):
if not any(m in single_crystal_methods for m in exper_method):
return None
forms = set(a['form'] for a in summary['assemblies'])
assert forms.issubset({'homo', 'hetero'}) # some are both (2EKS)
assert forms.issubset({'homo', 'hetero'}) # some are both (2EKS)
if forms != {'homo'}:
return None
entities = summary['number_of_entities']
Expand All @@ -132,11 +136,11 @@ def fetch_pdb_info_from_ebi(pdb_id):
for alt_sum in summaries[1:]:
assert [alt_sum['cell'][x] for x in par_names] == parameters
assert alt_sum['spacegroup'] == sg
if sg == 'A 1': # pesky 1LKS
if sg == 'A 1': # pesky 1LKS
return None
# Correct spacegroup, but in non-standard settings.
# We could convert it to the reference/standard settings.
if sg in ['P 1 1 21']: # only this one for now
if sg in ['P 1 1 21']: # only this one for now
return None

# Standarize unit cell settings (e.g. 3wai, 1y6e).
Expand Down Expand Up @@ -164,13 +168,12 @@ def dump_uniprot_tab(pdb_ids):
'?query=database:(type:pdb%%20%s)&sort=score&format=tab')
pdb_ids = [x.strip(',') for x in sys.stdin.read().split()]
assert all(len(x) == 4 for x in pdb_ids)
print >>sys.stderr, 'Read %d IDs' % len(pdb_ids)
print('Read %d IDs' % len(pdb_ids), file=sys.stderr)
for pdb_id in pdb_ids:
response = cached_urlopen(query_url % pdb_id, 'up-pdb-%s.tab' % pdb_id)
text = response.read()
print pdb_id
print text
print
print(pdb_id)
print(text + '\n')

def uniprot_names_to_acs(names):
query_url = ('http://www.uniprot.org/uniprot/'
Expand Down Expand Up @@ -207,13 +210,13 @@ def fetch_uniref_clusters(acs, verbose=False):
continue
#if key != 'UniRef100_P02931': continue
if verbose:
print '%s -> %s (%s) - %d members, %d residues' % (
ac, key, name, len(members), int(d['Length']))
print('%s -> %s (%s) - %d members, %d residues' % (
ac, key, name, len(members), int(d['Length'])))
if key in clusters:
sys.exit('Duplicated cluster: ' + key)
clusters[key] = members
if reader.line_num == 0:
print 'Not found in UniRef:', ac
print('Not found in UniRef:', ac)
clusters['_' + ac] = [ac] # we still want to keep this AC
return clusters

Expand All @@ -222,7 +225,7 @@ def read_current_pdb_entries():
'current-rcsb.xml')
root = ET.parse(f).getroot()
entries = set(child.attrib['structureId'] for child in root)
print 'Current PDB entries: %d' % len(entries)
print('Current PDB entries:', len(entries))
return entries

def read_sifts_mapping():
Expand Down Expand Up @@ -264,8 +267,8 @@ def cluster_by_cell_size_scipy(cells):
# matrix like from scipy.spatial.distance.pdist(..., metric=func)
dist_matrix = numpy.zeros(n * (n - 1) // 2, dtype=numpy.double)
k = 0
for i in xrange(0, n-1):
for j in xrange(i+1, n):
for i in range(0, n-1):
for j in range(i+1, n):
dist_matrix[k] = cells[i].max_shift_in_mapping(cells[j])
k += 1
assert k == len(dist_matrix)
Expand All @@ -291,7 +294,6 @@ def get_representative_unit_cell(cells):
def med_metric(c):
return max(cell_distance(c, other) for other in cells)
best = min(cells[:cutoff], key=med_metric)
#print '===>', med_metric(best)
return best

def write_output_file(representants):
Expand Down Expand Up @@ -319,7 +321,7 @@ def write_json_file(uniprot_acs, extra_info, representants):
leaf = {'name': leaf_name}
group.setdefault(ref.symmetry, []).append(leaf)
data = {'name': '', 'children': []}
for ac, v in uniprot_acs.iteritems():
for ac, v in uniprot_acs.items():
item = {'name': v, 'ac': ac, 'uniref': extra_info[v][0],
'desc': extra_info[v][1]}
if v in groups:
Expand All @@ -331,48 +333,48 @@ def write_json_file(uniprot_acs, extra_info, representants):

def main(verbose=False):
page = cached_urlopen(WIKI_URL, -1).readlines()
parsed_page = parse_wiki_page(page) # { UniProt name: [some PDB IDs] }
uniprot_acs = uniprot_names_to_acs(parsed_page) # { AC: UniProt name }
parsed_page = parse_wiki_page(page) # { UniProt name: [some PDB IDs] }
uniprot_acs = uniprot_names_to_acs(parsed_page) # { AC: UniProt name }
pdb2up = read_sifts_mapping()
descriptions_from_wiki = {}

# check for missing PDB IDs that were given explicitely in the wiki
for u, (desc, pp) in parsed_page.iteritems():
for u, (desc, pp) in parsed_page.items():
descriptions_from_wiki[u] = desc
for p in pp:
if p not in pdb2up:
print 'Missing in SIFTS: %s' % p
pdb2up[p] = [k for k, v in uniprot_acs.iteritems() if v == u]
print('Missing in SIFTS:', p)
pdb2up[p] = [k for k, v in uniprot_acs.items() if v == u]

# check for deprecated/removed PDB entries
current_pdb_entries = read_current_pdb_entries()
obsolete = [k for k in pdb2up if k not in current_pdb_entries]
print 'Obsolete PDB entries in SIFTS:', len(obsolete)
print('Obsolete PDB entries in SIFTS:', len(obsolete))
for k in obsolete:
del pdb2up[k]

# reverse the PDB->UniProt mapping
ac2pdb = {}
for p, acs in pdb2up.iteritems():
if len(acs) == 1: # don't care about heteromers
for p, acs in pdb2up.items():
if len(acs) == 1: # don't care about heteromers
ac2pdb.setdefault(acs[0], []).append(p)

uniref_clusters = fetch_uniref_clusters(uniprot_acs, verbose)

# mapping back uniref id to uniprot name that was used an input
uniref_sources = {}
for uniref_name, uclust in uniref_clusters.iteritems():
for uniref_name, uclust in uniref_clusters.items():
sources = [uniprot_acs[ac] for ac in uclust if ac in uniprot_acs]
assert len(sources) == 1, sources
uniref_sources[uniref_name] = sources[0]

representants = []
empty_cnt = 0
for uniref_name, uclust in uniref_clusters.iteritems():
for uniref_name, uclust in uniref_clusters.items():
pdb_set = sum([ac2pdb[ac] for ac in uclust if ac in ac2pdb], [])
print '%s -> %s (%d entries) -> %d PDBs' % (
uniref_sources[uniref_name], uniref_name,
len(uclust), len(pdb_set))
print('%s -> %s (%d entries) -> %d PDBs' % (
uniref_sources[uniref_name], uniref_name,
len(uclust), len(pdb_set)))
cells = []
for pdb_id in pdb_set:
cell = fetch_pdb_info_from_ebi(pdb_id)
Expand All @@ -391,35 +393,35 @@ def main(verbose=False):
r.uniprot_name = uniref_sources[uniref_name]
r.comment = uniref_name
if verbose:
print ' '.join(p.pdb_id for p in pdb_cluster), '=>', r.pdb_id
print(' '.join(p.pdb_id for p in pdb_cluster), '=>', r.pdb_id)
print('\t%s %-9s (%6.2f %6.2f %6.2f %5.1f %5.1f %5.1f) %8.2f'
% ((r.pdb_id, r.symmetry) + r.cell + (r.quality,)))
representants.append(r)
if len(representants) == prev_len:
empty_cnt += 1
if not verbose:
print '\t-> %d' % (len(representants) - prev_len)
print('\t-> %d' % (len(representants) - prev_len))
representants.sort(key=lambda x: x.a)
write_output_file(representants)
print '%d (incl. %d absent) proteins -> %d unique unit cells -> %s' % (
len(uniprot_acs), empty_cnt, len(representants), OUTPUT_PY_FILE)
print('%d (incl. %d absent) proteins -> %d unique unit cells -> %s' % (
len(uniprot_acs), empty_cnt, len(representants), OUTPUT_PY_FILE))
write_json_file(uniprot_acs,
dict((v, (k, descriptions_from_wiki[v]))
for k, v in uniref_sources.items()),
representants)
a, b = min(itertools.combinations(representants, 2),
key=lambda x: cell_distance(*x))
print 'Min. dist: %.2f%% between %s and %s' % (100*cell_distance(a, b),
a.pdb_id, b.pdb_id)
print('Min. dist: %.2f%% between %s and %s' % (
100*cell_distance(a, b), a.pdb_id, b.pdb_id))
contaminer_acs = fetch_contaminer_protein_acs()
for ac in contaminer_acs:
if not any(ac in uclust for uclust in uniref_clusters.values()):
print 'Only in ContaBase: %s %s' % (ac, contaminer_acs[ac])
for uniref_name, uclust in uniref_clusters.iteritems():
if len(set(uclust).intersection(contaminer_acs.viewkeys())) == 0:
print('Only in ContaBase:', ac, contaminer_acs[ac])
for uniref_name, uclust in uniref_clusters.items():
if len(set(uclust).intersection(contaminer_acs.keys())) == 0:
src = uniref_sources[uniref_name]
if src not in NOT_IN_CONTABASE: # these are not news
print 'Not in ContaBase:', src
if src not in NOT_IN_CONTABASE: # these are not news
print('Not in ContaBase:', src)

if __name__ == '__main__':
verbose = ('-v' in sys.argv[1:])
Expand Down
Loading

0 comments on commit c4e3e55

Please sign in to comment.