-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDB_plugin.py
1481 lines (1239 loc) · 53.6 KB
/
PDB_plugin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Metadata for Plugin Manager
#
# Authors: PDBe, [email protected]
# License: BSD-2-Clause
# Version: 2.0.0a
# Citation-Required: No
"""
Described at PyMOL wiki:
https://pymolwiki.org/index.php/PDB_plugin
--- original plugin ---
Authors : PDBe
Date : 2016
--- enhancements ---
Authors : [email protected]
Date : 2019
DESCRIPTION
Fetches the specified data set from PDB and breaks it into multiple
objects based on the criteria named by the function.
USAGE
PDB_Analysis_Assemblies pdb_id
PDB_Analysis_Molecules pdb_id
PDB_Analysis_Domains pdb_id
PDB_Analysis_Validation pdb_id
count_chains selection
ARGUMENTS
pdb_id = string: 4-character PDB entry ID
selection = string: selection-expression or name-pattern {default: (all)}.
EXAMPLES
PDB_Analysis_Molecules 3mzw
NOTES
For detailed descriptions see help on individual commands.
"""
# Documentation resources:
# mmCIF: (macromolecular Crystallographic Information Framework)
# http://mmcif.wwpdb.org/
# Naming Cheat Sheet:
# entity: synonymous with molecule or molecular entity
# within molecule:
# in_chains: corresponds to PyMOL chain name
# in_struct_asyms: corresponds to PyMOL segment name
# top level molecule name/key: corresponds to PyMOL object name
# asym_id: unique identifier within the crystallographic asymmetric unit
from __future__ import print_function
from collections import namedtuple
import datetime
import importlib # needs at least python 2.7
import json # parsing the input
import logging
import os
import random
import re
import socket
import sys
import time
import pymol
import pymol.plugins
from pymol import cmd
from pymol import stored
# ftp site
_EBI_FTP = ('ftp://ftp.ebi.ac.uk/pub/databases/pdb/data/structures/divided/'
'mmCIF/%s/%s.cif.gz')
# clean mmcif
_UPDATED_FTP = 'https://www.ebi.ac.uk/pdbe/static/entry/%s_updated.cif.gz'
_EMPTY_PDB_NUM = frozenset(['', '.', '?', None])
def clear_analysis_data():
# Note that pymol.stored is just a convenient global variable where this
# data is attached to. It is not necessary to have it there for interaction
# with pymol.
# --- inputs from PDB API
stored.molecules = {} # get_molecules()
Sequences.clear()
# --- results of analysis
stored.polymer_count = 0 # Molecules()._process_molecules()
stored.ca_p_only_segments = set() # Molecules()._process_molecules()
def extendaa(*arg, **kw):
"""Wrapper across cmd.extendaa that also adds new command to pymol.cmd."""
def wrapper(func):
_self = kw.get('_self', cmd)
name = func.__name__
setattr(_self, name, func)
return _self.extendaa(*arg, **kw)(func)
return wrapper
class PdbFetcher(object):
"""Downloads PDB json data from URLs.
This class tries to use one of several libraries to get the data, depending
on which are installed on the system.
"""
def __init__(self):
self._modules = {} # Modules loaded by this class - like sys.modules.
self._fetcher = None # Fetcher method to use for get_data.
# Figure out which library we can use to fetch data.
import_errors = []
for lib in ('requests', 'urllib2', 'urllib.request', 'urllib.error'):
try:
self._modules[lib] = importlib.import_module(lib)
except Exception as e:
import_errors.append(e)
continue
if 'requests' in self._modules:
# Module 'requests' works on python 2.x and 3.x.
logging.debug('using requests module')
self._fetcher = self._get_data_with_requests
break
elif 'urllib2' in self._modules:
# Module 'urllib2' works on python 2.x.
# However, the fetcher code uses the python 3.x split up naming
# conventions, so point those names to the right place.
logging.debug('using urllib2 module in python 2.x')
self._modules['urllib.request'] = self._modules['urllib2']
self._modules['urllib.error'] = self._modules['urllib2']
self._fetcher = self._get_data_with_urllib
break
elif ('urllib.request' in self._modules and
'urllib.error' in self._modules):
# Module 'urllib.[request,errors]' works on python 3.x.
logging.debug('using urllib module in python 3.x')
self._fetcher = self._get_data_with_urllib
break
if not self._fetcher:
raise Exception(
__name__.split('.')[-1] +
": PDB can't be accessed due to missing python libraries:\n" +
'\n'.join([' ' + str(error) for error in import_errors]) +
'\n==> Install one of them!')
def get_data(self, url, description, **kw):
"""Returns PDB data from the given URL."""
logging.debug(description)
url = self._quote(url)
return self._fetcher(url, description, **kw)
@staticmethod
def _quote(url):
"""Returns URL with space escaped."""
return url.replace(' ', '%20')
def _get_data_with_requests(self, url, description):
"""Uses requests module to fetch and return data from the PDB URL."""
requests = self._modules['requests']
response = requests.get(url=url, timeout=60)
if response.status_code == 200:
data = response.json()
elif response.status_code == 404:
data = {}
else:
logging.debug('%d %s' % (response.status_code, response.reason))
data = {}
return data
def _get_data_with_urllib(self,
url,
description,
sleep_min=1,
sleep_max=20):
"""Uses urllib module to fetch and return data from the PDB URL.
Argument sleep_(min,max)=0 is mostly interesting for testing.
"""
urllib2_request = self._modules['urllib.request']
urllib2_error = self._modules['urllib.error']
data = {}
data_response = False
limit = 5
logging.debug(url)
date = datetime.datetime.now().strftime('%Y,%m,%d %H:%M')
for tries in range(1, limit + 1):
try:
response = urllib2_request.urlopen(url, None, 60)
except urllib2_error.HTTPError as e:
logging.debug(
'%s HTTP API error - %s, error code - %s, try %d' %
(date, description, e.code, tries))
# logging.debug(e.code)
if e.code == 404:
data_response = True
break
else:
# logging.debug(entry_url)
time.sleep(random.randint(sleep_min, sleep_max))
logging.debug(e)
except urllib2_error.URLError as e:
logging.debug('%s URL API error - %s, try %d' %
(date, e, tries))
# logging.debug(entry_url)
time.sleep(random.randint(sleep_min, sleep_max))
logging.debug(e)
except socket.timeout as e:
logging.debug('%s Timeout API error - %s, try %d' %
(date, e, tries))
# logging.debug(entry_url)
time.sleep(random.randint(sleep_min, sleep_max))
logging.debug(e)
else:
logging.debug(
'received a response from the %s API after %d tries' %
(description, tries))
data = json.load(response)
data_response = True
break
if not data_response:
logging.error('No response from the %s API' % description)
return data
class PdbApi(object):
"""Handles getting data from the PDB API service.
Some documentation of the PDBe server's REST API is available at
https://www.ebi.ac.uk/pdbe/pdbe-rest-api
Args:
pretty: If True, returns well indented, human readable result.
"""
def __init__(self,
server_root='https://www.ebi.ac.uk/pdbe/api',
pretty=False):
self._server_root = server_root.rstrip('/')
self._url_suffix = '?pretty=true' if pretty else ''
self._fetcher = PdbFetcher()
def get_summary(self, pdbid):
"""Returns a summary dictionary of the PDB entry.
Documentation: https://www.ebi.ac.uk/pdbe/api/doc/pdb.html
"""
url = self._get_url('pdb/entry/summary', pdbid)
return self._fetcher.get_data(url, 'summary')
def get_molecules(self, pdbid):
"""Returns a dictionary of molecule data.
Documentation: https://www.ebi.ac.uk/pdbe/api/doc/pdb.html
The contents is a conglomerate of data found in the mmCIF entity group
http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Groups/entity_group.html
(Better reference would be good)
Format:
<molecules> = dict(<PDB_ID>: list(<entity>))
<entity> = dict(<key>: <value>)
some keys:
entity_id: unique numeric ID (key)
molecule_name: list(name of this molecule)
in_chains: list(<chain_name>)
in_struct_asyms: list(<segment_name>)
number_of_copies: number of duplicate entities
molecule_type: enum(polypeptide|Bound|Water|...)
for polypeptide type:
sequence: sequence string
length: sequence length
ca_p_only: true/false; only CA and/or P atom positions available
"""
url = self._get_url('pdb/entry/molecules', pdbid)
return self._fetcher.get_data(url, 'molecules')
def get_sequences(self, pdbid):
"""Returns a dictionary of sequence data.
Documentation: https://www.ebi.ac.uk/pdbe/api/doc/pdb.html
The contents is similar to mmCIF poly_seq_scheme.
http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/pdbx_poly_seq_scheme.html
Format:
<sequences> = dict(<PDB_ID>: dict("molecules": list(<molecule>)))
<molecule> = dict(<key>: <value)
available keys:
entity_id: molecule number (see get_molecules above)
chains: list(<chain>)
<chain> = dict(<key>: <value>)
available keys:
chain_id: PyMOL's chain name (e.g in selection)
struct_asym_id: PyMOL's segment name (e.g. in selection)
residues: list(<residue>)
<residue>: dict(<key>: <value>)
available keys:
residue_number: int; sequential within this dict
author_residue_number: int; PyMOL residue number
within chain
observer_ration: float; ?
residue_name: char; 3-letter residue code
"""
url = self._get_url('pdb/entry/residue_listing', pdbid)
return self._fetcher.get_data(url, 'sequences')
def get_protein_domains(self, pdbid):
"""Documentation: https://www.ebi.ac.uk/pdbe/api/doc/sifts.html"""
url = self._get_url('mappings', pdbid)
return self._fetcher.get_data(url, 'protein_domains')
def get_nucleic_domains(self, pdbid):
"""Documentation:
https://www.ebi.ac.uk/pdbe/api/doc/nucleic_mappings.html
"""
url = self._get_url('nucleic_mappings', pdbid)
return self._fetcher.get_data(url, 'nucleic_domains')
def get_validation(self, pdbid):
"""Documentation: https://www.ebi.ac.uk/pdbe/api/doc/validation.html"""
url = self._get_url('validation/global-percentiles/entry', pdbid)
return self._fetcher.get_data(url, 'validation')
def get_residue_validation(self, pdbid):
"""Documentation: https://www.ebi.ac.uk/pdbe/api/doc/validation.html"""
url = self._get_url(
'validation/protein-RNA-DNA-geometry-outlier-residues/entry', pdbid)
return self._fetcher.get_data(url, 'residue validation')
def get_ramachandran_validation(self, pdbid):
"""Documentation: https://www.ebi.ac.uk/pdbe/api/doc/validation.html"""
url = self._get_url(
'validation/protein-ramachandran-sidechain-outliers/entry', pdbid)
return self._fetcher.get_data(url, 'ramachandran validation')
def _get_url(self, api_url, pdbid):
url = '/'.join((self._server_root, api_url, pdbid))
url += self._url_suffix
logging.debug('url: %s' % url)
return url
# FIXME(r2r): don't leave this as global variable
pdb = PdbApi()
class Presentation(object):
"""Manage PyMOL presentation properties."""
_OBJECT_COLORS = [
'red',
'green',
'blue',
'yellow',
'magenta',
'cyan',
'tv_red',
'tv_green',
'tv_blue',
'tv_yellow',
'lightmagenta',
'palecyan',
'raspberry',
'chartreuse',
'marine',
'paleyellow',
'hotpink',
'aquamarine',
'darksalmon',
'splitpea',
'slate',
'yelloworange',
'pink',
'greencyan',
'salmon',
'smudge',
'lightblue',
'limon',
'lightpink',
'teal',
'deepsalmon',
'palegreen',
'skyblue',
'wheat',
'dirtyviolet',
'deepteal',
'warmpink',
'limegreen',
'purpleblue',
'sand',
'violet',
'lightteal',
'firebrick',
'lime',
'deepblue',
'violetpurple',
'ruby',
'limon',
'density',
'purple',
'chocolate',
'forest',
'deeppurple',
'brown',
]
_VALIDATION_COLORS = ['yellow', 'orange', 'red']
@classmethod
def set_object_color(cls, color_num, obj):
"""Colors object from predetermined set, reusing as necessary."""
color = cls._OBJECT_COLORS[color_num % len(cls._OBJECT_COLORS)]
cmd.color(color, obj)
@classmethod
def set_validation_color(cls, color_num, obj):
"""Colors object from predetermined set, clamping to max."""
color = cls._VALIDATION_COLORS[min(color_num,
len(cls._VALIDATION_COLORS) - 1)]
cmd.color(color, obj)
@classmethod
def set_validation_background_color(cls, obj):
"""Colors object with validation background color."""
color = 'validation_background_color'
# Define this as a new color name.
cmd.set_color(color, [0.4, 1.0, 0.4])
cmd.color(color, obj)
@staticmethod
def set_transparency(selection, transparency):
cmd.set('cartoon_transparency', transparency, selection)
cmd.set('ribbon_transparency', transparency, selection)
cmd.set('stick_transparency', transparency, selection)
cmd.set('sphere_transparency', transparency, selection)
def get_polymer_display_type(segment_id, molecule_type, length):
"""Returns display type depending on molecule complexity."""
# Start with it being cartoon - then change as needed.
display_type = 'cartoon'
if stored.polymer_count > 50:
display_type = 'ribbon'
elif segment_id in stored.ca_p_only_segments:
logging.debug('set ribbon trace on')
cmd.set('ribbon_trace_atoms', 1)
display_type = 'ribbon'
elif 'polypeptide' in molecule_type and length < 20:
display_type = 'sticks'
elif 'nucleotide' in molecule_type and length < 3:
display_type = 'sticks'
elif 'saccharide' in molecule_type:
display_type = 'sticks'
# logging.debug(
# 'segment_id: %s, molecule_type: %s, length: %s, display_type: %s' %
# (segment_id, molecule_type, length, display_type))
return display_type
class Sequences(object):
"""Polymer sequences."""
Residue = namedtuple('Residue',
'chain_id pdb_num pdb_residue_num is_observed')
Range = namedtuple('Range', 'chain_id start_residue_num end_residue_num')
# TODO(r2r): For refactoring purposes temporarily moved stored.sequences
# into a Sequences class-level object. This should really be instance-level.
_sequences = {} # get_sequences() reformatted by Sequences()._build()
"""Stores polymer sequence data.
Format:
<sequences> = dict(<segment_id>: <residues>)
<residues> = dict(<residue_num>: <residue>)
<residue_num> = sequential numeric residue id within this sequence
<residue> = namedtuple
chain_id: PyMOL chain name
pdb_num: PDB residue number (excluding insertion codes)
pdb_residue_num: PyMOL residue number (including insertion codes),
i.e. 'resi' in selections
is_observed: bool TODO(r2r): exact meaning unclear
"""
def __init__(self, pdbid):
self._pdbid = pdbid
if not self._sequences:
self._build()
@classmethod
def clear(cls):
cls._sequences = {}
@staticmethod
def get_pdb_residue_num(pdb_num, pdb_insertion_code):
"""Returns a residue number as used in PDB data."""
if not pdb_insertion_code or pdb_insertion_code == ' ':
pdb_residue_num = str(pdb_num)
else:
pdb_residue_num = '%s%s' % (pdb_num, pdb_insertion_code)
# TODO(r2r): not clear why this substitution would be necessary.
pdb_residue_num = pdb_residue_num.replace('-', '\\-')
return pdb_residue_num
def _build(self):
"""Builds a dictionary of sequence residues."""
data = pdb.get_sequences(self._pdbid)
if self._pdbid not in data:
return
for molecule in data[self._pdbid]['molecules']:
for chain in molecule['chains']:
chain_id = chain['chain_id']
segment_id = chain['struct_asym_id']
for residue in chain['residues']:
residue_num = residue['residue_number']
pdb_num = residue['author_residue_number']
pdb_ins_code = residue['author_insertion_code']
pdb_residue_num = self.get_pdb_residue_num(
pdb_num, pdb_ins_code)
if residue['observed_ratio'] != 0:
is_observed = True
else:
is_observed = False
self._sequences.setdefault(segment_id,
{})[residue_num] = self.Residue(
chain_id, pdb_num,
pdb_residue_num, is_observed)
# logging.debug(self._sequences)
@staticmethod
def _order_range(start, end, start_pdb_residue_num, end_pdb_residue_num):
"""Returns (start_pdb_residue_num, end_pdb_residue_num) ordered such
that start < end.
"""
# logging.debug('PRE: start: %s, end %s' % (start, end))
if type(start) is str:
start = start.split('\\')[-1]
if type(end) is str:
end = end.split('\\')[-1]
if int(start) > int(end):
result = (end_pdb_residue_num, start_pdb_residue_num)
else:
result = (start_pdb_residue_num, end_pdb_residue_num)
# logging.debug('POST: start: %s, end: %s, %s' % (start, end, result))
return result
@classmethod
def _append_range(cls, start_residue, end_residue, ranges):
if not start_residue:
return # Range was not started yet - ignore.
start_pdb_residue_num, end_pdb_residue_num = cls._order_range(
start_residue.pdb_num, end_residue.pdb_num,
start_residue.pdb_residue_num, end_residue.pdb_residue_num)
ranges.append(
cls.Range(start_residue.chain_id, start_pdb_residue_num,
end_pdb_residue_num))
@staticmethod
def _get_trimmed_range(sequence, start_residue_num, end_residue_num):
"""Trims the sequence range of unobserved residues."""
# Get the range of existing residue numbers.
residue_numbers = sequence.keys()
first_residue_num = min(residue_numbers)
last_residue_num = max(residue_numbers)
# Bound start/end residue number range to existing residue numbers.
start_residue_num = max(start_residue_num, first_residue_num)
end_residue_num = min(end_residue_num, last_residue_num)
# logging.debug('first_residue_num: %s, last_residue_num: %s' %
# (first_residue_num, last_residue_num))
# logging.debug('start_residue_num: %s, end_residue_num: %s' %
# (start_residue_num, end_residue_num))
# Trim unobserved ends of sequence.
while not sequence[start_residue_num].is_observed:
start_residue_num += 1
if start_residue_num > end_residue_num:
logging.debug('domain unobserved')
return (None, None)
while not sequence[end_residue_num].is_observed:
end_residue_num -= 1
if start_residue_num > end_residue_num:
logging.debug('domain unobserved')
return (None, None)
return (start_residue_num, end_residue_num)
@classmethod
def get_ranges(cls, segment_id, start_residue_num, end_residue_num):
"""Returns contiguous residue ranges present in the sequence.
Pymol doesn't cope with non-contiguous ranges.
This function finds all ranges where the residue number increases by 1.
If it jumps then a separate residue range is generated.
"""
ranges = []
if segment_id not in cls._sequences:
return ranges
sequence = cls._sequences[segment_id]
start_residue_num, end_residue_num = cls._get_trimmed_range(
sequence, start_residue_num, end_residue_num)
# logging.debug('start_residue_num: %s, end_residue_num: %s' %
# (start_residue_num, end_residue_num))
if not start_residue_num:
return ranges
# Look at difference of pdb_num between neighboring residues in
# start-end range:
# - If its 1 then its contiguous; continue increasing current range.
# - If its 0 then there must be insert codes; that's ok, continue
# increasing current range.
# - if its > 1 or < 0 then there is a discontinuity; store the previous
# residues as a range and start a new range on the next residue.
# Also, if we run into a residue that doesn't have a valid pdb_num then
# close off the previous range.
range_start_residue = sequence[start_residue_num]
for current_residue_num in range(start_residue_num, end_residue_num):
current_residue = sequence[current_residue_num]
current_pdb_num = current_residue.pdb_num
next_pdb_num = sequence[current_residue_num + 1].pdb_num
# Skip over residues that don't have valid PDB numbers.
if current_pdb_num in _EMPTY_PDB_NUM:
continue
# Start a new range if none is started yet.
if not range_start_residue:
range_start_residue = current_residue
if next_pdb_num in _EMPTY_PDB_NUM:
cls._append_range(range_start_residue, current_residue, ranges)
range_start_residue = None
continue
pdb_num_jump = int(next_pdb_num) - int(current_pdb_num)
if pdb_num_jump > 1 or pdb_num_jump < 0:
# logging.debug('numbering not contiguous, jump %d - '
# 'store as range' % pdb_num_jump)
cls._append_range(range_start_residue, current_residue, ranges)
range_start_residue = None
# Append the last open range (if any) until end of residues of interst.
cls._append_range(range_start_residue, sequence[end_residue_num],
ranges)
# logging.debug(ranges)
return ranges
def get_range_selection(self, rng):
"""Returns a PyMOL selection for the range."""
selection = 'chain %s and resi %s-%s and %s' % (
rng.chain_id, rng.start_residue_num, rng.end_residue_num,
self._pdbid)
# logging.debug(selection)
return selection
def append_residue_selections(self, segment_id, selections):
"""Adds PyMOL selections strings for the segment to selections list."""
for residue in self._sequences[segment_id].values():
# logging.debug(residue)
selection = 'chain %s and resi %s and %s' % (
residue.chain_id, residue.pdb_residue_num, self._pdbid)
selections.append(selection)
# logging.debug(selection)
class Validation(object):
"""Performs validation of all polymeric entries."""
def __init__(self, molecules):
self._molecules = molecules
self._pdbid = molecules.pdbid
# Fetch all validation data.
self._val_data = pdb.get_validation(self._pdbid)
if self._val_data:
self._residue_data = pdb.get_residue_validation(self._pdbid)
self._ramachandran_data = pdb.get_ramachandran_validation(
self._pdbid)
def show(self):
if self._val_data:
logging.debug('There is validation for this entry')
self._show_per_residue_validation()
else:
logging.warning('No validation for this entry')
def _check_geometric_validation_outliers(self):
"""Checks for geometric validation outliers."""
try:
molecules = self._residue_data[self._pdbid]['molecules']
except Exception:
logging.debug('no residue validation for this entry')
return
for molecule in molecules:
# logging.debug(molecule)
for chain in molecule['chains']:
chain_id = chain['chain_id']
# logging.debug(chain_id)
for model in chain['models']:
for outlier_type in model['outlier_types']:
outliers = model['outlier_types'][outlier_type]
# logging.debug(outlier_type)
# logging.debug(outliers)
for outlier in outliers:
pdb_residue_num = Sequences.get_pdb_residue_num(
outlier['author_residue_number'],
outlier['author_insertion_code'])
# logging.debug(pdb_residue_num)
selection = 'chain %s and resi %s' % (
chain_id, pdb_residue_num)
self._tally_outlier(selection)
def _check_ramachandran_validation_outliers(self):
"""Checks for ramachandran validation outliers."""
if self._pdbid not in self._ramachandran_data:
return
for key in self._ramachandran_data[self._pdbid]:
outliers = self._ramachandran_data[self._pdbid][key]
if not outliers:
logging.debug('no %s' % key)
continue
logging.debug('ramachandran %s for this entry' % key)
for outlier in outliers:
# logging.debug(outlier)
chain_id = outlier['chain_id']
pdb_residue_num = Sequences.get_pdb_residue_num(
outlier['author_residue_number'],
outlier['author_insertion_code'])
selection = 'chain %s and resi %s' % (chain_id, pdb_residue_num)
self._tally_outlier(selection)
def _show_per_residue_validation(self):
"""Shows validation of all outliers, colored by number of outliers."""
# Display only polymers.
# FIXME(r2r): Molecules._process_molecule() does almost exactly the same
# thing. Whe should refactor this to use a single function, and it
# should probably live in the Molecule class which owns the respective
# data.
for molecule in self._molecules.molecules:
# logging.debug(molecule)
molecule_type = molecule['molecule_type']
if molecule_type in ['Water', 'Bound']:
continue
length = molecule['length']
for segment_id in molecule['in_struct_asyms']:
# logging.debug(segment_id)
selections = []
ranges = self._molecules.sequences.get_ranges(
segment_id, 1, length)
for rng in ranges:
selection = self._molecules.sequences.get_range_selection(
rng)
selections.append(selection)
display_type = get_polymer_display_type(segment_id,
'polypeptide', length)
pymol_selection = ' or '.join(['(%s)' % x for x in selections])
cmd.show(display_type, pymol_selection)
self._clear_outlier_tally()
self._check_geometric_validation_outliers()
self._check_ramachandran_validation_outliers()
self._display_outlier_tally()
cmd.enable(self._pdbid)
def _clear_outlier_tally(self):
self._outlier_tally = {}
def _tally_outlier(self, selection):
# uses a list so 0 means that there is one outlier for this residue.
color_num = self._outlier_tally.get(selection, 0)
self._outlier_tally[selection] = color_num + 1
def _display_outlier_tally(self):
Presentation.set_validation_background_color(self._pdbid)
for selection, color_num in self._outlier_tally.items():
Presentation.set_validation_color(color_num, selection)
class Molecules(object):
"""Analyze and visualize molecules."""
def __init__(self, pdbid):
self._pdbid = pdbid
# Analysis depends on some globally available data; load it.
if not stored.molecules:
stored.molecules = pdb.get_molecules(pdbid)
self._process_molecules()
self._sequences = Sequences(pdbid)
@property
def pdbid(self):
return self._pdbid
@property
def sequences(self):
return self._sequences
@property
def molecules(self):
return stored.molecules.get(self._pdbid, [])
def _process_molecules(self):
"""Generate derivative data from raw molecules.
* Creates ca_p_only_segments set which contains all segment_ids that
only have CA and/or P atom positions available in location data.
* Creates a global count of polymers in the data.
"""
for molecule in stored.molecules.get(self._pdbid, []):
# add ca only list
if molecule['ca_p_only']:
for segment_id in molecule['in_struct_asyms']:
stored.ca_p_only_segments.add(segment_id)
if molecule['molecule_type'] not in ['Water', 'Bound']:
stored.polymer_count += 1
def _process_molecule(self, molecule):
"""Returns the display type and selection criteria for the molecule."""
display_type = ''
selections = []
molecule_type = molecule['molecule_type']
for segment_id in molecule['in_struct_asyms']:
if molecule_type == 'Bound':
# Use segment_id to find residue number and chain ID.
# logging.debug(entity_name)
display_type = 'spheres'
self._sequences.append_residue_selections(
segment_id, selections)
else:
length = molecule['length']
# TODO(r2r): Computing a display_type per segment is rather
# useless if we only return a single display_type for the whole
# molecule. The display_type of the last segment wins.
display_type = get_polymer_display_type(segment_id,
molecule_type, length)
# logging.debug(segment_id)
ranges = self._sequences.get_ranges(segment_id, 1, length)
for rng in ranges:
selection = self._sequences.get_range_selection(rng)
selections.append(selection)
return display_type, selections
def show(self):
logging.debug('Display molecules')
cmd.set('cartoon_transparency', 0.3, self._pdbid)
cmd.set('ribbon_transparency', 0.3, self._pdbid)
for molecule in stored.molecules.get(self._pdbid, []):
if molecule['molecule_type'] == 'Water':
continue # Don't show water.
molecule_name = molecule['molecule_name']
if molecule_name:
entity_name = '-'.join(molecule_name)
if len(molecule_name) > 1:
entity_name += '_chimera'
else:
logging.debug('No name from the API')
entity_name = 'entity_%s' % molecule['entity_id']
# logging.debug('molecule %s: %s' %
# (molecule['entity_id'], entity_name))
object_name = entity_name.replace(' ', '_')
object_name = re.sub(r'\W+', '', object_name)
object_name = object_name[:250]
# logging.debug(object_name)
display_type, selections = self._process_molecule(molecule)
pymol_selection = ' or '.join(['(%s)' % x for x in selections])
logging.debug(pymol_selection)
cmd.select('temp_select', pymol_selection)
cmd.create(object_name, 'temp_select')
# logging.debug(display_type)
cmd.show(display_type, object_name)
# Color by molecule.
Presentation.set_object_color(int(molecule['entity_id']),
object_name)
cmd.delete('temp_select')
def show_assemblies(pdbid, mm_cif_file):
"""iterate through the assemblies and output images"""
logging.info('Generating assemblies')
# cmd.hide('everything')
Presentation.set_transparency('all', 0.0)
try:
assemblies = cmd.get_assembly_ids(pdbid) # list or None
assemblies = assemblies if assemblies else []
logging.debug(assemblies)
for assembly_id in assemblies:
logging.debug('Assembly: %s' % assembly_id)
assembly_name = pdbid + '_assem_' + assembly_id
logging.debug(assembly_name)
cmd.set('assembly', assembly_id)
cmd.load(mm_cif_file, assembly_name, format='cif')
logging.debug('finished Assembly: %s' % assembly_id)
except Exception:
logging.debug('pymol version does not support assemblies')
class Domains(object):
"""Analyze and visualize domains."""
_SEGMENT_ID_NAME = {
# domain_type -> segment_id key string
'CATH': 'domain',
'SCOP': 'scop_id',
'Pfam': '',
'Rfam': ''
}
Segment = namedtuple('Segment', 'entity_id chain_id segment_id start end')
def __init__(self, molecules):
self._molecules = molecules
self._pdbid = molecules.pdbid
def _map_all(self):
"""Make all domains."""
# mapped_domains = dict(<domain_type>: <typed_domain>)
# <typed_domain> = dict(<domain_id>: <named_domain>)
# <named_domain> = dict(<domain_name>: list(Segment)
mapped_domains = {}
self._map_domains(pdb.get_protein_domains(self._pdbid), mapped_domains)
self._map_domains(pdb.get_nucleic_domains(self._pdbid), mapped_domains)
return mapped_domains
def _map_domains(self, domains, mapped_domains):
if not domains:
logging.debug('no domain information for this entry')
return
for domain_type in domains[self._pdbid]:
try:
segment_id_name = self._SEGMENT_ID_NAME[domain_type]
except Exception:
# Ignore all domain types not in _SEGMENT_ID_NAME.
continue
for domain_id, domain in domains[self._pdbid][domain_type].items():
# logging.debug(domain_type)
# logging.debug(domain_id)
for mapping in domain.get('mappings', []):
domain_name = str(mapping.get(segment_id_name, ''))
start_residue_num = mapping['start']['residue_number']
end_residue_num = mapping['end']['residue_number']
chain_id = mapping['chain_id']
entity_id = mapping['entity_id']
segment_id = mapping['struct_asym_id']
ranges = self._molecules.sequences.get_ranges(
segment_id, start_residue_num, end_residue_num)
for rng in ranges:
mapped_domains.setdefault(domain_type, {}).setdefault(
domain_id, {}).setdefault(domain_name, []).append(
self.Segment(entity_id, chain_id, segment_id,
rng.start_residue_num,
rng.end_residue_num))
def show(self):
mapped_domains = self._map_all()
if not mapped_domains:
return
# Precompute molecule lengths indexed by entity_id.
molecule_length = {} # entity_id -> length
for molecule in self._molecules.molecules:
entity_id = molecule['entity_id']
molecule_length[entity_id] = molecule.get('length', None)
Object = namedtuple('Object', 'name entity_ids segment_ids')
Chain = namedtuple('Chain', 'entity_id chain_id segment_id')
# logging.debug(mapped_domains)
for domain_type, typed_domain in mapped_domains.items():
objects = [] # list of Object
chains = [] # list of Chain
for domain_id, named_domain in typed_domain.items():
# logging.debug(domain_id)
for domain_name, segments in named_domain.items():
# logging.debug(domain_name)
segment_ids = []
entity_ids = []
object_selections = []