Skip to content

Commit

Permalink
DAOTHER-9158: Extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id…
Browse files Browse the repository at this point in the history
… to prevent sequence mismatch due to 5-letter CCD ID
  • Loading branch information
yokochi47 committed Mar 7, 2024
1 parent a51813a commit c62dad7
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 0 deletions.
46 changes: 46 additions & 0 deletions wwpdb/utils/nmr/NmrDpUtility.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
# 17-Jan-2024 M. Yokochi - detect coordinate issue (DAOTHER-9084, type_symbol mismatches label_atom_id)
# 24-Jan-2024 M. Yokochi - reconstruct polymer/non-polymer sequence based on pdb_mon_id, instead of auth_mon_id (D_1300043061)
# 21-Feb-2024 M. Yokochi - add support for discontinuous model_id (NMR restraint remediation, 2n6j)
# 07-Mar-2024 M. Yokochi - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061)
##
""" Wrapper class for NMR data processing.
@author: Masashi Yokochi
Expand Down Expand Up @@ -40834,6 +40835,11 @@ def __extractCoordPolymerSequence(self):
_key_items.append({'name': 'pdb_mon_id', 'type': 'str', 'alt_name': 'auth_comp_id', 'default-from': 'mon_id'})
key_items = _key_items

if self.__cR.hasItem(lp_category, 'auth_mon_id'):
_key_items = copy.copy(key_items)
_key_items.append({'name': 'auth_mon_id', 'type': 'str', 'alt_name': 'alt_comp_id', 'default-from': 'mon_id'})
key_items = _key_items

if not self.__cR.hasCategory(lp_category):
alias = True
lp_category = self.lp_categories[file_type][content_subtype + '_alias']
Expand Down Expand Up @@ -42414,6 +42420,26 @@ def __assignCoordPolymerSequence(self):

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

if conflict > 0 and any(len(c) > 3 for c in s2['comp_id']) and 'alt_comp_id' in s2:
self.__pA.addTestSequence(s2['alt_comp_id'], chain_id)
self.__pA.doAlign()

myAlign = self.__pA.getAlignment(chain_id)

length = len(myAlign)

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

if conflict > 0:
self.__pA.addTestSequence(s2['comp_id'], chain_id)
self.__pA.doAlign()

myAlign = self.__pA.getAlignment(chain_id)

length = len(myAlign)

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

_s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1)
_s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2)

Expand Down Expand Up @@ -42868,6 +42894,26 @@ def __assignCoordPolymerSequence(self):

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

if conflict > 0 and any(len(c) > 3 for c in s1['comp_id']) and 'alt_comp_id' in s1:
self.__pA.setReferenceSequence(s1['alt_comp_id'], 'REF' + chain_id)
self.__pA.doAlign()

myAlign = self.__pA.getAlignment(chain_id)

length = len(myAlign)

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

if conflict > 0:
self.__pA.setReferenceSequence(s1['comp_id'], 'REF' + chain_id)
self.__pA.doAlign()

myAlign = self.__pA.getAlignment(chain_id)

length = len(myAlign)

_matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)

_s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1)
_s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2)

Expand Down
13 changes: 13 additions & 0 deletions wwpdb/utils/nmr/io/CifReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
# 18-Dec-2023 my - add calculate_uninstanced_coord() (DAOTHER-8945)
# 24-Jan-2024 my - add 'default-from' attribute for key/data items (D_1300043061)
# 21-Feb-2024 my - add support for discontinuous model_id (NMR restraint remediation, 2n6j)
# 07-Mar-2024 my - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061)
##
""" A collection of classes for parsing CIF files.
"""
Expand Down Expand Up @@ -715,6 +716,7 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F
auth_chain_id_col = -1 if 'auth_chain_id' not in altDict else altDict['auth_chain_id']
auth_seq_id_col = -1 if 'auth_seq_id' not in altDict else altDict['auth_seq_id']
auth_comp_id_col = -1 if 'auth_comp_id' not in altDict else altDict['auth_comp_id']
alt_comp_id_col = -1 if 'alt_comp_id' not in altDict else altDict['alt_comp_id']

chainIds = sorted(set(row[chain_id_col] for row in rowList), key=lambda x: (len(x), x))

Expand Down Expand Up @@ -849,6 +851,17 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F
else:
ent['auth_comp_id'].append('.')

if alt_comp_id_col != -1:
ent['alt_comp_id'] = []
for s in seqDict[c]:
row = next((row for row in rowList if row[chain_id_col] == c and int(row[seq_id_col]) == s), None)
if row is not None:
comp_id = row[alt_comp_id_col]
if comp_id not in self.emptyValue:
ent['alt_comp_id'].append(comp_id)
else:
ent['alt_comp_id'].append('.')

if withStructConf and i < LEN_MAJOR_ASYM_ID: # to process large assembly avoiding forced timeout
ent['struct_conf'] = self.__extractStructConf(c, seqDict[c])

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,13 @@
"type": "string"
}
},
"alt_comp_id": {
"description": "List of original chemical component ID of a chain.",
"type": "array",
"items": {
"type": "string"
}
},
"struct_conf": {
"description": "List of secondary structure of a chain.",
"type": "array",
Expand Down

0 comments on commit c62dad7

Please sign in to comment.