From c62dad7c3810eea53be32a155b96b1adf2a9d814 Mon Sep 17 00:00:00 2001 From: yokochi47 Date: Thu, 7 Mar 2024 23:27:18 +0900 Subject: [PATCH] DAOTHER-9158: Extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID --- wwpdb/utils/nmr/NmrDpUtility.py | 46 +++++++++++++++++++ wwpdb/utils/nmr/io/CifReader.py | 13 ++++++ .../nmr-restraints-report-schema-v3.json | 7 +++ 3 files changed, 66 insertions(+) diff --git a/wwpdb/utils/nmr/NmrDpUtility.py b/wwpdb/utils/nmr/NmrDpUtility.py index e5342f26a..bdfb35fa7 100644 --- a/wwpdb/utils/nmr/NmrDpUtility.py +++ b/wwpdb/utils/nmr/NmrDpUtility.py @@ -185,6 +185,7 @@ # 17-Jan-2024 M. Yokochi - detect coordinate issue (DAOTHER-9084, type_symbol mismatches label_atom_id) # 24-Jan-2024 M. Yokochi - reconstruct polymer/non-polymer sequence based on pdb_mon_id, instead of auth_mon_id (D_1300043061) # 21-Feb-2024 M. Yokochi - add support for discontinuous model_id (NMR restraint remediation, 2n6j) +# 07-Mar-2024 M. Yokochi - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061) ## """ Wrapper class for NMR data processing. @author: Masashi Yokochi @@ -40834,6 +40835,11 @@ def __extractCoordPolymerSequence(self): _key_items.append({'name': 'pdb_mon_id', 'type': 'str', 'alt_name': 'auth_comp_id', 'default-from': 'mon_id'}) key_items = _key_items + if self.__cR.hasItem(lp_category, 'auth_mon_id'): + _key_items = copy.copy(key_items) + _key_items.append({'name': 'auth_mon_id', 'type': 'str', 'alt_name': 'alt_comp_id', 'default-from': 'mon_id'}) + key_items = _key_items + if not self.__cR.hasCategory(lp_category): alias = True lp_category = self.lp_categories[file_type][content_subtype + '_alias'] @@ -42414,6 +42420,26 @@ def __assignCoordPolymerSequence(self): _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + if conflict > 0 and any(len(c) > 3 for c in s2['comp_id']) and 'alt_comp_id' in s2: + self.__pA.addTestSequence(s2['alt_comp_id'], chain_id) + self.__pA.doAlign() + + myAlign = self.__pA.getAlignment(chain_id) + + length = len(myAlign) + + _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + + if conflict > 0: + self.__pA.addTestSequence(s2['comp_id'], chain_id) + self.__pA.doAlign() + + myAlign = self.__pA.getAlignment(chain_id) + + length = len(myAlign) + + _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + _s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1) _s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2) @@ -42868,6 +42894,26 @@ def __assignCoordPolymerSequence(self): _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + if conflict > 0 and any(len(c) > 3 for c in s1['comp_id']) and 'alt_comp_id' in s1: + self.__pA.setReferenceSequence(s1['alt_comp_id'], 'REF' + chain_id) + self.__pA.doAlign() + + myAlign = self.__pA.getAlignment(chain_id) + + length = len(myAlign) + + _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + + if conflict > 0: + self.__pA.setReferenceSequence(s1['comp_id'], 'REF' + chain_id) + self.__pA.doAlign() + + myAlign = self.__pA.getAlignment(chain_id) + + length = len(myAlign) + + _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign) + _s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1) _s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2) diff --git a/wwpdb/utils/nmr/io/CifReader.py b/wwpdb/utils/nmr/io/CifReader.py index 7237ea9d7..d1f660dab 100644 --- a/wwpdb/utils/nmr/io/CifReader.py +++ b/wwpdb/utils/nmr/io/CifReader.py @@ -30,6 +30,7 @@ # 18-Dec-2023 my - add calculate_uninstanced_coord() (DAOTHER-8945) # 24-Jan-2024 my - add 'default-from' attribute for key/data items (D_1300043061) # 21-Feb-2024 my - add support for discontinuous model_id (NMR restraint remediation, 2n6j) +# 07-Mar-2024 my - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061) ## """ A collection of classes for parsing CIF files. """ @@ -715,6 +716,7 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F auth_chain_id_col = -1 if 'auth_chain_id' not in altDict else altDict['auth_chain_id'] auth_seq_id_col = -1 if 'auth_seq_id' not in altDict else altDict['auth_seq_id'] auth_comp_id_col = -1 if 'auth_comp_id' not in altDict else altDict['auth_comp_id'] + alt_comp_id_col = -1 if 'alt_comp_id' not in altDict else altDict['alt_comp_id'] chainIds = sorted(set(row[chain_id_col] for row in rowList), key=lambda x: (len(x), x)) @@ -849,6 +851,17 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F else: ent['auth_comp_id'].append('.') + if alt_comp_id_col != -1: + ent['alt_comp_id'] = [] + for s in seqDict[c]: + row = next((row for row in rowList if row[chain_id_col] == c and int(row[seq_id_col]) == s), None) + if row is not None: + comp_id = row[alt_comp_id_col] + if comp_id not in self.emptyValue: + ent['alt_comp_id'].append(comp_id) + else: + ent['alt_comp_id'].append('.') + if withStructConf and i < LEN_MAJOR_ASYM_ID: # to process large assembly avoiding forced timeout ent['struct_conf'] = self.__extractStructConf(c, seqDict[c]) diff --git a/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json b/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json index c765c2a44..f0ae65fa4 100644 --- a/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json +++ b/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json @@ -850,6 +850,13 @@ "type": "string" } }, + "alt_comp_id": { + "description": "List of original chemical component ID of a chain.", + "type": "array", + "items": { + "type": "string" + } + }, "struct_conf": { "description": "List of secondary structure of a chain.", "type": "array",