From c62dad7c3810eea53be32a155b96b1adf2a9d814 Mon Sep 17 00:00:00 2001
From: yokochi47 <yokochi47@gmail.com>
Date: Thu, 7 Mar 2024 23:27:18 +0900
Subject: [PATCH] DAOTHER-9158: Extract pdbx_poly_seq_scheme.auth_mon_id as
 alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID

---
 wwpdb/utils/nmr/NmrDpUtility.py               | 46 +++++++++++++++++++
 wwpdb/utils/nmr/io/CifReader.py               | 13 ++++++
 .../nmr-restraints-report-schema-v3.json      |  7 +++
 3 files changed, 66 insertions(+)

diff --git a/wwpdb/utils/nmr/NmrDpUtility.py b/wwpdb/utils/nmr/NmrDpUtility.py
index e5342f26a..bdfb35fa7 100644
--- a/wwpdb/utils/nmr/NmrDpUtility.py
+++ b/wwpdb/utils/nmr/NmrDpUtility.py
@@ -185,6 +185,7 @@
 # 17-Jan-2024  M. Yokochi - detect coordinate issue (DAOTHER-9084, type_symbol mismatches label_atom_id)
 # 24-Jan-2024  M. Yokochi - reconstruct polymer/non-polymer sequence based on pdb_mon_id, instead of auth_mon_id (D_1300043061)
 # 21-Feb-2024  M. Yokochi - add support for discontinuous model_id (NMR restraint remediation, 2n6j)
+# 07-Mar-2024  M. Yokochi - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061)
 ##
 """ Wrapper class for NMR data processing.
     @author: Masashi Yokochi
@@ -40834,6 +40835,11 @@ def __extractCoordPolymerSequence(self):
             _key_items.append({'name': 'pdb_mon_id', 'type': 'str', 'alt_name': 'auth_comp_id', 'default-from': 'mon_id'})
             key_items = _key_items
 
+        if self.__cR.hasItem(lp_category, 'auth_mon_id'):
+            _key_items = copy.copy(key_items)
+            _key_items.append({'name': 'auth_mon_id', 'type': 'str', 'alt_name': 'alt_comp_id', 'default-from': 'mon_id'})
+            key_items = _key_items
+
         if not self.__cR.hasCategory(lp_category):
             alias = True
             lp_category = self.lp_categories[file_type][content_subtype + '_alias']
@@ -42414,6 +42420,26 @@ def __assignCoordPolymerSequence(self):
 
                     _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
 
+                    if conflict > 0 and any(len(c) > 3 for c in s2['comp_id']) and 'alt_comp_id' in s2:
+                        self.__pA.addTestSequence(s2['alt_comp_id'], chain_id)
+                        self.__pA.doAlign()
+
+                        myAlign = self.__pA.getAlignment(chain_id)
+
+                        length = len(myAlign)
+
+                        _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
+
+                        if conflict > 0:
+                            self.__pA.addTestSequence(s2['comp_id'], chain_id)
+                            self.__pA.doAlign()
+
+                            myAlign = self.__pA.getAlignment(chain_id)
+
+                            length = len(myAlign)
+
+                            _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
+
                     _s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1)
                     _s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2)
 
@@ -42868,6 +42894,26 @@ def __assignCoordPolymerSequence(self):
 
                     _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
 
+                    if conflict > 0 and any(len(c) > 3 for c in s1['comp_id']) and 'alt_comp_id' in s1:
+                        self.__pA.setReferenceSequence(s1['alt_comp_id'], 'REF' + chain_id)
+                        self.__pA.doAlign()
+
+                        myAlign = self.__pA.getAlignment(chain_id)
+
+                        length = len(myAlign)
+
+                        _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
+
+                        if conflict > 0:
+                            self.__pA.setReferenceSequence(s1['comp_id'], 'REF' + chain_id)
+                            self.__pA.doAlign()
+
+                            myAlign = self.__pA.getAlignment(chain_id)
+
+                            length = len(myAlign)
+
+                            _matched, unmapped, conflict, offset_1, offset_2 = getScoreOfSeqAlign(myAlign)
+
                     _s1 = s1 if offset_1 == 0 else fillBlankCompIdWithOffset(s1, offset_1)
                     _s2 = s2 if offset_2 == 0 else fillBlankCompIdWithOffset(s2, offset_2)
 
diff --git a/wwpdb/utils/nmr/io/CifReader.py b/wwpdb/utils/nmr/io/CifReader.py
index 7237ea9d7..d1f660dab 100644
--- a/wwpdb/utils/nmr/io/CifReader.py
+++ b/wwpdb/utils/nmr/io/CifReader.py
@@ -30,6 +30,7 @@
 # 18-Dec-2023   my  - add calculate_uninstanced_coord() (DAOTHER-8945)
 # 24-Jan-2024   my  - add 'default-from' attribute for key/data items (D_1300043061)
 # 21-Feb-2024   my  - add support for discontinuous model_id (NMR restraint remediation, 2n6j)
+# 07-Mar-2024   my  - extract pdbx_poly_seq_scheme.auth_mon_id as alt_cmop_id to prevent sequence mismatch due to 5-letter CCD ID (DAOTHER-9158 vs D_1300043061)
 ##
 """ A collection of classes for parsing CIF files.
 """
@@ -715,6 +716,7 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F
             auth_chain_id_col = -1 if 'auth_chain_id' not in altDict else altDict['auth_chain_id']
             auth_seq_id_col = -1 if 'auth_seq_id' not in altDict else altDict['auth_seq_id']
             auth_comp_id_col = -1 if 'auth_comp_id' not in altDict else altDict['auth_comp_id']
+            alt_comp_id_col = -1 if 'alt_comp_id' not in altDict else altDict['alt_comp_id']
 
             chainIds = sorted(set(row[chain_id_col] for row in rowList), key=lambda x: (len(x), x))
 
@@ -849,6 +851,17 @@ def getPolymerSequence(self, catName, keyItems, withStructConf=False, withRmsd=F
                                 else:
                                     ent['auth_comp_id'].append('.')
 
+                    if alt_comp_id_col != -1:
+                        ent['alt_comp_id'] = []
+                        for s in seqDict[c]:
+                            row = next((row for row in rowList if row[chain_id_col] == c and int(row[seq_id_col]) == s), None)
+                            if row is not None:
+                                comp_id = row[alt_comp_id_col]
+                                if comp_id not in self.emptyValue:
+                                    ent['alt_comp_id'].append(comp_id)
+                                else:
+                                    ent['alt_comp_id'].append('.')
+
                     if withStructConf and i < LEN_MAJOR_ASYM_ID:  # to process large assembly avoiding forced timeout
                         ent['struct_conf'] = self.__extractStructConf(c, seqDict[c])
 
diff --git a/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json b/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json
index c765c2a44..f0ae65fa4 100644
--- a/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json
+++ b/wwpdb/utils/tests-nmr/json-schema/nmr-restraints-report-schema-v3.json
@@ -850,6 +850,13 @@
             "type": "string"
           }
         },
+        "alt_comp_id": {
+          "description": "List of original chemical component ID of a chain.",
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
         "struct_conf": {
           "description": "List of secondary structure of a chain.",
           "type": "array",