DAOTHER-9317: Rescued 300 atoms' chemical shift statistics of non-sta…

…ndard residues by mathcing CCD
wwPDB · Apr 19, 2024 · 59108d6 · 59108d6
1 parent 3f59a79
commit 59108d6
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 42 deletions.
diff --git a/wwpdb/utils/nmr/BMRBChemShiftStat.py b/wwpdb/utils/nmr/BMRBChemShiftStat.py
@@ -33,12 +33,16 @@
 
 try:
     from wwpdb.utils.nmr.AlignUtil import (emptyValue,
-                                           protonBeginCode)
+                                           protonBeginCode,
+                                           pseProBeginCode)
     from wwpdb.utils.nmr.ChemCompUtil import ChemCompUtil
+    from wwpdb.utils.nmr.mr.ParserListenerUtil import translateToStdAtomName
 except ImportError:
     from nmr.AlignUtil import (emptyValue,
-                               protonBeginCode)
+                               protonBeginCode,
+                               pseProBeginCode)
     from nmr.ChemCompUtil import ChemCompUtil
+    from nmr.mr.ParserListenerUtil import translateToStdAtomName
 
 
 class BMRBChemShiftStat:
@@ -800,9 +804,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['comp_id'] = comp_id
                         _row['atom_id'] = _atom_id + str(i)
 
-                        if not self.__checkAtomNomenclature(_row['atom_id']):
+                        __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                        if not __status:
                             continue
 
+                        if _row['comp_id'] != __comp_id:
+                            _row['comp_id'] = __comp_id
+                        if _row['atom_id'] != __atom_id:
+                            _row['atom_id'] = __atom_id
+
                         _row['count'] = int(row['count'])
                         _row['avg'] = float(row['avg'])
                         try:
@@ -815,7 +825,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['primary'] = False
                         _row['norm_freq'] = None
 
-                        atm_list.append(_row)
+                        if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                            atm_list.append(_row)
 
                 elif comp_id == 'HEM' and re.match(r'^HM[A-D]$', _atom_id) is not None:  # others.csv dependent code
 
@@ -824,9 +835,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['comp_id'] = comp_id
                         _row['atom_id'] = _atom_id + i
 
-                        if not self.__checkAtomNomenclature(_row['atom_id']):
+                        __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                        if not __status:
                             continue
 
+                        if _row['comp_id'] != __comp_id:
+                            _row['comp_id'] = __comp_id
+                        if _row['atom_id'] != __atom_id:
+                            _row['atom_id'] = __atom_id
+
                         _row['count'] = int(row['count'])
                         _row['avg'] = float(row['avg'])
                         try:
@@ -839,7 +856,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['primary'] = False
                         _row['norm_freq'] = None
 
-                        atm_list.append(_row)
+                        if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                            atm_list.append(_row)
 
                 elif comp_id == 'HEB' and (re.match(r'^HM[A-D]1$', _atom_id) is not None or _atom_id == 'HBB1'):  # others.csv dependent code
 
@@ -848,9 +866,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['comp_id'] = comp_id
                         _row['atom_id'] = _atom_id[:-1] + str(i)
 
-                        if not self.__checkAtomNomenclature(_row['atom_id']):
+                        __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                        if not __status:
                             continue
 
+                        if _row['comp_id'] != __comp_id:
+                            _row['comp_id'] = __comp_id
+                        if _row['atom_id'] != __atom_id:
+                            _row['atom_id'] = __atom_id
+
                         _row['count'] = int(row['count'])
                         _row['avg'] = float(row['avg'])
                         try:
@@ -863,7 +887,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['primary'] = False
                         _row['norm_freq'] = None
 
-                        atm_list.append(_row)
+                        if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                            atm_list.append(_row)
 
                 elif comp_id == 'HEC' and (re.match(r'^HM[A-D]$', _atom_id) is not None or re.match(r'^HB[BC]$', _atom_id) is not None):  # others.csv dependent code
 
@@ -872,9 +897,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['comp_id'] = comp_id
                         _row['atom_id'] = _atom_id + str(i)
 
-                        if not self.__checkAtomNomenclature(_row['atom_id']):
+                        __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                        if not __status:
                             continue
 
+                        if _row['comp_id'] != __comp_id:
+                            _row['comp_id'] = __comp_id
+                        if _row['atom_id'] != __atom_id:
+                            _row['atom_id'] = __atom_id
+
                         _row['count'] = int(row['count'])
                         _row['avg'] = float(row['avg'])
                         try:
@@ -887,7 +918,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['primary'] = False
                         _row['norm_freq'] = None
 
-                        atm_list.append(_row)
+                        if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                            atm_list.append(_row)
 
                 # geminal proton group
                 elif _atom_id.startswith('Q'):
@@ -898,9 +930,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['comp_id'] = comp_id
                         _row['atom_id'] = _atom_id + str(i)
 
-                        if not self.__checkAtomNomenclature(_row['atom_id']):
+                        __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                        if not __status:
                             continue
 
+                        if _row['comp_id'] != __comp_id:
+                            _row['comp_id'] = __comp_id
+                        if _row['atom_id'] != __atom_id:
+                            _row['atom_id'] = __atom_id
+
                         _row['count'] = int(row['count'])
                         _row['avg'] = float(row['avg'])
                         try:
@@ -913,7 +951,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                         _row['primary'] = False
                         _row['norm_freq'] = None
 
-                        atm_list.append(_row)
+                        if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                            atm_list.append(_row)
 
                 elif not ((comp_id == 'HEM' and re.match(r'^HM[A-D][AB]$', _atom_id) is not None)
                           or (comp_id == 'HEB' and (re.match(r'^HM[A-D][23]$', _atom_id) is not None
@@ -924,9 +963,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                     _row['comp_id'] = comp_id
                     _row['atom_id'] = _atom_id
 
-                    if not self.__checkAtomNomenclature(_row['atom_id']):
+                    __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id'])
+                    if not __status:
                         continue
 
+                    if _row['comp_id'] != __comp_id:
+                        _row['comp_id'] = __comp_id
+                    if _row['atom_id'] != __atom_id:
+                        _row['atom_id'] = __atom_id
+
                     _row['count'] = int(row['count'])
                     _row['avg'] = float(row['avg'])
                     try:
@@ -939,7 +984,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                     _row['primary'] = False
                     _row['norm_freq'] = None
 
-                    atm_list.append(_row)
+                    if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                        atm_list.append(_row)
 
         comp_ids = set(item['comp_id'] for item in atm_list)
 
@@ -971,7 +1017,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_
                                 _row['norm_freq'] = None
                                 _row['count'] = 0
 
-                                atm_list.append(_row)
+                                if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list):
+                                    atm_list.append(_row)
 
         self.__detectMethylProtonFromAtomNomenclature(comp_ids, atm_list)
         self.__detectGeminalProtonFromAtomNomenclature(comp_ids, atm_list)
@@ -1031,60 +1078,99 @@ def __appendExtraFromCcd(self, comp_id):
 
         self.extras.extend(atm_list)
 
-    def __checkAtomNomenclature(self, atom_id):
+    def checkAtomNomenclature(self, atom_id, comp_id=None):
         """ Check atom nomenclature.
         """
 
+        if comp_id is not None:
+            self.__ccU.updateChemCompDict(comp_id)
+
+        comp_id = self.__ccU.lastCompId
         peptide_like = self.__ccU.peptideLike()
 
-        if any(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList
-               if a[self.__ccU.ccaAtomId] == atom_id
-               and (a[self.__ccU.ccaLeavingAtomFlag] != 'Y'
-                    or (peptide_like
-                        and a[self.__ccU.ccaNTerminalAtomFlag] == 'N'
-                        and a[self.__ccU.ccaCTerminalAtomFlag] == 'N'))):
-            return True
+        cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
+
+        if cc_rel_status == 'OBS' and '_chem_comp.pdbx_replaced_by' in self.__ccU.lastChemCompDict:
+            replaced_by = self.__ccU.lastChemCompDict['_chem_comp.pdbx_replaced_by']
+            if replaced_by not in emptyValue and self.__ccU.updateChemCompDict(replaced_by):
+                if self.__verbose:
+                    self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning  - {comp_id} is replaced by {replaced_by}\n")
+
+                comp_id = replaced_by
+                ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList]
+
+                cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
+
+        if cc_rel_status == 'REL':
+            if any(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList
+                   if a[self.__ccU.ccaAtomId] == atom_id
+                   and (a[self.__ccU.ccaLeavingAtomFlag] != 'Y'
+                        or (peptide_like
+                            and a[self.__ccU.ccaNTerminalAtomFlag] == 'N'
+                            and a[self.__ccU.ccaCTerminalAtomFlag] == 'N'))):
+                return True, comp_id, atom_id
 
-        comp_id = self.__ccU.lastCompId
         ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList]
-        ref_alt_atom_ids = [a[self.__ccU.ccaAltAtomId] for a in self.__ccU.lastAtomList]
 
-        if len(ref_atom_ids) == 0:
+        if len(ref_atom_ids) == 0 or cc_rel_status != 'REL':
             if self.__verbose:
-                cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
-                self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error  - {comp_id} is not valid CCD ID, status code: {cc_rel_status}\n")
-            return False
+                self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error  - {comp_id} is not valid CCD ID, status code: {cc_rel_status}\n")
+            return False, None, None
+
+        ref_alt_atom_ids = [a[self.__ccU.ccaAltAtomId] for a in self.__ccU.lastAtomList]
 
         if atom_id in ref_atom_ids and atom_id in ref_alt_atom_ids:
             _ref_atom_id = next(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList
                                 if a[self.__ccU.ccaAltAtomId] == atom_id)
 
             if atom_id == _ref_atom_id:
-                return True
+                return True, comp_id, atom_id
 
             if self.__verbose:
-                self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Warning  - {comp_id}:{atom_id} is valid, "
+                self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning  - {comp_id}:{atom_id} is valid, "
                                  f"but _chem_comp.alt_atom_id matched with different atom_id {_ref_atom_id}\n")
 
-            return True
+            return True, comp_id, atom_id
 
         if atom_id in ref_atom_ids and atom_id not in ref_alt_atom_ids:
-            return True
+            return True, comp_id, atom_id
 
-        if atom_id not in ref_alt_atom_ids and atom_id in ref_alt_atom_ids:
+        if atom_id not in ref_atom_ids and atom_id in ref_alt_atom_ids:
             _ref_atom_id = next(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList
                                 if a[self.__ccU.ccaAltAtomId] == atom_id)
 
             if self.__verbose:
-                self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error  - {comp_id}:{atom_id} matched with _chem_comp.alt_atom_id only. "
+                self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning  - {comp_id}:{atom_id} matched with _chem_comp.alt_atom_id only. "
                                  f"It should be {_ref_atom_id}\n")
 
-            return False
+            # print(f'case 1. {_comp_id}:{atom_id} -> {comp_id}:{_ref_atom_id}')
+            return True, comp_id, _ref_atom_id
+
+        _ref_atom_ids = [a for a in ref_atom_ids if a[0] == atom_id[0] or a[0] == 'H' and atom_id[0] in pseProBeginCode]
+
+        if len(_ref_atom_ids) == 0:
+            if self.__verbose:
+                self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error  - {comp_id}:{atom_id} did not match with any atom in CCD. No candidates foud\n")
+
+            return False, None, None
+
+        if len(atom_id) > 1 and atom_id[0] in ('1', '2', '3') and atom_id[1] == 'H' and atom_id[1:] + atom_id[0] in _ref_atom_ids:
+            # print(f'case 2. {_comp_id}:{atom_id} -> {comp_id}:{atom_id[1:] + atom_id[0]}')
+            return True, comp_id, atom_id[1:] + atom_id[0]
+
+        if atom_id[-1] == "'" and self.getTypeOfCompId(comp_id)[2] and atom_id[:-1] in _ref_atom_ids:
+            # print(f'case 3. {_comp_id}:{atom_id} -> {comp_id}:{atom_id[:-1]}')
+            return True, comp_id, atom_id[:-1]
+
+        _atom_id = translateToStdAtomName(atom_id, comp_id, refAtomIdList=_ref_atom_ids, ccU=self.__ccU, unambig=False)
+        if _atom_id in _ref_atom_ids:
+            # print(f'case 4. {_comp_id}:{atom_id} -> {comp_id}:{_atom_id}')
+            return True, comp_id, _atom_id
 
         if self.__verbose:
-            self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error  - {comp_id}:{atom_id} did not match with any atom in CCD\n")
+            self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error  - {comp_id}:{atom_id} did not match with any atom in CCD, {_ref_atom_ids}\n")
 
-        return False
+        return False, None, None
 
     def __detectMethylProtonFromAtomNomenclature(self, comp_ids, atm_list):
         """ Detect methyl proton from atom nomenclature.
@@ -1685,9 +1771,19 @@ def check_bmrb_cs_stat(atm_list):
                 _list = [a for a in atm_list if a['comp_id'] == comp_id]
 
                 ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList]
+                cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
+
+                if cc_rel_status == 'OBS' and '_chem_comp.pdbx_replaced_by' in self.__ccU.lastChemCompDict:
+                    replaced_by = self.__ccU.lastChemCompDict['_chem_comp.pdbx_replaced_by']
+                    if replaced_by not in emptyValue and self.__ccU.updateChemCompDict(replaced_by):
+                        print(f'[Warning] {comp_id} is replaced by {replaced_by}.')
+
+                        comp_id = replaced_by
+
+                        ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList]
+                        cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
 
-                if len(ref_atom_ids) == 0:
-                    cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status']
+                if len(ref_atom_ids) == 0 or cc_rel_status != 'REL':
                     print(f'[Error] {comp_id} is not valid CCD ID, status code: {cc_rel_status}.')
                     ret['error'] += 1
                     continue

diff --git a/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl b/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl
diff --git a/wwpdb/utils/nmr/mr/ParserListenerUtil.py b/wwpdb/utils/nmr/mr/ParserListenerUtil.py
@@ -1826,12 +1826,16 @@ def translateToStdAtomName(atomId, refCompId=None, refAtomIdList=None, ccU=None,
                         return "H2'1"
                     if atomId == "H2''" and "H2'2" in refAtomIdList:  # DCZ, THM
                         return "H2'2"
+                    if atomId == "H2''" and "HO2'" in refAtomIdList:  # 5MC (DAOTHER-9313)
+                        return "HO2'"
                     if atomId == "H2''" and "H2'" in refAtomIdList:
                         return "H2'"
                     if atomId == "H2''''" and "H2''" in refAtomIdList:
                         return "H2''"
                     if atomId == "H2''''" and "H2'2" in refAtomIdList:  # DCZ, THM
                         return "H2'2"
+                    if atomId == "H2''''" and "HO2'" in refAtomIdList:  # 5MC (DAOTHER-9313)
+                        return "HO2'"
                     if atomId == "H2''1" and "H2'1" in refAtomIdList:  # 4EN
                         return "H2'1"
                     if atomId == "H2''2" and "H2'2" in refAtomIdList:  # 4EN
@@ -1844,6 +1848,8 @@ def translateToStdAtomName(atomId, refCompId=None, refAtomIdList=None, ccU=None,
                     return "H2'1"
                 if atomId == "H2''" and "H2'2" in _refAtomIdList:  # DCZ, THM
                     return "H2'2"
+                if atomId == "H2''" and "HO2'" in _refAtomIdList:  # 5MC (DAOTHER-9313)
+                    return "HO2'"
                 if atomId == "H2''" and "H2'" in _refAtomIdList:
                     return "H2'"
                 if atomId == "H2''''" and "H2''" in _refAtomIdList: