diff --git a/wwpdb/utils/nmr/BMRBChemShiftStat.py b/wwpdb/utils/nmr/BMRBChemShiftStat.py index 737226307..f885b04b4 100644 --- a/wwpdb/utils/nmr/BMRBChemShiftStat.py +++ b/wwpdb/utils/nmr/BMRBChemShiftStat.py @@ -33,12 +33,16 @@ try: from wwpdb.utils.nmr.AlignUtil import (emptyValue, - protonBeginCode) + protonBeginCode, + pseProBeginCode) from wwpdb.utils.nmr.ChemCompUtil import ChemCompUtil + from wwpdb.utils.nmr.mr.ParserListenerUtil import translateToStdAtomName except ImportError: from nmr.AlignUtil import (emptyValue, - protonBeginCode) + protonBeginCode, + pseProBeginCode) from nmr.ChemCompUtil import ChemCompUtil + from nmr.mr.ParserListenerUtil import translateToStdAtomName class BMRBChemShiftStat: @@ -800,9 +804,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id + str(i) - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -815,7 +825,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) elif comp_id == 'HEM' and re.match(r'^HM[A-D]$', _atom_id) is not None: # others.csv dependent code @@ -824,9 +835,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id + i - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -839,7 +856,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) elif comp_id == 'HEB' and (re.match(r'^HM[A-D]1$', _atom_id) is not None or _atom_id == 'HBB1'): # others.csv dependent code @@ -848,9 +866,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id[:-1] + str(i) - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -863,7 +887,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) elif comp_id == 'HEC' and (re.match(r'^HM[A-D]$', _atom_id) is not None or re.match(r'^HB[BC]$', _atom_id) is not None): # others.csv dependent code @@ -872,9 +897,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id + str(i) - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -887,7 +918,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) # geminal proton group elif _atom_id.startswith('Q'): @@ -898,9 +930,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id + str(i) - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -913,7 +951,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) elif not ((comp_id == 'HEM' and re.match(r'^HM[A-D][AB]$', _atom_id) is not None) or (comp_id == 'HEB' and (re.match(r'^HM[A-D][23]$', _atom_id) is not None @@ -924,9 +963,15 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['comp_id'] = comp_id _row['atom_id'] = _atom_id - if not self.__checkAtomNomenclature(_row['atom_id']): + __status, __comp_id, __atom_id = self.checkAtomNomenclature(_row['atom_id']) + if not __status: continue + if _row['comp_id'] != __comp_id: + _row['comp_id'] = __comp_id + if _row['atom_id'] != __atom_id: + _row['atom_id'] = __atom_id + _row['count'] = int(row['count']) _row['avg'] = float(row['avg']) try: @@ -939,7 +984,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['primary'] = False _row['norm_freq'] = None - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) comp_ids = set(item['comp_id'] for item in atm_list) @@ -971,7 +1017,8 @@ def loadStatFromCsvFile(self, file_name, primary_th, secondary_th=None, comp_id_ _row['norm_freq'] = None _row['count'] = 0 - atm_list.append(_row) + if not any(a['comp_id'] == _row['comp_id'] and a['atom_id'] == _row['atom_id'] for a in atm_list): + atm_list.append(_row) self.__detectMethylProtonFromAtomNomenclature(comp_ids, atm_list) self.__detectGeminalProtonFromAtomNomenclature(comp_ids, atm_list) @@ -1031,60 +1078,99 @@ def __appendExtraFromCcd(self, comp_id): self.extras.extend(atm_list) - def __checkAtomNomenclature(self, atom_id): + def checkAtomNomenclature(self, atom_id, comp_id=None): """ Check atom nomenclature. """ + if comp_id is not None: + self.__ccU.updateChemCompDict(comp_id) + + comp_id = self.__ccU.lastCompId peptide_like = self.__ccU.peptideLike() - if any(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList - if a[self.__ccU.ccaAtomId] == atom_id - and (a[self.__ccU.ccaLeavingAtomFlag] != 'Y' - or (peptide_like - and a[self.__ccU.ccaNTerminalAtomFlag] == 'N' - and a[self.__ccU.ccaCTerminalAtomFlag] == 'N'))): - return True + cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] + + if cc_rel_status == 'OBS' and '_chem_comp.pdbx_replaced_by' in self.__ccU.lastChemCompDict: + replaced_by = self.__ccU.lastChemCompDict['_chem_comp.pdbx_replaced_by'] + if replaced_by not in emptyValue and self.__ccU.updateChemCompDict(replaced_by): + if self.__verbose: + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning - {comp_id} is replaced by {replaced_by}\n") + + comp_id = replaced_by + ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList] + + cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] + + if cc_rel_status == 'REL': + if any(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList + if a[self.__ccU.ccaAtomId] == atom_id + and (a[self.__ccU.ccaLeavingAtomFlag] != 'Y' + or (peptide_like + and a[self.__ccU.ccaNTerminalAtomFlag] == 'N' + and a[self.__ccU.ccaCTerminalAtomFlag] == 'N'))): + return True, comp_id, atom_id - comp_id = self.__ccU.lastCompId ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList] - ref_alt_atom_ids = [a[self.__ccU.ccaAltAtomId] for a in self.__ccU.lastAtomList] - if len(ref_atom_ids) == 0: + if len(ref_atom_ids) == 0 or cc_rel_status != 'REL': if self.__verbose: - cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] - self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error - {comp_id} is not valid CCD ID, status code: {cc_rel_status}\n") - return False + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error - {comp_id} is not valid CCD ID, status code: {cc_rel_status}\n") + return False, None, None + + ref_alt_atom_ids = [a[self.__ccU.ccaAltAtomId] for a in self.__ccU.lastAtomList] if atom_id in ref_atom_ids and atom_id in ref_alt_atom_ids: _ref_atom_id = next(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList if a[self.__ccU.ccaAltAtomId] == atom_id) if atom_id == _ref_atom_id: - return True + return True, comp_id, atom_id if self.__verbose: - self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Warning - {comp_id}:{atom_id} is valid, " + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning - {comp_id}:{atom_id} is valid, " f"but _chem_comp.alt_atom_id matched with different atom_id {_ref_atom_id}\n") - return True + return True, comp_id, atom_id if atom_id in ref_atom_ids and atom_id not in ref_alt_atom_ids: - return True + return True, comp_id, atom_id - if atom_id not in ref_alt_atom_ids and atom_id in ref_alt_atom_ids: + if atom_id not in ref_atom_ids and atom_id in ref_alt_atom_ids: _ref_atom_id = next(a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList if a[self.__ccU.ccaAltAtomId] == atom_id) if self.__verbose: - self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error - {comp_id}:{atom_id} matched with _chem_comp.alt_atom_id only. " + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Warning - {comp_id}:{atom_id} matched with _chem_comp.alt_atom_id only. " f"It should be {_ref_atom_id}\n") - return False + # print(f'case 1. {_comp_id}:{atom_id} -> {comp_id}:{_ref_atom_id}') + return True, comp_id, _ref_atom_id + + _ref_atom_ids = [a for a in ref_atom_ids if a[0] == atom_id[0] or a[0] == 'H' and atom_id[0] in pseProBeginCode] + + if len(_ref_atom_ids) == 0: + if self.__verbose: + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error - {comp_id}:{atom_id} did not match with any atom in CCD. No candidates foud\n") + + return False, None, None + + if len(atom_id) > 1 and atom_id[0] in ('1', '2', '3') and atom_id[1] == 'H' and atom_id[1:] + atom_id[0] in _ref_atom_ids: + # print(f'case 2. {_comp_id}:{atom_id} -> {comp_id}:{atom_id[1:] + atom_id[0]}') + return True, comp_id, atom_id[1:] + atom_id[0] + + if atom_id[-1] == "'" and self.getTypeOfCompId(comp_id)[2] and atom_id[:-1] in _ref_atom_ids: + # print(f'case 3. {_comp_id}:{atom_id} -> {comp_id}:{atom_id[:-1]}') + return True, comp_id, atom_id[:-1] + + _atom_id = translateToStdAtomName(atom_id, comp_id, refAtomIdList=_ref_atom_ids, ccU=self.__ccU, unambig=False) + if _atom_id in _ref_atom_ids: + # print(f'case 4. {_comp_id}:{atom_id} -> {comp_id}:{_atom_id}') + return True, comp_id, _atom_id if self.__verbose: - self.__lfh.write(f"+BMRBChemShiftStat.__checkAtomNomenclature() ++ Error - {comp_id}:{atom_id} did not match with any atom in CCD\n") + self.__lfh.write(f"+BMRBChemShiftStat.checkAtomNomenclature() ++ Error - {comp_id}:{atom_id} did not match with any atom in CCD, {_ref_atom_ids}\n") - return False + return False, None, None def __detectMethylProtonFromAtomNomenclature(self, comp_ids, atm_list): """ Detect methyl proton from atom nomenclature. @@ -1685,9 +1771,19 @@ def check_bmrb_cs_stat(atm_list): _list = [a for a in atm_list if a['comp_id'] == comp_id] ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList] + cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] + + if cc_rel_status == 'OBS' and '_chem_comp.pdbx_replaced_by' in self.__ccU.lastChemCompDict: + replaced_by = self.__ccU.lastChemCompDict['_chem_comp.pdbx_replaced_by'] + if replaced_by not in emptyValue and self.__ccU.updateChemCompDict(replaced_by): + print(f'[Warning] {comp_id} is replaced by {replaced_by}.') + + comp_id = replaced_by + + ref_atom_ids = [a[self.__ccU.ccaAtomId] for a in self.__ccU.lastAtomList] + cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] - if len(ref_atom_ids) == 0: - cc_rel_status = self.__ccU.lastChemCompDict['_chem_comp.pdbx_release_status'] + if len(ref_atom_ids) == 0 or cc_rel_status != 'REL': print(f'[Error] {comp_id} is not valid CCD ID, status code: {cc_rel_status}.') ret['error'] += 1 continue diff --git a/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl b/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl index 49d3dadab..b3141de79 100644 Binary files a/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl and b/wwpdb/utils/nmr/bmrb_cs_stat/others.pkl differ diff --git a/wwpdb/utils/nmr/mr/ParserListenerUtil.py b/wwpdb/utils/nmr/mr/ParserListenerUtil.py index bd379f29d..88d583f31 100644 --- a/wwpdb/utils/nmr/mr/ParserListenerUtil.py +++ b/wwpdb/utils/nmr/mr/ParserListenerUtil.py @@ -1826,12 +1826,16 @@ def translateToStdAtomName(atomId, refCompId=None, refAtomIdList=None, ccU=None, return "H2'1" if atomId == "H2''" and "H2'2" in refAtomIdList: # DCZ, THM return "H2'2" + if atomId == "H2''" and "HO2'" in refAtomIdList: # 5MC (DAOTHER-9313) + return "HO2'" if atomId == "H2''" and "H2'" in refAtomIdList: return "H2'" if atomId == "H2''''" and "H2''" in refAtomIdList: return "H2''" if atomId == "H2''''" and "H2'2" in refAtomIdList: # DCZ, THM return "H2'2" + if atomId == "H2''''" and "HO2'" in refAtomIdList: # 5MC (DAOTHER-9313) + return "HO2'" if atomId == "H2''1" and "H2'1" in refAtomIdList: # 4EN return "H2'1" if atomId == "H2''2" and "H2'2" in refAtomIdList: # 4EN @@ -1844,6 +1848,8 @@ def translateToStdAtomName(atomId, refCompId=None, refAtomIdList=None, ccU=None, return "H2'1" if atomId == "H2''" and "H2'2" in _refAtomIdList: # DCZ, THM return "H2'2" + if atomId == "H2''" and "HO2'" in _refAtomIdList: # 5MC (DAOTHER-9313) + return "HO2'" if atomId == "H2''" and "H2'" in _refAtomIdList: return "H2'" if atomId == "H2''''" and "H2''" in _refAtomIdList: diff --git a/wwpdb/utils/tests-nmr/test_BMRBChemShiftStat.py b/wwpdb/utils/tests-nmr/test_BMRBChemShiftStat.py index be808f75f..6a3df8ac4 100644 --- a/wwpdb/utils/tests-nmr/test_BMRBChemShiftStat.py +++ b/wwpdb/utils/tests-nmr/test_BMRBChemShiftStat.py @@ -57,9 +57,9 @@ def test_bb_atoms(self): self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('DA', excl_minor_atom=True), ["H1'", "H2'", "H2''", "H3'", "H4'", "H5'", "H5''"]) self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('A'), ["C1'", "C2'", "C3'", "C4'", "C5'", "H1'", "H2'", "H3'", "H4'", "H5'", "H5''", "HO2'", 'P']) self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('A', excl_minor_atom=True), ["C1'", "C2'", "C3'", "C4'", "C5'", "H1'", "H2'", "H3'", "H4'", "H5'", "H5''"]) - self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('5MC'), ["H1'", "H2'", "H3'", "H4'", "H5'", "H5''", 'P', "C5'", "C4'", "C3'", "C2'", "C1'", "HO2'"]) + self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('5MC'), ["H1'", "H2'", "HO2'", "H3'", "H4'", "H5'", "H5''", 'P', "C5'", "C4'", "C3'", "C2'", "C1'"]) self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('5MC', polypeptide_like=True), []) - self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('5MC', polynucleotide_like=True), ["H1'", "H2'", "H3'", "H4'", "H5'", "H5''", 'P', "C5'", "C4'", "C3'", "C2'", "C1'", "HO2'"]) + self.assertEqual(self.bmrb_cs_stat.getBackBoneAtoms('5MC', polynucleotide_like=True), ["H1'", "H2'", "HO2'", "H3'", "H4'", "H5'", "H5''", 'P', "C5'", "C4'", "C3'", "C2'", "C1'"]) def test_arom_atoms(self): self.assertEqual(self.bmrb_cs_stat.getAromaticAtoms('ALA'), []) @@ -167,6 +167,9 @@ def test_peptide_line(self): def test_atom_nomenclature(self): self.assertEqual(self.bmrb_cs_stat.testAtomNomenclatureOfLibrary(), True) + def test_doh_hn1(self): + self.assertEqual(self.bmrb_cs_stat.checkAtomNomenclature('HN1', 'DOH'), (True, 'BHD', 'H')) + if __name__ == '__main__': unittest.main()