From 88aa0292827048d0ed40b38ad3f11594410159fa Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Mon, 13 Nov 2023 12:20:48 -0500 Subject: [PATCH] fix(datasets): update sequence to include 3Di --- src/data_processing/datasets.py | 17 ++++++++++------- src/feature_extraction/protein.py | 7 +++++-- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/data_processing/datasets.py b/src/data_processing/datasets.py index fd2b20d3..ab4f3026 100644 --- a/src/data_processing/datasets.py +++ b/src/data_processing/datasets.py @@ -336,12 +336,15 @@ def process(self): # extra_feat is Lx54 or Lx34 (if shannon=True) try: pro_cmap = np.load(self.cmap_p(code)) - extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq, contact_map=pro_cmap, - threshold=self.cmap_threshold, pro_feat=self.feature_opt, - aln_file=self.aln_p(code), - # for foldseek feats - pdb_fp=self.pdb_p(code), - pddlt_fp=self.pddlt_p(code)) + # updated_seq is for updated foldseek 3di combined seq + updated_seq, extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq, + contact_map=pro_cmap, + threshold=self.cmap_threshold, + pro_feat=self.feature_opt, + aln_file=self.aln_p(code), + # for foldseek feats + pdb_fp=self.pdb_p(code), + pddlt_fp=self.pddlt_p(code)) except Exception as e: raise Exception(f"error on protein graph creation for code {code}") from e @@ -370,7 +373,7 @@ def process(self): pro = torchg.data.Data(x=torch.Tensor(pro_feat), edge_index=torch.LongTensor(edge_idx), - pro_seq=pro_seq, # protein sequence for downstream esm model + pro_seq=updated_seq, # Protein sequence for downstream esm model prot_id=prot_id, edge_weight=pro_edge_weight) processed_prots[prot_id] = pro diff --git a/src/feature_extraction/protein.py b/src/feature_extraction/protein.py index f2056da8..275ba373 100644 --- a/src/feature_extraction/protein.py +++ b/src/feature_extraction/protein.py @@ -44,7 +44,7 @@ def target_to_graph(target_sequence:str, contact_map:str or np.array, Returns ------- Tuple[np.array] - tuple of (target_feature, target_edge_index) + tuple of (target_sequence, target_feature, target_edge_index) """ assert pro_feat in cfg.PRO_FEAT_OPT, \ f'Invalid protein feature option: {pro_feat}, must be one of {cfg.PRO_FEAT_OPT}' @@ -97,10 +97,13 @@ def entropy(col): # input sequences should now include 3di tokens pro_hot_3di = get_foldseek_onehot(combined_seq) target_feature = np.concatenate((pro_hot, pro_hot_3di), axis=1) + + # updating target sequence to include 3di tokens + target_sequence = combined_seq else: raise NotImplementedError(f'Invalid protein feature option: {pro_feat}') - return target_feature, edge_index + return target_sequence, target_feature, edge_index ######################################################################