Skip to content

Commit

Permalink
fix(datasets): update sequence to include 3Di
Browse files Browse the repository at this point in the history
  • Loading branch information
jyaacoub committed Nov 13, 2023
1 parent 2e2f924 commit 88aa029
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 9 deletions.
17 changes: 10 additions & 7 deletions src/data_processing/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,15 @@ def process(self):
# extra_feat is Lx54 or Lx34 (if shannon=True)
try:
pro_cmap = np.load(self.cmap_p(code))
extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq, contact_map=pro_cmap,
threshold=self.cmap_threshold, pro_feat=self.feature_opt,
aln_file=self.aln_p(code),
# for foldseek feats
pdb_fp=self.pdb_p(code),
pddlt_fp=self.pddlt_p(code))
# updated_seq is for updated foldseek 3di combined seq
updated_seq, extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq,
contact_map=pro_cmap,
threshold=self.cmap_threshold,
pro_feat=self.feature_opt,
aln_file=self.aln_p(code),
# for foldseek feats
pdb_fp=self.pdb_p(code),
pddlt_fp=self.pddlt_p(code))
except Exception as e:
raise Exception(f"error on protein graph creation for code {code}") from e

Expand Down Expand Up @@ -370,7 +373,7 @@ def process(self):

pro = torchg.data.Data(x=torch.Tensor(pro_feat),
edge_index=torch.LongTensor(edge_idx),
pro_seq=pro_seq, # protein sequence for downstream esm model
pro_seq=updated_seq, # Protein sequence for downstream esm model
prot_id=prot_id,
edge_weight=pro_edge_weight)
processed_prots[prot_id] = pro
Expand Down
7 changes: 5 additions & 2 deletions src/feature_extraction/protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def target_to_graph(target_sequence:str, contact_map:str or np.array,
Returns
-------
Tuple[np.array]
tuple of (target_feature, target_edge_index)
tuple of (target_sequence, target_feature, target_edge_index)
"""
assert pro_feat in cfg.PRO_FEAT_OPT, \
f'Invalid protein feature option: {pro_feat}, must be one of {cfg.PRO_FEAT_OPT}'
Expand Down Expand Up @@ -97,10 +97,13 @@ def entropy(col):
# input sequences should now include 3di tokens
pro_hot_3di = get_foldseek_onehot(combined_seq)
target_feature = np.concatenate((pro_hot, pro_hot_3di), axis=1)

# updating target sequence to include 3di tokens
target_sequence = combined_seq
else:
raise NotImplementedError(f'Invalid protein feature option: {pro_feat}')

return target_feature, edge_index
return target_sequence, target_feature, edge_index


######################################################################
Expand Down

0 comments on commit 88aa029

Please sign in to comment.