Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hits sorting testbed #1018

Merged
merged 12 commits into from
Feb 3, 2025
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion metatlas/plots/dill2plots.py
Original file line number Diff line number Diff line change
@@ -306,7 +306,7 @@ def __init__(self,
"""
logger.debug("Initializing new instance of %s.", self.__class__.__name__)
self.data = data
self.msms_hits = msms_hits.sort_values('score', ascending=False)
self.msms_hits = sp.sort_msms_hits(msms_hits)
self.color_me = or_default(color_me, [('black', '')])
self.compound_idx = compound_idx
self.width = width
@@ -2278,6 +2278,8 @@ def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str, compoun
compound_msms_hits['score'] = compound_msms_hits['measured_precursor_intensity']
compound_msms_hits['num_matches'] = 0
compound_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(compound_msms_hits)
compound_msms_hits['ref_frags'] = 0
compound_msms_hits['data_frags'] = 0

return compound_msms_hits

@@ -2348,6 +2350,9 @@ def get_hits_per_compound(cos: Type[CosineHungarian], compound_idx: int,
inchi_msms_hits['score'] = scores_matches['score'].flatten()
inchi_msms_hits['num_matches'] = scores_matches['matches'].flatten()

inchi_msms_hits['ref_frags'] = inchi_msms_hits['spectrum'].apply(lambda x: len(x[0]) if x.any() else 0)
inchi_msms_hits['data_frags'] = inchi_msms_hits['query_spectrum'].apply(lambda x: len(x[0]) if x.any() else 0)

inchi_msms_hits['precursor_ppm_error'] = (abs(inchi_msms_hits['measured_precursor_mz'] - inchi_msms_hits['precursor_mz']) / inchi_msms_hits['precursor_mz']) * 1000000
inchi_msms_hits = inchi_msms_hits[inchi_msms_hits['precursor_ppm_error']<=inchi_msms_hits['cid_pmz_tolerance']]

@@ -2385,6 +2390,7 @@ def get_msms_hits(metatlas_dataset: MetatlasDataset, extra_time: bool | float =
"""

msms_hits_cols = ['database', 'id', 'file_name', 'msms_scan', 'score', 'num_matches',
'ref_frags', 'data_frags',
'msv_query_aligned', 'msv_ref_aligned', 'name', 'adduct', 'inchi_key',
'precursor_mz', 'measured_precursor_mz',
'measured_precursor_intensity']
11 changes: 10 additions & 1 deletion metatlas/tools/fastanalysis.py
Original file line number Diff line number Diff line change
@@ -110,6 +110,7 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
msms_hits_df = msms_hits.copy()
msms_hits_df.reset_index(inplace=True)

#msms_hits_sorted_list = []
for compound_idx, compound_name in enumerate(compound_names):
ref_rt_peak = dataset[0][compound_idx]['identification'].rt_references[0].rt_peak
ref_mz = dataset[0][compound_idx]['identification'].mz_references[0].mz
@@ -135,7 +136,8 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
& ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float) - mz_theoretical)/mz_theoretical) \
<= cid.mz_references[0].mz_tolerance*1e-6)]

comp_msms_hits = comp_msms_hits.sort_values('score', ascending=False)
comp_msms_hits = sp.sort_msms_hits(comp_msms_hits)
#msms_hits_sorted_list.append(comp_msms_hits)
file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], []
if len(comp_msms_hits) > 0 and not np.isnan(np.concatenate(comp_msms_hits['msv_ref_aligned'].values, axis=1)).all():
file_idxs = [file_names.index(f) for f in comp_msms_hits['file_name'] if f in file_names]
@@ -426,6 +428,13 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
dfs['msms_score'].iat[compound_idx, file_idx] = rows.loc[rows['score'].astype(float).idxmax()]['score']
dfs['num_frag_matches'].iat[compound_idx, file_idx] = rows.loc[rows['score'].astype(float).idxmax()]['num_matches']

# method = 'numeric_hierarchy'
# msms_hits_sorted_df = pd.concat(msms_hits_sorted_list)
# msms_hits_sorted_df.to_csv(f"/out/msms_hits_sorted_{method}.csv")
# best_hits = msms_hits_sorted_df.groupby(['inchi_key', 'adduct']).first().reset_index()
# best_hits.to_csv(f"/out/best_hits_{method}.csv")
# best_hits.to_json(f"/out/best_hits_{method}.json")

passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >= min_msms_score).astype(float)
passing['num_frag_matches'] = (np.nan_to_num(dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float)

55 changes: 55 additions & 0 deletions metatlas/tools/spectralprocessing.py
Original file line number Diff line number Diff line change
@@ -1577,3 +1577,58 @@ def next2pow(x):
contributions[i, j, c] = np.round((ifft((fft_ptA / (tAs[i])) * (itAs[i, j])).real[cutoff_idx][c] / ptA[cutoff_idx][c]) * element_vector[i])

return np.array([MA[cutoff_idx], ptA[cutoff_idx]]), contributions


def sort_msms_hits(msms_hits: pd.DataFrame, sorting_method: str = 'score') -> pd.DataFrame:
"""
Takes an msms hits dataframe and returns a sorted version of it based on the sorting method. Typically
this function is called while iterating over compounds, so each dataframe input will likely be for a single compound.
"""
if sorting_method == "score":
sorted_msms_hits = msms_hits.sort_values('score', ascending=False)

elif sorting_method == "sums":
sorted_msms_hits = msms_hits.copy()
sorted_msms_hits.loc[:, 'summed_ratios_and_score'] = (
(sorted_msms_hits['num_matches'] / sorted_msms_hits['ref_frags']) +
(sorted_msms_hits['num_matches'] / sorted_msms_hits['data_frags']) +
(sorted_msms_hits['score'])
)
sorted_msms_hits = sorted_msms_hits.sort_values('summed_ratios_and_score', ascending=False)
# droppable_columns = ['summed_ratios_and_score', 'data_frags', 'ref_frags']
# sorted_msms_hits.drop(columns=droppable_columns, inplace=True)

elif sorting_method == "weighted":
sorted_msms_hits = msms_hits.copy()
weights = {'score': 0.5,
'match_to_data_frag_ratio': 0.25,
'match_to_ref_frag_ratio': 0.25
}
sorted_msms_hits.loc[:, 'weighted_score'] = (
(sorted_msms_hits['score'] * weights['score']) +
((sorted_msms_hits['num_matches'] / sorted_msms_hits['data_frags']) * weights['match_to_data_frag_ratio']) +
((sorted_msms_hits['num_matches'] / sorted_msms_hits['ref_frags']) * weights['match_to_ref_frag_ratio'])
)
sorted_msms_hits = sorted_msms_hits.sort_values('weighted_score', ascending=False)
# droppable_columns = ['weighted_score', 'data_frags', 'ref_frags']
# sorted_msms_hits.drop(columns=droppable_columns, inplace=True)

elif sorting_method == "numeric_hierarchy":
sorted_msms_hits = msms_hits.copy()
bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 0.99, 1]
labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '0.7-0.8', '0.8-0.9', '0.9-0.95', '0.95-0.97', '0.97-0.99', '0.99-1']
sorted_msms_hits['score_bin'] = pd.cut(sorted_msms_hits['score'], bins=bins, labels=labels, right=False)
sorted_msms_hits = sorted_msms_hits.sort_values(by=['score_bin', 'num_matches', 'score'], ascending=[False, False, False])
# droppable_columns = ['score_bin', 'data_frags', 'ref_frags']
# sorted_msms_hits.drop(columns=droppable_columns, inplace=True)

elif sorting_method == "quantile_hierarchy":
sorted_msms_hits = msms_hits.copy()
sorted_msms_hits = sorted_msms_hits.dropna(subset=['score'])
sorted_msms_hits['score'] += np.random.normal(0, 1e-8, size=len(sorted_msms_hits)) # Add small noise to handle duplicates
sorted_msms_hits['score_bin'] = pd.qcut(sorted_msms_hits['score'], duplicates='drop', q=5)
sorted_msms_hits = sorted_msms_hits.sort_values(by=['score_bin', 'num_matches', 'score'], ascending=[False, False, False])
# droppable_columns = ['score_bin', 'data_frags', 'ref_frags']
# sorted_msms_hits.drop(columns=droppable_columns, inplace=True)

return sorted_msms_hits
13 changes: 7 additions & 6 deletions tests/unit/test_dill2plot.py

Large diffs are not rendered by default.