biorack · bkieft-usa · Feb 3, 2025 · Jan 29, 2025 · Jan 30, 2025 · Jan 30, 2025
diff --git a/metatlas/plots/dill2plots.py b/metatlas/plots/dill2plots.py
@@ -306,7 +306,7 @@ def __init__(self,
         """
         logger.debug("Initializing new instance of %s.", self.__class__.__name__)
         self.data = data
-        self.msms_hits = msms_hits.sort_values('score', ascending=False)
+        self.msms_hits = sp.sort_msms_hits(msms_hits)
         self.color_me = or_default(color_me, [('black', '')])
         self.compound_idx = compound_idx
         self.width = width
@@ -2278,6 +2278,8 @@ def create_nonmatched_msms_hits(msms_data: pd.DataFrame, inchi_key: str, compoun
     compound_msms_hits['score'] = compound_msms_hits['measured_precursor_intensity']
     compound_msms_hits['num_matches'] = 0
     compound_msms_hits['spectrum'] = [np.array([np.array([]), np.array([])])] * len(compound_msms_hits)
+    compound_msms_hits['ref_frags'] = 0
+    compound_msms_hits['data_frags'] = 0
 
     return compound_msms_hits
 
@@ -2348,6 +2350,9 @@ def get_hits_per_compound(cos: Type[CosineHungarian], compound_idx: int,
     inchi_msms_hits['score'] = scores_matches['score'].flatten()
     inchi_msms_hits['num_matches'] = scores_matches['matches'].flatten()
 
+    inchi_msms_hits['ref_frags'] = inchi_msms_hits['spectrum'].apply(lambda x: len(x[0]) if x.any() else 0)
+    inchi_msms_hits['data_frags'] = inchi_msms_hits['query_spectrum'].apply(lambda x: len(x[0]) if x.any() else 0)
+
     inchi_msms_hits['precursor_ppm_error'] = (abs(inchi_msms_hits['measured_precursor_mz'] - inchi_msms_hits['precursor_mz']) / inchi_msms_hits['precursor_mz']) * 1000000
     inchi_msms_hits = inchi_msms_hits[inchi_msms_hits['precursor_ppm_error']<=inchi_msms_hits['cid_pmz_tolerance']]
 
@@ -2385,6 +2390,7 @@ def get_msms_hits(metatlas_dataset: MetatlasDataset, extra_time: bool | float =
     """
 
     msms_hits_cols = ['database', 'id', 'file_name', 'msms_scan', 'score', 'num_matches',
+                     'ref_frags', 'data_frags',
                      'msv_query_aligned', 'msv_ref_aligned', 'name', 'adduct', 'inchi_key',
                      'precursor_mz', 'measured_precursor_mz',
                      'measured_precursor_intensity']

diff --git a/metatlas/tools/fastanalysis.py b/metatlas/tools/fastanalysis.py
@@ -110,6 +110,7 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
     msms_hits_df = msms_hits.copy()
     msms_hits_df.reset_index(inplace=True)
 
+    #msms_hits_sorted_list = []
     for compound_idx, compound_name in enumerate(compound_names):
         ref_rt_peak = dataset[0][compound_idx]['identification'].rt_references[0].rt_peak
         ref_mz = dataset[0][compound_idx]['identification'].mz_references[0].mz
@@ -135,7 +136,8 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
                                     & ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float) - mz_theoretical)/mz_theoretical) \
                                     <= cid.mz_references[0].mz_tolerance*1e-6)]
 
-        comp_msms_hits = comp_msms_hits.sort_values('score', ascending=False)
+        comp_msms_hits = sp.sort_msms_hits(comp_msms_hits)
+        #msms_hits_sorted_list.append(comp_msms_hits)
         file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], []
         if len(comp_msms_hits) > 0 and not np.isnan(np.concatenate(comp_msms_hits['msv_ref_aligned'].values, axis=1)).all():
             file_idxs = [file_names.index(f) for f in comp_msms_hits['file_name'] if f in file_names]
@@ -426,6 +428,13 @@ def make_stats_table(workflow_name: str = "JGI-HILIC", input_fname: Optional[Pat
                     dfs['msms_score'].iat[compound_idx, file_idx] = rows.loc[rows['score'].astype(float).idxmax()]['score']
                 dfs['num_frag_matches'].iat[compound_idx, file_idx] = rows.loc[rows['score'].astype(float).idxmax()]['num_matches']
 
+    # method = 'numeric_hierarchy'
+    # msms_hits_sorted_df = pd.concat(msms_hits_sorted_list)
+    # msms_hits_sorted_df.to_csv(f"/out/msms_hits_sorted_{method}.csv")
+    # best_hits = msms_hits_sorted_df.groupby(['inchi_key', 'adduct']).first().reset_index()
+    # best_hits.to_csv(f"/out/best_hits_{method}.csv")
+    # best_hits.to_json(f"/out/best_hits_{method}.json")
+
     passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >= min_msms_score).astype(float)
     passing['num_frag_matches'] = (np.nan_to_num(dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float)
 

diff --git a/metatlas/tools/spectralprocessing.py b/metatlas/tools/spectralprocessing.py
@@ -1577,3 +1577,58 @@ def next2pow(x):
             contributions[i, j, c] = np.round((ifft((fft_ptA / (tAs[i])) * (itAs[i, j])).real[cutoff_idx][c] / ptA[cutoff_idx][c]) * element_vector[i])
 
     return np.array([MA[cutoff_idx], ptA[cutoff_idx]]), contributions
+
+
+def sort_msms_hits(msms_hits: pd.DataFrame, sorting_method: str = 'score') -> pd.DataFrame:
+    """
+    Takes an msms hits dataframe and returns a sorted version of it based on the sorting method. Typically
+    this function is called while iterating over compounds, so each dataframe input will likely be for a single compound.
+    """
+    if sorting_method == "score":
+        sorted_msms_hits = msms_hits.sort_values('score', ascending=False)
+
+    elif sorting_method == "sums":
+        sorted_msms_hits = msms_hits.copy()
+        sorted_msms_hits.loc[:, 'summed_ratios_and_score'] = (
+                                                              (sorted_msms_hits['num_matches'] / sorted_msms_hits['ref_frags']) +
+                                                              (sorted_msms_hits['num_matches'] / sorted_msms_hits['data_frags']) +
+                                                              (sorted_msms_hits['score'])
+                                                              )
+        sorted_msms_hits = sorted_msms_hits.sort_values('summed_ratios_and_score', ascending=False)
+        # droppable_columns = ['summed_ratios_and_score', 'data_frags', 'ref_frags']
+        # sorted_msms_hits.drop(columns=droppable_columns, inplace=True)
+
+    elif sorting_method == "weighted":
+        sorted_msms_hits = msms_hits.copy()
+        weights = {'score': 0.5,
+                    'match_to_data_frag_ratio': 0.25,
+                    'match_to_ref_frag_ratio': 0.25
+                    }
+        sorted_msms_hits.loc[:, 'weighted_score'] = (
+                                                    (sorted_msms_hits['score'] * weights['score']) +
+                                                    ((sorted_msms_hits['num_matches'] / sorted_msms_hits['data_frags']) * weights['match_to_data_frag_ratio']) +
+                                                    ((sorted_msms_hits['num_matches'] / sorted_msms_hits['ref_frags']) * weights['match_to_ref_frag_ratio'])
+                                                    )
+        sorted_msms_hits = sorted_msms_hits.sort_values('weighted_score', ascending=False)
+        # droppable_columns = ['weighted_score', 'data_frags', 'ref_frags']
+        # sorted_msms_hits.drop(columns=droppable_columns, inplace=True)
+
+    elif sorting_method == "numeric_hierarchy":
+        sorted_msms_hits = msms_hits.copy()
+        bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.97, 0.99, 1]
+        labels = ['0-0.1', '0.1-0.2', '0.2-0.3', '0.3-0.4', '0.4-0.5', '0.5-0.6', '0.6-0.7', '0.7-0.8', '0.8-0.9', '0.9-0.95', '0.95-0.97', '0.97-0.99', '0.99-1']
+        sorted_msms_hits['score_bin'] = pd.cut(sorted_msms_hits['score'], bins=bins, labels=labels, right=False)
+        sorted_msms_hits = sorted_msms_hits.sort_values(by=['score_bin', 'num_matches', 'score'], ascending=[False, False, False])
+        # droppable_columns = ['score_bin', 'data_frags', 'ref_frags']
+        # sorted_msms_hits.drop(columns=droppable_columns, inplace=True)
+
+    elif sorting_method == "quantile_hierarchy":
+        sorted_msms_hits = msms_hits.copy()
+        sorted_msms_hits = sorted_msms_hits.dropna(subset=['score'])
+        sorted_msms_hits['score'] += np.random.normal(0, 1e-8, size=len(sorted_msms_hits))  # Add small noise to handle duplicates
+        sorted_msms_hits['score_bin'] = pd.qcut(sorted_msms_hits['score'], duplicates='drop', q=5)
+        sorted_msms_hits = sorted_msms_hits.sort_values(by=['score_bin', 'num_matches', 'score'], ascending=[False, False, False])
+        # droppable_columns = ['score_bin', 'data_frags', 'ref_frags']
+        # sorted_msms_hits.drop(columns=droppable_columns, inplace=True)
+
+    return sorted_msms_hits
diff --git a/tests/unit/test_dill2plot.py b/tests/unit/test_dill2plot.py