Merge branch 'feature/single_input_file' into 'develop'

single file input Closes #32 See merge request msaid/chimerys/mokapot!30
msaid-de · Jan 12, 2023 · 13c6669 · 13c6669
2 parents 02bb7a3 + f650e72
commit 13c6669
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 171 deletions.
diff --git a/mokapot/brew.py b/mokapot/brew.py
@@ -86,27 +86,11 @@ def brew(
     if model is None:
         model = PercolatorModel()
 
-    try:
-        iter(psms_info)
-    except TypeError:
-        psms_info = [psms_info]
-
-    # Check that all of the datasets have the same features:
-    feat_set = set(psms_info[0]["feature_columns"])
-    if not all([set(p["feature_columns"]) == feat_set for p in psms_info]):
-        raise ValueError("All collections of PSMs must use the same features.")
-
-    target_column = psms_info[0]["target_column"]
-    spectrum_columns = psms_info[0]["spectrum_columns"]
-    df_spectra = pd.concat(
-        [
-            read_file(
-                file_name=p["file"],
-                use_cols=spectrum_columns + [target_column],
-            )
-            for p in psms_info
-        ],
-        ignore_index=True,
+    target_column = psms_info["target_column"]
+    spectrum_columns = psms_info["spectrum_columns"]
+    df_spectra = read_file(
+        file_name=psms_info["file"],
+        use_cols=spectrum_columns + [target_column],
     ).apply(pd.to_numeric, errors="ignore")
     df_spectra = convert_targets_column(df_spectra, target_column)
     data_size = len(df_spectra)
@@ -120,9 +104,6 @@ def brew(
             num_targets,
             num_decoys,
         )
-    # once we check all the datasets have the same features we can use only one metedata information
-    psms_info[0]["file"] = [psm["file"] for psm in psms_info]
-    psms_info = psms_info[0]
     df_spectra = df_spectra[spectrum_columns]
     LOGGER.info("Splitting PSMs into %i folds...", folds)
     test_idx = _split(df_spectra, folds)
@@ -163,10 +144,10 @@ def brew(
     elif all([m[0].is_trained for m in models]):
         # If we don't reset, assign scores to each fold:
         models = [m for m, _ in models]
-        scores = [_predict(test_idx, psms_info, models, test_fdr)]
+        scores = _predict(test_idx, psms_info, models, test_fdr)
     else:
         # If model training has failed
-        scores = [np.zeros(len(p.data)) for p in psms_info]
+        scores = np.zeros(data_size)
     # Find which is best: the learned model, the best feature, or
     # a pretrained model.
     if type(model) is not list and not model.override:
@@ -177,27 +158,21 @@ def brew(
     else:
         feat_total = 0
 
-    preds = [
-        update_labels(p, s, test_fdr) for p, s in zip([psms_info], scores)
-    ]
-    pred_total = sum([(pred == 1).sum() for pred in preds])
+    preds = update_labels(psms_info, scores, test_fdr)
+
+    pred_total = sum([(preds == 1).sum()])
 
     # Here, f[0] is the name of the best feature, and f[3] is a boolean
     if feat_total > pred_total:
         using_best_feat = True
-        scores = []
-        descs = []
         feat, _, desc = best_feats[best_feat_idx]
-        for dat in psms_info["file"]:
-            df = pd.read_csv(
-                dat,
-                sep="\t",
-                usecols=[feat],
-                index_col=False,
-                on_bad_lines="skip",
-            )
-            scores.append(df.values)
-            descs.append(desc)
+        scores = pd.read_csv(
+            psms_info["file"],
+            sep="\t",
+            usecols=[feat],
+            index_col=False,
+            on_bad_lines="skip",
+        ).values
 
     else:
         using_best_feat = False
@@ -216,7 +191,7 @@ def brew(
             "using the original model."
         )
 
-    return psms_info, models, scores[0], desc
+    return psms_info, models, scores, desc
 
 
 # Utility Functions -----------------------------------------------------------

diff --git a/mokapot/confidence.py b/mokapot/confidence.py
@@ -82,11 +82,12 @@ def __init__(
         dest_dir=None,
         file_root=None,
         sep="\t",
+        proteins=None,
         combine=False,
     ):
         """Initialize a GroupedConfidence object"""
         psms = read_file(
-            psms_info["file"][0],
+            psms_info["file"],
             use_cols=list(psms_info["feature_columns"])
             + list(psms_info["metadata_columns"]),
         )
@@ -111,8 +112,8 @@ def __init__(
             tdc_winners = group_df.index.intersection(idx)
             group_psms = group_df.loc[tdc_winners, :]
             group_scores = scores.loc[group_psms.index].values + 1
-            psms_info["file"] = ["group_psms.csv"]
-            group_psms.to_csv(psms_info["file"][0], sep="\t", index=False)
+            psms_info["file"] = "group_psms.csv"
+            group_psms.to_csv(psms_info["file"], sep="\t", index=False)
             assign_confidence(
                 psms_info,
                 group_scores * (2 * desc - 1),
@@ -122,6 +123,7 @@ def __init__(
                 file_root=file_root,
                 sep=sep,
                 decoys=decoys,
+                proteins=proteins,
                 group_column=group,
                 combine=combine,
             )
@@ -233,23 +235,21 @@ class Confidence:
         "peptide_pairs": "Peptide Pairs",
     }
 
-    def __init__(self, psms_info):
+    def __init__(self, psms_info, proteins=None):
         """Initialize a PsmConfidence object."""
         self._score_column = "score"
         self._target_column = psms_info["target_column"]
-        self._protein_column = psms_info["protein_column"]
+        self._protein_column = "proteinIds"
+        self._group_column = psms_info["group_column"]
         self._metadata_column = psms_info["metadata_columns"]
-        self._has_proteins = psms_info["has_proteins"]
 
         self.scores = None
         self.targets = None
         self.qvals = None
         self.peps = None
 
-        if self._has_proteins:
-            self._proteins = self._has_proteins
-        else:
-            self._proteins = None
+        self._proteins = proteins
+
         # This attribute holds the results as DataFrames:
         self.confidence_estimates = {}
         self.decoy_confidence_estimates = {}
@@ -267,7 +267,9 @@ def levels(self):
         """
         return list(self.confidence_estimates.keys())
 
-    def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
+    def to_txt(
+        self, data_path, columns, level, decoys, file_root, dest_dir, sep
+    ):
         """Save confidence estimates to delimited text files.
         Parameters
         ----------
@@ -294,11 +296,10 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             The paths to the saved files.
 
         """
-
         reader = read_file_in_chunks(
             file=data_path,
             chunk_size=CONFIDENCE_CHUNK_SIZE,
-            use_cols=["ScanNr", "Peptide", "Proteins"],
+            use_cols=[i for i in columns if i != self._target_column],
         )
 
         self.scores = utils.create_chunks(
@@ -314,27 +315,22 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             self.targets, chunk_size=CONFIDENCE_CHUNK_SIZE
         )
 
-        output_columns = sep.join(
-            [
-                "PSMId",
-                "Peptide",
-                "score",
-                "q-value",
-                "posterior_error_prob",
-                "proteinIds",
-                "\n",
-            ]
-        )
         if file_root is not None:
             dest_dir = Path(dest_dir, file_root)
         outfile_t = str(dest_dir) + f"targets.{level}"
         outfile_d = str(dest_dir) + f"decoys.{level}"
+
+        columns.remove(self._target_column)
+        output_columns = columns + ["q-value", "posterior_error_prob"]
+        if level != "proteins" and self._protein_column is not None:
+            output_columns.remove(self._protein_column)
+            output_columns.append(self._protein_column)
         if not os.path.exists(outfile_t):
             with open(outfile_t, "w") as fp:
-                fp.write(output_columns)
+                fp.write(f"{sep.join(output_columns)}\n")
         if decoys and not os.path.exists(outfile_d):
             with open(outfile_d, "w") as fp:
-                fp.write(output_columns)
+                fp.write(f"{sep.join(output_columns)}\n")
 
         for data_in, score_in, qvals_in, pep_in, target_in in zip(
             reader, self.scores, self.qvals, self.peps, self.targets
@@ -453,16 +449,17 @@ def __init__(
         dest_dir=None,
         file_root=None,
         decoys=None,
+        proteins=None,
         sep="\t",
         group_column=None,
         combine=False,
     ):
         """Initialize a a LinearPsmConfidence object"""
-        super().__init__(psms_info)
+        super().__init__(psms_info, proteins)
         self._target_column = psms_info["target_column"]
         self._psm_columns = psms_info["spectrum_columns"]
         self._peptide_column = psms_info["peptide_column"]
-        self._protein_column = psms_info["protein_column"]
+        self._protein_column = "proteinIds"
         self._eval_fdr = eval_fdr
 
         LOGGER.info("Performing target-decoy competition...")
@@ -496,7 +493,7 @@ def __repr__(self):
             f"{self.accepted['peptides']}\n"
         )
 
-        if self._has_proteins:
+        if self._proteins:
             base += (
                 f"\t- Protein groups at q<={self._eval_fdr:g}: "
                 f"{self.accepted['proteins']}\n"
@@ -555,10 +552,8 @@ def _assign_confidence(
         """
         levels = ["PSMs", "peptides"]
         level_data_path = [psms_path, peptides_path]
-        if self._has_proteins:
-            data = read_file(
-                peptides_path, use_cols=self._metadata_column + ["score"]
-            )
+        if self._proteins:
+            data = read_file(peptides_path)
             data = data.apply(pd.to_numeric, errors="ignore")
             convert_targets_column(
                 data=data, target_column=self._target_column
@@ -577,15 +572,15 @@ def _assign_confidence(
             LOGGER.info("\t- Found %i unique protein groups.", len(proteins))
 
         for level, data_path in zip(levels, level_data_path):
-            data = read_file(
-                data_path, use_cols=self._metadata_column + ["score"]
-            )
+            data = read_file(data_path)
             data = data.apply(pd.to_numeric, errors="ignore")
+            data_columns = list(data.columns)
             convert_targets_column(
                 data=data, target_column=self._target_column
             )
             self.scores = data.loc[:, self._score_column].values
             self.targets = data.loc[:, self._target_column].astype(bool).values
+            del data
             if all(self.targets):
                 LOGGER.warning(
                     "No decoy PSMs remain for confidence estimation. "
@@ -624,11 +619,23 @@ def _assign_confidence(
             if group_column and not combine:
                 file_root = f"{group_column}."
                 self.to_txt(
-                    data_path, level.lower(), decoys, file_root, dest_dir, sep
+                    data_path,
+                    data_columns,
+                    level.lower(),
+                    decoys,
+                    file_root,
+                    dest_dir,
+                    sep,
                 )
             else:
                 self.to_txt(
-                    data_path, level.lower(), decoys, file_root, dest_dir, sep
+                    data_path,
+                    data_columns,
+                    level.lower(),
+                    decoys,
+                    file_root,
+                    dest_dir,
+                    sep,
                 )
 
     def to_flashlfq(self, out_file="mokapot.flashlfq.txt"):
@@ -800,6 +807,7 @@ def assign_confidence(
     file_root=None,
     sep="\t",
     decoys=False,
+    proteins=None,
     group_column=None,
     combine=False,
 ):
@@ -847,17 +855,11 @@ def assign_confidence(
     if scores is None:
         feat, _, _, desc = find_best_feature(psms_info, eval_fdr)
         LOGGER.info("Selected %s as the best feature.", feat)
-        scores = pd.concat(
-            [
-                read_file(file_name=file, use_cols=[feat])
-                for file in psms_info["file"]
-            ],
-            ignore_index=True,
-        ).values
+        scores = read_file(file_name=psms_info["file"], use_cols=[feat]).values
 
     if psms_info["group_column"] is None:
         reader = read_file_in_chunks(
-            file=psms_info["file"][0],
+            file=psms_info["file"],
             chunk_size=CONFIDENCE_CHUNK_SIZE,
             use_cols=psms_info["metadata_columns"],
         )
@@ -874,7 +876,6 @@ def assign_confidence(
                 mode="a",
                 header=None,
             )
-            metadata_columns = list(chunk_metadata.columns)
         psms_path = "psms.csv"
         peptides_path = "peptides.csv"
         iterable_sorted = utils.sort_file_on_disk(
@@ -886,10 +887,11 @@ def assign_confidence(
             "Keeping the best match per %s columns...",
             "+".join(psms_info["spectrum_columns"]),
         )
+        metadata_columns = ["PSMId", "Label", "Peptide", "proteinIds", "score"]
         with open(psms_path, "w") as f_psm:
-            f_psm.write(sep.join(metadata_columns + ["score", "\n"]))
+            f_psm.write(f"{sep.join(metadata_columns)}\n")
         with open(peptides_path, "w") as f_peptide:
-            f_peptide.write(sep.join(metadata_columns + ["score", "\n"]))
+            f_peptide.write(f"{sep.join(metadata_columns)}\n")
 
         unique_psms, unique_peptides = utils.get_unique_psms_and_peptides(
             iterable=iterable_sorted,
@@ -911,6 +913,7 @@ def assign_confidence(
             file_root=file_root,
             sep=sep,
             decoys=decoys,
+            proteins=proteins,
             group_column=group_column,
             combine=combine,
         )
@@ -925,6 +928,7 @@ def assign_confidence(
             file_root=file_root,
             sep=sep,
             decoys=decoys,
+            proteins=proteins,
             combine=combine,
         )