From 177dc7a4e8575e34b184db71b01b5d5de9373cb1 Mon Sep 17 00:00:00 2001
From: samia <samia.benfredj@msaid.de>
Date: Tue, 10 Jan 2023 16:39:47 +0100
Subject: [PATCH 1/4] fix bug max workers st to 1: convert train_sets generator
 to list

---
 mokapot/brew.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mokapot/brew.py b/mokapot/brew.py
index 33ad5993..0d12184d 100644
--- a/mokapot/brew.py
+++ b/mokapot/brew.py
@@ -127,14 +127,14 @@ def brew(
     LOGGER.info("Splitting PSMs into %i folds...", folds)
     test_idx = _split(df_spectra, folds)
     del df_spectra
-    train_sets = make_train_sets(
-        test_idx=test_idx,
-        subset_max_train=subset_max_train,
-        data_size=data_size,
+    train_sets = list(
+        make_train_sets(
+            test_idx=test_idx,
+            subset_max_train=subset_max_train,
+            data_size=data_size,
+        )
     )
-    if max_workers != 1:
-        # train_sets can't be a generator for joblib :(
-        train_sets = list(train_sets)
+
     train_psms = parse_in_chunks(
         psms_info=psms_info,
         idx=train_sets,

From b7fb3d627082aea1af1d189edd5f6e4479067392 Mon Sep 17 00:00:00 2001
From: samia <samia.benfredj@msaid.de>
Date: Tue, 10 Jan 2023 11:04:37 +0100
Subject: [PATCH 2/4] single file input: - remove aggregate option from config
 - refactor parse input file - refactor functions in brew, confidence and
 dataset files to read only one input file

---
 mokapot/brew.py        | 61 +++++++++++++-----------------------------
 mokapot/confidence.py  | 14 +++-------
 mokapot/config.py      | 15 -----------
 mokapot/dataset.py     | 33 +++++++----------------
 mokapot/mokapot.py     | 15 +++--------
 mokapot/parsers/pin.py | 44 +++++++++++++++---------------
 6 files changed, 56 insertions(+), 126 deletions(-)

diff --git a/mokapot/brew.py b/mokapot/brew.py
index 0d12184d..98b01f36 100644
--- a/mokapot/brew.py
+++ b/mokapot/brew.py
@@ -86,27 +86,11 @@ def brew(
     if model is None:
         model = PercolatorModel()
 
-    try:
-        iter(psms_info)
-    except TypeError:
-        psms_info = [psms_info]
-
-    # Check that all of the datasets have the same features:
-    feat_set = set(psms_info[0]["feature_columns"])
-    if not all([set(p["feature_columns"]) == feat_set for p in psms_info]):
-        raise ValueError("All collections of PSMs must use the same features.")
-
-    target_column = psms_info[0]["target_column"]
-    spectrum_columns = psms_info[0]["spectrum_columns"]
-    df_spectra = pd.concat(
-        [
-            read_file(
-                file_name=p["file"],
-                use_cols=spectrum_columns + [target_column],
-            )
-            for p in psms_info
-        ],
-        ignore_index=True,
+    target_column = psms_info["target_column"]
+    spectrum_columns = psms_info["spectrum_columns"]
+    df_spectra = read_file(
+        file_name=psms_info["file"],
+        use_cols=spectrum_columns + [target_column],
     ).apply(pd.to_numeric, errors="ignore")
     df_spectra = convert_targets_column(df_spectra, target_column)
     data_size = len(df_spectra)
@@ -120,9 +104,6 @@ def brew(
             num_targets,
             num_decoys,
         )
-    # once we check all the datasets have the same features we can use only one metedata information
-    psms_info[0]["file"] = [psm["file"] for psm in psms_info]
-    psms_info = psms_info[0]
     df_spectra = df_spectra[spectrum_columns]
     LOGGER.info("Splitting PSMs into %i folds...", folds)
     test_idx = _split(df_spectra, folds)
@@ -163,10 +144,10 @@ def brew(
     elif all([m[0].is_trained for m in models]):
         # If we don't reset, assign scores to each fold:
         models = [m for m, _ in models]
-        scores = [_predict(test_idx, psms_info, models, test_fdr)]
+        scores = _predict(test_idx, psms_info, models, test_fdr)
     else:
         # If model training has failed
-        scores = [np.zeros(len(p.data)) for p in psms_info]
+        scores = np.zeros(data_size)
     # Find which is best: the learned model, the best feature, or
     # a pretrained model.
     if type(model) is not list and not model.override:
@@ -177,27 +158,21 @@ def brew(
     else:
         feat_total = 0
 
-    preds = [
-        update_labels(p, s, test_fdr) for p, s in zip([psms_info], scores)
-    ]
-    pred_total = sum([(pred == 1).sum() for pred in preds])
+    preds = update_labels(psms_info, scores, test_fdr)
+
+    pred_total = sum([(preds == 1).sum()])
 
     # Here, f[0] is the name of the best feature, and f[3] is a boolean
     if feat_total > pred_total:
         using_best_feat = True
-        scores = []
-        descs = []
         feat, _, desc = best_feats[best_feat_idx]
-        for dat in psms_info["file"]:
-            df = pd.read_csv(
-                dat,
-                sep="\t",
-                usecols=[feat],
-                index_col=False,
-                on_bad_lines="skip",
-            )
-            scores.append(df.values)
-            descs.append(desc)
+        scores = pd.read_csv(
+            psms_info["file"],
+            sep="\t",
+            usecols=[feat],
+            index_col=False,
+            on_bad_lines="skip",
+        ).values
 
     else:
         using_best_feat = False
@@ -216,7 +191,7 @@ def brew(
             "using the original model."
         )
 
-    return psms_info, models, scores[0], desc
+    return psms_info, models, scores, desc
 
 
 # Utility Functions -----------------------------------------------------------
diff --git a/mokapot/confidence.py b/mokapot/confidence.py
index 5cf3ac19..eefa6a75 100644
--- a/mokapot/confidence.py
+++ b/mokapot/confidence.py
@@ -86,7 +86,7 @@ def __init__(
     ):
         """Initialize a GroupedConfidence object"""
         psms = read_file(
-            psms_info["file"][0],
+            psms_info["file"],
             use_cols=list(psms_info["feature_columns"])
             + list(psms_info["metadata_columns"]),
         )
@@ -112,7 +112,7 @@ def __init__(
             group_psms = group_df.loc[tdc_winners, :]
             group_scores = scores.loc[group_psms.index].values + 1
             psms_info["file"] = ["group_psms.csv"]
-            group_psms.to_csv(psms_info["file"][0], sep="\t", index=False)
+            group_psms.to_csv(psms_info["file"], sep="\t", index=False)
             assign_confidence(
                 psms_info,
                 group_scores * (2 * desc - 1),
@@ -847,17 +847,11 @@ def assign_confidence(
     if scores is None:
         feat, _, _, desc = find_best_feature(psms_info, eval_fdr)
         LOGGER.info("Selected %s as the best feature.", feat)
-        scores = pd.concat(
-            [
-                read_file(file_name=file, use_cols=[feat])
-                for file in psms_info["file"]
-            ],
-            ignore_index=True,
-        ).values
+        scores = read_file(file_name=psms_info["file"], use_cols=[feat]).values
 
     if psms_info["group_column"] is None:
         reader = read_file_in_chunks(
-            file=psms_info["file"][0],
+            file=psms_info["file"],
             chunk_size=CONFIDENCE_CHUNK_SIZE,
             use_cols=psms_info["metadata_columns"],
         )
diff --git a/mokapot/config.py b/mokapot/config.py
index 0171f26b..fcc27169 100644
--- a/mokapot/config.py
+++ b/mokapot/config.py
@@ -48,7 +48,6 @@ def _parser():
     parser.add_argument(
         "psm_files",
         type=str,
-        nargs="+",
         help=(
             "A collection of PSMs in the Percolator tab-delimited or PepXML "
             "format."
@@ -209,20 +208,6 @@ def _parser():
         ),
     )
 
-    parser.add_argument(
-        "--aggregate",
-        default=False,
-        action="store_true",
-        help=(
-            "If used, PSMs from multiple PIN files will be "
-            "aggregated and analyzed together. Otherwise, "
-            "a joint model will be trained, but confidence "
-            "estimates will be calculated separately for "
-            "each PIN file. This flag only has an effect "
-            "when multiple PIN files are provided."
-        ),
-    )
-
     parser.add_argument(
         "--subset_max_train",
         type=int,
diff --git a/mokapot/dataset.py b/mokapot/dataset.py
index 0c46bdf8..87b0124f 100644
--- a/mokapot/dataset.py
+++ b/mokapot/dataset.py
@@ -624,16 +624,10 @@ def calibrate_scores(scores, targets, eval_fdr, desc=True):
 
 
 def targets_count_by_feature(psms_info, eval_fdr, columns, desc):
-    df = pd.concat(
-        [
-            read_file(
-                file_name=file, use_cols=columns + [psms_info["target_column"]]
-            )
-            for file in psms_info["file"]
-        ],
-        ignore_index=True,
+    df = read_file(
+        file_name=psms_info["file"],
+        use_cols=columns + [psms_info["target_column"]],
     )
-
     return (
         df.loc[:, columns].apply(
             _update_labels,
@@ -668,16 +662,11 @@ def find_best_feature(psms_info, eval_fdr):
         if num_passing > best_positives:
             best_positives = num_passing
             best_feat = feat_idx
-            df = pd.concat(
-                [
-                    read_file(
-                        file_name=file,
-                        use_cols=[best_feat, psms_info["target_column"]],
-                    )
-                    for file in psms_info["file"]
-                ],
-                ignore_index=True,
+            df = read_file(
+                file_name=psms_info["file"],
+                use_cols=[best_feat, psms_info["target_column"]],
             )
+
             new_labels = _update_labels(
                 scores=df.loc[:, best_feat],
                 targets=df[psms_info["target_column"]],
@@ -693,12 +682,8 @@ def find_best_feature(psms_info, eval_fdr):
 
 
 def update_labels(psms_info, scores, eval_fdr=0.01, desc=True):
-    df = pd.concat(
-        [
-            read_file(file_name=_file, use_cols=[psms_info["target_column"]])
-            for _file in psms_info["file"]
-        ],
-        ignore_index=True,
+    df = read_file(
+        file_name=psms_info["file"], use_cols=[psms_info["target_column"]]
     )
     return _update_labels(
         scores=scores,
diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py
index 22511f14..12cbf5fe 100644
--- a/mokapot/mokapot.py
+++ b/mokapot/mokapot.py
@@ -57,11 +57,7 @@ def main():
 
     # Parse Datasets
     parse = get_parser(config)
-    if config.aggregate or len(config.psm_files) == 1:
-        datasets = parse(config.psm_files)
-    else:
-        datasets = [parse(f) for f in config.psm_files]
-        prefixes = [Path(f).stem for f in config.psm_files]
+    dataset = parse(config.psm_files)
 
     # Parse FASTA, if required:
     if config.proteins is not None:
@@ -77,11 +73,7 @@ def main():
             decoy_prefix=config.decoy_prefix,
         )
 
-        if config.aggregate or len(config.psm_files) == 1:
-            datasets.add_proteins(proteins)
-        else:
-            for dataset in datasets:
-                dataset.add_proteins(proteins)
+        dataset.add_proteins(proteins)
 
     # Define a model:
     if config.init_weights:
@@ -101,10 +93,11 @@ def main():
             direction=config.direction,
             override=config.override,
         )
+    print(dataset)
 
     # Fit the models:
     psms_info, models, scores, desc = brew(
-        datasets,
+        dataset,
         model=model,
         test_fdr=config.test_fdr,
         folds=config.folds,
diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py
index 95b575d2..aa7c619b 100644
--- a/mokapot/parsers/pin.py
+++ b/mokapot/parsers/pin.py
@@ -18,7 +18,7 @@
 
 # Functions -------------------------------------------------------------------
 def read_pin(
-    pin_files,
+    pin_file,
     has_proteins=None,
     group_column=None,
     filename_column=None,
@@ -98,20 +98,17 @@ def read_pin(
         PSMs from all of the PIN files.
     """
     logging.info("Parsing PSMs...")
-    return [
-        read_percolator(
-            file,
-            has_proteins=has_proteins,
-            group_column=group_column,
-            filename_column=filename_column,
-            calcmass_column=calcmass_column,
-            expmass_column=expmass_column,
-            rt_column=rt_column,
-            charge_column=charge_column,
-            copy_data=copy_data,
-        )
-        for file in utils.tuplize(pin_files)
-    ]
+    return read_percolator(
+        pin_file,
+        has_proteins=has_proteins,
+        group_column=group_column,
+        filename_column=filename_column,
+        calcmass_column=calcmass_column,
+        expmass_column=expmass_column,
+        rt_column=rt_column,
+        charge_column=charge_column,
+        copy_data=copy_data,
+    )
 
 
 def read_percolator(
@@ -334,14 +331,15 @@ def parse_in_chunks(psms_info, idx, chunk_size):
         list of dataframes
     """
     train_psms = [[] for _ in range(len(idx))]
-    for file in psms_info["file"]:
-        reader = read_file_in_chunks(
-            file=file, chunk_size=chunk_size, use_cols=psms_info["columns"]
-        )
-        Parallel(n_jobs=-1, require="sharedmem")(
-            delayed(get_rows_from_dataframe)(idx, chunk, train_psms)
-            for chunk in reader
-        )
+    reader = read_file_in_chunks(
+        file=psms_info["file"],
+        chunk_size=chunk_size,
+        use_cols=psms_info["columns"],
+    )
+    Parallel(n_jobs=-1, require="sharedmem")(
+        delayed(get_rows_from_dataframe)(idx, chunk, train_psms)
+        for chunk in reader
+    )
 
     return Parallel(n_jobs=-1, require="sharedmem")(
         delayed(concat_chunks)(df=df, orig_idx=orig_idx)

From 0baf57081c18a7c800750e230a84aebab949f1a2 Mon Sep 17 00:00:00 2001
From: samia <samia.benfredj@msaid.de>
Date: Tue, 10 Jan 2023 16:25:41 +0100
Subject: [PATCH 3/4] fix protein level confidence

---
 mokapot/confidence.py  | 82 +++++++++++++++++++++++++++++-------------
 mokapot/mokapot.py     |  6 ++--
 mokapot/parsers/pin.py |  4 ---
 3 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/mokapot/confidence.py b/mokapot/confidence.py
index eefa6a75..deea6f2f 100644
--- a/mokapot/confidence.py
+++ b/mokapot/confidence.py
@@ -82,6 +82,7 @@ def __init__(
         dest_dir=None,
         file_root=None,
         sep="\t",
+        proteins=None,
         combine=False,
     ):
         """Initialize a GroupedConfidence object"""
@@ -122,6 +123,7 @@ def __init__(
                 file_root=file_root,
                 sep=sep,
                 decoys=decoys,
+                proteins=proteins,
                 group_column=group,
                 combine=combine,
             )
@@ -233,23 +235,20 @@ class Confidence:
         "peptide_pairs": "Peptide Pairs",
     }
 
-    def __init__(self, psms_info):
+    def __init__(self, psms_info, proteins=None):
         """Initialize a PsmConfidence object."""
         self._score_column = "score"
         self._target_column = psms_info["target_column"]
         self._protein_column = psms_info["protein_column"]
         self._metadata_column = psms_info["metadata_columns"]
-        self._has_proteins = psms_info["has_proteins"]
 
         self.scores = None
         self.targets = None
         self.qvals = None
         self.peps = None
 
-        if self._has_proteins:
-            self._proteins = self._has_proteins
-        else:
-            self._proteins = None
+        self._proteins = proteins
+
         # This attribute holds the results as DataFrames:
         self.confidence_estimates = {}
         self.decoy_confidence_estimates = {}
@@ -294,11 +293,43 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             The paths to the saved files.
 
         """
-
+        if level == "proteins":
+            intput_columns = [
+                "mokapot protein group",
+                "best peptide",
+                "stripped sequence",
+                "score",
+                "Label",
+            ]
+            output_columns = sep.join(
+                [
+                    "mokapot protein group",
+                    "best peptide",
+                    "stripped sequence",
+                    "score",
+                    "q-value",
+                    "posterior_error_prob",
+                    "proteinIds",
+                    "\n",
+                ]
+            )
+        else:
+            intput_columns = ["ScanNr", "Peptide", "Proteins"]
+            output_columns = sep.join(
+                [
+                    "PSMId",
+                    "Peptide",
+                    "score",
+                    "q-value",
+                    "posterior_error_prob",
+                    "proteinIds",
+                    "\n",
+                ]
+            )
         reader = read_file_in_chunks(
             file=data_path,
             chunk_size=CONFIDENCE_CHUNK_SIZE,
-            use_cols=["ScanNr", "Peptide", "Proteins"],
+            use_cols=intput_columns,
         )
 
         self.scores = utils.create_chunks(
@@ -314,17 +345,6 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             self.targets, chunk_size=CONFIDENCE_CHUNK_SIZE
         )
 
-        output_columns = sep.join(
-            [
-                "PSMId",
-                "Peptide",
-                "score",
-                "q-value",
-                "posterior_error_prob",
-                "proteinIds",
-                "\n",
-            ]
-        )
         if file_root is not None:
             dest_dir = Path(dest_dir, file_root)
         outfile_t = str(dest_dir) + f"targets.{level}"
@@ -453,12 +473,13 @@ def __init__(
         dest_dir=None,
         file_root=None,
         decoys=None,
+        proteins=None,
         sep="\t",
         group_column=None,
         combine=False,
     ):
         """Initialize a a LinearPsmConfidence object"""
-        super().__init__(psms_info)
+        super().__init__(psms_info, proteins)
         self._target_column = psms_info["target_column"]
         self._psm_columns = psms_info["spectrum_columns"]
         self._peptide_column = psms_info["peptide_column"]
@@ -496,7 +517,7 @@ def __repr__(self):
             f"{self.accepted['peptides']}\n"
         )
 
-        if self._has_proteins:
+        if self._proteins:
             base += (
                 f"\t- Protein groups at q<={self._eval_fdr:g}: "
                 f"{self.accepted['proteins']}\n"
@@ -555,7 +576,7 @@ def _assign_confidence(
         """
         levels = ["PSMs", "peptides"]
         level_data_path = [psms_path, peptides_path]
-        if self._has_proteins:
+        if self._proteins:
             data = read_file(
                 peptides_path, use_cols=self._metadata_column + ["score"]
             )
@@ -577,9 +598,17 @@ def _assign_confidence(
             LOGGER.info("\t- Found %i unique protein groups.", len(proteins))
 
         for level, data_path in zip(levels, level_data_path):
-            data = read_file(
-                data_path, use_cols=self._metadata_column + ["score"]
-            )
+            if level == "proteins":
+                usecols = [
+                    "mokapot protein group",
+                    "best peptide",
+                    "stripped sequence",
+                    "score",
+                    "Label",
+                ]
+            else:
+                usecols = self._metadata_column + ["score"]
+            data = read_file(data_path, use_cols=usecols)
             data = data.apply(pd.to_numeric, errors="ignore")
             convert_targets_column(
                 data=data, target_column=self._target_column
@@ -800,6 +829,7 @@ def assign_confidence(
     file_root=None,
     sep="\t",
     decoys=False,
+    proteins=None,
     group_column=None,
     combine=False,
 ):
@@ -905,6 +935,7 @@ def assign_confidence(
             file_root=file_root,
             sep=sep,
             decoys=decoys,
+            proteins=proteins,
             group_column=group_column,
             combine=combine,
         )
@@ -919,6 +950,7 @@ def assign_confidence(
             file_root=file_root,
             sep=sep,
             decoys=decoys,
+            proteins=proteins,
             combine=combine,
         )
 
diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py
index 12cbf5fe..5b66fe03 100644
--- a/mokapot/mokapot.py
+++ b/mokapot/mokapot.py
@@ -72,8 +72,8 @@ def main():
             semi=config.semi,
             decoy_prefix=config.decoy_prefix,
         )
-
-        dataset.add_proteins(proteins)
+    else:
+        proteins = None
 
     # Define a model:
     if config.init_weights:
@@ -93,7 +93,6 @@ def main():
             direction=config.direction,
             override=config.override,
         )
-    print(dataset)
 
     # Fit the models:
     psms_info, models, scores, desc = brew(
@@ -114,6 +113,7 @@ def main():
         dest_dir=config.dest_dir,
         file_root=config.file_root,
         decoys=config.keep_decoys,
+        proteins=proteins,
     )
 
     if config.dest_dir is not None:
diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py
index aa7c619b..97459311 100644
--- a/mokapot/parsers/pin.py
+++ b/mokapot/parsers/pin.py
@@ -19,7 +19,6 @@
 # Functions -------------------------------------------------------------------
 def read_pin(
     pin_file,
-    has_proteins=None,
     group_column=None,
     filename_column=None,
     calcmass_column=None,
@@ -100,7 +99,6 @@ def read_pin(
     logging.info("Parsing PSMs...")
     return read_percolator(
         pin_file,
-        has_proteins=has_proteins,
         group_column=group_column,
         filename_column=filename_column,
         calcmass_column=calcmass_column,
@@ -113,7 +111,6 @@ def read_pin(
 
 def read_percolator(
     perc_file,
-    has_proteins=None,
     group_column=None,
     filename_column=None,
     calcmass_column=None,
@@ -229,7 +226,6 @@ def read_percolator(
         "expmass_column": expmass,
         "rt_column": ret_time,
         "charge_column": charge,
-        "has_proteins": has_proteins,
     }
 
 

From f650e72ae87e2ab3e4f214035b125d5ca7002f24 Mon Sep 17 00:00:00 2001
From: samia <samia.benfredj@msaid.de>
Date: Thu, 12 Jan 2023 09:10:28 +0100
Subject: [PATCH 4/4] fix column names for intermediate files

---
 mokapot/confidence.py | 96 +++++++++++++++++--------------------------
 mokapot/utils.py      |  8 +++-
 2 files changed, 43 insertions(+), 61 deletions(-)

diff --git a/mokapot/confidence.py b/mokapot/confidence.py
index deea6f2f..3a565378 100644
--- a/mokapot/confidence.py
+++ b/mokapot/confidence.py
@@ -112,7 +112,7 @@ def __init__(
             tdc_winners = group_df.index.intersection(idx)
             group_psms = group_df.loc[tdc_winners, :]
             group_scores = scores.loc[group_psms.index].values + 1
-            psms_info["file"] = ["group_psms.csv"]
+            psms_info["file"] = "group_psms.csv"
             group_psms.to_csv(psms_info["file"], sep="\t", index=False)
             assign_confidence(
                 psms_info,
@@ -239,7 +239,8 @@ def __init__(self, psms_info, proteins=None):
         """Initialize a PsmConfidence object."""
         self._score_column = "score"
         self._target_column = psms_info["target_column"]
-        self._protein_column = psms_info["protein_column"]
+        self._protein_column = "proteinIds"
+        self._group_column = psms_info["group_column"]
         self._metadata_column = psms_info["metadata_columns"]
 
         self.scores = None
@@ -266,7 +267,9 @@ def levels(self):
         """
         return list(self.confidence_estimates.keys())
 
-    def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
+    def to_txt(
+        self, data_path, columns, level, decoys, file_root, dest_dir, sep
+    ):
         """Save confidence estimates to delimited text files.
         Parameters
         ----------
@@ -293,43 +296,10 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             The paths to the saved files.
 
         """
-        if level == "proteins":
-            intput_columns = [
-                "mokapot protein group",
-                "best peptide",
-                "stripped sequence",
-                "score",
-                "Label",
-            ]
-            output_columns = sep.join(
-                [
-                    "mokapot protein group",
-                    "best peptide",
-                    "stripped sequence",
-                    "score",
-                    "q-value",
-                    "posterior_error_prob",
-                    "proteinIds",
-                    "\n",
-                ]
-            )
-        else:
-            intput_columns = ["ScanNr", "Peptide", "Proteins"]
-            output_columns = sep.join(
-                [
-                    "PSMId",
-                    "Peptide",
-                    "score",
-                    "q-value",
-                    "posterior_error_prob",
-                    "proteinIds",
-                    "\n",
-                ]
-            )
         reader = read_file_in_chunks(
             file=data_path,
             chunk_size=CONFIDENCE_CHUNK_SIZE,
-            use_cols=intput_columns,
+            use_cols=[i for i in columns if i != self._target_column],
         )
 
         self.scores = utils.create_chunks(
@@ -349,12 +319,18 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
             dest_dir = Path(dest_dir, file_root)
         outfile_t = str(dest_dir) + f"targets.{level}"
         outfile_d = str(dest_dir) + f"decoys.{level}"
+
+        columns.remove(self._target_column)
+        output_columns = columns + ["q-value", "posterior_error_prob"]
+        if level != "proteins" and self._protein_column is not None:
+            output_columns.remove(self._protein_column)
+            output_columns.append(self._protein_column)
         if not os.path.exists(outfile_t):
             with open(outfile_t, "w") as fp:
-                fp.write(output_columns)
+                fp.write(f"{sep.join(output_columns)}\n")
         if decoys and not os.path.exists(outfile_d):
             with open(outfile_d, "w") as fp:
-                fp.write(output_columns)
+                fp.write(f"{sep.join(output_columns)}\n")
 
         for data_in, score_in, qvals_in, pep_in, target_in in zip(
             reader, self.scores, self.qvals, self.peps, self.targets
@@ -483,7 +459,7 @@ def __init__(
         self._target_column = psms_info["target_column"]
         self._psm_columns = psms_info["spectrum_columns"]
         self._peptide_column = psms_info["peptide_column"]
-        self._protein_column = psms_info["protein_column"]
+        self._protein_column = "proteinIds"
         self._eval_fdr = eval_fdr
 
         LOGGER.info("Performing target-decoy competition...")
@@ -577,9 +553,7 @@ def _assign_confidence(
         levels = ["PSMs", "peptides"]
         level_data_path = [psms_path, peptides_path]
         if self._proteins:
-            data = read_file(
-                peptides_path, use_cols=self._metadata_column + ["score"]
-            )
+            data = read_file(peptides_path)
             data = data.apply(pd.to_numeric, errors="ignore")
             convert_targets_column(
                 data=data, target_column=self._target_column
@@ -598,23 +572,15 @@ def _assign_confidence(
             LOGGER.info("\t- Found %i unique protein groups.", len(proteins))
 
         for level, data_path in zip(levels, level_data_path):
-            if level == "proteins":
-                usecols = [
-                    "mokapot protein group",
-                    "best peptide",
-                    "stripped sequence",
-                    "score",
-                    "Label",
-                ]
-            else:
-                usecols = self._metadata_column + ["score"]
-            data = read_file(data_path, use_cols=usecols)
+            data = read_file(data_path)
             data = data.apply(pd.to_numeric, errors="ignore")
+            data_columns = list(data.columns)
             convert_targets_column(
                 data=data, target_column=self._target_column
             )
             self.scores = data.loc[:, self._score_column].values
             self.targets = data.loc[:, self._target_column].astype(bool).values
+            del data
             if all(self.targets):
                 LOGGER.warning(
                     "No decoy PSMs remain for confidence estimation. "
@@ -653,11 +619,23 @@ def _assign_confidence(
             if group_column and not combine:
                 file_root = f"{group_column}."
                 self.to_txt(
-                    data_path, level.lower(), decoys, file_root, dest_dir, sep
+                    data_path,
+                    data_columns,
+                    level.lower(),
+                    decoys,
+                    file_root,
+                    dest_dir,
+                    sep,
                 )
             else:
                 self.to_txt(
-                    data_path, level.lower(), decoys, file_root, dest_dir, sep
+                    data_path,
+                    data_columns,
+                    level.lower(),
+                    decoys,
+                    file_root,
+                    dest_dir,
+                    sep,
                 )
 
     def to_flashlfq(self, out_file="mokapot.flashlfq.txt"):
@@ -898,7 +876,6 @@ def assign_confidence(
                 mode="a",
                 header=None,
             )
-            metadata_columns = list(chunk_metadata.columns)
         psms_path = "psms.csv"
         peptides_path = "peptides.csv"
         iterable_sorted = utils.sort_file_on_disk(
@@ -910,10 +887,11 @@ def assign_confidence(
             "Keeping the best match per %s columns...",
             "+".join(psms_info["spectrum_columns"]),
         )
+        metadata_columns = ["PSMId", "Label", "Peptide", "proteinIds", "score"]
         with open(psms_path, "w") as f_psm:
-            f_psm.write(sep.join(metadata_columns + ["score", "\n"]))
+            f_psm.write(f"{sep.join(metadata_columns)}\n")
         with open(peptides_path, "w") as f_peptide:
-            f_peptide.write(sep.join(metadata_columns + ["score", "\n"]))
+            f_peptide.write(f"{sep.join(metadata_columns)}\n")
 
         unique_psms, unique_peptides = utils.get_unique_psms_and_peptides(
             iterable=iterable_sorted,
diff --git a/mokapot/utils.py b/mokapot/utils.py
index 5bea7b6b..933aa0c8 100644
--- a/mokapot/utils.py
+++ b/mokapot/utils.py
@@ -83,10 +83,14 @@ def get_unique_psms_and_peptides(iterable, out_psms, out_peptides, sep):
         line_hash_peptide = line_list[4]
         if line_hash_psm not in seen_psm:
             seen_psm.add(line_hash_psm)
-            f_psm.write(f"{line}\n")
+            f_psm.write(
+                f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n"
+            )
             if line_hash_peptide not in seen_peptide:
                 seen_peptide.add(line_hash_peptide)
-                f_peptide.write(f"{line}\n")
+                f_peptide.write(
+                    f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n"
+                )
     f_psm.close()
     f_peptide.close()
     return [len(seen_psm), len(seen_peptide)]