From 177dc7a4e8575e34b184db71b01b5d5de9373cb1 Mon Sep 17 00:00:00 2001 From: samia Date: Tue, 10 Jan 2023 16:39:47 +0100 Subject: [PATCH 1/4] fix bug max workers st to 1: convert train_sets generator to list --- mokapot/brew.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index 33ad5993..0d12184d 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -127,14 +127,14 @@ def brew( LOGGER.info("Splitting PSMs into %i folds...", folds) test_idx = _split(df_spectra, folds) del df_spectra - train_sets = make_train_sets( - test_idx=test_idx, - subset_max_train=subset_max_train, - data_size=data_size, + train_sets = list( + make_train_sets( + test_idx=test_idx, + subset_max_train=subset_max_train, + data_size=data_size, + ) ) - if max_workers != 1: - # train_sets can't be a generator for joblib :( - train_sets = list(train_sets) + train_psms = parse_in_chunks( psms_info=psms_info, idx=train_sets, From b7fb3d627082aea1af1d189edd5f6e4479067392 Mon Sep 17 00:00:00 2001 From: samia Date: Tue, 10 Jan 2023 11:04:37 +0100 Subject: [PATCH 2/4] single file input: - remove aggregate option from config - refactor parse input file - refactor functions in brew, confidence and dataset files to read only one input file --- mokapot/brew.py | 61 +++++++++++++----------------------------- mokapot/confidence.py | 14 +++------- mokapot/config.py | 15 ----------- mokapot/dataset.py | 33 +++++++---------------- mokapot/mokapot.py | 15 +++-------- mokapot/parsers/pin.py | 44 +++++++++++++++--------------- 6 files changed, 56 insertions(+), 126 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index 0d12184d..98b01f36 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -86,27 +86,11 @@ def brew( if model is None: model = PercolatorModel() - try: - iter(psms_info) - except TypeError: - psms_info = [psms_info] - - # Check that all of the datasets have the same features: - feat_set = set(psms_info[0]["feature_columns"]) - if not all([set(p["feature_columns"]) == feat_set for p in psms_info]): - raise ValueError("All collections of PSMs must use the same features.") - - target_column = psms_info[0]["target_column"] - spectrum_columns = psms_info[0]["spectrum_columns"] - df_spectra = pd.concat( - [ - read_file( - file_name=p["file"], - use_cols=spectrum_columns + [target_column], - ) - for p in psms_info - ], - ignore_index=True, + target_column = psms_info["target_column"] + spectrum_columns = psms_info["spectrum_columns"] + df_spectra = read_file( + file_name=psms_info["file"], + use_cols=spectrum_columns + [target_column], ).apply(pd.to_numeric, errors="ignore") df_spectra = convert_targets_column(df_spectra, target_column) data_size = len(df_spectra) @@ -120,9 +104,6 @@ def brew( num_targets, num_decoys, ) - # once we check all the datasets have the same features we can use only one metedata information - psms_info[0]["file"] = [psm["file"] for psm in psms_info] - psms_info = psms_info[0] df_spectra = df_spectra[spectrum_columns] LOGGER.info("Splitting PSMs into %i folds...", folds) test_idx = _split(df_spectra, folds) @@ -163,10 +144,10 @@ def brew( elif all([m[0].is_trained for m in models]): # If we don't reset, assign scores to each fold: models = [m for m, _ in models] - scores = [_predict(test_idx, psms_info, models, test_fdr)] + scores = _predict(test_idx, psms_info, models, test_fdr) else: # If model training has failed - scores = [np.zeros(len(p.data)) for p in psms_info] + scores = np.zeros(data_size) # Find which is best: the learned model, the best feature, or # a pretrained model. if type(model) is not list and not model.override: @@ -177,27 +158,21 @@ def brew( else: feat_total = 0 - preds = [ - update_labels(p, s, test_fdr) for p, s in zip([psms_info], scores) - ] - pred_total = sum([(pred == 1).sum() for pred in preds]) + preds = update_labels(psms_info, scores, test_fdr) + + pred_total = sum([(preds == 1).sum()]) # Here, f[0] is the name of the best feature, and f[3] is a boolean if feat_total > pred_total: using_best_feat = True - scores = [] - descs = [] feat, _, desc = best_feats[best_feat_idx] - for dat in psms_info["file"]: - df = pd.read_csv( - dat, - sep="\t", - usecols=[feat], - index_col=False, - on_bad_lines="skip", - ) - scores.append(df.values) - descs.append(desc) + scores = pd.read_csv( + psms_info["file"], + sep="\t", + usecols=[feat], + index_col=False, + on_bad_lines="skip", + ).values else: using_best_feat = False @@ -216,7 +191,7 @@ def brew( "using the original model." ) - return psms_info, models, scores[0], desc + return psms_info, models, scores, desc # Utility Functions ----------------------------------------------------------- diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 5cf3ac19..eefa6a75 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -86,7 +86,7 @@ def __init__( ): """Initialize a GroupedConfidence object""" psms = read_file( - psms_info["file"][0], + psms_info["file"], use_cols=list(psms_info["feature_columns"]) + list(psms_info["metadata_columns"]), ) @@ -112,7 +112,7 @@ def __init__( group_psms = group_df.loc[tdc_winners, :] group_scores = scores.loc[group_psms.index].values + 1 psms_info["file"] = ["group_psms.csv"] - group_psms.to_csv(psms_info["file"][0], sep="\t", index=False) + group_psms.to_csv(psms_info["file"], sep="\t", index=False) assign_confidence( psms_info, group_scores * (2 * desc - 1), @@ -847,17 +847,11 @@ def assign_confidence( if scores is None: feat, _, _, desc = find_best_feature(psms_info, eval_fdr) LOGGER.info("Selected %s as the best feature.", feat) - scores = pd.concat( - [ - read_file(file_name=file, use_cols=[feat]) - for file in psms_info["file"] - ], - ignore_index=True, - ).values + scores = read_file(file_name=psms_info["file"], use_cols=[feat]).values if psms_info["group_column"] is None: reader = read_file_in_chunks( - file=psms_info["file"][0], + file=psms_info["file"], chunk_size=CONFIDENCE_CHUNK_SIZE, use_cols=psms_info["metadata_columns"], ) diff --git a/mokapot/config.py b/mokapot/config.py index 0171f26b..fcc27169 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -48,7 +48,6 @@ def _parser(): parser.add_argument( "psm_files", type=str, - nargs="+", help=( "A collection of PSMs in the Percolator tab-delimited or PepXML " "format." @@ -209,20 +208,6 @@ def _parser(): ), ) - parser.add_argument( - "--aggregate", - default=False, - action="store_true", - help=( - "If used, PSMs from multiple PIN files will be " - "aggregated and analyzed together. Otherwise, " - "a joint model will be trained, but confidence " - "estimates will be calculated separately for " - "each PIN file. This flag only has an effect " - "when multiple PIN files are provided." - ), - ) - parser.add_argument( "--subset_max_train", type=int, diff --git a/mokapot/dataset.py b/mokapot/dataset.py index 0c46bdf8..87b0124f 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -624,16 +624,10 @@ def calibrate_scores(scores, targets, eval_fdr, desc=True): def targets_count_by_feature(psms_info, eval_fdr, columns, desc): - df = pd.concat( - [ - read_file( - file_name=file, use_cols=columns + [psms_info["target_column"]] - ) - for file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], + use_cols=columns + [psms_info["target_column"]], ) - return ( df.loc[:, columns].apply( _update_labels, @@ -668,16 +662,11 @@ def find_best_feature(psms_info, eval_fdr): if num_passing > best_positives: best_positives = num_passing best_feat = feat_idx - df = pd.concat( - [ - read_file( - file_name=file, - use_cols=[best_feat, psms_info["target_column"]], - ) - for file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], + use_cols=[best_feat, psms_info["target_column"]], ) + new_labels = _update_labels( scores=df.loc[:, best_feat], targets=df[psms_info["target_column"]], @@ -693,12 +682,8 @@ def find_best_feature(psms_info, eval_fdr): def update_labels(psms_info, scores, eval_fdr=0.01, desc=True): - df = pd.concat( - [ - read_file(file_name=_file, use_cols=[psms_info["target_column"]]) - for _file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], use_cols=[psms_info["target_column"]] ) return _update_labels( scores=scores, diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py index 22511f14..12cbf5fe 100644 --- a/mokapot/mokapot.py +++ b/mokapot/mokapot.py @@ -57,11 +57,7 @@ def main(): # Parse Datasets parse = get_parser(config) - if config.aggregate or len(config.psm_files) == 1: - datasets = parse(config.psm_files) - else: - datasets = [parse(f) for f in config.psm_files] - prefixes = [Path(f).stem for f in config.psm_files] + dataset = parse(config.psm_files) # Parse FASTA, if required: if config.proteins is not None: @@ -77,11 +73,7 @@ def main(): decoy_prefix=config.decoy_prefix, ) - if config.aggregate or len(config.psm_files) == 1: - datasets.add_proteins(proteins) - else: - for dataset in datasets: - dataset.add_proteins(proteins) + dataset.add_proteins(proteins) # Define a model: if config.init_weights: @@ -101,10 +93,11 @@ def main(): direction=config.direction, override=config.override, ) + print(dataset) # Fit the models: psms_info, models, scores, desc = brew( - datasets, + dataset, model=model, test_fdr=config.test_fdr, folds=config.folds, diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index 95b575d2..aa7c619b 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -18,7 +18,7 @@ # Functions ------------------------------------------------------------------- def read_pin( - pin_files, + pin_file, has_proteins=None, group_column=None, filename_column=None, @@ -98,20 +98,17 @@ def read_pin( PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") - return [ - read_percolator( - file, - has_proteins=has_proteins, - group_column=group_column, - filename_column=filename_column, - calcmass_column=calcmass_column, - expmass_column=expmass_column, - rt_column=rt_column, - charge_column=charge_column, - copy_data=copy_data, - ) - for file in utils.tuplize(pin_files) - ] + return read_percolator( + pin_file, + has_proteins=has_proteins, + group_column=group_column, + filename_column=filename_column, + calcmass_column=calcmass_column, + expmass_column=expmass_column, + rt_column=rt_column, + charge_column=charge_column, + copy_data=copy_data, + ) def read_percolator( @@ -334,14 +331,15 @@ def parse_in_chunks(psms_info, idx, chunk_size): list of dataframes """ train_psms = [[] for _ in range(len(idx))] - for file in psms_info["file"]: - reader = read_file_in_chunks( - file=file, chunk_size=chunk_size, use_cols=psms_info["columns"] - ) - Parallel(n_jobs=-1, require="sharedmem")( - delayed(get_rows_from_dataframe)(idx, chunk, train_psms) - for chunk in reader - ) + reader = read_file_in_chunks( + file=psms_info["file"], + chunk_size=chunk_size, + use_cols=psms_info["columns"], + ) + Parallel(n_jobs=-1, require="sharedmem")( + delayed(get_rows_from_dataframe)(idx, chunk, train_psms) + for chunk in reader + ) return Parallel(n_jobs=-1, require="sharedmem")( delayed(concat_chunks)(df=df, orig_idx=orig_idx) From 0baf57081c18a7c800750e230a84aebab949f1a2 Mon Sep 17 00:00:00 2001 From: samia Date: Tue, 10 Jan 2023 16:25:41 +0100 Subject: [PATCH 3/4] fix protein level confidence --- mokapot/confidence.py | 82 +++++++++++++++++++++++++++++------------- mokapot/mokapot.py | 6 ++-- mokapot/parsers/pin.py | 4 --- 3 files changed, 60 insertions(+), 32 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index eefa6a75..deea6f2f 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -82,6 +82,7 @@ def __init__( dest_dir=None, file_root=None, sep="\t", + proteins=None, combine=False, ): """Initialize a GroupedConfidence object""" @@ -122,6 +123,7 @@ def __init__( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, group_column=group, combine=combine, ) @@ -233,23 +235,20 @@ class Confidence: "peptide_pairs": "Peptide Pairs", } - def __init__(self, psms_info): + def __init__(self, psms_info, proteins=None): """Initialize a PsmConfidence object.""" self._score_column = "score" self._target_column = psms_info["target_column"] self._protein_column = psms_info["protein_column"] self._metadata_column = psms_info["metadata_columns"] - self._has_proteins = psms_info["has_proteins"] self.scores = None self.targets = None self.qvals = None self.peps = None - if self._has_proteins: - self._proteins = self._has_proteins - else: - self._proteins = None + self._proteins = proteins + # This attribute holds the results as DataFrames: self.confidence_estimates = {} self.decoy_confidence_estimates = {} @@ -294,11 +293,43 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): The paths to the saved files. """ - + if level == "proteins": + intput_columns = [ + "mokapot protein group", + "best peptide", + "stripped sequence", + "score", + "Label", + ] + output_columns = sep.join( + [ + "mokapot protein group", + "best peptide", + "stripped sequence", + "score", + "q-value", + "posterior_error_prob", + "proteinIds", + "\n", + ] + ) + else: + intput_columns = ["ScanNr", "Peptide", "Proteins"] + output_columns = sep.join( + [ + "PSMId", + "Peptide", + "score", + "q-value", + "posterior_error_prob", + "proteinIds", + "\n", + ] + ) reader = read_file_in_chunks( file=data_path, chunk_size=CONFIDENCE_CHUNK_SIZE, - use_cols=["ScanNr", "Peptide", "Proteins"], + use_cols=intput_columns, ) self.scores = utils.create_chunks( @@ -314,17 +345,6 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): self.targets, chunk_size=CONFIDENCE_CHUNK_SIZE ) - output_columns = sep.join( - [ - "PSMId", - "Peptide", - "score", - "q-value", - "posterior_error_prob", - "proteinIds", - "\n", - ] - ) if file_root is not None: dest_dir = Path(dest_dir, file_root) outfile_t = str(dest_dir) + f"targets.{level}" @@ -453,12 +473,13 @@ def __init__( dest_dir=None, file_root=None, decoys=None, + proteins=None, sep="\t", group_column=None, combine=False, ): """Initialize a a LinearPsmConfidence object""" - super().__init__(psms_info) + super().__init__(psms_info, proteins) self._target_column = psms_info["target_column"] self._psm_columns = psms_info["spectrum_columns"] self._peptide_column = psms_info["peptide_column"] @@ -496,7 +517,7 @@ def __repr__(self): f"{self.accepted['peptides']}\n" ) - if self._has_proteins: + if self._proteins: base += ( f"\t- Protein groups at q<={self._eval_fdr:g}: " f"{self.accepted['proteins']}\n" @@ -555,7 +576,7 @@ def _assign_confidence( """ levels = ["PSMs", "peptides"] level_data_path = [psms_path, peptides_path] - if self._has_proteins: + if self._proteins: data = read_file( peptides_path, use_cols=self._metadata_column + ["score"] ) @@ -577,9 +598,17 @@ def _assign_confidence( LOGGER.info("\t- Found %i unique protein groups.", len(proteins)) for level, data_path in zip(levels, level_data_path): - data = read_file( - data_path, use_cols=self._metadata_column + ["score"] - ) + if level == "proteins": + usecols = [ + "mokapot protein group", + "best peptide", + "stripped sequence", + "score", + "Label", + ] + else: + usecols = self._metadata_column + ["score"] + data = read_file(data_path, use_cols=usecols) data = data.apply(pd.to_numeric, errors="ignore") convert_targets_column( data=data, target_column=self._target_column @@ -800,6 +829,7 @@ def assign_confidence( file_root=None, sep="\t", decoys=False, + proteins=None, group_column=None, combine=False, ): @@ -905,6 +935,7 @@ def assign_confidence( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, group_column=group_column, combine=combine, ) @@ -919,6 +950,7 @@ def assign_confidence( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, combine=combine, ) diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py index 12cbf5fe..5b66fe03 100644 --- a/mokapot/mokapot.py +++ b/mokapot/mokapot.py @@ -72,8 +72,8 @@ def main(): semi=config.semi, decoy_prefix=config.decoy_prefix, ) - - dataset.add_proteins(proteins) + else: + proteins = None # Define a model: if config.init_weights: @@ -93,7 +93,6 @@ def main(): direction=config.direction, override=config.override, ) - print(dataset) # Fit the models: psms_info, models, scores, desc = brew( @@ -114,6 +113,7 @@ def main(): dest_dir=config.dest_dir, file_root=config.file_root, decoys=config.keep_decoys, + proteins=proteins, ) if config.dest_dir is not None: diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index aa7c619b..97459311 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -19,7 +19,6 @@ # Functions ------------------------------------------------------------------- def read_pin( pin_file, - has_proteins=None, group_column=None, filename_column=None, calcmass_column=None, @@ -100,7 +99,6 @@ def read_pin( logging.info("Parsing PSMs...") return read_percolator( pin_file, - has_proteins=has_proteins, group_column=group_column, filename_column=filename_column, calcmass_column=calcmass_column, @@ -113,7 +111,6 @@ def read_pin( def read_percolator( perc_file, - has_proteins=None, group_column=None, filename_column=None, calcmass_column=None, @@ -229,7 +226,6 @@ def read_percolator( "expmass_column": expmass, "rt_column": ret_time, "charge_column": charge, - "has_proteins": has_proteins, } From f650e72ae87e2ab3e4f214035b125d5ca7002f24 Mon Sep 17 00:00:00 2001 From: samia Date: Thu, 12 Jan 2023 09:10:28 +0100 Subject: [PATCH 4/4] fix column names for intermediate files --- mokapot/confidence.py | 96 +++++++++++++++++-------------------------- mokapot/utils.py | 8 +++- 2 files changed, 43 insertions(+), 61 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index deea6f2f..3a565378 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -112,7 +112,7 @@ def __init__( tdc_winners = group_df.index.intersection(idx) group_psms = group_df.loc[tdc_winners, :] group_scores = scores.loc[group_psms.index].values + 1 - psms_info["file"] = ["group_psms.csv"] + psms_info["file"] = "group_psms.csv" group_psms.to_csv(psms_info["file"], sep="\t", index=False) assign_confidence( psms_info, @@ -239,7 +239,8 @@ def __init__(self, psms_info, proteins=None): """Initialize a PsmConfidence object.""" self._score_column = "score" self._target_column = psms_info["target_column"] - self._protein_column = psms_info["protein_column"] + self._protein_column = "proteinIds" + self._group_column = psms_info["group_column"] self._metadata_column = psms_info["metadata_columns"] self.scores = None @@ -266,7 +267,9 @@ def levels(self): """ return list(self.confidence_estimates.keys()) - def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): + def to_txt( + self, data_path, columns, level, decoys, file_root, dest_dir, sep + ): """Save confidence estimates to delimited text files. Parameters ---------- @@ -293,43 +296,10 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): The paths to the saved files. """ - if level == "proteins": - intput_columns = [ - "mokapot protein group", - "best peptide", - "stripped sequence", - "score", - "Label", - ] - output_columns = sep.join( - [ - "mokapot protein group", - "best peptide", - "stripped sequence", - "score", - "q-value", - "posterior_error_prob", - "proteinIds", - "\n", - ] - ) - else: - intput_columns = ["ScanNr", "Peptide", "Proteins"] - output_columns = sep.join( - [ - "PSMId", - "Peptide", - "score", - "q-value", - "posterior_error_prob", - "proteinIds", - "\n", - ] - ) reader = read_file_in_chunks( file=data_path, chunk_size=CONFIDENCE_CHUNK_SIZE, - use_cols=intput_columns, + use_cols=[i for i in columns if i != self._target_column], ) self.scores = utils.create_chunks( @@ -349,12 +319,18 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): dest_dir = Path(dest_dir, file_root) outfile_t = str(dest_dir) + f"targets.{level}" outfile_d = str(dest_dir) + f"decoys.{level}" + + columns.remove(self._target_column) + output_columns = columns + ["q-value", "posterior_error_prob"] + if level != "proteins" and self._protein_column is not None: + output_columns.remove(self._protein_column) + output_columns.append(self._protein_column) if not os.path.exists(outfile_t): with open(outfile_t, "w") as fp: - fp.write(output_columns) + fp.write(f"{sep.join(output_columns)}\n") if decoys and not os.path.exists(outfile_d): with open(outfile_d, "w") as fp: - fp.write(output_columns) + fp.write(f"{sep.join(output_columns)}\n") for data_in, score_in, qvals_in, pep_in, target_in in zip( reader, self.scores, self.qvals, self.peps, self.targets @@ -483,7 +459,7 @@ def __init__( self._target_column = psms_info["target_column"] self._psm_columns = psms_info["spectrum_columns"] self._peptide_column = psms_info["peptide_column"] - self._protein_column = psms_info["protein_column"] + self._protein_column = "proteinIds" self._eval_fdr = eval_fdr LOGGER.info("Performing target-decoy competition...") @@ -577,9 +553,7 @@ def _assign_confidence( levels = ["PSMs", "peptides"] level_data_path = [psms_path, peptides_path] if self._proteins: - data = read_file( - peptides_path, use_cols=self._metadata_column + ["score"] - ) + data = read_file(peptides_path) data = data.apply(pd.to_numeric, errors="ignore") convert_targets_column( data=data, target_column=self._target_column @@ -598,23 +572,15 @@ def _assign_confidence( LOGGER.info("\t- Found %i unique protein groups.", len(proteins)) for level, data_path in zip(levels, level_data_path): - if level == "proteins": - usecols = [ - "mokapot protein group", - "best peptide", - "stripped sequence", - "score", - "Label", - ] - else: - usecols = self._metadata_column + ["score"] - data = read_file(data_path, use_cols=usecols) + data = read_file(data_path) data = data.apply(pd.to_numeric, errors="ignore") + data_columns = list(data.columns) convert_targets_column( data=data, target_column=self._target_column ) self.scores = data.loc[:, self._score_column].values self.targets = data.loc[:, self._target_column].astype(bool).values + del data if all(self.targets): LOGGER.warning( "No decoy PSMs remain for confidence estimation. " @@ -653,11 +619,23 @@ def _assign_confidence( if group_column and not combine: file_root = f"{group_column}." self.to_txt( - data_path, level.lower(), decoys, file_root, dest_dir, sep + data_path, + data_columns, + level.lower(), + decoys, + file_root, + dest_dir, + sep, ) else: self.to_txt( - data_path, level.lower(), decoys, file_root, dest_dir, sep + data_path, + data_columns, + level.lower(), + decoys, + file_root, + dest_dir, + sep, ) def to_flashlfq(self, out_file="mokapot.flashlfq.txt"): @@ -898,7 +876,6 @@ def assign_confidence( mode="a", header=None, ) - metadata_columns = list(chunk_metadata.columns) psms_path = "psms.csv" peptides_path = "peptides.csv" iterable_sorted = utils.sort_file_on_disk( @@ -910,10 +887,11 @@ def assign_confidence( "Keeping the best match per %s columns...", "+".join(psms_info["spectrum_columns"]), ) + metadata_columns = ["PSMId", "Label", "Peptide", "proteinIds", "score"] with open(psms_path, "w") as f_psm: - f_psm.write(sep.join(metadata_columns + ["score", "\n"])) + f_psm.write(f"{sep.join(metadata_columns)}\n") with open(peptides_path, "w") as f_peptide: - f_peptide.write(sep.join(metadata_columns + ["score", "\n"])) + f_peptide.write(f"{sep.join(metadata_columns)}\n") unique_psms, unique_peptides = utils.get_unique_psms_and_peptides( iterable=iterable_sorted, diff --git a/mokapot/utils.py b/mokapot/utils.py index 5bea7b6b..933aa0c8 100644 --- a/mokapot/utils.py +++ b/mokapot/utils.py @@ -83,10 +83,14 @@ def get_unique_psms_and_peptides(iterable, out_psms, out_peptides, sep): line_hash_peptide = line_list[4] if line_hash_psm not in seen_psm: seen_psm.add(line_hash_psm) - f_psm.write(f"{line}\n") + f_psm.write( + f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n" + ) if line_hash_peptide not in seen_peptide: seen_peptide.add(line_hash_peptide) - f_peptide.write(f"{line}\n") + f_peptide.write( + f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n" + ) f_psm.close() f_peptide.close() return [len(seen_psm), len(seen_peptide)]