diff --git a/mokapot/brew.py b/mokapot/brew.py index 0d12184d..98b01f36 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -86,27 +86,11 @@ def brew( if model is None: model = PercolatorModel() - try: - iter(psms_info) - except TypeError: - psms_info = [psms_info] - - # Check that all of the datasets have the same features: - feat_set = set(psms_info[0]["feature_columns"]) - if not all([set(p["feature_columns"]) == feat_set for p in psms_info]): - raise ValueError("All collections of PSMs must use the same features.") - - target_column = psms_info[0]["target_column"] - spectrum_columns = psms_info[0]["spectrum_columns"] - df_spectra = pd.concat( - [ - read_file( - file_name=p["file"], - use_cols=spectrum_columns + [target_column], - ) - for p in psms_info - ], - ignore_index=True, + target_column = psms_info["target_column"] + spectrum_columns = psms_info["spectrum_columns"] + df_spectra = read_file( + file_name=psms_info["file"], + use_cols=spectrum_columns + [target_column], ).apply(pd.to_numeric, errors="ignore") df_spectra = convert_targets_column(df_spectra, target_column) data_size = len(df_spectra) @@ -120,9 +104,6 @@ def brew( num_targets, num_decoys, ) - # once we check all the datasets have the same features we can use only one metedata information - psms_info[0]["file"] = [psm["file"] for psm in psms_info] - psms_info = psms_info[0] df_spectra = df_spectra[spectrum_columns] LOGGER.info("Splitting PSMs into %i folds...", folds) test_idx = _split(df_spectra, folds) @@ -163,10 +144,10 @@ def brew( elif all([m[0].is_trained for m in models]): # If we don't reset, assign scores to each fold: models = [m for m, _ in models] - scores = [_predict(test_idx, psms_info, models, test_fdr)] + scores = _predict(test_idx, psms_info, models, test_fdr) else: # If model training has failed - scores = [np.zeros(len(p.data)) for p in psms_info] + scores = np.zeros(data_size) # Find which is best: the learned model, the best feature, or # a pretrained model. if type(model) is not list and not model.override: @@ -177,27 +158,21 @@ def brew( else: feat_total = 0 - preds = [ - update_labels(p, s, test_fdr) for p, s in zip([psms_info], scores) - ] - pred_total = sum([(pred == 1).sum() for pred in preds]) + preds = update_labels(psms_info, scores, test_fdr) + + pred_total = sum([(preds == 1).sum()]) # Here, f[0] is the name of the best feature, and f[3] is a boolean if feat_total > pred_total: using_best_feat = True - scores = [] - descs = [] feat, _, desc = best_feats[best_feat_idx] - for dat in psms_info["file"]: - df = pd.read_csv( - dat, - sep="\t", - usecols=[feat], - index_col=False, - on_bad_lines="skip", - ) - scores.append(df.values) - descs.append(desc) + scores = pd.read_csv( + psms_info["file"], + sep="\t", + usecols=[feat], + index_col=False, + on_bad_lines="skip", + ).values else: using_best_feat = False @@ -216,7 +191,7 @@ def brew( "using the original model." ) - return psms_info, models, scores[0], desc + return psms_info, models, scores, desc # Utility Functions ----------------------------------------------------------- diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 5cf3ac19..3a565378 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -82,11 +82,12 @@ def __init__( dest_dir=None, file_root=None, sep="\t", + proteins=None, combine=False, ): """Initialize a GroupedConfidence object""" psms = read_file( - psms_info["file"][0], + psms_info["file"], use_cols=list(psms_info["feature_columns"]) + list(psms_info["metadata_columns"]), ) @@ -111,8 +112,8 @@ def __init__( tdc_winners = group_df.index.intersection(idx) group_psms = group_df.loc[tdc_winners, :] group_scores = scores.loc[group_psms.index].values + 1 - psms_info["file"] = ["group_psms.csv"] - group_psms.to_csv(psms_info["file"][0], sep="\t", index=False) + psms_info["file"] = "group_psms.csv" + group_psms.to_csv(psms_info["file"], sep="\t", index=False) assign_confidence( psms_info, group_scores * (2 * desc - 1), @@ -122,6 +123,7 @@ def __init__( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, group_column=group, combine=combine, ) @@ -233,23 +235,21 @@ class Confidence: "peptide_pairs": "Peptide Pairs", } - def __init__(self, psms_info): + def __init__(self, psms_info, proteins=None): """Initialize a PsmConfidence object.""" self._score_column = "score" self._target_column = psms_info["target_column"] - self._protein_column = psms_info["protein_column"] + self._protein_column = "proteinIds" + self._group_column = psms_info["group_column"] self._metadata_column = psms_info["metadata_columns"] - self._has_proteins = psms_info["has_proteins"] self.scores = None self.targets = None self.qvals = None self.peps = None - if self._has_proteins: - self._proteins = self._has_proteins - else: - self._proteins = None + self._proteins = proteins + # This attribute holds the results as DataFrames: self.confidence_estimates = {} self.decoy_confidence_estimates = {} @@ -267,7 +267,9 @@ def levels(self): """ return list(self.confidence_estimates.keys()) - def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): + def to_txt( + self, data_path, columns, level, decoys, file_root, dest_dir, sep + ): """Save confidence estimates to delimited text files. Parameters ---------- @@ -294,11 +296,10 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): The paths to the saved files. """ - reader = read_file_in_chunks( file=data_path, chunk_size=CONFIDENCE_CHUNK_SIZE, - use_cols=["ScanNr", "Peptide", "Proteins"], + use_cols=[i for i in columns if i != self._target_column], ) self.scores = utils.create_chunks( @@ -314,27 +315,22 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep): self.targets, chunk_size=CONFIDENCE_CHUNK_SIZE ) - output_columns = sep.join( - [ - "PSMId", - "Peptide", - "score", - "q-value", - "posterior_error_prob", - "proteinIds", - "\n", - ] - ) if file_root is not None: dest_dir = Path(dest_dir, file_root) outfile_t = str(dest_dir) + f"targets.{level}" outfile_d = str(dest_dir) + f"decoys.{level}" + + columns.remove(self._target_column) + output_columns = columns + ["q-value", "posterior_error_prob"] + if level != "proteins" and self._protein_column is not None: + output_columns.remove(self._protein_column) + output_columns.append(self._protein_column) if not os.path.exists(outfile_t): with open(outfile_t, "w") as fp: - fp.write(output_columns) + fp.write(f"{sep.join(output_columns)}\n") if decoys and not os.path.exists(outfile_d): with open(outfile_d, "w") as fp: - fp.write(output_columns) + fp.write(f"{sep.join(output_columns)}\n") for data_in, score_in, qvals_in, pep_in, target_in in zip( reader, self.scores, self.qvals, self.peps, self.targets @@ -453,16 +449,17 @@ def __init__( dest_dir=None, file_root=None, decoys=None, + proteins=None, sep="\t", group_column=None, combine=False, ): """Initialize a a LinearPsmConfidence object""" - super().__init__(psms_info) + super().__init__(psms_info, proteins) self._target_column = psms_info["target_column"] self._psm_columns = psms_info["spectrum_columns"] self._peptide_column = psms_info["peptide_column"] - self._protein_column = psms_info["protein_column"] + self._protein_column = "proteinIds" self._eval_fdr = eval_fdr LOGGER.info("Performing target-decoy competition...") @@ -496,7 +493,7 @@ def __repr__(self): f"{self.accepted['peptides']}\n" ) - if self._has_proteins: + if self._proteins: base += ( f"\t- Protein groups at q<={self._eval_fdr:g}: " f"{self.accepted['proteins']}\n" @@ -555,10 +552,8 @@ def _assign_confidence( """ levels = ["PSMs", "peptides"] level_data_path = [psms_path, peptides_path] - if self._has_proteins: - data = read_file( - peptides_path, use_cols=self._metadata_column + ["score"] - ) + if self._proteins: + data = read_file(peptides_path) data = data.apply(pd.to_numeric, errors="ignore") convert_targets_column( data=data, target_column=self._target_column @@ -577,15 +572,15 @@ def _assign_confidence( LOGGER.info("\t- Found %i unique protein groups.", len(proteins)) for level, data_path in zip(levels, level_data_path): - data = read_file( - data_path, use_cols=self._metadata_column + ["score"] - ) + data = read_file(data_path) data = data.apply(pd.to_numeric, errors="ignore") + data_columns = list(data.columns) convert_targets_column( data=data, target_column=self._target_column ) self.scores = data.loc[:, self._score_column].values self.targets = data.loc[:, self._target_column].astype(bool).values + del data if all(self.targets): LOGGER.warning( "No decoy PSMs remain for confidence estimation. " @@ -624,11 +619,23 @@ def _assign_confidence( if group_column and not combine: file_root = f"{group_column}." self.to_txt( - data_path, level.lower(), decoys, file_root, dest_dir, sep + data_path, + data_columns, + level.lower(), + decoys, + file_root, + dest_dir, + sep, ) else: self.to_txt( - data_path, level.lower(), decoys, file_root, dest_dir, sep + data_path, + data_columns, + level.lower(), + decoys, + file_root, + dest_dir, + sep, ) def to_flashlfq(self, out_file="mokapot.flashlfq.txt"): @@ -800,6 +807,7 @@ def assign_confidence( file_root=None, sep="\t", decoys=False, + proteins=None, group_column=None, combine=False, ): @@ -847,17 +855,11 @@ def assign_confidence( if scores is None: feat, _, _, desc = find_best_feature(psms_info, eval_fdr) LOGGER.info("Selected %s as the best feature.", feat) - scores = pd.concat( - [ - read_file(file_name=file, use_cols=[feat]) - for file in psms_info["file"] - ], - ignore_index=True, - ).values + scores = read_file(file_name=psms_info["file"], use_cols=[feat]).values if psms_info["group_column"] is None: reader = read_file_in_chunks( - file=psms_info["file"][0], + file=psms_info["file"], chunk_size=CONFIDENCE_CHUNK_SIZE, use_cols=psms_info["metadata_columns"], ) @@ -874,7 +876,6 @@ def assign_confidence( mode="a", header=None, ) - metadata_columns = list(chunk_metadata.columns) psms_path = "psms.csv" peptides_path = "peptides.csv" iterable_sorted = utils.sort_file_on_disk( @@ -886,10 +887,11 @@ def assign_confidence( "Keeping the best match per %s columns...", "+".join(psms_info["spectrum_columns"]), ) + metadata_columns = ["PSMId", "Label", "Peptide", "proteinIds", "score"] with open(psms_path, "w") as f_psm: - f_psm.write(sep.join(metadata_columns + ["score", "\n"])) + f_psm.write(f"{sep.join(metadata_columns)}\n") with open(peptides_path, "w") as f_peptide: - f_peptide.write(sep.join(metadata_columns + ["score", "\n"])) + f_peptide.write(f"{sep.join(metadata_columns)}\n") unique_psms, unique_peptides = utils.get_unique_psms_and_peptides( iterable=iterable_sorted, @@ -911,6 +913,7 @@ def assign_confidence( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, group_column=group_column, combine=combine, ) @@ -925,6 +928,7 @@ def assign_confidence( file_root=file_root, sep=sep, decoys=decoys, + proteins=proteins, combine=combine, ) diff --git a/mokapot/config.py b/mokapot/config.py index 0171f26b..fcc27169 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -48,7 +48,6 @@ def _parser(): parser.add_argument( "psm_files", type=str, - nargs="+", help=( "A collection of PSMs in the Percolator tab-delimited or PepXML " "format." @@ -209,20 +208,6 @@ def _parser(): ), ) - parser.add_argument( - "--aggregate", - default=False, - action="store_true", - help=( - "If used, PSMs from multiple PIN files will be " - "aggregated and analyzed together. Otherwise, " - "a joint model will be trained, but confidence " - "estimates will be calculated separately for " - "each PIN file. This flag only has an effect " - "when multiple PIN files are provided." - ), - ) - parser.add_argument( "--subset_max_train", type=int, diff --git a/mokapot/dataset.py b/mokapot/dataset.py index 0c46bdf8..87b0124f 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -624,16 +624,10 @@ def calibrate_scores(scores, targets, eval_fdr, desc=True): def targets_count_by_feature(psms_info, eval_fdr, columns, desc): - df = pd.concat( - [ - read_file( - file_name=file, use_cols=columns + [psms_info["target_column"]] - ) - for file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], + use_cols=columns + [psms_info["target_column"]], ) - return ( df.loc[:, columns].apply( _update_labels, @@ -668,16 +662,11 @@ def find_best_feature(psms_info, eval_fdr): if num_passing > best_positives: best_positives = num_passing best_feat = feat_idx - df = pd.concat( - [ - read_file( - file_name=file, - use_cols=[best_feat, psms_info["target_column"]], - ) - for file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], + use_cols=[best_feat, psms_info["target_column"]], ) + new_labels = _update_labels( scores=df.loc[:, best_feat], targets=df[psms_info["target_column"]], @@ -693,12 +682,8 @@ def find_best_feature(psms_info, eval_fdr): def update_labels(psms_info, scores, eval_fdr=0.01, desc=True): - df = pd.concat( - [ - read_file(file_name=_file, use_cols=[psms_info["target_column"]]) - for _file in psms_info["file"] - ], - ignore_index=True, + df = read_file( + file_name=psms_info["file"], use_cols=[psms_info["target_column"]] ) return _update_labels( scores=scores, diff --git a/mokapot/mokapot.py b/mokapot/mokapot.py index 22511f14..5b66fe03 100644 --- a/mokapot/mokapot.py +++ b/mokapot/mokapot.py @@ -57,11 +57,7 @@ def main(): # Parse Datasets parse = get_parser(config) - if config.aggregate or len(config.psm_files) == 1: - datasets = parse(config.psm_files) - else: - datasets = [parse(f) for f in config.psm_files] - prefixes = [Path(f).stem for f in config.psm_files] + dataset = parse(config.psm_files) # Parse FASTA, if required: if config.proteins is not None: @@ -76,12 +72,8 @@ def main(): semi=config.semi, decoy_prefix=config.decoy_prefix, ) - - if config.aggregate or len(config.psm_files) == 1: - datasets.add_proteins(proteins) - else: - for dataset in datasets: - dataset.add_proteins(proteins) + else: + proteins = None # Define a model: if config.init_weights: @@ -104,7 +96,7 @@ def main(): # Fit the models: psms_info, models, scores, desc = brew( - datasets, + dataset, model=model, test_fdr=config.test_fdr, folds=config.folds, @@ -121,6 +113,7 @@ def main(): dest_dir=config.dest_dir, file_root=config.file_root, decoys=config.keep_decoys, + proteins=proteins, ) if config.dest_dir is not None: diff --git a/mokapot/parsers/pin.py b/mokapot/parsers/pin.py index 95b575d2..97459311 100644 --- a/mokapot/parsers/pin.py +++ b/mokapot/parsers/pin.py @@ -18,8 +18,7 @@ # Functions ------------------------------------------------------------------- def read_pin( - pin_files, - has_proteins=None, + pin_file, group_column=None, filename_column=None, calcmass_column=None, @@ -98,25 +97,20 @@ def read_pin( PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") - return [ - read_percolator( - file, - has_proteins=has_proteins, - group_column=group_column, - filename_column=filename_column, - calcmass_column=calcmass_column, - expmass_column=expmass_column, - rt_column=rt_column, - charge_column=charge_column, - copy_data=copy_data, - ) - for file in utils.tuplize(pin_files) - ] + return read_percolator( + pin_file, + group_column=group_column, + filename_column=filename_column, + calcmass_column=calcmass_column, + expmass_column=expmass_column, + rt_column=rt_column, + charge_column=charge_column, + copy_data=copy_data, + ) def read_percolator( perc_file, - has_proteins=None, group_column=None, filename_column=None, calcmass_column=None, @@ -232,7 +226,6 @@ def read_percolator( "expmass_column": expmass, "rt_column": ret_time, "charge_column": charge, - "has_proteins": has_proteins, } @@ -334,14 +327,15 @@ def parse_in_chunks(psms_info, idx, chunk_size): list of dataframes """ train_psms = [[] for _ in range(len(idx))] - for file in psms_info["file"]: - reader = read_file_in_chunks( - file=file, chunk_size=chunk_size, use_cols=psms_info["columns"] - ) - Parallel(n_jobs=-1, require="sharedmem")( - delayed(get_rows_from_dataframe)(idx, chunk, train_psms) - for chunk in reader - ) + reader = read_file_in_chunks( + file=psms_info["file"], + chunk_size=chunk_size, + use_cols=psms_info["columns"], + ) + Parallel(n_jobs=-1, require="sharedmem")( + delayed(get_rows_from_dataframe)(idx, chunk, train_psms) + for chunk in reader + ) return Parallel(n_jobs=-1, require="sharedmem")( delayed(concat_chunks)(df=df, orig_idx=orig_idx) diff --git a/mokapot/utils.py b/mokapot/utils.py index 5bea7b6b..933aa0c8 100644 --- a/mokapot/utils.py +++ b/mokapot/utils.py @@ -83,10 +83,14 @@ def get_unique_psms_and_peptides(iterable, out_psms, out_peptides, sep): line_hash_peptide = line_list[4] if line_hash_psm not in seen_psm: seen_psm.add(line_hash_psm) - f_psm.write(f"{line}\n") + f_psm.write( + f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n" + ) if line_hash_peptide not in seen_peptide: seen_peptide.add(line_hash_peptide) - f_peptide.write(f"{line}\n") + f_peptide.write( + f"{sep.join([line_list[0], line_list[1], line_list[-3], line_list[-2], line_list[-1]])}\n" + ) f_psm.close() f_peptide.close() return [len(seen_psm), len(seen_peptide)]