Skip to content

Commit

Permalink
Merge branch 'feature/single_input_file' into 'develop'
Browse files Browse the repository at this point in the history
single file input

Closes #32

See merge request msaid/chimerys/mokapot!30
  • Loading branch information
gessulat committed Jan 12, 2023
2 parents 02bb7a3 + f650e72 commit 13c6669
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 171 deletions.
61 changes: 18 additions & 43 deletions mokapot/brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,27 +86,11 @@ def brew(
if model is None:
model = PercolatorModel()

try:
iter(psms_info)
except TypeError:
psms_info = [psms_info]

# Check that all of the datasets have the same features:
feat_set = set(psms_info[0]["feature_columns"])
if not all([set(p["feature_columns"]) == feat_set for p in psms_info]):
raise ValueError("All collections of PSMs must use the same features.")

target_column = psms_info[0]["target_column"]
spectrum_columns = psms_info[0]["spectrum_columns"]
df_spectra = pd.concat(
[
read_file(
file_name=p["file"],
use_cols=spectrum_columns + [target_column],
)
for p in psms_info
],
ignore_index=True,
target_column = psms_info["target_column"]
spectrum_columns = psms_info["spectrum_columns"]
df_spectra = read_file(
file_name=psms_info["file"],
use_cols=spectrum_columns + [target_column],
).apply(pd.to_numeric, errors="ignore")
df_spectra = convert_targets_column(df_spectra, target_column)
data_size = len(df_spectra)
Expand All @@ -120,9 +104,6 @@ def brew(
num_targets,
num_decoys,
)
# once we check all the datasets have the same features we can use only one metedata information
psms_info[0]["file"] = [psm["file"] for psm in psms_info]
psms_info = psms_info[0]
df_spectra = df_spectra[spectrum_columns]
LOGGER.info("Splitting PSMs into %i folds...", folds)
test_idx = _split(df_spectra, folds)
Expand Down Expand Up @@ -163,10 +144,10 @@ def brew(
elif all([m[0].is_trained for m in models]):
# If we don't reset, assign scores to each fold:
models = [m for m, _ in models]
scores = [_predict(test_idx, psms_info, models, test_fdr)]
scores = _predict(test_idx, psms_info, models, test_fdr)
else:
# If model training has failed
scores = [np.zeros(len(p.data)) for p in psms_info]
scores = np.zeros(data_size)
# Find which is best: the learned model, the best feature, or
# a pretrained model.
if type(model) is not list and not model.override:
Expand All @@ -177,27 +158,21 @@ def brew(
else:
feat_total = 0

preds = [
update_labels(p, s, test_fdr) for p, s in zip([psms_info], scores)
]
pred_total = sum([(pred == 1).sum() for pred in preds])
preds = update_labels(psms_info, scores, test_fdr)

pred_total = sum([(preds == 1).sum()])

# Here, f[0] is the name of the best feature, and f[3] is a boolean
if feat_total > pred_total:
using_best_feat = True
scores = []
descs = []
feat, _, desc = best_feats[best_feat_idx]
for dat in psms_info["file"]:
df = pd.read_csv(
dat,
sep="\t",
usecols=[feat],
index_col=False,
on_bad_lines="skip",
)
scores.append(df.values)
descs.append(desc)
scores = pd.read_csv(
psms_info["file"],
sep="\t",
usecols=[feat],
index_col=False,
on_bad_lines="skip",
).values

else:
using_best_feat = False
Expand All @@ -216,7 +191,7 @@ def brew(
"using the original model."
)

return psms_info, models, scores[0], desc
return psms_info, models, scores, desc


# Utility Functions -----------------------------------------------------------
Expand Down
102 changes: 53 additions & 49 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,12 @@ def __init__(
dest_dir=None,
file_root=None,
sep="\t",
proteins=None,
combine=False,
):
"""Initialize a GroupedConfidence object"""
psms = read_file(
psms_info["file"][0],
psms_info["file"],
use_cols=list(psms_info["feature_columns"])
+ list(psms_info["metadata_columns"]),
)
Expand All @@ -111,8 +112,8 @@ def __init__(
tdc_winners = group_df.index.intersection(idx)
group_psms = group_df.loc[tdc_winners, :]
group_scores = scores.loc[group_psms.index].values + 1
psms_info["file"] = ["group_psms.csv"]
group_psms.to_csv(psms_info["file"][0], sep="\t", index=False)
psms_info["file"] = "group_psms.csv"
group_psms.to_csv(psms_info["file"], sep="\t", index=False)
assign_confidence(
psms_info,
group_scores * (2 * desc - 1),
Expand All @@ -122,6 +123,7 @@ def __init__(
file_root=file_root,
sep=sep,
decoys=decoys,
proteins=proteins,
group_column=group,
combine=combine,
)
Expand Down Expand Up @@ -233,23 +235,21 @@ class Confidence:
"peptide_pairs": "Peptide Pairs",
}

def __init__(self, psms_info):
def __init__(self, psms_info, proteins=None):
"""Initialize a PsmConfidence object."""
self._score_column = "score"
self._target_column = psms_info["target_column"]
self._protein_column = psms_info["protein_column"]
self._protein_column = "proteinIds"
self._group_column = psms_info["group_column"]
self._metadata_column = psms_info["metadata_columns"]
self._has_proteins = psms_info["has_proteins"]

self.scores = None
self.targets = None
self.qvals = None
self.peps = None

if self._has_proteins:
self._proteins = self._has_proteins
else:
self._proteins = None
self._proteins = proteins

# This attribute holds the results as DataFrames:
self.confidence_estimates = {}
self.decoy_confidence_estimates = {}
Expand All @@ -267,7 +267,9 @@ def levels(self):
"""
return list(self.confidence_estimates.keys())

def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
def to_txt(
self, data_path, columns, level, decoys, file_root, dest_dir, sep
):
"""Save confidence estimates to delimited text files.
Parameters
----------
Expand All @@ -294,11 +296,10 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
The paths to the saved files.
"""

reader = read_file_in_chunks(
file=data_path,
chunk_size=CONFIDENCE_CHUNK_SIZE,
use_cols=["ScanNr", "Peptide", "Proteins"],
use_cols=[i for i in columns if i != self._target_column],
)

self.scores = utils.create_chunks(
Expand All @@ -314,27 +315,22 @@ def to_txt(self, data_path, level, decoys, file_root, dest_dir, sep):
self.targets, chunk_size=CONFIDENCE_CHUNK_SIZE
)

output_columns = sep.join(
[
"PSMId",
"Peptide",
"score",
"q-value",
"posterior_error_prob",
"proteinIds",
"\n",
]
)
if file_root is not None:
dest_dir = Path(dest_dir, file_root)
outfile_t = str(dest_dir) + f"targets.{level}"
outfile_d = str(dest_dir) + f"decoys.{level}"

columns.remove(self._target_column)
output_columns = columns + ["q-value", "posterior_error_prob"]
if level != "proteins" and self._protein_column is not None:
output_columns.remove(self._protein_column)
output_columns.append(self._protein_column)
if not os.path.exists(outfile_t):
with open(outfile_t, "w") as fp:
fp.write(output_columns)
fp.write(f"{sep.join(output_columns)}\n")
if decoys and not os.path.exists(outfile_d):
with open(outfile_d, "w") as fp:
fp.write(output_columns)
fp.write(f"{sep.join(output_columns)}\n")

for data_in, score_in, qvals_in, pep_in, target_in in zip(
reader, self.scores, self.qvals, self.peps, self.targets
Expand Down Expand Up @@ -453,16 +449,17 @@ def __init__(
dest_dir=None,
file_root=None,
decoys=None,
proteins=None,
sep="\t",
group_column=None,
combine=False,
):
"""Initialize a a LinearPsmConfidence object"""
super().__init__(psms_info)
super().__init__(psms_info, proteins)
self._target_column = psms_info["target_column"]
self._psm_columns = psms_info["spectrum_columns"]
self._peptide_column = psms_info["peptide_column"]
self._protein_column = psms_info["protein_column"]
self._protein_column = "proteinIds"
self._eval_fdr = eval_fdr

LOGGER.info("Performing target-decoy competition...")
Expand Down Expand Up @@ -496,7 +493,7 @@ def __repr__(self):
f"{self.accepted['peptides']}\n"
)

if self._has_proteins:
if self._proteins:
base += (
f"\t- Protein groups at q<={self._eval_fdr:g}: "
f"{self.accepted['proteins']}\n"
Expand Down Expand Up @@ -555,10 +552,8 @@ def _assign_confidence(
"""
levels = ["PSMs", "peptides"]
level_data_path = [psms_path, peptides_path]
if self._has_proteins:
data = read_file(
peptides_path, use_cols=self._metadata_column + ["score"]
)
if self._proteins:
data = read_file(peptides_path)
data = data.apply(pd.to_numeric, errors="ignore")
convert_targets_column(
data=data, target_column=self._target_column
Expand All @@ -577,15 +572,15 @@ def _assign_confidence(
LOGGER.info("\t- Found %i unique protein groups.", len(proteins))

for level, data_path in zip(levels, level_data_path):
data = read_file(
data_path, use_cols=self._metadata_column + ["score"]
)
data = read_file(data_path)
data = data.apply(pd.to_numeric, errors="ignore")
data_columns = list(data.columns)
convert_targets_column(
data=data, target_column=self._target_column
)
self.scores = data.loc[:, self._score_column].values
self.targets = data.loc[:, self._target_column].astype(bool).values
del data
if all(self.targets):
LOGGER.warning(
"No decoy PSMs remain for confidence estimation. "
Expand Down Expand Up @@ -624,11 +619,23 @@ def _assign_confidence(
if group_column and not combine:
file_root = f"{group_column}."
self.to_txt(
data_path, level.lower(), decoys, file_root, dest_dir, sep
data_path,
data_columns,
level.lower(),
decoys,
file_root,
dest_dir,
sep,
)
else:
self.to_txt(
data_path, level.lower(), decoys, file_root, dest_dir, sep
data_path,
data_columns,
level.lower(),
decoys,
file_root,
dest_dir,
sep,
)

def to_flashlfq(self, out_file="mokapot.flashlfq.txt"):
Expand Down Expand Up @@ -800,6 +807,7 @@ def assign_confidence(
file_root=None,
sep="\t",
decoys=False,
proteins=None,
group_column=None,
combine=False,
):
Expand Down Expand Up @@ -847,17 +855,11 @@ def assign_confidence(
if scores is None:
feat, _, _, desc = find_best_feature(psms_info, eval_fdr)
LOGGER.info("Selected %s as the best feature.", feat)
scores = pd.concat(
[
read_file(file_name=file, use_cols=[feat])
for file in psms_info["file"]
],
ignore_index=True,
).values
scores = read_file(file_name=psms_info["file"], use_cols=[feat]).values

if psms_info["group_column"] is None:
reader = read_file_in_chunks(
file=psms_info["file"][0],
file=psms_info["file"],
chunk_size=CONFIDENCE_CHUNK_SIZE,
use_cols=psms_info["metadata_columns"],
)
Expand All @@ -874,7 +876,6 @@ def assign_confidence(
mode="a",
header=None,
)
metadata_columns = list(chunk_metadata.columns)
psms_path = "psms.csv"
peptides_path = "peptides.csv"
iterable_sorted = utils.sort_file_on_disk(
Expand All @@ -886,10 +887,11 @@ def assign_confidence(
"Keeping the best match per %s columns...",
"+".join(psms_info["spectrum_columns"]),
)
metadata_columns = ["PSMId", "Label", "Peptide", "proteinIds", "score"]
with open(psms_path, "w") as f_psm:
f_psm.write(sep.join(metadata_columns + ["score", "\n"]))
f_psm.write(f"{sep.join(metadata_columns)}\n")
with open(peptides_path, "w") as f_peptide:
f_peptide.write(sep.join(metadata_columns + ["score", "\n"]))
f_peptide.write(f"{sep.join(metadata_columns)}\n")

unique_psms, unique_peptides = utils.get_unique_psms_and_peptides(
iterable=iterable_sorted,
Expand All @@ -911,6 +913,7 @@ def assign_confidence(
file_root=file_root,
sep=sep,
decoys=decoys,
proteins=proteins,
group_column=group_column,
combine=combine,
)
Expand All @@ -925,6 +928,7 @@ def assign_confidence(
file_root=file_root,
sep=sep,
decoys=decoys,
proteins=proteins,
combine=combine,
)

Expand Down
Loading

0 comments on commit 13c6669

Please sign in to comment.