Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ConvertJob.copy_sequences() method added. #145

Merged
merged 6 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 50 additions & 29 deletions sequence_processing_pipeline/ConvertJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def _get_sample_sheet_info(self):
has_reps = sheet.Bioinformatics['contains_replicates'].tolist()
# assume a validated sample-sheet ensures has_reps has only one
# value, either True or False.
self.contains_replicates = bool(has_reps[0])
self.contains_replicates = has_reps[0]
else:
self.contains_replicates = False

Expand Down Expand Up @@ -295,10 +295,11 @@ def copy_sequences(self, sample_name, source_project, dest_project,
copy_all_replicates=False):
charles-cowart marked this conversation as resolved.
Show resolved Hide resolved
"""
Copies all fastq files related to a sample into another project.
:param source_project: The source project w/qiita_id.
:param dest_project: The destination project w/qiita_id.
:param sample_name: A sample-name.
:param orig_name: A sample-name.
:param source_project: The source project name including qiita_id.
:param dest_project: The destination project name including qiita_id.
:param sample_name: A value from the sample-name column if
copy_all_replicates is False or a value from the orig_name column
otherwise.
:param copy_all_replicates: If True, search for sample_name in the
orig_name column of the sample-sheet. Copy all replicates.
:return: None
Expand All @@ -309,15 +310,10 @@ def copy_sequences(self, sample_name, source_project, dest_project,

project_names = list(self.info.keys())

# confirm source project is a valid one.
if source_project not in project_names:
raise ValueError(f"'{source_project}' is not defined in the "
"sample-sheet")

# confirm destination project is a valid one.
if dest_project not in project_names:
raise ValueError(f"'{dest_project}' is not defined in the "
"sample-sheet")
# confirm source and destination projects are both valid.
for proj in [source_project, dest_project]:
if proj not in project_names:
raise ValueError(f"'{proj}' is not defined in sample-sheet")

if source_project == dest_project:
raise ValueError(f"source '{source_project}' and destination "
Expand All @@ -341,33 +337,58 @@ def copy_sequences(self, sample_name, source_project, dest_project,
# be processed.

if copy_all_replicates is True and self.contains_replicates is False:
raise ValueError("'treat_as_orig_name' is set to 'True' but this "
raise ValueError("'copy_all_replicates' is set to 'True' but this "
"sample-sheet doesn't contain replicates")
charles-cowart marked this conversation as resolved.
Show resolved Hide resolved

samples = self.info[source_project]['samples']
sample_list = []

# Either we will need to copy all replicates, in which case we need to
# match our input to values in the orig_name column, or we want to copy
# a single sample, in which case the input needs to be matched to a
# value in the sample_name column. The results of our match will be
# stored in results and if we are not copying all replicates, there
# should only be one sample matched in results.
results = []

if copy_all_replicates:
for key in samples:
sample = samples[key]
# assume orig_name is present if treat_as_orig_name is True.
for _, sample in samples.items():
# assume orig_name is present if copy_all_replicates is True.
if sample_name == sample['orig_name']:
sample_list.append(sample)
results.append(sample)
else:
# sample_name is a value from the sample_name column. it may or
# may not have a well-id appended and this sample-sheet may or
# may not contain replicates, but in either case a single sample
# either exists or it doesn't.
if sample_name in self.info[source_project]['samples']:
sample_list.append(samples[sample_name])

if len(sample_list) == 0:
# if the sample_list is empty, then sample-name wasn't present in
# either the sample_name or orig_name columns.
raise ValueError(f"'{sample_name}' is not defined in the project"
f" '{source_project}'")

for sample in sample_list:
results.append(samples[sample_name])

if len(results) == 0:
if copy_all_replicates:
# the value of sample_name did not match any value in the
# orig_name column. It may match a value in the sample_name
# column.

# if we want to copy all replicates and we mistakenly provide
# an input from the sample_name column, we could first look
# for a match in the orig_name column and if it fails, look for
# an exact match in the sample_name column. If we get an exact
# match we can move forward using the input's associated
# orig_name. This is not currently implemented however.
msg = (f"'{sample_name}' did not match any values in the 'orig"
f"_name' column for project '{source_project}'. Your "
f"value '{sample_name}' have a well-id appended to it")
charles-cowart marked this conversation as resolved.
Show resolved Hide resolved
else:
# if we don't want to copy all replicates and we just want to
# copy a particular sample-name, then providing a value from
# the orig_name column would be ambiguous since it could be
# matched to multiple samples. Not much can be done here.
msg = (f"'{sample_name}' did not match any values in the 'samp"
f"le_name' column for project '{source_project}'")

raise ValueError(msg)

for sample in results:
for src_fp in sample['matching_files']:
# split(fp)[1] is simply the original filename, which must
# be provided in the destination path.
Expand Down
22 changes: 11 additions & 11 deletions sequence_processing_pipeline/tests/test_ConvertJob.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,31 +1019,34 @@ def test_copy_sequences_bad_parameters(self):
not_a_sample_name = 'NOT_A_SAMPLE_NAME'
not_a_project = 'NOT_A_PROJECT'

with self.assertRaisesRegex(ValueError, "'NOT_A_SAMPLE_NAME' is not "
"defined in the project "
"'Feist_11661'"):
with self.assertRaisesRegex(ValueError, "'NOT_A_SAMPLE_NAME' did not "
"match any values in the "
"'sample_name' column for "
"project 'Feist_11661'"):
job.copy_sequences(not_a_sample_name,
source_project,
dest_project,
copy_all_replicates=False)

with self.assertRaisesRegex(ValueError, "'CDPH-SAL.Salmonella.Typhi."
"MDL-154' is not defined in "
"the project 'Gerwick_6123'"):
"MDL-154' did not match any "
"values in the 'sample_name' "
"column for project 'Gerwick_"
"6123'"):
job.copy_sequences(sample_name,
not_source_project,
dest_project,
copy_all_replicates=False)

with self.assertRaisesRegex(ValueError, "'NOT_A_PROJECT' is not "
"defined in the sample-sheet"):
"defined in sample-sheet"):
job.copy_sequences(sample_name,
not_a_project,
dest_project,
copy_all_replicates=False)

with self.assertRaisesRegex(ValueError, "'NOT_A_PROJECT' is not "
"defined in the sample-sheet"):
"defined in sample-sheet"):
job.copy_sequences(sample_name,
source_project,
not_a_project,
Expand Down Expand Up @@ -1100,9 +1103,6 @@ def test_copy_sequences_success(self):
copy_all_replicates=False)

sample_info = job.info[source_project]['samples'][sample_name]
# {'Sample_Name': 'CDPH-SAL.Salmonella.Typhi.MDL-154',
# 'Sample_ID': 'CDPH-SAL_Salmonella_Typhi_MDL-154',
# 'matching_files': []}

# get the path for the source fastq file we created above and swap out
# the project-level directory names to confirm and deny the existence
Expand All @@ -1121,7 +1121,7 @@ def test_copy_sequences_success(self):
# correctly, since the setup for this situation has already been
# created here.

msg = ("'treat_as_orig_name' is set to 'True' but this sample-sheet "
msg = ("'copy_all_replicates' is set to 'True' but this sample-sheet "
"doesn't contain replicates")

with self.assertRaisesRegex(ValueError, msg):
Expand Down
Loading