Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unmapped reads from SAM output #78

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,22 @@ from snakemake.utils import min_version
##################################


extensions=["fa", "fasta", "fq", "fastq"]

def multiglob(patterns):
files = []
for pattern in patterns:
files.extend(glob.glob(pattern))
files = list(map(Path, files))
return files


def get_all_query_filepaths():
return multiglob(
["queries/*.fa", "queries/*.fq", "queries/*.fasta", "queries/*.fastq"]
)
return multiglob(expand("queries/*.{ext}", ext=extensions))


def get_all_query_filenames():
return (Path(file).with_suffix("").name for file in get_all_query_filepaths())
return (file.with_suffix("").name for file in get_all_query_filepaths())


def get_filename_for_all_queries():
Expand All @@ -43,11 +44,11 @@ batches = [x.strip() for x in open(config["batches"])]
print(f"Batches: {batches}")

qfiles = get_all_query_filepaths()
print(f"Query files: {qfiles}")
print(f"Query files: {list(map(str, qfiles))}")

assemblies_dir = f"{config['download_dir']}/asms"
cobs_dir = f"{config['download_dir']}/cobs"
decompression_dir = config.get("decompression_dir", "intermediate/00_cobs")
assemblies_dir = Path(f"{config['download_dir']}/asms")
cobs_dir = Path(f"{config['download_dir']}/cobs")
decompression_dir = Path(config.get("decompression_dir", "intermediate/00_cobs"))


wildcard_constraints:
Expand Down Expand Up @@ -155,13 +156,18 @@ rule download_cobs_batch:
##################################
## Processing rules
##################################
def get_query_file(wildcards):
query_file = multiglob(expand(f"queries/{wildcards.qfile}.{{ext}}", ext=extensions))
assert len(query_file) == 1
return query_file[0]

rule fix_query:
"""Fix query to expected COBS format: single line fastas composed of ACGT bases only
"""
output:
fixed_query="intermediate/fixed_queries/{qfile}.fa",
input:
original_query="queries/{qfile}.fa",
original_query=get_query_file,
threads: 1
conda:
"envs/seqtk.yaml"
Expand All @@ -179,7 +185,7 @@ rule concatenate_queries:
"""Concatenate all queries into a single file, so we just need to run COBS/minimap2 just once per batch
"""
output:
concatenated_query=f"intermediate/fixed_queries/{get_filename_for_all_queries()}.fa",
concatenated_query=f"intermediate/concatenated_query/{get_filename_for_all_queries()}.fa",
input:
all_queries=expand(
"intermediate/fixed_queries/{qfile}.fa", qfile=get_all_query_filenames()
Expand Down Expand Up @@ -216,7 +222,7 @@ rule run_cobs:
match=protected("intermediate/01_match/{batch}____{qfile}.xz"),
input:
cobs_index=f"{decompression_dir}/{{batch}}.cobs_classic",
fa="intermediate/fixed_queries/{qfile}.fa",
fa="intermediate/concatenated_query/{qfile}.fa",
threads: config["cobs_thr"] # Small number in order to guarantee Snakemake parallelization
params:
kmer_thres=config["cobs_kmer_thres"],
Expand Down Expand Up @@ -244,7 +250,7 @@ rule decompress_and_run_cobs:
match=protected("intermediate/01_match/{batch}____{qfile}.xz"),
input:
compressed_cobs_index=f"{cobs_dir}/{{batch}}.cobs_classic.xz",
fa="intermediate/fixed_queries/{qfile}.fa",
fa="intermediate/concatenated_query/{qfile}.fa",
threads: config["cobs_thr"] # Small number in order to guarantee Snakemake parallelization
params:
kmer_thres=config["cobs_kmer_thres"],
Expand Down Expand Up @@ -278,7 +284,7 @@ rule translate_matches:
output:
fa="intermediate/02_filter/{qfile}.fa",
input:
fa="intermediate/fixed_queries/{qfile}.fa",
fa="intermediate/concatenated_query/{qfile}.fa",
all_matches=[
f"intermediate/01_match/{batch}____{{qfile}}.xz" for batch in batches
],
Expand Down
Binary file removed output/backbone19Kbp.sam_summary.xz
Binary file not shown.
Loading