From da0507375c63882b8e61230312e9fff36b015545 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Wed, 6 Mar 2024 16:31:30 +1000 Subject: [PATCH] ENH Make split_contigs into a subcommand --- SemiBin/main.py | 48 +++++++++++++++++++ docs/usage.md | 26 +++++----- integration-tests/split_contigs_command.py | 30 ++++++++++++ script/generate_split.py | 56 ---------------------- 4 files changed, 91 insertions(+), 69 deletions(-) create mode 100644 integration-tests/split_contigs_command.py delete mode 100644 script/generate_split.py diff --git a/SemiBin/main.py b/SemiBin/main.py index 8ef0a08..fcbf55e 100644 --- a/SemiBin/main.py +++ b/SemiBin/main.py @@ -102,6 +102,29 @@ def parse_args(args, is_semibin2): default=0, dest='min_length') + split_contigs = subparsers.add_parser('split_contigs', + help = 'Split contigs to generate data (only for strobealign-aemb pipeline)') + + split_contigs.add_argument('-i', '--input-fasta', + required=True, + help='Path to the input fasta file.', + dest='contig_fasta', + default=None, ) + + split_contigs.add_argument('-o', '--output', + required=True, + help='Output directory (will be created if non-existent)', + dest='output', + default=None + ) + + split_contigs.add_argument('-m', '--min-len', + required=False, + type=int, + help='Discard sequences below this length (default:0)', + default=0, + dest='min_length') + generate_sequence_features_single.add_argument('--kmer', required=False, help='Just output data.csv with k-mer features.', @@ -1360,6 +1383,26 @@ def multi_easy_binning(logger, args, device): new_path = os.path.join(args.output, 'bins', new_file) shutil.copyfile(original_path, new_path) + +def split_contigs(logger, contig_fasta, *, output, min_length): + from .utils import possibly_compressed_write + os.makedirs(output, exist_ok=True) + + oname = f"{output}/split_contigs.fna.gz" + with possibly_compressed_write(oname) as ofile: + for h, seq in fasta_iter(contig_fasta): + if len(seq) < min_length: + continue + half = len(seq) // 2 + h1 = h + "_1" + seq1 = seq[:half] + h2 = h + "_2" + seq2 = seq[half:] + ofile.write(f">{h1}\n{seq1}\n") + ofile.write(f">{h2}\n{seq2}\n") + return oname + + def main2(args=None, is_semibin2=True): import tempfile @@ -1573,6 +1616,11 @@ def main2(args=None, is_semibin2=True): from .utils import concatenate_fasta ofname = concatenate_fasta(args.contig_fasta, args.min_length, args.output, args.separator, args.output_compression) logger.info(f'Concatenated contigs written to {ofname}') + elif args.cmd == 'split_contigs': + if not is_semibin2: + logger.error('Command `split_contigs` is not available in SemiBin1. Please upgrade to SemiBin2.') + oname = split_contigs(logger, args.contig_fasta, output=args.output, min_length=args.min_length) + logger.info(f'Split contigs written to {oname}') else: logger.error(f'Could not handle subcommand {arg.cmd}') diff --git a/docs/usage.md b/docs/usage.md index 320feff..f937159 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -426,22 +426,22 @@ Strobealign-aemb is a fast abundance estimation method for metagenomic binning. As strobealign-aemb can not provide the mapping information for every position of the contig, so we can not run SemiBin2 with strobealign-aemb in binning modes where samples used smaller 5 and need to split the contigs to generate the must-link constratints. -1. Split the fasta files +1. Split the fasta files using the `split_contigs` subcommand: ```bash -python script/generate_split.py -c contig.fa -o output +SemiBin2 split_contigs -i contig.fa -o output ``` -2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information +2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information. Note that version 0.13 (or newer) is required ```bash -strobealign --aemb output/split.fa read1_1.fq read1_2.fq -R 6 > sample1.txt -strobealign --aemb output/split.fa read2_1.fq read2_2.fq -R 6 > sample2.txt -strobealign --aemb output/split.fa read3_1.fq read3_2.fq -R 6 > sample3.txt -strobealign --aemb output/split.fa read4_1.fq read4_2.fq -R 6 > sample4.txt -strobealign --aemb output/split.fa read5_1.fq read5_2.fq -R 6 > sample5.txt +strobealign --aemb output/split_contigs.fna.gz read1_1.fq read1_2.fq -R 6 > sample1.tsv +strobealign --aemb output/split_contigs.fna.gz read2_1.fq read2_2.fq -R 6 > sample2.tsv +strobealign --aemb output/split_contigs.fna.gz read3_1.fq read3_2.fq -R 6 > sample3.tsv +strobealign --aemb output/split_contigs.fna.gz read4_1.fq read4_2.fq -R 6 > sample4.tsv +strobealign --aemb output/split_contigs.fna.gz read5_1.fq read5_2.fq -R 6 > sample5.tsv ``` -3. Run SemiBin2 (like running SemiBin with BAM files) +3. Run SemiBin2 (running SemiBin with BAM files) ```bash -SemiBin2 generate_sequence_features_single -i contig.fa -a *.txt -o output -SemiBin2 generate_sequence_features_multi -i contig.fa -a *.txt -s : -o output -SemiBin2 single_easy_bin -i contig.fa -a *.txt -o output -SemiBin2 multi_easy_bin i contig.fa -a *.txt -s : -o output +SemiBin2 generate_sequence_features_single -i contig.fa -a *.tsv -o output +SemiBin2 generate_sequence_features_multi -i contig.fa -a *.tsv -s : -o output +SemiBin2 single_easy_bin -i contig.fa -a *.tsv -o output +SemiBin2 multi_easy_bin i contig.fa -a *.tsv -s : -o output diff --git a/integration-tests/split_contigs_command.py b/integration-tests/split_contigs_command.py new file mode 100644 index 0000000..1b88e8f --- /dev/null +++ b/integration-tests/split_contigs_command.py @@ -0,0 +1,30 @@ +from os import path +import subprocess +import tempfile +import gzip + +tmpdir = '/tmp/SemiBin2' +with tempfile.TemporaryDirectory() as tmpdir: + with open(f'{tmpdir}/S1.fna', 'w') as f: + f.write('>C1\n') + for i in range(1000): + f.write('ATCG') + f.write('\n') + f.write('>C2\n') + for i in range(100): + f.write('GTAA') + subprocess.check_call( + ['SemiBin2', 'split_contigs', + '-i', f'{tmpdir}/S1.fna', + '-o', f'{tmpdir}/output', + '--min-len', '1000',]) + assert path.exists(f'{tmpdir}/output/split_contigs.fna.gz') + + with gzip.open(f'{tmpdir}/output/split_contigs.fna.gz', 'rt') as f: + assert f.readline() == '>C1_1\n' + part1 = f.readline() + assert f.readline() == '>C1_2\n' + part2 = f.readline() + assert len(part1) + len(part2) == 4002 # 4000 + 2 newlines + assert not f.read(1) + diff --git a/script/generate_split.py b/script/generate_split.py deleted file mode 100644 index 1ffc177..0000000 --- a/script/generate_split.py +++ /dev/null @@ -1,56 +0,0 @@ -import argparse -from atomicwrites import atomic_write -from SemiBin.fasta import fasta_iter -import os - -def generate_file(contig_file, output, min_length, name): - os.makedirs(output, exist_ok=True) - - with atomic_write(f"{output}/{name}.fa") as ofile: - for h, seq in fasta_iter(f"{contig_file}"): - if len(seq) < min_length: - continue - half = len(seq) // 2 - h1 = h + "_1" - seq1 = seq[:half] - h2 = h + "_2" - seq2 = seq[half:] - ofile.write(f">{h1}\n{seq1}\n") - ofile.write(f">{h2}\n{seq2}\n") - -def main(): - parser = argparse.ArgumentParser( - description="Generate cannt-link constrains from the output of CAT") - parser.add_argument('-c', '--contig-file', - required=True, - help='Path to the contig fasta file corresponding to the CAT output file.', - dest='contig_file') - parser.add_argument('-o', '--output', - required=True, - help='Output directory (will be created if non-existent)', - dest='output', - default=None - ) - parser.add_argument('-n', '--name', - required=False, - help='Name for the splitted contig.', - dest='name', - default='split' - ) - parser.add_argument('-m','--min-length', - help='Remove contig whose length smaller than this threshold', - required=False, - default=0, - dest='min_length') - - - args = parser.parse_args() - generate_file( - args.contig_file, - args.output, - args.min_length, - args.name) - - -if __name__ == '__main__': - main() \ No newline at end of file