Skip to content

Commit

Permalink
ENH Make split_contigs into a subcommand
Browse files Browse the repository at this point in the history
  • Loading branch information
luispedro committed Mar 6, 2024
1 parent b3c7d5b commit da05073
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 69 deletions.
48 changes: 48 additions & 0 deletions SemiBin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,29 @@ def parse_args(args, is_semibin2):
default=0,
dest='min_length')

split_contigs = subparsers.add_parser('split_contigs',
help = 'Split contigs to generate data (only for strobealign-aemb pipeline)')

split_contigs.add_argument('-i', '--input-fasta',
required=True,
help='Path to the input fasta file.',
dest='contig_fasta',
default=None, )

split_contigs.add_argument('-o', '--output',
required=True,
help='Output directory (will be created if non-existent)',
dest='output',
default=None
)

split_contigs.add_argument('-m', '--min-len',
required=False,
type=int,
help='Discard sequences below this length (default:0)',
default=0,
dest='min_length')

generate_sequence_features_single.add_argument('--kmer',
required=False,
help='Just output data.csv with k-mer features.',
Expand Down Expand Up @@ -1360,6 +1383,26 @@ def multi_easy_binning(logger, args, device):
new_path = os.path.join(args.output, 'bins', new_file)
shutil.copyfile(original_path, new_path)


def split_contigs(logger, contig_fasta, *, output, min_length):
from .utils import possibly_compressed_write
os.makedirs(output, exist_ok=True)

oname = f"{output}/split_contigs.fna.gz"
with possibly_compressed_write(oname) as ofile:
for h, seq in fasta_iter(contig_fasta):
if len(seq) < min_length:
continue
half = len(seq) // 2
h1 = h + "_1"
seq1 = seq[:half]
h2 = h + "_2"
seq2 = seq[half:]
ofile.write(f">{h1}\n{seq1}\n")
ofile.write(f">{h2}\n{seq2}\n")
return oname


def main2(args=None, is_semibin2=True):
import tempfile

Expand Down Expand Up @@ -1573,6 +1616,11 @@ def main2(args=None, is_semibin2=True):
from .utils import concatenate_fasta
ofname = concatenate_fasta(args.contig_fasta, args.min_length, args.output, args.separator, args.output_compression)
logger.info(f'Concatenated contigs written to {ofname}')
elif args.cmd == 'split_contigs':
if not is_semibin2:
logger.error('Command `split_contigs` is not available in SemiBin1. Please upgrade to SemiBin2.')
oname = split_contigs(logger, args.contig_fasta, output=args.output, min_length=args.min_length)
logger.info(f'Split contigs written to {oname}')

else:
logger.error(f'Could not handle subcommand {arg.cmd}')
Expand Down
26 changes: 13 additions & 13 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -426,22 +426,22 @@ Strobealign-aemb is a fast abundance estimation method for metagenomic binning.
As strobealign-aemb can not provide the mapping information for every position of the contig, so we can not run SemiBin2 with strobealign-aemb in binning modes where samples used smaller 5 and need to split the contigs to generate the must-link constratints.


1. Split the fasta files
1. Split the fasta files using the `split_contigs` subcommand:
```bash
python script/generate_split.py -c contig.fa -o output
SemiBin2 split_contigs -i contig.fa -o output
```
2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information
2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information. Note that version 0.13 (or newer) is required
```bash
strobealign --aemb output/split.fa read1_1.fq read1_2.fq -R 6 > sample1.txt
strobealign --aemb output/split.fa read2_1.fq read2_2.fq -R 6 > sample2.txt
strobealign --aemb output/split.fa read3_1.fq read3_2.fq -R 6 > sample3.txt
strobealign --aemb output/split.fa read4_1.fq read4_2.fq -R 6 > sample4.txt
strobealign --aemb output/split.fa read5_1.fq read5_2.fq -R 6 > sample5.txt
strobealign --aemb output/split_contigs.fna.gz read1_1.fq read1_2.fq -R 6 > sample1.tsv
strobealign --aemb output/split_contigs.fna.gz read2_1.fq read2_2.fq -R 6 > sample2.tsv
strobealign --aemb output/split_contigs.fna.gz read3_1.fq read3_2.fq -R 6 > sample3.tsv
strobealign --aemb output/split_contigs.fna.gz read4_1.fq read4_2.fq -R 6 > sample4.tsv
strobealign --aemb output/split_contigs.fna.gz read5_1.fq read5_2.fq -R 6 > sample5.tsv
```
3. Run SemiBin2 (like running SemiBin with BAM files)
3. Run SemiBin2 (running SemiBin with BAM files)
```bash
SemiBin2 generate_sequence_features_single -i contig.fa -a *.txt -o output
SemiBin2 generate_sequence_features_multi -i contig.fa -a *.txt -s : -o output
SemiBin2 single_easy_bin -i contig.fa -a *.txt -o output
SemiBin2 multi_easy_bin i contig.fa -a *.txt -s : -o output
SemiBin2 generate_sequence_features_single -i contig.fa -a *.tsv -o output
SemiBin2 generate_sequence_features_multi -i contig.fa -a *.tsv -s : -o output
SemiBin2 single_easy_bin -i contig.fa -a *.tsv -o output
SemiBin2 multi_easy_bin i contig.fa -a *.tsv -s : -o output

30 changes: 30 additions & 0 deletions integration-tests/split_contigs_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from os import path
import subprocess
import tempfile
import gzip

tmpdir = '/tmp/SemiBin2'
with tempfile.TemporaryDirectory() as tmpdir:
with open(f'{tmpdir}/S1.fna', 'w') as f:
f.write('>C1\n')
for i in range(1000):
f.write('ATCG')
f.write('\n')
f.write('>C2\n')
for i in range(100):
f.write('GTAA')
subprocess.check_call(
['SemiBin2', 'split_contigs',
'-i', f'{tmpdir}/S1.fna',
'-o', f'{tmpdir}/output',
'--min-len', '1000',])
assert path.exists(f'{tmpdir}/output/split_contigs.fna.gz')

with gzip.open(f'{tmpdir}/output/split_contigs.fna.gz', 'rt') as f:
assert f.readline() == '>C1_1\n'
part1 = f.readline()
assert f.readline() == '>C1_2\n'
part2 = f.readline()
assert len(part1) + len(part2) == 4002 # 4000 + 2 newlines
assert not f.read(1)

56 changes: 0 additions & 56 deletions script/generate_split.py

This file was deleted.

0 comments on commit da05073

Please sign in to comment.