From da0507375c63882b8e61230312e9fff36b015545 Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <luis@luispedro.org>
Date: Wed, 6 Mar 2024 16:31:30 +1000
Subject: [PATCH] ENH Make split_contigs into a subcommand

---
 SemiBin/main.py                            | 48 +++++++++++++++++++
 docs/usage.md                              | 26 +++++-----
 integration-tests/split_contigs_command.py | 30 ++++++++++++
 script/generate_split.py                   | 56 ----------------------
 4 files changed, 91 insertions(+), 69 deletions(-)
 create mode 100644 integration-tests/split_contigs_command.py
 delete mode 100644 script/generate_split.py

diff --git a/SemiBin/main.py b/SemiBin/main.py
index 8ef0a08..fcbf55e 100644
--- a/SemiBin/main.py
+++ b/SemiBin/main.py
@@ -102,6 +102,29 @@ def parse_args(args, is_semibin2):
                         default=0,
                         dest='min_length')
 
+    split_contigs = subparsers.add_parser('split_contigs',
+                                          help = 'Split contigs to generate data (only for strobealign-aemb pipeline)')
+
+    split_contigs.add_argument('-i', '--input-fasta',
+                   required=True,
+                   help='Path to the input fasta file.',
+                   dest='contig_fasta',
+                   default=None, )
+
+    split_contigs.add_argument('-o', '--output',
+                        required=True,
+                        help='Output directory (will be created if non-existent)',
+                        dest='output',
+                        default=None
+                        )
+
+    split_contigs.add_argument('-m', '--min-len',
+                        required=False,
+                        type=int,
+                        help='Discard sequences below this length (default:0)',
+                        default=0,
+                        dest='min_length')
+
     generate_sequence_features_single.add_argument('--kmer',
                                                    required=False,
                                                    help='Just output data.csv with k-mer features.',
@@ -1360,6 +1383,26 @@ def multi_easy_binning(logger, args, device):
             new_path = os.path.join(args.output, 'bins', new_file)
             shutil.copyfile(original_path, new_path)
 
+
+def split_contigs(logger, contig_fasta, *, output, min_length):
+    from .utils import possibly_compressed_write
+    os.makedirs(output, exist_ok=True)
+
+    oname = f"{output}/split_contigs.fna.gz"
+    with possibly_compressed_write(oname) as ofile:
+        for h, seq in fasta_iter(contig_fasta):
+            if len(seq) < min_length:
+                continue
+            half = len(seq) // 2
+            h1 = h + "_1"
+            seq1 = seq[:half]
+            h2 = h + "_2"
+            seq2 = seq[half:]
+            ofile.write(f">{h1}\n{seq1}\n")
+            ofile.write(f">{h2}\n{seq2}\n")
+    return oname
+
+
 def main2(args=None, is_semibin2=True):
     import tempfile
 
@@ -1573,6 +1616,11 @@ def main2(args=None, is_semibin2=True):
             from .utils import concatenate_fasta
             ofname = concatenate_fasta(args.contig_fasta, args.min_length, args.output, args.separator, args.output_compression)
             logger.info(f'Concatenated contigs written to {ofname}')
+        elif args.cmd == 'split_contigs':
+            if not is_semibin2:
+                logger.error('Command `split_contigs` is not available in SemiBin1. Please upgrade to SemiBin2.')
+            oname = split_contigs(logger, args.contig_fasta, output=args.output, min_length=args.min_length)
+            logger.info(f'Split contigs written to {oname}')
 
         else:
             logger.error(f'Could not handle subcommand {arg.cmd}')
diff --git a/docs/usage.md b/docs/usage.md
index 320feff..f937159 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -426,22 +426,22 @@ Strobealign-aemb is a fast abundance estimation method for metagenomic binning.
 As strobealign-aemb can not provide the mapping information for every position of the contig, so we can not run SemiBin2 with strobealign-aemb in binning modes where samples used smaller 5 and need to split the contigs to generate the must-link constratints. 
 
 
-1. Split the fasta files 
+1. Split the fasta files using the `split_contigs` subcommand:
 ```bash
-python script/generate_split.py -c contig.fa -o output
+SemiBin2 split_contigs -i contig.fa -o output
 ```
-2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information
+2. Map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information. Note that version 0.13 (or newer) is required
 ```bash
-strobealign --aemb output/split.fa read1_1.fq read1_2.fq -R 6 > sample1.txt
-strobealign --aemb output/split.fa read2_1.fq read2_2.fq -R 6 > sample2.txt
-strobealign --aemb output/split.fa read3_1.fq read3_2.fq -R 6 > sample3.txt
-strobealign --aemb output/split.fa read4_1.fq read4_2.fq -R 6 > sample4.txt
-strobealign --aemb output/split.fa read5_1.fq read5_2.fq -R 6 > sample5.txt
+strobealign --aemb output/split_contigs.fna.gz read1_1.fq read1_2.fq -R 6 > sample1.tsv
+strobealign --aemb output/split_contigs.fna.gz read2_1.fq read2_2.fq -R 6 > sample2.tsv
+strobealign --aemb output/split_contigs.fna.gz read3_1.fq read3_2.fq -R 6 > sample3.tsv
+strobealign --aemb output/split_contigs.fna.gz read4_1.fq read4_2.fq -R 6 > sample4.tsv
+strobealign --aemb output/split_contigs.fna.gz read5_1.fq read5_2.fq -R 6 > sample5.tsv
 ```
-3. Run SemiBin2 (like running SemiBin with BAM files)
+3. Run SemiBin2 (running SemiBin with BAM files)
 ```bash
-SemiBin2 generate_sequence_features_single -i contig.fa -a *.txt -o output
-SemiBin2 generate_sequence_features_multi -i contig.fa -a *.txt -s : -o output
-SemiBin2 single_easy_bin -i contig.fa -a *.txt -o output
-SemiBin2 multi_easy_bin i contig.fa -a *.txt -s : -o output
+SemiBin2 generate_sequence_features_single -i contig.fa -a *.tsv -o output
+SemiBin2 generate_sequence_features_multi -i contig.fa -a *.tsv -s : -o output
+SemiBin2 single_easy_bin -i contig.fa -a *.tsv -o output
+SemiBin2 multi_easy_bin i contig.fa -a *.tsv -s : -o output
 
diff --git a/integration-tests/split_contigs_command.py b/integration-tests/split_contigs_command.py
new file mode 100644
index 0000000..1b88e8f
--- /dev/null
+++ b/integration-tests/split_contigs_command.py
@@ -0,0 +1,30 @@
+from os import path
+import subprocess
+import tempfile
+import gzip
+
+tmpdir = '/tmp/SemiBin2'
+with tempfile.TemporaryDirectory() as tmpdir:
+    with open(f'{tmpdir}/S1.fna', 'w') as f:
+        f.write('>C1\n')
+        for i in range(1000):
+            f.write('ATCG')
+        f.write('\n')
+        f.write('>C2\n')
+        for i in range(100):
+            f.write('GTAA')
+    subprocess.check_call(
+        ['SemiBin2', 'split_contigs',
+            '-i', f'{tmpdir}/S1.fna',
+            '-o', f'{tmpdir}/output',
+            '--min-len', '1000',])
+    assert path.exists(f'{tmpdir}/output/split_contigs.fna.gz')
+
+    with gzip.open(f'{tmpdir}/output/split_contigs.fna.gz', 'rt') as f:
+        assert f.readline() == '>C1_1\n'
+        part1 = f.readline()
+        assert f.readline() == '>C1_2\n'
+        part2 = f.readline()
+        assert len(part1) + len(part2) == 4002 # 4000 + 2 newlines
+        assert not f.read(1)
+
diff --git a/script/generate_split.py b/script/generate_split.py
deleted file mode 100644
index 1ffc177..0000000
--- a/script/generate_split.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import argparse
-from atomicwrites import atomic_write
-from SemiBin.fasta import fasta_iter
-import os
-
-def generate_file(contig_file, output, min_length, name):
-    os.makedirs(output, exist_ok=True)
-
-    with atomic_write(f"{output}/{name}.fa") as ofile:
-        for h, seq in fasta_iter(f"{contig_file}"):
-            if len(seq) < min_length:
-                continue
-            half = len(seq) // 2
-            h1 = h + "_1"
-            seq1 = seq[:half]
-            h2 = h + "_2"
-            seq2 = seq[half:]
-            ofile.write(f">{h1}\n{seq1}\n")
-            ofile.write(f">{h2}\n{seq2}\n")
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Generate cannt-link constrains from the output of CAT")
-    parser.add_argument('-c', '--contig-file',
-                        required=True,
-                        help='Path to the contig fasta file corresponding to the CAT output file.',
-                        dest='contig_file')
-    parser.add_argument('-o', '--output',
-                        required=True,
-                        help='Output directory (will be created if non-existent)',
-                        dest='output',
-                        default=None
-                        )
-    parser.add_argument('-n', '--name',
-                        required=False,
-                        help='Name for the splitted contig.',
-                        dest='name',
-                        default='split'
-                        )
-    parser.add_argument('-m','--min-length',
-                        help='Remove contig whose length smaller than this threshold',
-                        required=False,
-                        default=0,
-                        dest='min_length')
-
-
-    args = parser.parse_args()
-    generate_file(
-        args.contig_file,
-        args.output,
-        args.min_length,
-        args.name)
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file