From b71430e4f23f977632acc9686794d8b7e98e3248 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Tue, 5 Mar 2024 10:27:08 +0100
Subject: [PATCH 1/2] Add param -c to append debugging comments

---
 create_batches.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/create_batches.py b/create_batches.py
index c34d90b..9fe6a22 100755
--- a/create_batches.py
+++ b/create_batches.py
@@ -23,10 +23,11 @@
 def clean_species_name(name):
     return re.sub('[^a-zA-Z0-9 ]+', '', name).replace(" ", "_").lower()
 
+
 class Batching:
 
     def __init__(self, input_fn, cluster_min_size, cluster_max_size,
-                 dustbin_max_size, output_d, col_species, col_fn):
+                 dustbin_max_size, output_d, col_species, col_fn, comments):
         self.input_fn = input_fn
         self.cluster_min_size = cluster_min_size
         self.cluster_max_size = cluster_max_size
@@ -34,10 +35,12 @@ def __init__(self, input_fn, cluster_min_size, cluster_max_size,
         self.output_d = output_d
         self.col_species = col_species
         self.col_fn = col_fn
+        self.comments = comments
 
         self.clusters = collections.defaultdict(list)
         self.pseudoclusters = collections.defaultdict(list)
         self.batches = collections.defaultdict(list)
+        self.dbg_info = {} # fn -> dbg comments
 
     def _load_clusters(self):
         with xopen(self.input_fn) as fo:
@@ -48,6 +51,7 @@ def _load_clusters(self):
                 species = clean_species_name(x[self.col_species])
                 fn = x[self.col_fn]
                 self.clusters[species].append(fn)
+                self.dbg_info[fn] = species
         print(
             f"Loaded {genome_count} genomes across {len(self.clusters)} species clusters",
             file=sys.stderr)
@@ -91,7 +95,11 @@ def _write_batches(self):
         for batch_name, l in self.batches.items():
             fn = os.path.join(self.output_d, f"{batch_name}.txt")
             with open(fn, "w+") as f:
-                f.write("\n".join(l) + "\n")
+                for x in l:
+                    if self.comments:
+                        f.write(f"{x}\t#{self.dbg_info[x]}\n")
+                    else:
+                        f.write(f"{x}\n")
         print(f"Finished", file=sys.stderr)
 
     def run(self):
@@ -158,6 +166,13 @@ def main():
         help=f'column name with filename [{DEFAULT_COLUMN_FN}]',
     )
 
+    parser.add_argument(
+        '-c',
+        dest='comments',
+        action='store_true',
+        help=f'add comments with info to the output text files (for debugging)',
+    )
+
     args = parser.parse_args()
 
     batching = Batching(input_fn=args.input_fn,
@@ -166,7 +181,8 @@ def main():
                         dustbin_max_size=args.dustbin_max_size,
                         output_d=args.output_d,
                         col_species=args.col_species,
-                        col_fn=args.col_fn)
+                        col_fn=args.col_fn,
+                        comments=args.comments)
     batching.run()
 
 

From e6a56c734bf8db020100d0deb4853df53b8af48f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Karel=20B=C5=99inda?= <karel.brinda@gmail.com>
Date: Tue, 5 Mar 2024 10:36:05 +0100
Subject: [PATCH 2/2] Remove comments in the snakemake pipeline

---
 create_batches.py       | 2 +-
 workflow/rules/init.smk | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/create_batches.py b/create_batches.py
index 9fe6a22..385af79 100755
--- a/create_batches.py
+++ b/create_batches.py
@@ -40,7 +40,7 @@ def __init__(self, input_fn, cluster_min_size, cluster_max_size,
         self.clusters = collections.defaultdict(list)
         self.pseudoclusters = collections.defaultdict(list)
         self.batches = collections.defaultdict(list)
-        self.dbg_info = {} # fn -> dbg comments
+        self.dbg_info = {}  # fn -> dbg comments
 
     def _load_clusters(self):
         with xopen(self.input_fn) as fo:
diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk
index 5e66caa..c0609be 100644
--- a/workflow/rules/init.smk
+++ b/workflow/rules/init.smk
@@ -50,7 +50,8 @@ for x in res:
     BATCHES_FN[batch] = {}
     with open(x) as f:
         for y in f:
-            sample_fn = y.strip()
+            z = y.strip()
+            sample_fn = z.split("\t")[0]
             if sample_fn:
                 sample = _get_sample_from_fn(sample_fn)
                 BATCHES_FN[batch][sample] = sample_fn