From b71430e4f23f977632acc9686794d8b7e98e3248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 5 Mar 2024 10:27:08 +0100 Subject: [PATCH 1/2] Add param -c to append debugging comments --- create_batches.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/create_batches.py b/create_batches.py index c34d90b..9fe6a22 100755 --- a/create_batches.py +++ b/create_batches.py @@ -23,10 +23,11 @@ def clean_species_name(name): return re.sub('[^a-zA-Z0-9 ]+', '', name).replace(" ", "_").lower() + class Batching: def __init__(self, input_fn, cluster_min_size, cluster_max_size, - dustbin_max_size, output_d, col_species, col_fn): + dustbin_max_size, output_d, col_species, col_fn, comments): self.input_fn = input_fn self.cluster_min_size = cluster_min_size self.cluster_max_size = cluster_max_size @@ -34,10 +35,12 @@ def __init__(self, input_fn, cluster_min_size, cluster_max_size, self.output_d = output_d self.col_species = col_species self.col_fn = col_fn + self.comments = comments self.clusters = collections.defaultdict(list) self.pseudoclusters = collections.defaultdict(list) self.batches = collections.defaultdict(list) + self.dbg_info = {} # fn -> dbg comments def _load_clusters(self): with xopen(self.input_fn) as fo: @@ -48,6 +51,7 @@ def _load_clusters(self): species = clean_species_name(x[self.col_species]) fn = x[self.col_fn] self.clusters[species].append(fn) + self.dbg_info[fn] = species print( f"Loaded {genome_count} genomes across {len(self.clusters)} species clusters", file=sys.stderr) @@ -91,7 +95,11 @@ def _write_batches(self): for batch_name, l in self.batches.items(): fn = os.path.join(self.output_d, f"{batch_name}.txt") with open(fn, "w+") as f: - f.write("\n".join(l) + "\n") + for x in l: + if self.comments: + f.write(f"{x}\t#{self.dbg_info[x]}\n") + else: + f.write(f"{x}\n") print(f"Finished", file=sys.stderr) def run(self): @@ -158,6 +166,13 @@ def main(): help=f'column name with filename [{DEFAULT_COLUMN_FN}]', ) + parser.add_argument( + '-c', + dest='comments', + action='store_true', + help=f'add comments with info to the output text files (for debugging)', + ) + args = parser.parse_args() batching = Batching(input_fn=args.input_fn, @@ -166,7 +181,8 @@ def main(): dustbin_max_size=args.dustbin_max_size, output_d=args.output_d, col_species=args.col_species, - col_fn=args.col_fn) + col_fn=args.col_fn, + comments=args.comments) batching.run() From e6a56c734bf8db020100d0deb4853df53b8af48f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Karel=20B=C5=99inda?= Date: Tue, 5 Mar 2024 10:36:05 +0100 Subject: [PATCH 2/2] Remove comments in the snakemake pipeline --- create_batches.py | 2 +- workflow/rules/init.smk | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/create_batches.py b/create_batches.py index 9fe6a22..385af79 100755 --- a/create_batches.py +++ b/create_batches.py @@ -40,7 +40,7 @@ def __init__(self, input_fn, cluster_min_size, cluster_max_size, self.clusters = collections.defaultdict(list) self.pseudoclusters = collections.defaultdict(list) self.batches = collections.defaultdict(list) - self.dbg_info = {} # fn -> dbg comments + self.dbg_info = {} # fn -> dbg comments def _load_clusters(self): with xopen(self.input_fn) as fo: diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk index 5e66caa..c0609be 100644 --- a/workflow/rules/init.smk +++ b/workflow/rules/init.smk @@ -50,7 +50,8 @@ for x in res: BATCHES_FN[batch] = {} with open(x) as f: for y in f: - sample_fn = y.strip() + z = y.strip() + sample_fn = z.split("\t")[0] if sample_fn: sample = _get_sample_from_fn(sample_fn) BATCHES_FN[batch][sample] = sample_fn