Skip to content

Commit

Permalink
Merge pull request #91 from karel-brinda/dbg
Browse files Browse the repository at this point in the history
Add an -c option for adding comments to batching output
  • Loading branch information
karel-brinda authored Mar 5, 2024
2 parents 9fe6107 + e6a56c7 commit 4240186
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
22 changes: 19 additions & 3 deletions create_batches.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,24 @@
def clean_species_name(name):
return re.sub('[^a-zA-Z0-9 ]+', '', name).replace(" ", "_").lower()


class Batching:

def __init__(self, input_fn, cluster_min_size, cluster_max_size,
dustbin_max_size, output_d, col_species, col_fn):
dustbin_max_size, output_d, col_species, col_fn, comments):
self.input_fn = input_fn
self.cluster_min_size = cluster_min_size
self.cluster_max_size = cluster_max_size
self.dustbin_max_size = dustbin_max_size
self.output_d = output_d
self.col_species = col_species
self.col_fn = col_fn
self.comments = comments

self.clusters = collections.defaultdict(list)
self.pseudoclusters = collections.defaultdict(list)
self.batches = collections.defaultdict(list)
self.dbg_info = {} # fn -> dbg comments

def _load_clusters(self):
with xopen(self.input_fn) as fo:
Expand All @@ -48,6 +51,7 @@ def _load_clusters(self):
species = clean_species_name(x[self.col_species])
fn = x[self.col_fn]
self.clusters[species].append(fn)
self.dbg_info[fn] = species
print(
f"Loaded {genome_count} genomes across {len(self.clusters)} species clusters",
file=sys.stderr)
Expand Down Expand Up @@ -91,7 +95,11 @@ def _write_batches(self):
for batch_name, l in self.batches.items():
fn = os.path.join(self.output_d, f"{batch_name}.txt")
with open(fn, "w+") as f:
f.write("\n".join(l) + "\n")
for x in l:
if self.comments:
f.write(f"{x}\t#{self.dbg_info[x]}\n")
else:
f.write(f"{x}\n")
print(f"Finished", file=sys.stderr)

def run(self):
Expand Down Expand Up @@ -158,6 +166,13 @@ def main():
help=f'column name with filename [{DEFAULT_COLUMN_FN}]',
)

parser.add_argument(
'-c',
dest='comments',
action='store_true',
help=f'add comments with info to the output text files (for debugging)',
)

args = parser.parse_args()

batching = Batching(input_fn=args.input_fn,
Expand All @@ -166,7 +181,8 @@ def main():
dustbin_max_size=args.dustbin_max_size,
output_d=args.output_d,
col_species=args.col_species,
col_fn=args.col_fn)
col_fn=args.col_fn,
comments=args.comments)
batching.run()


Expand Down
3 changes: 2 additions & 1 deletion workflow/rules/init.smk
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ for x in res:
BATCHES_FN[batch] = {}
with open(x) as f:
for y in f:
sample_fn = y.strip()
z = y.strip()
sample_fn = z.split("\t")[0]
if sample_fn:
sample = _get_sample_from_fn(sample_fn)
BATCHES_FN[batch][sample] = sample_fn
Expand Down

0 comments on commit 4240186

Please sign in to comment.