-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch-kmer-counts.py
43 lines (35 loc) · 1.44 KB
/
fetch-kmer-counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import os
import sys
import subprocess
from collections import defaultdict
PREFIX_LEN=4
K=40
DAYS=14
kmers_by_prefix = defaultdict(list)
existing_s3_files = set()
def start(spikes_fname, job_name):
with open(spikes_fname) as inf:
for line in inf:
kmer = line.split('\t')[0]
kmers_by_prefix[kmer[:PREFIX_LEN]].append(kmer)
s3_raw_output = subprocess.check_output(
["aws", "s3", "ls", "s3://prjna729801/"])
for record in s3_raw_output.decode('utf-8').split('\n'):
if not record.strip(): continue
existing_s3_files.add(record.split()[-1])
for prefix, kmers in kmers_by_prefix.items():
trie_output_fname = "%s-%s-%s.tsv.gz" % (prefix, K, DAYS)
kmer_count_fname = "%s-%s-%s-%s.tsv" % (job_name, prefix, K, DAYS)
if (trie_output_fname in existing_s3_files and
not os.path.exists(kmer_count_fname)):
print("Extracting %s kmers from %s and writing to %s..." % (
len(kmers), trie_output_fname, kmer_count_fname))
subprocess.check_call(
"aws s3 cp s3://prjna729801/%s-%s-%s.tsv.gz - | "
"gunzip | "
"grep '%s' > %s || echo '%s: no matching kmers'" % (
prefix, K, DAYS,
"\\|".join('^%s' % kmer for kmer in kmers),
kmer_count_fname, prefix), shell=True)
if __name__ == "__main__":
start(*sys.argv[1:])