forked from capoony/ABBABABA-4AF
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSYNC2AF.py
105 lines (83 loc) · 3.07 KB
/
SYNC2AF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import sys
from optparse import OptionParser, OptionGroup
# Author: Martin Kapun
######################################################### HELP #########################################################################
usage = "python %prog --sync input.sync > output.af "
parser = OptionParser(usage=usage)
group = OptionGroup(parser, '''
''')
######################################################### CODE #########################################################################
parser.add_option("--sync", dest="IN", help="Input file")
parser.add_option("--MinCov", dest="mc",
help="minor coverage threshold", default=10)
(options, args) = parser.parse_args()
parser.add_option_group(group)
def sync2freqh(x, ALL, mc):
''' convert string in SYNC format to dictionary of freqencies where x is a string in sync format'''
from collections import defaultdict as d
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0":
return "na", "na"
nuc = ["A", "T", "C", "G"]
counts = [int(X) for X in x.split(":")[:4]]
if sum(counts) == 0:
return ({"A": 0.0, "T": 0.0, "C": 0.0, "G": 0.0}, 0)
CO = {k: v for k, v in list(zip(*[nuc, counts]))if k in ALL}
h = d(float)
if sum(CO.values()) < mc:
return "na", "na"
for k, v in CO.items():
h[k] = v / float(sum(CO.values()))
return h, sum(CO.values())
def all_alleles(v):
''' returns most common strings'''
from collections import Counter
nuc = v.replace("N", "")
if len(nuc) == 0 or len(set(nuc)) < 2:
return "NA"
return list(zip(*Counter(nuc).most_common()))[0]
def sync2string(x):
''' convert sync format to string of nucleotides where x is a string in sync format '''
string = ""
alleles = ["A", "T", "C", "G"]
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0":
return "na"
ah = dict(zip(alleles, [int(X) for X in x.split(":")[:4]]))
for k, v in ah.items():
string += v * k
return string
def load_data(x):
''' import data either from a gzipped or or uncrompessed file or from STDIN'''
import gzip
if x == "-":
y = sys.stdin
elif x.endswith(".gz"):
y = gzip.open(x, "rt", encoding="latin-1")
else:
y = open(x, "r", encoding="latin-1")
return y
for l in load_data(options.IN):
# split in columns
a = l.rstrip().split()
ID = a[0] + ":" + a[1]
# make long string of all summed-up alleles across all populations
AlleleStrings = ""
for pop in a[3:]:
Counts = sync2string(pop)
if Counts == "na":
continue
AlleleStrings += Counts
# identify two most common alleles
MA = all_alleles(AlleleStrings)[:2]
# skip if position invariate
if MA == "NA":
continue
fl = []
# calculate and print frequencies of major alleles
for pop in a[3:]:
FH = sync2freqh(pop, MA, int(options.mc))
if "na" in FH:
fl.append("NA")
continue
fl.append(FH[0][MA[0]])
print("\t".join(a[:2]) + "\t" + "/".join(MA[:2])
+ "\t" + "\t".join([str(x) for x in fl]))