-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalculate_d_dplus_genomewide_cows.py
155 lines (138 loc) · 4.99 KB
/
calculate_d_dplus_genomewide_cows.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import random as rd
import os.path
import argparse
import gzip
import sys
#Only skip over missing data if missing data is part of the sample we want#Calculate D genomewide.
#python pytonName.py -infile /path/to/name.vcf.gz -outfile /path/to/outfile.csv -p1 cow_cow -p2 cow_monkey -p3 cow_bison -p4 cow_human -meta meta.csv
#make sure meta file has correct individual names and that population names don't include spaces
parser = argparse.ArgumentParser(description="Calulate D genomewide.")
parser.add_argument('-infile',action="store",dest="infile")
parser.add_argument('-outfile',action="store",dest="outfile")
parser.add_argument('-p1',action="store",dest="p1")
parser.add_argument('-p2',action="store",dest="p2")
parser.add_argument('-p3',action="store",dest="p3")
parser.add_argument('-p4',action="store",dest="p4")
parser.add_argument('-meta',action="store",dest="meta_file")
param=parser.parse_args()
def random_geno(genotype,sep):
geno_list = genotype.split(sep)
geno = rd.sample(geno_list,1)[0]
return(geno)
def isAncestral(allele,ancestral):
return(allele == ancestral)
#Populations to be used
populations = [param.p1,param.p2,param.p3,param.p4] #["bos_taurus","sanga_taurus","bos_indicus","bison_bison"]
pop_name_dicc = {}
with open(param.meta_file,"r") as fp:
for aline in fp:
aspline = aline.split()
pop_name_dicc[aspline[1]]=aspline[0]
#phased or unphased?
sep = "/"
#Mitochondrial tag
mito_tag = "NC_006853.1"
#Open file
if 'gz' in param.infile:
file = gzip.open(param.infile, 'r')
else:
file = open(param.infile, 'r')
line = file.readline()
#Define nucleotides
nucleotides = ["A","T","C","G"]
#Get header
while line.startswith('#'):
header = line.split()
line = file.readline()
#Get indices for populations to be used
pop_index_dicc = {}
for pop in populations:
ind = header.index(pop_name_dicc[pop])
pop_index_dicc[pop] = ind
#Keep count of abba and baba
abba = 0.0;baba = 0.0
#Keep count of baaa and abaa
baaa = 0.0;abaa = 0.0
counter = 0.0
#want to keep track of all sites, including shared alleles between pop 1 & 2 ex. BBAA
all_sites = 0.0
#Go through line by line
for line in file:
counter += 1
spline = line.split()
#Do not include mitochondrial data
if mito_tag in line:
continue
#Ignore missing data
if ".%s." %(sep) in line:
continue
#Only biallelic
if(nucleotides.count(spline[3])!=1 or nucleotides.count(spline[4])!=1):
continue
alleles = {}
pop_count = 1
for pop in populations:
if pop == "bos_taurus":
alleles["pop"+str(pop_count)] = "0"
pop_count+=1
else:
genotype_raw = spline[pop_index_dicc[pop]]
genotype = genotype_raw.split(':')[0]
alleles["pop"+str(pop_count)] = random_geno(genotype,sep)
pop_count+=1
#tracking all sites (counting) ex. BBAA #all sites equals informative & non-informative sites
all_sites += 1
#Ignore sites where p1 and p2 have same allele
if(alleles["pop1"] == alleles["pop2"]):
continue
#Differentiate between abba/baba and baaa/abaa
#Is p3 ancestral?
if(isAncestral(alleles["pop3"],alleles["pop4"])):
#Is it baaa or abaa:
if(isAncestral(alleles["pop2"],alleles["pop4"])):
baaa+=1
elif(isAncestral(alleles["pop1"],alleles["pop4"])):
abaa+=1
#Is it ABBA or BABA
elif not isAncestral(alleles["pop3"],alleles["pop4"]):
#ABBA: p1 has ancestral allele
if(isAncestral(alleles["pop1"],alleles["pop4"])):
abba += 1
elif(isAncestral(alleles["pop2"],alleles["pop4"])):
baba += 1
if(counter%10000==0):
print("On line {}. ABBA is {}. BABA is {}.BAAA is {}. ABAA is {}".format(counter,abba,baba,baaa,abaa))
file.close()
#Calculate D
try:
d = (abba - baba)/(abba + baba)
except ZeroDivisionError:
d = 'nan'
#Calculate D+
dplus_numerator = (abba - baba) + (baaa - abaa)
dplus_denom = (abba + baba) + (baaa + abaa)
try:
dplus = dplus_numerator/dplus_denom
except ZeroDivisionError:
dplus = 'nan'
# counting number of informative site for d
d_informative_sites = abba + baba
#counting number of informative sites for dplus
dplus_informative_sites = abaa + baaa
#Output results. If outfile already exists, append to it instead of creating outfile again
if(os.path.exists(param.outfile)):
fout = open(param.outfile,"a+")
outline = param.p1+"\t"+param.p2+"\t"+param.p3+"\t"+param.p4+"\t"
outline += str(all_sites)+"\t" + str(d_informative_sites)+"\t" + str(dplus_informative_sites)+"\t"
outline += str(abba) + "\t" + str(baba) +"\t"+ str(baaa) +"\t"+ str(abaa) +"\t"+ str(d) + "\t"+str(dplus)+"\n"
fout.write(outline)
fout.close()
else:
fout = open(param.outfile,"w")
outfile_header="p1\tp2\tp3\tp4\tall_sites\td_informative_sites\tdplus_informative_sites\tABBA\tBABA\tBAAA\tABAA\tD\tD+\n"
fout.write(outfile_header)
outline = param.p1+"\t"+param.p2+"\t"+param.p3+"\t"+param.p4+"\t"
outline += str(all_sites)+"\t" + str(d_informative_sites)+"\t" + str(dplus_informative_sites)+"\t"
outline += str(abba) + "\t" + str(baba) +"\t"+ str(baaa) +"\t"+ str(abaa) +"\t"+ str(d)+"\t"+str(dplus)+"\n"
fout.write(outline)
fout.close()