-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheredity.py
198 lines (167 loc) · 6.42 KB
/
heredity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import sys
import csv
import itertools
from typing import Dict, List, Set
P = {
"G": {
0: {True: 0.96, False: 0.04},
1: {True: 0.03, False: 0.97},
2: {True: 0.01, False: 0.99}
},
"T|G": {
0: {True: 0.01, False: 0.99},
2: {True: 0.65, False: 0.35},
1: {True: 0.56, False: 0.44}
},
"M": {
True: 0.01,
False: 0.99
},
"M|G": {
0: {True: 0.01, False: 0.99},
1: {True: 0.5, False: 0.5},
2: {True: 0.99, False: 0.01}
}
}
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python heredity.py data.csv")
people = load_data(sys.argv[1])
probabilities = initialize_probabilities(people)
calculate_probabilities(people, probabilities)
print_probabilities(people, probabilities)
def load_data(filename: str) -> Dict[str, dict]:
"""
Load gene and trait data from a file into a dictionary.
File assumed to be a CSV containing fields name, mother, father, trait.
mother, father must both be blank, or both be valid names in the CSV.
trait should be 0 or 1 if trait is known, blank otherwise.
"""
data = dict()
with open(filename) as file:
reader = csv.DictReader(file)
for row in reader:
name = row["name"]
data[name] = {
"name": name,
"mother": row["mother"] or None,
"father": row["father"] or None,
"trait": (True if row["trait"] == "1" else
False if row["trait"] == "0" else None)
}
return data
def initialize_probabilities(people: dict) -> Dict[str, dict]:
return {
person: {
"gene": {
0: 0,
1: 0,
2: 0
},
"trait": {
True: 0,
False: 0
}
}
for person in people
}
def calculate_probabilities(people: dict, probabilities: dict) -> None:
"""
Calculate the probabilities for each person having a gene and/or trait.
"""
names = set(people)
for have_trait in powerset(names):
# Check if current set of people violates known information
if evidence_violated(people, have_trait):
continue
# Loop over all sets of people who might have the gene
for one_gene in powerset(names):
for two_genes in powerset(names - one_gene):
# Update probabilities with new joint probability
p = joint_probability(people, one_gene, two_genes, have_trait)
update(probabilities, one_gene, two_genes, have_trait, p)
# Ensure probabilities sum to 1
normalize(probabilities)
def powerset(s: Set[str]) -> List[Set[str]]:
"""
Return a list of all possible subsets of set s.
"""
s = list(s)
return [
set(s) for s in itertools.chain.from_iterable(
itertools.combinations(s, r) for r in range(len(s) + 1)
)
]
def evidence_violated(people: dict, trait: set) -> bool:
"""
Check if the current set of people with the trait violates known information.
"""
return any(people[person]["trait"] != (person in trait)
for person in people
if people[person]["trait"] is not None)
def joint_probability(people: dict, one_gene: set, two_genes: set, trait: set) -> float:
"""
Compute and return a joint probability.
The probability returned should be the probability that
* everyone in set `one_gene` has one copy of the gene, and
* everyone in set `two_genes` has two copies of the gene, and
* everyone not in `one_gene` or `two_gene` does not have the gene, and
* everyone in set `have_trait` has the trait, and
* everyone not in set` have_trait` does not have the trait.
"""
probability = 1
for person in people:
num_genes = 1 if person in one_gene else 2 if person in two_genes else 0
have_trait = True if person in trait else False
mother = people[person]["mother"]
father = people[person]["father"]
if not mother and not father:
probability *= P["G"][num_genes][True] * P["T|G"][num_genes][have_trait]
else:
parents = dict()
for parent in [mother, father]:
n_genes = 2 if parent in two_genes else 1 if parent in one_gene else 0
parents[parent] = P["M|G"][n_genes]
if num_genes == 0:
probability *= parents[mother][False] * parents[father][False]
elif num_genes == 1:
probability *= (
parents[mother][True] * parents[father][False] +
parents[father][True] * parents[mother][False]
)
else:
probability *= parents[mother][True] * parents[mother][True]
probability *= P["T|G"][num_genes][have_trait]
return probability
def update(probabilities: dict, one_gene: set, two_genes: set, trait: set, p: float):
"""
Add to `probabilities` a new joint probability `p`.
Each person should have their "gene" and "trait" distributions updated.
Which value for each distribution is updated depends on whether
the person is in `have_gene` and `have_trait`, respectively.
"""
for person in probabilities:
num_genes = 1 if person in one_gene else 2 if person in two_genes else 0
probabilities[person]["gene"][num_genes] += p
probabilities[person]["trait"][person in trait] += p
def normalize(probabilities: dict) -> None:
"""
Update `probabilities` such that each probability distribution
is normalized (i.e., sums to 1, with relative proportions the same).
"""
for person in probabilities.copy():
for field in probabilities[person]:
total = sum(probabilities[person][field].values())
for value in probabilities[person][field]:
normalized = probabilities[person][field][value] / total
probabilities[person][field][value] = normalized
def print_probabilities(people: dict, probabilities: dict) -> None:
for person in people:
print(person)
for field in probabilities[person]:
print(f" {field.capitalize()}:")
for value in probabilities[person][field]:
p = probabilities[person][field][value]
print(f" {value}: {p:.4f}")
if __name__ == "__main__":
main()