-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhitting_set.py
356 lines (292 loc) · 15.1 KB
/
hitting_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# Top-level code file for modeling the selction of personalized tumor targets as a hitting set problem
# Authors: Pattara Sukprasert & Saba Ahmadi
from __future__ import print_function
import argparse
import pprint
from typing import List, Tuple, Any, Optional, Dict, Set
import sys
import time
# from mh_util import eprint, args, Model, quicksum # Flags are defined here.
from mh_util import *
from patient import Patient
def print_fraction_killed_normal_cells(model: Model, patients: List[Patient]) -> None:
""" Computing and outputing the fraction of normall cells killed for each patient (w.r.t to the given model)
Args:
model: An ILP model, should store the answer of the ILP isntance.
patients: A list of patients.
Returns:
None
"""
average_hit_normal_cells=0
for patient in patients:
hit_normal_cells = set()
for non_tumor_cell_id, normal_cell_name in enumerate(patient.non_tumor_cell_names):
for (gene_name, gene_value) in patient.non_tumor_genes:
if get_val(model, patient.local_vars[gene_name]) > 0.5:
if patient.covers(gene_value[non_tumor_cell_id], gene_name): #check if expression of this gene is high enough for cell to be killed
hit_normal_cells.add(normal_cell_name)
if (len(patient.non_tumor_cell_names) != 0):
ratio_hit_normall_cells = float(len(hit_normal_cells)/len(patient.non_tumor_cell_names))
else:
ratio_hit_normall_cells = 0
print('The percentage of non-cancer cells killed/targeted by the optimal solution for {}: {}'.format(patient.source, ratio_hit_normall_cells))
average_hit_normal_cells += ratio_hit_normall_cells
print ('The average percentage of non-cancer cells killed/targeted by the optimal solution across all the patients', float(average_hit_normal_cells/len(patients)))
def uncovered_cancer_cells_plus_hitting_set_size(model: Model, patients: List[Patient]) -> None:
""" Computing and printing, for each patient, the size of the hitting set plus the number of tumor cells that are not killed.
Args:
model: An ILP model, should store the answer of the ILP isntance.
patients: A list of patients.
Returns:
None
"""
average = 0
for patient in patients:
hitting_set_size = 0
for (gene_name, _) in patient.genes:
if get_val(model, patient.local_vars[gene_name]) > 0.5:
hitting_set_size +=1
# Count the number of unhittable tumor_cells.
count_unhittable_tumor_cells = 0
for (tumor_cell_id, _) in enumerate(patient.tumor_cell_names) :
unhittable = True
for (gene_name, gene_value) in patient.genes:
if patient.covers(gene_value[tumor_cell_id], gene_name):
unhittable = False
if unhittable is True:
count_unhittable_tumor_cells +=1
total_patient = hitting_set_size + count_unhittable_tumor_cells
average += total_patient
print('The number of uncovered cancer cells plus size of the hitting set for {}: {}'.format(patient.source, total_patient))
print('The average of the number of uncovered cancer cells plus size of the hitting set across all the patients', float(average/len(patients)))
def average_hitting_set_size(model: Model, patients: List[Patient]) -> None:
""" Computing and Printing the average hitting set size among all patients.
Args:
model: An ILP model, should store the answer of the ILP isntance.
patients: A list of patients.
Returns:
None
"""
average = 0
for patient in patients:
hitting_set_size = 0
for (gene_name, _) in patient.genes:
if get_val(model, patient.local_vars[gene_name]) > 0.5:
hitting_set_size +=1
average += hitting_set_size
print('Size of the hitting set for {}: {}'.format(patient.source, hitting_set_size))
print('The average hitting set across all the patients', float(average/len(patients)))
def solve_ILP(args: argparse.Namespace) -> Tuple[Model, set, List[Patient]]:
""" Construct and solve the ILP program to find hitting set for the given args.
Args:
args: The parameters given by the program (see parse() for the list of flags).
Returns:
model: The solved ILP model.
global_instance_vars: List of variables of the model indexed by names.
patients: List of patients whose involved in the ILP.
"""
patients = list(Patient(args, idx) for idx, _ in enumerate(args.tumor_files))
model = Model()
q_sum = quicksum
global_instance_vars = {} #this set is changed in the call patient.add_to_model
for patient in patients:
assert isinstance(patient, Patient)
if args.use_absolute_model is False:
patient.solve_local()
print(patient)
patient.add_to_model(model, global_instance_vars)
if args.use_gurobi:
model.setParam(GRB.param.PoolSearchMode, 2)
model.setParam(GRB.param.PoolSolutions, args.num_sol)
model.setObjective(q_sum(var for (_, var) in global_instance_vars.items()))
model.optimize() #call the optimization procedure
assert model.status == GRB.OPTIMAL, (
'No feasible answer for the global instance. '
'This should not be the case since all the local instances can be solved.'
)
else:
#build the objective function, which is the sum of a variable for each of the possible gene used in the possible hitting set
model.setObjective(q_sum(var for (_, var) in global_instance_vars.items()))
if args.silent:
model.hideOutput()
model.optimize() #call the optimization procedure
assert model.getStatus() == 'optimal', (
'No feasible answer for the global instance. '
'This should not be the case since all the local instances can be solved.'
)
return model, global_instance_vars, patients
def solve_greedy(args: argparse.Namespace) -> Tuple[List[Dict[str, Any]], List[Patient]]:
""" Find hitting set for the given args using the greedy approach.
Args:
args: The parameters given by the program (see parse() for the list of flags).
Returns:
global_solutions: List of solutions of different sizes.
patients: List of Patients.
"""
eprint("Start greedy process.")
# Create Patient objects
patients = list(Patient(args, idx) for idx, _ in enumerate(args.tumor_files))
eprint("Patient objects are created.")
def acc_genes() -> Set[str]:
""" Return the list of genes needed to be consider for all patients combined.
"""
# Accumulate all genes
genes = set()
for patient in patients:
genes = genes.union(set(name for name,_ in patient.genes))
return genes
def get_avg_effectiveness(selected_set: Set[str]):
""" Return the average effectiveness of the set of genes for all patients.
"""
ret = 0.0
for patient in patients:
ret = ret + patient.get_effectiveness(selected_set)
ret = ret / len(patients)
return ret
eprint("Accumulating genes from patients.")
genes = sorted(acc_genes())
gene_pairs = []
# Calculate pair of genes
eprint("Building gene pairs from:", len(genes), "genes")
for i in range(0, len(genes)):
for j in range(i, len(genes)):
pair = {
'genes': set([genes[i], genes[j]]),
'eff': get_avg_effectiveness(set([genes[i], genes[j]]))
}
gene_pairs.append(pair)
if i == j :
eprint(genes[i], "Effectiveness: ", pair['eff'])
if ( len(gene_pairs) % 10000 ) == 0:
eprint("Built:", len(gene_pairs), "pairs so far.")
gene_pairs = sorted(gene_pairs, key=lambda e: e['eff'], reverse=True)
eprint("Gathered all the gene pairs in sorted order.")
eprint("Run local greedy with all gene pairs.")
# Run local greedy on patients
solutions = []
for p in patients:
solutions.append(p.greedy(gene_pairs))
eprint("Finish running local greedy.")
eprint("Flatten the genes to get 1-d list in sorted order.")
# Flatten the genes in 1-d array
# This will also remove unused genes.
flatten_genes = []
for pair in gene_pairs:
for e in sorted(pair['genes']):
if e not in flatten_genes:
for s in solutions:
if e in s[-1]['selected_genes']:
flatten_genes.append(e)
break
eprint("Finish flattening the genes.")
eprint("Finding answers of all sizes.")
# Crate k-sized solutions
global_solutions = []
chosen_genes = set()
for gene in flatten_genes:
eprint("Adding", gene)
chosen_genes.add(gene)
k_sized_soln = []
for local_sol in solutions:
copy_sol = local_sol.copy()
for x in local_sol:
if not x['selected_genes'].issubset(chosen_genes):
copy_sol.remove(x)
k_sized_soln.append(copy_sol)
global_solutions.append(
{
'genes': chosen_genes.copy(),
'local_solutions': k_sized_soln
}
)
return global_solutions, patients
def main():
""" The main routine of the program.
"""
# Keep track of the running time.
start = time.time()
if args.use_greedy: # If true, then find the hitting set using greedy approach.
# Initiate a PrettyPrinter (used to format long output).
pp = pprint.PrettyPrinter(indent = 4)
# Run the greedy algorithm.
solutions, patients = solve_greedy(args)
# Print to stderr, the whole output.
eprint(pp.pformat(solutions))
# The actual output to stdout will only depend on the largest solution.
output = solutions[-1]
"""
The following part will output various information that will be processed thereafter.
"""
for i in range(len(output['local_solutions'])):
output['local_solutions'][i] = output['local_solutions'][i][-1]
print('The selected genes are: [{}]'.format(' '.join(output['genes'])))
average_hit_normal_cells = 0
g_average_hitting_set_size = 0
average_uncovered_plus_hitting_set = 0
for i in range(len(patients)):
print('The selected genes for', patients[i].source,': [{}]'.format(' '.join(output['local_solutions'][i]['selected_genes'])))
print('Size of the hitting set for {}: {}'.format(patients[i].source, len(output['local_solutions'][i]['selected_genes'])))
g_average_hitting_set_size += len(output['local_solutions'][i]['selected_genes'])
print('Kill/Hittable/Tumor cells = ', output['local_solutions'][i]['tumor_cells_killed'], '/', patients[i].hittable_cell_size, '/', len(patients[i].tumor_cell_names))
print('The number of uncovered cancer cells plus size of the hitting set for {}: {}'.format(
patients[i].source, len(patients[i].tumor_cell_names) \
- output['local_solutions'][i]['tumor_cells_killed']
))
average_uncovered_plus_hitting_set += len(patients[i].tumor_cell_names) - output['local_solutions'][i]['tumor_cells_killed']
if args.non_tumor_files:
print('Non-tumor killed/Non-tumor cells = ',
output['local_solutions'][i]['non_tumor_cells_killed'],
'/',
len(patients[i].non_tumor_cell_names)
)
if len(patients[i].non_tumor_cell_names) > 0:
ratio_hit_normal_cells =\
float(output['local_solutions'][i]['non_tumor_cells_killed'] / len(patients[i].non_tumor_cell_names))
average_hit_normal_cells += ratio_hit_normal_cells
else:
ratio_hit_normal_cells = 0
print('The percentage of non-cancer cells killed/targeted by the optimal solution for {}: {}'
.format(patients[i].source, ratio_hit_normal_cells))
print('Size of global hitting set is:', len(output['genes']))
if args.non_tumor_files:
print ('The average percentage of non-cancer cells killed/targeted by the optimal solution across all the patients', float(average_hit_normal_cells/len(patients)))
print('The average hitting set across all the patients', float(g_average_hitting_set_size)/len(patients))
print('The average of the number of uncovered cancer cells plus size of the hitting set across all the patients', float(average_uncovered_plus_hitting_set)/len(patients))
else: # Use SCIP/Gurobi to solve ILP.
# Solve the ILP.
model, global_instance_vars, patients = solve_ILP(args)
"""
The following part will output various information that will be processed thereafter.
"""
loop_cnt = 1
if (args.use_gurobi):
loop_cnt = args.num_sol
model_opt = get_obj_val(model)
for i in range(loop_cnt):
if args.use_gurobi:
select_solution(model, i)
if get_obj_val(model) > model_opt:
break
print('=====Solution #{}====='.format(i))
print('The optimal global hitting set is {}'.format(get_obj_val(model)))
# The variables are the genes and any gene assigned a value > 0.5 is considered to be in the global optimal hitting set.
global_ans = list(var_name for (var_name, var) in global_instance_vars.items()
if get_val(model, var) > 0.5)
print('The selected genes are: [{}]'.format(' '.join(global_ans)))
# List those genes that are expressed in each patient.
for patient in patients:
local_ans = list(gene_name for (gene_name, _) in patient.genes
if get_val(model, patient.local_vars[gene_name]) > 0.5)
print('The selected genes for {}: [{}]'.format(patient.source, ' '.join(local_ans)))
print ('Size of global hitting set is:', len(global_ans))
#To compute the fraction of normal cells in each patient hit by the optimal solution
if args.non_tumor_files:
print_fraction_killed_normal_cells(model, patients)
average_hitting_set_size(model, patients)
uncovered_cancer_cells_plus_hitting_set_size(model, patients)
# Output the running time to stderr.
end = time.time()
eprint("The process took:", end - start, "seconds.")
# Call main().
if __name__ == '__main__':
main()