forked from jonathan-taylor/compare-selection
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstatistics.py
110 lines (91 loc) · 4.37 KB
/
statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import numpy as np, pandas as pd, time
def interval_statistic(method, instance, X, Y, beta, l_theory, l_min, l_1se, sigma_reid):
toc = time.time()
M = method(X.copy(), Y.copy(), l_theory.copy(), l_min, l_1se, sigma_reid)
active, lower, upper = M.generate_intervals()
target = M.get_target(active, beta) # for now limited to Gaussian methods
tic = time.time()
if len(active) > 0:
value = pd.DataFrame({'active_variable':active,
'lower_confidence':lower,
'upper_confidence':upper,
'target':target})
value['Time'] = tic-toc
return M, value
else:
return M, None
def interval_summary(result):
coverage = (np.asarray(result['lower_confidence'] <= result['target']) *
np.asarray(result['upper_confidence'] >= result['target']))
length = result['upper_confidence'] - result['lower_confidence']
instances = result.groupby('instance_id')
active_length = np.mean([len(g.index) for _, g in instances])
# XXX we should group by instances before averaging and computing SD
value = pd.DataFrame([[len(np.unique(result['instance_id'])),
np.mean(coverage),
np.std(coverage),
np.median(length),
np.mean(length),
active_length,
result['model_target'].values[0]]],
columns=['Replicates',
'Coverage',
'SD(Coverage)',
'Median length',
'Mean length',
'Active',
'Model'])
# keep all things constant over groups
for n in result.columns:
if len(np.unique(result[n])) == 1:
value[n] = result[n].values[0]
return value
def FDR_statistic(method, instance, X, Y, beta, l_theory, l_min, l_1se, sigma_reid):
toc = time.time()
M = method(X.copy(), Y.copy(), l_theory.copy(), l_min, l_1se, sigma_reid)
selected, active = M.select()
tic = time.time()
true_active = np.nonzero(beta)[0]
if active is not None:
TD = instance.discoveries(selected, true_active)
FD = len(selected) - TD
FDP = FD / max(TD + 1. * FD, 1.)
return M, pd.DataFrame([[TD / (len(true_active)*1.), FD, FDP, tic-toc, len(active)]],
columns=['Full model power',
'False discoveries',
'Full model FDP',
'Time',
'Active'])
else:
return M, pd.DataFrame([[0, 0, 0, tic-toc, 0]],
columns=['Full model power',
'False discoveries',
'Full model FDP',
'Time',
'Active'])
def FDR_summary(result):
nresult = result['Full model power'].shape[0]
value = pd.DataFrame([[nresult,
np.mean(result['Full model power']),
np.std(result['Full model power']) / np.sqrt(nresult),
np.mean(result['False discoveries']),
np.mean(result['Full model FDP']),
np.std(result['Full model FDP']) / np.sqrt(nresult),
np.mean(result['Time']),
np.mean(result['Active']),
result['model_target'].values[0]]],
columns=['Replicates',
'Full model power',
'SD(Full model power)',
'False discoveries',
'Full model FDR',
'SD(Full model FDR)',
'Time',
'Active',
'Model'
])
# keep all things constant over groups
for n in result.columns:
if len(np.unique(result[n])) == 1:
value[n] = result[n].values[0]
return value