-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresult_analyser.py
251 lines (221 loc) · 10.9 KB
/
result_analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import csv
import numpy as np
import json
import scipy.stats as scipy_stats
from distribution_fit_lib import best_fit_distribution, get_params_description
GENDER_LIST = [
# 'Male',
# 'Female',
'All'
]
FINANCE_EXP_LIST = [
'Yes',
'No'
]
KEEP_REJECTED = False
is_empty = lambda x: x==''
is_invalid_answer = lambda x: is_empty(x) or x=='No'
format_sus = lambda sus: list(map(lambda x: int(x) if x!='' else 3, sus))
format_answers = lambda answers: list(map(lambda x: 1 if x=='Yes' else 0, answers))
format_time = lambda time_str: tuple(map(int,time_str.split(':')[:2]))
def time_to_seconds(time):
h,m = time
return 60*60*h+60*m
def compute_sus_score(sus):
x = sum(sus[::2]) - 5
y = 25 - sum(sus[1::2])
return (x + y)*2.5
def get_stat_dict(value_list):
# print(value_list)
return {
'median': np.median(value_list),
'mean': np.mean(value_list),
'std': np.std(value_list),
# 'max': max(value_list),
# 'min': min(value_list),
}
row_list = []
with open('data/questionnaire_alternative.csv', newline='') as csvfile:
row_list += list(map(lambda x: ['Yes']+x, csv.reader(csvfile, delimiter=',')))
with open('data/questionnaire_baseline.csv', newline='') as csvfile:
row_list += list(map(lambda x: ['No']+x, csv.reader(csvfile, delimiter=',')))
experiment_sus_dict = {
'Yes': {g:[] for g in GENDER_LIST},
'No': {g:[] for g in GENDER_LIST},
}
INTRO_QUESTIONS = 7
EFFECTIVENESS_QUESTIONS = 7
END_QUESTIONS = 2
SUS_QUESTIONS = 10
for i,row in enumerate(row_list[1:]):
# Timestamp, Gender, Age, Do you have experience with Credit Approval Systems or Finance, What browser are you using?, What time is it NOW?
plugin = row[0]
rejected, timestamp, gender, age, finance_experience, browser, start_time_str = row[1:1+INTRO_QUESTIONS]
anwers_and_questions = row[1+INTRO_QUESTIONS:1+INTRO_QUESTIONS+EFFECTIVENESS_QUESTIONS*2]
end_time_str, suggestion = row[1+INTRO_QUESTIONS+EFFECTIVENESS_QUESTIONS*2:1+INTRO_QUESTIONS+EFFECTIVENESS_QUESTIONS*2+END_QUESTIONS]
sus = row[1+INTRO_QUESTIONS+EFFECTIVENESS_QUESTIONS*2+END_QUESTIONS:]
if not KEEP_REJECTED and rejected=='Yes':
continue
# Filter by previous experience
if finance_experience not in FINANCE_EXP_LIST:
continue
end_time = format_time(end_time_str)
start_time = format_time(start_time_str)
elapsed_seconds = time_to_seconds(end_time) - time_to_seconds(start_time)
# Q1 - Correct, Q1 - What automated process was used by the Bank to decide whether to give a loan?, Q2 - Correct, Q2 - What are the known issues of the automated processes (of the specific Credit Approval System) used by the Bank?, Q3 - Correct, Q3 - What did the Credit Approval System decide for Customer 25?, Q4 - Correct, Q4 - What is the "Average age of accounts in months" in this specific context?, Q5 - Correct, Q5 - What is the value of "Average age of accounts in months" that the Bank associated to Customer 25?, Q6 - Correct, Q6 - What should Customer 25 do in order to get its loan application accepted?, Q7 - Correct, Q7 - What is the smallest change to the "Average age of accounts in months" that Customer 25 should do in order to have its loan application accepted by the Bank?
answers = anwers_and_questions[::2]
# answers = [answers[0], answers[2], answers[3]] # only answers 1, 3, 4
# Ignore if all answers are empty/nonsensical
if len(list(filter(is_empty, answers))) == len(answers):
continue
# 1- I think that I would like to use these explanations frequently., 2- I found the explanations unnecessarily complex., 3- I thought the explanations were clear to understand., 4- I think that I would need the support of an expert to be able to understand the explanations of the system., 5- I found the various explanation bits in this system were well integrated., 6- I thought there was too much inconsistency in the explanation bits., 7- I would imagine that most people would learn to understand these explanations very quickly., 8- I found the explanations very cumbersome to understand., 9- I felt very confident understanding the explanations., 10- I needed to learn a lot of things before I could get going with these explanations.
# Ignore if SUS is empty
if len(list(filter(is_empty, sus))) > 0:
continue
answers = format_answers(answers)
sus = format_sus(sus)
sus_score = compute_sus_score(sus)
# print(i, plugin, sus_score)
row_dict = {
'Elapsed Seconds': elapsed_seconds,
'Effectiveness': sum(answers),
'Satisfaction': sus_score,
'scale': sus
}
if gender.capitalize() in experiment_sus_dict[plugin]:
experiment_sus_dict[plugin][gender].append(row_dict)
experiment_sus_dict[plugin]['All'].append(row_dict)
result_dict = {
'Yes': {g:[] for g in GENDER_LIST},
'No': {g:[] for g in GENDER_LIST},
}
for gender in GENDER_LIST:
for key, sus_dict in experiment_sus_dict.items():
score_list = list(map(lambda x: x['Satisfaction'], sus_dict[gender]))
scale_list = list(map(lambda x: x['scale'], sus_dict[gender]))
efficacy_list = list(map(lambda x: x['Effectiveness'], sus_dict[gender]))
seconds_list = list(map(lambda x: x['Elapsed Seconds'], sus_dict[gender]))
key_result_dict = {
'test_count': len(score_list),
'Elapsed Seconds': get_stat_dict(seconds_list),
'Satisfaction': get_stat_dict(score_list),
'Effectiveness': get_stat_dict(efficacy_list),
}
key_result_dict['question_dict'] = {}
median_sus = []
for e,q_list in enumerate(zip(*scale_list)):
key_result_dict['question_dict'][e] = get_stat_dict(q_list)
median_sus.append(key_result_dict['question_dict'][e]['median'])
key_result_dict['median_score'] = compute_sus_score(median_sus)
result_dict[key][gender] = key_result_dict
print('stats:', json.dumps(result_dict, indent=4))
#This test can be used to investigate whether two independent samples were selected from populations having the same distribution.
'''
A low pvalue implies that .
A high pvalue implies that Elapsed Seconds in "No" are not statistically greater than Elapsed Seconds in "Yes".
'''
def test_hypothesis(a, b):
a_value, a_label = a
b_value, b_label = b
# params_dict = {}
# sse_dict = {}
# for distr, params, sse in best_fit_distribution(a_value):
# sse_dict[distr] = sse
# params_dict[distr] = [params]
# for distr, params, sse in best_fit_distribution(b_value):
# if distr not in sse_dict:
# continue
# sse_dict[distr] += sse
# params_dict[distr].append(params)
# best_distribution = sorted(sse_dict.items(), key=lambda x:x[-1])[0][0]
# fit_params_a, fit_params_b = params_dict[best_distribution]
alternatives = ['two-sided','less','greater']
mannwhitneyu_dict = {}
for alternative in alternatives:
mannwhitneyu_dict[b_label + ' is ' + alternative] = scipy_stats.mannwhitneyu(a_value, b_value, use_continuity=True, alternative=alternative)
return {
# 'wilcoxon': scipy_stats.wilcoxon(a_value,b_value), # The Wilcoxon signed-rank test tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. It is a non-parametric version of the paired T-test.
# 'best_fit_distribution': best_distribution.name,
# 'params': {
# 'a': get_params_description(best_distribution, fit_params_a),
# 'b': get_params_description(best_distribution, fit_params_b)
# },
'mannwhitneyu': mannwhitneyu_dict,
'kruskal': scipy_stats.kruskal(a_value,b_value), # Due to the assumption that H has a chi square distribution, the number of samples in each group must not be too small. A typical rule is that each sample must have at least 5 measurements.
}
for gender in GENDER_LIST:
# follows loglaplace distribution
print('Elapsed Seconds', json.dumps(test_hypothesis( # A low mannwhitneyu pvalue (<0.05) implies that Elapsed Seconds in 'No' are statistically greater than Elapsed Seconds in 'Yes'
(list(map(lambda x: x['Elapsed Seconds'], experiment_sus_dict['No'][gender])),'No'),
(list(map(lambda x: x['Elapsed Seconds'], experiment_sus_dict['Yes'][gender])),'Yes'),
), indent=4))
# follows gennorm distribution
print('Effectiveness', json.dumps(test_hypothesis( # A low mannwhitneyu pvalue (<0.05) implies that Effectiveness in 'No' are statistically lower than Effectiveness in 'Yes'
(list(map(lambda x: x['Effectiveness'], experiment_sus_dict['No'][gender])),'No'),
(list(map(lambda x: x['Effectiveness'], experiment_sus_dict['Yes'][gender])),'Yes'),
), indent=4))
# follows dgamma distribution
print('Satisfaction', json.dumps(test_hypothesis( # A high pvalue (>0.95) implies that 'Yes' and 'No' have very similar scores
(list(map(lambda x: x['Satisfaction'], experiment_sus_dict['No'][gender])),'No'),
(list(map(lambda x: x['Satisfaction'], experiment_sus_dict['Yes'][gender])),'Yes'),
), indent=4))
a = map(lambda x: x['scale'], experiment_sus_dict['No'][gender])
b = map(lambda x: x['scale'], experiment_sus_dict['Yes'][gender])
print('Single SUS scales:')
sus_scale_dict = {}
for e,(a_list,b_list) in enumerate(zip(zip(*a),zip(*b))):
sus_scale_dict[int(e)+1] = test_hypothesis(
(a_list,'No'),
(b_list,'Yes'),
)
print(json.dumps(sus_scale_dict, indent=4))
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plot_list = ['Satisfaction','Effectiveness','Elapsed Seconds']
df_list = []
for gender in GENDER_LIST:
df_yes = pd.DataFrame(experiment_sus_dict['Yes'][gender])
df_yes = pd.melt(df_yes, value_vars=plot_list)
df_yes['user-centred'] = True
df_yes['gender'] = gender
df_no = pd.DataFrame(experiment_sus_dict['No'][gender])
df_no = pd.melt(df_no, value_vars=plot_list)
df_no['user-centred'] = False
df_no['gender'] = gender
df_list += [df_yes,df_no]
df = pd.concat(df_list,ignore_index=True)
print(df.loc[df['variable'] == 'Effectiveness'])
sns.set_style("whitegrid")
g = sns.FacetGrid(df, col="variable", row='gender', sharex=False, sharey=False,)
def my_boxplot(**kwargs):
x = kwargs.pop('x')
y = kwargs.pop('y')
data = kwargs.pop('data')
ax = sns.boxplot(x=x, y=y, data=data, showfliers=kwargs.get('showfliers'), autorange=kwargs.get('autorange'))
# Calculate number of obs per group & median to position labels
medians = data.groupby([x])[y].median().values
# Add it to the plot
pos = range(len(medians))
for tick,label in zip(pos,ax.get_xticklabels()):
ax.text(
pos[tick],
medians[tick],
medians[tick],
horizontalalignment='center',
size='medium',
# color='w',
weight='bold',
ha="center", va="center",
bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 0.8, 0.8))
)
return ax
ax = g.map_dataframe(my_boxplot, x='user-centred', y='value', showfliers=False, autorange=True).set_titles("{row_name} | {col_name}",bbox=dict(boxstyle="round", ec=(0., 0., 0.), fc=(0.9, 0.9, 0.9))).set_axis_labels('user-centred tool','value')
# Iterate thorugh each axis
for ax in g.axes.flat:
# Make x and y-axis labels slightly larger
ax.set_xlabel(ax.get_xlabel(), fontsize='x-large', fontweight='bold')
ax.set_ylabel(ax.get_ylabel(), fontsize='x-large', fontweight='bold')
# plt.legend()
plt.tight_layout()
plt.savefig('boxplot.png')