-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatistics aggregator.py
175 lines (163 loc) · 9.35 KB
/
statistics aggregator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# Function my_mean: input is a list and output is the mean of that list
def my_mean(a):
return sum(a)/float(len(a))
# Function my_median: input is a list and output is the median of that list
def my_median(a):
a.sort()
l = len(a)
if l % 2 == 0:
median = (a[(l/2)-1]+a[l/2])/float(2)
else:
median = a[l/2]
return median
#Function my_mode: input is a list and output is the mode of that list
def my_mode(a):
count = [[x, a.count(x)] for x in set(a)]
mode = []
index = 0
for i in range(1, len(count)):
if (count[i][1]>=count[index][1]):
index = i
if(count[index][1] == 1):
mode = ["No mode"]
else:
for j in range(0, len(count)):
if (count[j][1]==count[index][1]):
mode.append(count[j][0])
return mode
#Function my_SD: input is a list and output is the standard deviation of that list
import math
def my_SD(a):
sum = 0
mean = my_mean(a)
for i in range(0, len(a)):
each = pow((a[i]- mean),2)
sum += each
SD = math.sqrt(sum/len(a))
return SD
#Function my_cc: input is a list of tuples which contain the ordered data of two variables
#and output is the correlation coefficient of that two variables
def my_cc(a):
var1 = [x[0] for x in a]
var2 = [x[1] for x in a]
mean1 = my_mean(var1)
mean2 = my_mean(var2)
SD1 = my_SD(var1)
SD2 = my_SD(var2)
standard1 = []
standard2 = []
if (SD1 == 0 or SD2 == 0):
#When either SD equals to 0, we cannot divide by 0 and it makes sense because no variable that has standard deviation 0 could possibly be correlated with another non-constant variable
#then there is no correlation
cc = 0 #I can put it "undefined" here, but later we have to sort the variable based on correlation coeffient, then it is more comfortable to put r as 0 here
else:
for k in var1:
standard1.append((k-mean1)/SD1)
for h in var2:
standard2.append((h-mean2)/SD2)
sum = 0
for l in range(0, len(a)):
sum += standard1[l]*standard2[l]
cc = sum/len(a)
return cc
import math
def my_SD(a):
sum = 0
mean = my_mean(a)
for i in range(0, len(a)):
each = pow((a[i]- mean),2)
sum += each
SD = math.sqrt(sum/len(a))
return SD
#Function my_cc: input is a list of tuples which contain the ordered data of two variables
#and output is the correlation coefficient of that two variables
def my_cc(a):
var1 = [x[0] for x in a]
var2 = [x[1] for x in a]
mean1 = my_mean(var1)
mean2 = my_mean(var2)
SD1 = my_SD(var1)
SD2 = my_SD(var2)
standard1 = []
standard2 = []
if SD1 == 0 or SD2 == 0:
cc = 0
else:
for k in var1:
standard1.append((k-mean1)/SD1)
for h in var2:
standard2.append((h-mean2)/SD2)
sum = 0
for l in range(0, len(a)):
sum += standard1[l]*standard2[l]
cc = sum/len(a)
return cc
def two_variable(tuple_list):
new_l = [] #New_l is a list of small lists. Each small list contains names of two variables and the correlation coefficient
#This two for_loop helps us make all the pairs of variables without repeating
#For example, if we have 3 variables, it will pair var1 and var2 (p=0,q=1), var1 and var3 (p=0,q=2), var2 and var3 (p=1,q=2)
#q should be more than p all the time to avoid repeating
num_var = len(tuple_list[0])
for p in range(0, num_var-1):
for q in range(1, num_var):
if q > p:
l = []
two_var = [(x[p],x[q]) for x in tuple_list]
new_l.append(two_var)
return new_l
def aggregate_stats(tuple_list):
num_var = len(tuple_list[0]) #Number of variables: the length of the first tuple
new_list = [] #I want this new list to have the format as a list of sublist
for i in range(0,num_var):
sub_list = [] #I want sublist to contain: variable name, mean, median, mode, standard deviation of data of that variable
var_name = tuple_list[0][i] #Var_name is variable name, the first element of every sublist
sub_list.append(var_name) #Add var_name to one sublist
one_var = [] #I want this one_var to contain the data of each variable (does not include the name)
for j in range(1,len(tuple_list)):
one_var.append(tuple_list[j][i]) #Append data to each variable
sub_list.append(my_mean(one_var)) #Calculate mean of data of each variable, append the value to list
sub_list.append(my_median(one_var)) #Calculate median of data of each variable, append the value to list
sub_list.append(my_mode(one_var)) #Calculate mode of data of each variable, append the value to list
sub_list.append(my_SD(one_var)) #Calculate standard deviation of data of each variable, append the value to list
new_list.append(sub_list) #Append list to new_list so that new_list contains a list of list, each list contains information about one variable
#When we print new_list, we can get thing like:
#[['temp', 52.33, 52, 'No mode', 2.05], ['pressure', 11.0, 11, 'No mode', 0.82],['precipitation', 0.13, 0.1, [0.1], 0.047]]
sorted_list = sorted(new_list, key=lambda var: var[0]) #I sort new_list based on the name of variable (in alphabetical order), then key is the first element
#The sorted_list should look like:
#[['precipitation', 0.13, 0.1, [0.1], 0.047],['pressure', 11.0, 11, 'No mode', 0.82],['temp', 52.33, 52, 'No mode', 2.05]]
#This for_loop loops through each element of the sorted_list and print out the name, the mean, the median, the mode and the standard deviation of each variable
for m in range(0, num_var):
print "Variable name: %s" % sorted_list[m][0]
print "Mean = %s" % sorted_list[m][1]
print "Median = %s" % sorted_list[m][2]
print "Mode = "+','.join(str(e) for e in sorted_list[m][3])
print "Standard deviation = %s" % sorted_list[m][4]
new_l = [] #New_l is a list of small lists. Each small list contains names of two variables and the correlation coefficient
#This two for_loop helps us make all the pairs of variables without repeating
#For example, if we have 3 variables, it will pair var1 and var2 (p=0,q=1), var1 and var3 (p=0,q=2), var2 and var3 (p=1,q=2)
#q should be more than p all the time to avoid repeating
for p in range(0, num_var-1):
for q in range(1, num_var):
if q > p:
l = []
two_var = [(x[p],x[q]) for x in tuple_list]
#two_var is a list of tuples,
#where the first tuple contains the two variables names and subsequent tuples contain the ordered data
cut = []
#I want "cut" to contain only the data, not the variables names
#so that I can use function my_cc for "cut" to calculate the correlation coefficient
for y in range(1,len(tuple_list)):
cut.append(two_var[y])
r = my_cc(cut)
l.append(two_var[0]) #Append the name of variable to each small list
l.append(r) #Append the correlation coefficient to each small list
new_l.append(l) #Append each small list to the new lis
sorted_l = sorted(new_l, key=lambda var: abs(var[1])) #I sort new_l based on the absolute value of r, then key is the second element
#This for_loop loops through each element of the sorted_l and print out the names of two variables and the relative r
for x in sorted_l:
print "Variable 1: %s" % x[0][0]
print "Variable 2: %s" % x[0][1]
print "Correlation Coefficient = %s" % x[1]
#Test aggregate_stats
aggregate_stats([('temp', 'pressure', 'precipitation'),(50, 10, 0.1),(55, 11, 0.1)])
aggregate_stats([('annual average temperature anomaly', 'annual precipitation anomaly', 'Palmer Drought Index anomaly', 'cooling degree days anomaly', 'heating degree days anomaly'), (-0.41, -0.24, 0.34, 22, 115), (-0.36, 0.39, 0.2, -18, 2), (-0.65, 1.76, 1.58, -57, 136), (0.27, 5.02, 4.45, 25, -223), (0.24, 0.68, 3.07, -81, -154), (-0.52, 3.09, 3.32, -34, -86), (-0.55, -3.63, -0.75, -173, 148), (0.53, 0.5, -1.55, 88, 14), (-0.97, 0.2, 0.52, 31, 360), (-1.14, 2.81, 1.83, -87, 192), (0.37, -1.67, -0.8, 110, 101), (1.1, 0.02, -1.16, 9, -92), (-0.67, 3.92, 2.97, -58, 17), (-0.14, 4.82, 4.61, 48, 19), (-0.04, 1.46, 3.55, 17, -105), (-0.72, 0.03, 0.56, 7, 23), (1.3, 1.44, 1.74, 59, -344), (1.31, -0.93, -0.97, 72, -296), (0.61, -4.04, -3.91, 91, -8), (-0.18, -0.89, -1.25, -27, 47), (1.49, 2.23, -0.37, 76, -619), (1.14, 2.5, 0.75, 140, -445), (0.58, 1.32, 0.79, -135, -228), (-0.76, 2.68, 3.97, 23, 38), (0.85, 0.68, 1.74, 38, -191), (0.63, 2.75, 2.4, 103, -150), (-0.13, 3.76, 2.39, 8, 13), (0.18, 1.92, 3.9, -25, -138), (2.21, 3.95, 1.15, 236, -684), (1.86, -1.47, -0.54, 117, -516), (1.25, -1.72, -4.84, 70, -247), (1.68, -0.92, -3.27, 86, -470), (1.19, -0.89, -2.46, 190, -360), (1.24, 0.57, 0.18, 91, -175), (1.08, 3.31, 0.91, 32, -361), (1.62, 0.14, -0.31, 199, -349), (2.23, -0.12, -2.62, 172, -648), (1.63, -0.76, -2.05, 199, -412), (0.27, 1.3, -0.05, 85, -181), (0.37, 2.36, 0.85, 44, -189), (0.96, 1.43, 2.65, 256, -198), (1.16, 0.16, -0.56, 268, -348), (3.26, -2.41, -4.41, 291, -891), (0.41, 1.12, -0.62, 100, -178), (0.51, 0.9, 0.84, 85, -80)])