-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
292 lines (237 loc) · 10.5 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import pandas as pd
from prettytable import PrettyTable
import environment.datasets as ds
import fdca.fdca as fdca
import fdca.visualisation as vi
DATASET_PATH = './datasets/'
def handle_manipulation(df):
result = df.copy(True)
# Convert date column to datetime if it exists
if 'date' in result.columns:
result['date'] = pd.to_datetime(result['date'])
print('\nNow you can manipulate the dataframe before applying the FDCA to it.\nType help for available commands.')
command = None
while True:
command = input('> ').split()
if len(command) < 1:
print('Invalid command!')
continue
if command[0] == 'help':
pt = PrettyTable(['Command', 'Description'])
pt.align['Command'] = 'l'
pt.align['Description'] = 'l'
pt.add_row(['end', 'Ends the manipulation step'])
pt.add_row(['help', 'Prints all available commands'])
pt.add_row(['reload', 'Reloads the dataframe'])
pt.add_row(['print {object}', 'Prints the given object. Available objects: df, head, tail, dtypes, columns'])
pt.add_row(['set_index {column}', 'Sets the given column as index'])
pt.add_row(['del_column {column...}', 'Deletes the given columns'])
pt.add_row(['filter_date {start} {end}', 'Filters the dataframe by the given time range. Input datetime as {year}-{month}-{day}-{hour}:{minute}:{second}'])
pt.add_row(['to_dtype {column} {dtype}', 'Converts the given column to the given dtype.'])
print(pt)
elif command[0] == 'set_index':
try:
result = result.set_index(command[1])
except KeyError:
print('The given column doesn\'t exist')
elif command[0] == 'del_column':
try:
result = result.drop(command[1:], axis=1)
except:
print('One of the given columns doesn\'t exist')
elif command[0] == 'filter_date':
if 'date' not in result.columns:
print('This dataframe doesn\'t have a \'date\' column')
continue
try:
result = result[(result['date'] >= command[1]) & (result['date'] <= command[2])]
result = result.reset_index(drop=True)
except:
print('Something went wrong during filtering')
elif command[0] == 'print':
if len(command) < 2:
print('Available objects to print: df, head, tail, dtypes, columns')
continue
if command[1] == 'head':
print(result.head())
elif command[1] == 'tail':
print(result.tail())
elif command[1] == 'dtypes':
print(result.dtypes)
elif command[1] == 'columns':
print(result.columns)
else:
print(result)
elif command[0] == 'to_dtype':
try:
result[command[1]] = result[command[1]].astype(command[2])
except:
print('Something went wrong during conversion')
elif command[0] == 'reload':
result = df.copy(True)
if 'date' in result.columns:
result['date'] = pd.to_datetime(result['date'])
elif command[0] == 'end':
return result
else:
print('Invalid command!')
def find_dc_with_graph(df, parameters):
repeat = True
while repeat:
print('\nWhich parameters should be used to find the dc? Leave input empty to use default value.')
dc_low = input("start value (default: 0): ")
dc_high = input("end value (default: 0.2): ")
step_count = input("number of iterations (default: 200): ")
if dc_low == '':
dc_low = 0
if dc_high == '':
dc_high = 0.2
if step_count == '':
step_count = 200
print('\nExecuting FDCA...', flush=True)
dc_z_map = fdca.calculate_z(df, parameters, dc_low=float(dc_low), dc_high=float(dc_high), step_count=int(step_count))
vi.plot_line(dc_z_map)
print('Success\n')
while True:
prompt = input('Do you want to try other parameters? [y/n] ')
if prompt == 'y':
break
elif prompt == 'n':
repeat = False
break
else:
print('Invalid input!')
def use_dc_for_clustering(df, parameters, tweet_ids):
try_dc = float(input('Type the dc value to use: '))
print('\nExecuting FDCA...', flush=True)
# TODO: Handle output
result_df, result_centers = fdca.execute(df, parameters, try_dc)
if not tweet_ids.empty:
result_df['tweet_id'] = tweet_ids
if 'date' in result_df.columns:
result_df['date'] = result_df['date'].astype('datetime64[ns]')
result_df.to_csv('fdca_twitter.csv')
print('Success')
repeat = True
while repeat:
print('\nHow do you want to plot your data?')
pt = PrettyTable(['Method', 'Description'])
pt.align['Method'] = 'l'
pt.align['Description'] = 'l'
pt.add_row(['2d', 'Plots all data points in a two-dimensional coordinate system and colors them by cluster.'])
pt.add_row(['3d', 'Plots all data points in a three-dimensional coordinate system and colors them by cluster.'])
pt.add_row(['3d_cluster', 'Can be used to show the number of tweets in the clusters over time.\nPlots only the cluster centers in a three-dimensional coordinate system and resizes them depending on the number of tweets in that cluster.\nThe values get split in intervals on the z-axis.'])
pt.add_row(['bar', 'Plots a bar chart with one bar per cluster center and colored parts for different classes/categories.'])
pt.add_row(['map', 'Plots only cluster centers on top of a map and resizes them depending on the number of tweets in that cluster.'])
print(pt)
command = None
while True:
command = input('> ')
if command == '2d':
print('\nWhich parameters should be used for the plot?')
xaxis = input("x-axis: ")
yaxis = input("y-axis: ")
try:
vi.plot_2d(result_df, result_centers, xaxis, yaxis)
except:
print('\nSomething went wrong during plotting.')
break
elif command == "3d":
print('\nWhich parameters should be used for the plot?')
xaxis = input("x-axis: ")
yaxis = input("y-axis: ")
zaxis = input("z-axis: ")
try:
vi.plot_3d(result_df, result_centers, xaxis, yaxis, zaxis)
except:
print('\nSomething went wrong during plotting.')
break
elif command == "3d_cluster":
print('\nWhich parameters should be used for the plot? Leave input empty to use default value.')
xaxis = input("x-axis: ")
yaxis = input("y-axis: ")
zaxis = input("z-axis: ")
steps = input("number of intervalls (default: 200): ")
try:
if steps != '':
vi.plot_x_y_date(result_df, result_centers, xaxis, yaxis, zaxis, int(steps))
else:
vi.plot_x_y_date(result_df, result_centers, xaxis, yaxis, zaxis)
except:
print('\nSomething went wrong during plotting.')
break
elif command == "bar":
print('\nWhich parameters should be used for the plot?')
class_column = input("class column (e.g. lex_info_class): ")
try:
vi.plot_class_bars(result_df, result_centers, class_column)
except:
print('\nSomething went wrong during plotting.')
break
elif command == "map":
print('\nWhich parameters should be used for the plot?')
lon = input("longitude: ")
lat = input("latitude: ")
try:
vi.plot_2d_geo(result_df, result_centers, lon, lat)
except:
print('\nSomething went wrong during plotting.')
break
else:
print('Invalid command!')
while True:
prompt = input('\nDo you want to do another plot? [y/n] ')
if prompt == 'y':
break
elif prompt == 'n':
repeat = False
break
else:
print('Invalid input!')
if __name__ == '__main__':
print('Loading all available datasets...',flush=True)
datasets = ds.DatasetManager(DATASET_PATH)
datasets.info()
dataset = None
while dataset is None:
name = input('Name of the dataset: ')
dataset = datasets.get_by_name(name)
# Save tweet ids
tweet_ids = pd.Series([])
if 'tweet_id' in dataset.frame.columns:
tweet_ids = dataset.frame['tweet_id'].copy(True)
dataset.frame = dataset.frame.drop(['tweet_id'], axis=1)
df = handle_manipulation(dataset.frame)
# Convert datetimes to int
for index, dtype in df.dtypes.iteritems():
if dtype != 'datetime64[ns]': continue
df[index] = df[index].astype('int64')
print('\nSetup parameter types. Type 0 for numeric and 1 for categorical data.')
parameters = []
col_index = 0
while col_index < len(df.columns):
option = int(input(f'{df.columns[col_index]}: '))
if option in [0,1]:
parameters.append(option)
col_index += 1
else:
print('Invalid option. Please try again.')
# Ask user if he wants to cluster with a fixed dc or find the dc with z
print("\nWhat do you want to do?")
pt = PrettyTable(['Utility', 'Description'])
pt.align['Utility'] = 'l'
pt.align['Description'] = 'l'
pt.add_row(['find_dc', 'Use a diagramm of different z and dc values to find the best dc.'])
pt.add_row(['use_dc', 'Use a specific dc value to cluster the data and plot the clustered points.'])
print(pt)
command = None
while(True):
command = input('> ')
if command == 'find_dc':
find_dc_with_graph(df, parameters)
break
elif command == "use_dc":
use_dc_for_clustering(df, parameters, tweet_ids)
break
else:
print('Invalid command!')