-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdbscan.py
239 lines (208 loc) · 6.61 KB
/
dbscan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import json
from sklearn.cluster import DBSCAN
import numpy as np
import sys
""" load all the data first """
root = "data_simple-segments/"
with open("edwpmanifest.json") as f:
filenames = json.load(f)
with open("distmatrix.json") as f:
matrix = json.load(f)
# load all segments
segments = [] # list of list of segments
fileindex = {} # mapping from index to mother filename
c = 0
for filename in filenames:
with open(root + filename) as f:
data = json.load(f)
for traj in data:
segments.append(traj)
fileindex[c] = filename
c += 1
for row in matrix:
for i, val in enumerate(row):
if val == "NaN":
row[i] = sys.float_info.max
# to be used in Cluster.serialize()
def segobjFromSegids(ids):
path = []
for segid in ids:
seg = segments[segid]
for coord in seg:
path.append(coord)
segobj = {
"path": path,
"icao": fileindex[ids[0]].split(".json")[0]
}
return segobj
class Cluster:
def __init__(self, label):
self.label = int(label)
self.segmentids = []
self.rep = None
def __len__(self):
return len(self.segmentids)
def findrep(self):
if self.label == -1:
return None
# for each segment in cluster, find distance to all other segments
distances = []
for i, seg1 in enumerate(self.segmentids):
total = 0
for j, seg2 in enumerate(self.segmentids):
if matrix[i][j] == sys.float_info.max:
total += 0
else:
total += matrix[i][j] ** 2
distances.append(total)
# now find minimum
m_score = None
m_index = None
for i, score in enumerate(distances):
if i == 0:
m_score = score
m_index = i
else:
if score < m_score:
m_score = score
m_index = i
self.rep = i
return i
def add(self, index):
self.segmentids.append(index)
# turn into base objects (arrays or dicts)
def serialize(self):
# inflate ids into paths, joining adjacent segments
# if same icao
obj = {
"segments": [],
"label": self.label
}
self.segmentids.sort()
working = []
for segid in self.segmentids:
pinchoff = False
if len(working) == 0:
working.append(segid)
else:
previd = working[-1]
if fileindex[segid] == fileindex[previd]:
t1 = segments[previd][-1][2]
t2 = segments[segid][0][2]
if t2 - t1 < 60 * 1000: # have to be within 1min to be one segment
working.append(segid)
else:
pinchoff = True
else:
pinchoff = True
# if either seg too far, or different path
if pinchoff:
obj["segments"].append(segobjFromSegids(working))
working = [segid]
if len(working) > 0:
obj["segments"].append(segobjFromSegids(working))
return obj
def mindistance(index, data, cluster):
return min([data[index][c_in] for c_in in cluster])
# using Lu & Fu's nearest neighbor algorithm
def lufucluster(data, threshold=0.1):
n = len(data)
clusters = [[0]]
# do the clustering
for i in range(1, n):
underthreshold = []
for cluster in clusters:
d = mindistance(i, data, cluster)
if d < threshold:
underthreshold.append([cluster, d])
if len(underthreshold) == 0:
clusters.append([i])
else:
smallest = min(underthreshold, key=lambda el: el[1])
smallest[0].append(i)
# get labels
labels = [-1 for x in data]
for i, cluster in enumerate(clusters):
for el in cluster:
labels[el] = i
return labels
def mindistance_cluster(cluster, metacluster, data):
return min(data[cluster.rep][x.rep] for x in metacluster)
# using Lu & Fu's nearest neighbor algo on clusters of clusters
def lufu_metacluster(data, clusters, threshold=0.1):
clusters = list(clusters)
cluster2 = [[clusters[0]]]
for i in range(len(clusters)):
cluster = clusters[i]
if cluster.label == -1:
continue
underthreshold = []
for metacluster in cluster2:
d = mindistance_cluster(cluster, metacluster, data)
if d < threshold:
underthreshold.append([metacluster, d])
if len(underthreshold) == 0:
cluster2.append([cluster])
else:
smallest = min(underthreshold, key=lambda el: el[1])
smallest[0].append(cluster)
return cluster2
def metric_nodelength(seg):
return len(seg)
def metric_pathlength(seg):
s = 0
for i in range(0, len(seg)-1):
s += (seg[i][0] - seg[i+1][0])**2 + (seg[i][1] - seg[i+1][1])**2
return s
DODBSCAN = False
if DODBSCAN:
db = DBSCAN(eps=0.1, min_samples=2, metric="precomputed")
db.fit(matrix)
labels = db.labels_
else:
labels = lufucluster(matrix, 0.15)
# repackage data
maxlabel = max(labels)
labeldict = {}
for i, seg in enumerate(segments):
label = labels[i]
if label not in labeldict:
labeldict[label] = Cluster(label)
labeldict[label].add(i)
# combine all the single element clusters as outlier group
outliers = Cluster(-1)
todelete = []
for cluster in labeldict.values():
if len(cluster) == 1:
outliers.add(cluster.segmentids[0])
todelete.append(cluster.label)
for label in todelete:
del labeldict[label]
# relabel all clusters starting from 0
counter = 0
newlabeldict = {}
for cluster in labeldict.values():
if cluster.label != -1:
cluster.label = counter
newlabeldict[counter] = cluster
counter += 1
labeldict = newlabeldict
labeldict[-1] = outliers
# also find the rep for purposes of distance
for label in labeldict:
labeldict[label].findrep()
# make metaclusters
metaclusters = lufu_metacluster(matrix, labeldict.values(), 0.1)
# print histogram, basically, for quick diagnostics
for label in labeldict:
cluster = labeldict[label]
print("{:<2}\t{:>3}".format(int(label), len(cluster)))
# now cluster clusters into groups
out = [[outliers.serialize()]]
for metacluster in metaclusters:
metaarr = []
for cluster in metacluster:
metaarr.append(cluster.serialize())
out.append(metaarr)
with open("dbscanned.json", "w") as outfile:
json.dump(out, outfile)