-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClustering.py
361 lines (312 loc) · 16.7 KB
/
Clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
# Some Infos and Ideas:
# The kartisian space calculated from the drivingTime Matrix is n-1 dimensional.
# I have tryed to write my own algorithems to get the Problem down to 2 Dimensions: (__distanceMatrixToPoints_onlyFirstThree and __distanceMatrixToPoints_midEuclid)
# The mid Euclid dose not work really well (maybe because the first points set in 2d are completely wrong, or an error in the code (i havend completely checked its correctness))
# The onlyFirstThree Alg. works suprisingly better, although it only considered the distances to the first three points
# I found the Problem under the Name "Distance geometry problem with inaccurate distances":
# Maybe the Alg. "Solving the molecular distance geometry problem with inaccurate distance data" is better, but it looks pretty complicated: https://bit.ly/2H9bMGX (minimize Function with SciPy: https://bit.ly/2qHJ58P)
# Other possible techniques: Hierarchical method and SOM and Gas NN (= Neural gas) (Algs. from https://bit.ly/2HKDS8j)
import numpy as np
import copy
import random
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin
#from sklearn.cluster import KMeans
#from sklearn.cluster import AgglomerativeClustering
'''
File name: Clustering.py
Author: Sebastian Nichtern
Date created: 13.04.2018
Date last modified: 26.06.2018
Python Version: 3.6.3
'''
############# GLOBAL VARS #############
optimalSecondPoint = None
PLOT_HERE = False
''' DIMENSION REDUCTION '''
def __distanceMatrixToPointsIfExact(A):
# https://math.stackexchange.com/questions/156161/finding-the-coordinates-of-points-from-distance-matrix
# this dose not work in this case because the matrix consists of Travletimes, not distances
M = (np.square(A[0, :]) + np.square(np.transpose([A[:,0]])) - np.square(A)) / 2
u, s, v = np.linalg.svd(M) # Singulärwertezerlegung
rank = np.sum(s > 1e-10) # Alternative: np.linalg.matrix_rank(M). 1e-10 = Threshold where it counts as a zero
# rank2 =
points = []
for i, singu in enumerate(s):
if singu > 1e-10:
points.append((singu*v[i]))
points = np.array(points)
return points
def __distanceMatrixToPoints_onlyFirstThree(A, secondPoint=1):
# Using 2 initial Points and locate them in space, calculate all other point locations using there distances
points = np.zeros((A.shape[1],2))
# first fixed point is 0,0 (it's the Lab)
# Second point as far away as possible from the first one: (https://bit.ly/2qHwZg4)
# secondPoint = 1 # second fixed point at (x=0, y= 1 or distance to doc whos farest away)
# if farestPoint:
# secondPoint = np.argmax(A[0,1:])+1 # exclude first zero for argmin # argmax for index (armax for value)
points[secondPoint] = [0,A[0,secondPoint]]
for p in range(1,len(points)): # from third point to last point
if p == secondPoint: # this Point has allready been used above
continue
d13 = A[0,p] # point 3 changes each loop -> used p instead of 3
d23 = A[secondPoint,p]
x1 = 0
y1 = 0
x2 = 0 # actually not needed
y2 = A[0,secondPoint] # distance to the second Point = y2-coordinate
# Calculate y3 and x3 (formulars generated with wolfram Alpha):
y3 = np.divide((-np.square(d13)+np.square(d23)+np.square(y1)-np.square(y2)), 2*(y1-y2))
x3 = x1+np.sqrt(-np.square(y3)+np.square(d13)+2*y3*y1-np.square(y1))
# decide if x3 is negative or positive. It depends on the best Distance (+ or -) to the Distence in A:
if p > 2: # only now the third point is defined
dPositive = np.sqrt(np.square(x3 - points[2][0]) + np.square(y3 - points[2][1]))
dNegative = np.sqrt(np.square(-x3 - points[2][0]) + np.square(y3 - points[2][1]))
dPositive_DiffToA = np.abs(dPositive - A[2,p])
dNagative_DiffToA = np.abs(dNegative - A[2,p])
if dNagative_DiffToA < dPositive_DiffToA:
x3 = -x3
# and save the calculated location:
points[p] = [x3,y3]
return points
def __distanceMatrixToPoints_midEuclid(A):
# Places point after point in the 2D space at the location with mid best fitting Range to all places points
# can use random points and optimization using the __pointsToDistanceMatrix() and __matrixDifference() functions
# Problem mit diesem Alg: Punkte werden stueck fuer stueck hinzugefuegt, nur Abstaende zu bereits
# gesetzten Punkten werden somit beruecksichtigt. Neu gesetzte Punkte beruecksichtigen dann zwar
# die position der alten, wenn diese aber schon voellig falsch liegen nuetzt das auch nichts mehr
points = np.zeros((A.shape[1], 2))
order = list(range(1,A.shape[1]))
random.shuffle(order) # random order in witch points are added
points[1] = [0, A[0,order[-1]]] # first point (Lab) was (0,0), second point now placed
for lastAddedPoint, newPoint in enumerate(order[1:]):
# lastAddesPoint+2 is needed for the number of elements in points that are used for the approximation
# 1. Suchraum bestimmen (Abstand * 1.5 von allen Punkten aus)
xSearchSpace = [0,0] # min and max x
ySearchSpace = [0,0] # min and max y
ENLARGING_FACTOR = 1.5
for oldPoint in range(lastAddedPoint+2):
# distance to toPoint from element * 1,5
xMin = (points[oldPoint][0] - A[newPoint,oldPoint]) * ENLARGING_FACTOR
xMax = (points[oldPoint][0] + A[newPoint,oldPoint]) * ENLARGING_FACTOR
yMin = (points[oldPoint][1] - A[newPoint,oldPoint]) * ENLARGING_FACTOR
yMax = (points[oldPoint][1] + A[newPoint,oldPoint]) * ENLARGING_FACTOR
if xSearchSpace[0] >= xMin:
xSearchSpace[0] = xMin
if xSearchSpace[1] <= xMax:
xSearchSpace[1] = xMax
if ySearchSpace[0] >= yMin:
ySearchSpace[0] = yMin
if ySearchSpace[1] <= yMax:
ySearchSpace[1] = yMax
# 2. Im suchraum minimum der Funktion (Abstaende zu allen nachbarn) Finden
# Entweder brute force oder monte carlo oder Alternativer ansatz:
# Diskretisierung in 100ter schritte und dort wo die besten ergebnisse gefunden
# wurden anschließend nochmal mit kleineren schritten (z.b. 1ner) suchen
# Discreted Brute Force (STEPSIZE):
STEPSIZE = 200 # higher = faster
bestPosition = [xSearchSpace[0], ySearchSpace[0]]
lowermostDistanceSum = 10000000000
for x in np.arange(xSearchSpace[0],xSearchSpace[1],STEPSIZE):
for y in np.arange(ySearchSpace[0],ySearchSpace[1],STEPSIZE):
distances = [] # distances to all already set points
for oldPoint in range(lastAddedPoint + 2):
d = np.sqrt(np.square(x - points[oldPoint][0]) + np.square(y - points[oldPoint][1]))
if d < A[newPoint,oldPoint]: # if the distance is gone below the min distance from A
d += 1.5*(A[newPoint,oldPoint] - d) # then add the overstepped part to the distance. -> pseudodistance
# this way, going nearer to points dose not count as better solutions
distances.append(d)
if sum(distances) < lowermostDistanceSum:
bestPosition = [x,y]
lowermostDistanceSum = sum(distances)
print(str(lastAddedPoint+2) + ": " + str(bestPosition))
points[lastAddedPoint+2] = bestPosition
# Alternativer alg (Einfacher und ungenauer):
# 2 zufaellige nachbarn auswaehlen und von diesen über den Abstand mit der formel aus
# __distanceMatrixToPoints_onlyFirstTwo die Position bestimmen.
return points
def __pointsToDistanceMatrix(p):
dist = np.zeros((p.shape[0],p.shape[0])).astype(np.int)
for i in range(dist.shape[0]):
for j in range(dist.shape[1]):
dist[i,j] = np.round(np.sqrt(np.square(p[j,0]-p[i,0])+np.square(p[j,1]-p[i,1])),0).astype(np.int)
return dist
def __matrixDifference(A,B):
# Only for 2D arrays!
# Hint this module: works only if the points have the same order in both matreces
err = 0
if(A.shape == B.shape):
for i in range(A.shape[0]):
for j in range(A.shape[1]):
err += np.abs(A[i,j]-B[i,j])
else:
print("A and B must have the same shape (in __matrixDifference(A,B))")
return err
def __to2D(A):
# Iterate through all Points and choose the best second Point:
global optimalSecondPoint
if not optimalSecondPoint: # if its not allready calculated for this problem...
optimalSecondPoint = 1
minErr = 100000000000
for secondPoint in range(1, A.shape[1]):
points = __distanceMatrixToPoints_onlyFirstThree(A, secondPoint) # n-1 Dimensions to 2D (err(10) = 146060, err(30) = 2223666)
D = __pointsToDistanceMatrix(points) # Distances in the model ([n-1]D to 2D)
err = __matrixDifference(A, D) # compare original distances and projected distances in 2D
# print("Point: "+str(secondPoint)+" err: " + str(err))
# plt.scatter(points[:,0],points[:,1], c=np.random.rand(3,)) # Plot from all Points as second Points
# plt.scatter(0,0,color="red")
# plt.annotate("Lab",(0,0))
if err < minErr:
minErr = err
optimalSecondPoint = secondPoint
# plt.scatter(points[:, 0], points[:, 1], c=np.random.rand(3, ))
# for i,p in enumerate(points):
# plt.annotate(str(i), (p[0], p[1]))
# plt.annotate("err: "+str(err),(points[2, 0], points[2, 1]))
# plt.annotate("Lab",(0,0))
# plt.show()
points = __distanceMatrixToPoints_onlyFirstThree(A, optimalSecondPoint)
# print("Optimal Point: "+str(optimalSecondPoint)) # +", minimal err: " + str(minErr))
# plt.scatter(points[:,0],points[:,1], c=np.random.rand(3,))
# plt.scatter(0,0,color="red")
# plt.annotate("Lab",(0,0))
# plt.show()
return points
''' CLUSTERING '''
def labelsToSets(labels, CLUSTER_COUNT):
clusterSets = [[] for _ in range(CLUSTER_COUNT)]
for i, label in enumerate(labels):
clusterSets[label].append(i)
# DONT wunder about the "array" keword when you print out the return. Its because the Elements dont have the same size!!!
return np.array([cluster for cluster in clusterSets])
def approximateTimeNeededWithKNN(A, cluster):
placesLeft = copy.deepcopy(list(cluster))
placesLeft = np.array(np.trim_zeros(placesLeft)) # if the start node is in this cluster, remove it
path = [0] # first node is the Laboratory
distanceSum = 0
for _ in range(len(placesLeft)):
lowestDist = 1000000000
nearestNeighbor = 0
currentPlace = path[-1]
for neighbor in placesLeft:
dist = A[currentPlace, neighbor]
if dist < lowestDist:
lowestDist = dist
nearestNeighbor = neighbor
distanceSum += lowestDist
path.append(nearestNeighbor)
placesLeft = np.delete(placesLeft, np.argwhere(placesLeft == nearestNeighbor))
# print("places Left "+str(placesLeft))
# print("nearest neighbor: "+str(nearestNeighbor))
# go back to the start:
distanceSum += A[path[-1], 0]
path.append(0)
return distanceSum, path
def my_special_KMeans(X, n_clusters, startAngleRange=120, rseed=0):
# TODO: I think Clusters can be also be found in (n-1)d space, not only 2d.
# 1. Randomly choose clusters # Instead I calculate meaningfull centroid points.
# rng = np.random.RandomState(rseed)
# i = rng.permutation(X.shape[0])[:n_clusters]
# print(i)
# centers = X[i]
if n_clusters > X.shape[0]: # just in case
print("You want more Clusters then Points. Thats not possible (Func: my_special_KMeans)")
absoluteCenter = X.mean(0)
startAngle = np.arcsin(np.divide(absoluteCenter[0],np.sqrt(np.square(absoluteCenter[0])+np.square(absoluteCenter[1])))) * 57.2958 # rad->deg: * 57,2958
# startAngle: add this angle to put the start Points in the right direction.
alpha = startAngleRange/n_clusters
radius = absoluteCenter[1] # set radius to mean y of all datapoints
centers = []
# Seperating the Points between the positive and the negative axis:
if n_clusters & 0x1: # odd (ungerade)
factor = (n_clusters-1)/2
else:
factor = n_clusters/2 - 0.5
for i in range(n_clusters):
currentalpha = (-factor+i*factor)/(n_clusters/2)*alpha + startAngle # can be optimized
centers.append([radius*np.sin(0.0174533*currentalpha), radius*np.cos(0.0174533*currentalpha)]) # Numpy takes Rad, not degree -> *0.0174533
centers = np.array(centers)
# plt.scatter(X[:, 0], X[:, 1])
# plt.scatter([c[0] for c in centers], [c[1] for c in centers], color="red")
# plt.show()
while True:
# TODO: I think Clusters can be also be found in (n-1)d space, not only 2d.
labels = pairwise_distances_argmin(X, centers) # Assign labels based on closest center
new_centers = np.zeros(centers.shape)
for i in range(n_clusters): # Find new centers from means of points
clusterPoints = np.array(X[labels == i])
# withLab = np.vstack([clusterPoints,[0,0]]) # always consider the Lab
# moves the mean in the direction of the Lab, not always good!
# withLab = np.vstack([withLab,[0,0]]) # double Lab
if clusterPoints.size != 0: # there are no nearest neighbors
new_centers[i] = clusterPoints.mean(0)
else:
new_centers[i] = X[random.randrange(X.shape[0])]
# new_centers = np.array([X[labels == i].mean(0) for i in range(n_clusters)])
if np.all(centers == new_centers): # Check for convergence
break
centers = new_centers
for i in list(range(n_clusters)):
if i not in labels.tolist():
print("One Cluster hase no points in it")
# plt.scatter([c[0] for c in centers], [c[1] for c in centers], color="red")
# for c in centers:
# plt.annotate("Center",(c[0],c[1]))
return centers, labels
''' PLOTTING '''
def plot(points, clusterSets, DRIVING_TIMES=None, paths=None):
for clusterNr, cluster in enumerate(clusterSets):
clusterPoints = [points[p] for p in cluster]
# Plot:
plt.scatter([pp[0] for pp in clusterPoints], [pp[1] for pp in clusterPoints])
for i,p in enumerate(clusterPoints):
plt.annotate(str(i), (p[0], p[1]))
# plt.annotate(str(clusterNr), (p[0], p[1]))
# Plot Path:
if paths:
path = paths[clusterNr]
lastP = None
for p in path:
if lastP is None:
lastP = p
continue
plt.plot([points[p][0],points[lastP][0]], [points[p][1],points[lastP][1]],c="black")
lastP = p
plt.annotate("Lab",(0,0))
plt.axis('off')
plt.show()
''' EXECUTION '''
def cluserting(DRIVING_TIMES, CLUSTER_COUNT):
# print("Finding best 2D fit...")
points = __to2D(DRIVING_TIMES)
# CLUSTER_COUNT = 5 # for testing
# labels = spectral_clustering(DRIVING_TIMES.astype(np.float64), CLUSTER_COUNT) # Produces a different result every time
''' Find Clusters '''
# print("Clustering...")
# labels = KMeans(CLUSTER_COUNT, random_state=0).fit_predict(points)
# labels = AgglomerativeClustering(CLUSTER_COUNT).fit_predict(points)
centers, labels = my_special_KMeans(points,CLUSTER_COUNT)
# print(labels)
# TODO: Finding a more problem specific way of clustering
''' How many points are in each cluster: '''
unique, counts = np.unique(labels, return_counts=True)
# print("Points in Cluster x: "+str(dict(zip(unique, counts))))
''' labels -> sets of points (The real Clusters) '''
clusterSets = labelsToSets(labels, CLUSTER_COUNT)
if PLOT_HERE:
# EXAMPLE Path finder (KNN):
times = []
paths = []
for cluster in clusterSets:
# print("__________________________________")
# print("Cluster: "+str(cluster))
time, path = approximateTimeNeededWithKNN(DRIVING_TIMES, cluster)
times.append(time)
paths.append(path)
# print("path: "+str(path))
# print("Cluster: "+str(clusterNr)+" time KNN: "+str(time))
plot(points,clusterSets,DRIVING_TIMES,paths)
return points, clusterSets
# Other Alorithms (Not feasible for this problem):
# 1. Random Projection to reduce Dimensions is only for labeled data -> (Training set, target Values) (X : numpy array of shape [n_samples, n_features], y : numpy array of shape [n_samples])
# Use of that Alg.: http://scikit-learn.org/stable/modules/random_projection.html