-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
62 lines (54 loc) · 1.78 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import csv, matplotlib
import pandas as pd, time, sys
import matplotlib.pyplot as plt
import os
from sklearn.cluster import KMeans
def main():
# reading in the data
titles = []
ids = []
filename = sys.argv[1]
with open(filename, newline='') as csvfile:
tsvr = csv.reader(csvfile, delimiter='\t')
for line in tsvr:
try:
title = line[1]
titles.append(title)
id = line[0]
ids.append(id)
except:
pass
#####################################################
# making elbow plot to determine optimal k
# (can be commented out if not needed)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'})
X = vectorizer.fit_transform(titles)
Sum_of_squared_distances = []
K = range(2,10)
for k in K:
km = KMeans(n_clusters=k, max_iter=200, n_init=10)
km = km.fit(X)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
fig_name = sys.argv[3] + '.jpeg'
plt.savefig(fig_name)
plt.show()
######################################################
# conducting the k means clustering
true_k = int(sys.argv[2])
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=200, n_init=10)
model.fit(X)
labels=model.labels_
title_cl=pd.DataFrame(list(zip(ids,labels)),columns=['title','cluster'])
pd.set_option("display.max_rows", None, "display.max_columns", None)
# output is printed! pipe out to a file if printing
# to console is not necessary
print(title_cl.sort_values(by=['cluster']))
# times the code if needed
start_time = time.time()
main()
#print("--- %s seconds ---" % (time.time() - start_time))