-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_tsne_with_autolabels.py
executable file
·118 lines (94 loc) · 4.04 KB
/
generate_tsne_with_autolabels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#! /usr/local/Cellar/python/3.7.3/bin/python3
"""
see https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html to learn more about the
to learn more about the arguments passed via command line
"""
import gensim
import os
import collections
import smart_open
import random
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import time
import sys
from doc_classification import svm_classifier as svm_clf
# early_exaggeration = float(sys.argv[1]) # default: 12.0
#
# doc_vector_size = int(sys.argv[2])
# num_epochs = int(sys.argv[3])
#
# date_made = str(sys.argv[4])
early_exaggeration = 32.0
doc_vector_size = 20
num_epochs = 80
date_made = "nov15_19"
train_file = os.getcwd() + '/src/aggregate_train_corpus.txt'
# yield produces a value that can only be iterated over once
# that is, it's not stored in memory and is deallocated once it's iterated over
# so yielding, instead of returning, speeds things up
# https://pythontips.com/2013/09/29/the-python-yield-keyword-explained/
def read_corpus(fname, tokens_only=False):
with smart_open.open(fname, encoding="UTF-8") as f:
for i, line in enumerate(f):
if tokens_only:
yield gensim.utils.simple_preprocess(line)
else:
# for training data, add tags
yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
train_corpus = list(read_corpus(train_file))
model_path = "./src/doc2vec_models/{}/vs_{}_epochs_{}".format(date_made, doc_vector_size, num_epochs)
model_name = model_path + "/aggregate_model.model"
if not os.path.exists(model_path):
os.mkdir(model_path)
if os.path.exists(model_name):
model = gensim.models.doc2vec.Doc2Vec.load(model_name)
else:
model = gensim.models.doc2vec.Doc2Vec(vector_size=doc_vector_size, min_count=2, epochs=num_epochs)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.save(model_name)
doc_vectors = np.array(model.docvecs.vectors_docs)
embedded_doc_vectors = TSNE(n_components=2, early_exaggeration=early_exaggeration, random_state=1).fit_transform(doc_vectors)
# training data gets labeled in order of being presented
# so, in particular, the following training categories will be given the commented labels
svm_classifier = svm_clf.SVMClassifier(os.getcwd())
svm_classifier.train([
'/src/mtg_articles.txt', # label 0
'/src/sports_articles.txt', # label 1
'/src/bharatanatyam_pdfs.txt' # label 2
])
# print(svm_classifier.predict(' '.join(train_corpus[1].words)))
# init the list so there's not a lot of appending when adding the labels
# the label is hardcoded
embedded_doc_vectors_with_words = [None for i in range(len(doc_vectors))]
for doc_id in range(len(doc_vectors)):
# 0: mtg
# 1: sports
# 2: bharatanatyam
prediction = svm_classifier.predict(' '.join(train_corpus[doc_id].words))
if prediction == 0:
label = "m"
color = dict(facecolor='blue', alpha=0.5)
elif prediction == 1:
label = "s"
color = dict(facecolor='red', alpha=0.5)
else:
label = "b"
color = dict(facecolor='black', alpha=0.5)
embedded_doc_vectors_with_words[doc_id] = [(embedded_doc_vectors[doc_id, 0], embedded_doc_vectors[doc_id, 1]), label, color]
# find the bounds for the axes
minX = min(embedded_doc_vectors[0:, 0])
maxX = max(embedded_doc_vectors[0:, 0])
minY = min(embedded_doc_vectors[0:, 1])
maxY = max(embedded_doc_vectors[0:, 1])
x_offset = 0.1 * (maxX - minX)
y_offset = 0.1 * (maxY - minY)
# define axes and label each point
plt.axis([minX - x_offset, maxX + x_offset, minY - y_offset, maxY + y_offset])
for point, label, color in embedded_doc_vectors_with_words:
plt.text(point[0], point[1], label, bbox=color)
plt.title("Early Exaggeration: {} -- Vector Size: {} -- Epochs: {}".format(early_exaggeration, doc_vector_size, num_epochs))
# file formats supported are png, pdf, and some mores. jpg and jpeg are not supported
plt.savefig(model_path + "/ee_{}.png".format(early_exaggeration))