Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

save and use pre-computed embeddings, replace tf_idf vectorizer method #16

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 26 additions & 11 deletions leet_topic/leet_topic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from bokeh.models import ColumnDataSource, CustomJS, DataTable, TableColumn, MultiChoice, HTMLTemplateFormatter, TextAreaInput, Div
from bokeh.plotting import figure, output_file, show
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from bokeh.palettes import Category10, Cividis256, Turbo256
Expand All @@ -23,6 +22,7 @@
import string
import logging
import warnings
import pickle

warnings.filterwarnings("ignore")
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
Expand Down Expand Up @@ -230,16 +230,27 @@ def list_creator(fields, str_type=""):
show(layout)


def create_labels(df, document_field, encoding_model,
def create_labels(df, document_field, encoding_model, embeddings=None,
umap_params={"n_neighbors": 50, "min_dist": 0.01, "metric": 'correlation'},
hdbscan_params={"min_samples": 10, "min_cluster_size": 50}):

#Load Transformer Model
model = SentenceTransformer(encoding_model)
if embeddings is not None:
# Use pre-computed embeddings
logging.info("Using pre-computed embeddings")
doc_embeddings = embeddings

else:
#Load Transformer Model
model = SentenceTransformer(encoding_model)

#Create Document Embeddings
logging.info("Encoding Documents")
doc_embeddings = model.encode(df[document_field])
#Create Document Embeddings
logging.info("Encoding Documents")
doc_embeddings = model.encode(df[document_field])

# Save embeddings
logging.info("Saving Embeddings")
with open("embeddings.pickle", "wb") as fichier:
pickle.dump(doc_embeddings, fichier)

#Create UMAP Projection
logging.info("Creating UMAP Projections")
Expand Down Expand Up @@ -303,15 +314,15 @@ def create_tfidf(df, topic_data, document_field, spacy_model):
df["lemma_docs"] = lemma_docs
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(lemma_docs)
feature_names = vectorizer.get_feature_names()
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
tfidf_df = pd.DataFrame(denselist, columns=feature_names)

top_n = 10
tfidf_words = []
for vector in vectors:
top_words = (sorted(list(zip(vectorizer.get_feature_names(),
top_words = (sorted(list(zip(vectorizer.get_feature_names_out(),
vector.sum(0).getA1())),
key=lambda x: x[1], reverse=True)[:top_n])
tfidf_words.append(top_words)
Expand All @@ -326,7 +337,7 @@ def create_tfidf(df, topic_data, document_field, spacy_model):

for leet_label, data in topic_data.items():
X = vectorizer.fit_transform(data["doc_lemmas"])
words = (sorted(list(zip(vectorizer.get_feature_names(),
words = (sorted(list(zip(vectorizer.get_feature_names_out(),
X.sum(0).getA1())),
key=lambda x: x[1], reverse=True)[:top_n])
topic_data[leet_label]["key_words"] = words
Expand Down Expand Up @@ -370,6 +381,7 @@ def LeetTopic(df: pd.DataFrame,
encoding_model='all-MiniLM-L6-v2',
umap_params={"n_neighbors": 50, "min_dist": 0.01, "metric": 'correlation'},
hdbscan_params={"min_samples": 10, "min_cluster_size": 50},
embeddings = None,
app_name=""
):
"""
Expand Down Expand Up @@ -402,6 +414,9 @@ def LeetTopic(df: pd.DataFrame,
hdbscan_params: dict (Optional default {"min_samples": 10, "min_cluster_size": 50})
dictionary of keys to HBDscan params and values for those params

embeddings: numpy.ndarray (Optional)
pre-computed embeddings

app_name: str (Optional)
title of your Bokeh application

Expand All @@ -418,7 +433,7 @@ def LeetTopic(df: pd.DataFrame,

download_spacy_model(spacy_model)

df = create_labels(df, document_field, encoding_model, umap_params=umap_params, hdbscan_params=hdbscan_params)
df = create_labels(df, document_field, encoding_model, umap_params=umap_params, hdbscan_params=hdbscan_params, embeddings=embeddings)
logging.info("Calculating the Center of the Topic Clusters")
topic_data = find_centers(df)
logging.info(f"Recalculating clusters based on a max distance of {max_distance} from any topic vector")
Expand Down