From eb2eff9bacde329430402a01b82d9ab855f5a4f8 Mon Sep 17 00:00:00 2001 From: PA <45755573+EquinetPaul@users.noreply.github.com> Date: Fri, 17 Feb 2023 13:37:02 +0100 Subject: [PATCH 1/4] Update leet_topic.py remove duplicate importation of SentenceTransformer --- leet_topic/leet_topic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/leet_topic/leet_topic.py b/leet_topic/leet_topic.py index 2b23a94..a303c58 100644 --- a/leet_topic/leet_topic.py +++ b/leet_topic/leet_topic.py @@ -12,7 +12,6 @@ from bokeh.models import ColumnDataSource, CustomJS, DataTable, TableColumn, MultiChoice, HTMLTemplateFormatter, TextAreaInput, Div from bokeh.plotting import figure, output_file, show import pandas as pd -from sentence_transformers import SentenceTransformer import umap import hdbscan from bokeh.palettes import Category10, Cividis256, Turbo256 From ff8cc52f4866ab8604c14bd55b6b894cb1dbe5f8 Mon Sep 17 00:00:00 2001 From: PA <45755573+EquinetPaul@users.noreply.github.com> Date: Fri, 17 Feb 2023 13:38:30 +0100 Subject: [PATCH 2/4] Update leet_topic.py replace "get_feature_names()" method by "get_feature_names_out()" for scikit-learn>=1.2 --- leet_topic/leet_topic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/leet_topic/leet_topic.py b/leet_topic/leet_topic.py index a303c58..b60165a 100644 --- a/leet_topic/leet_topic.py +++ b/leet_topic/leet_topic.py @@ -302,7 +302,7 @@ def create_tfidf(df, topic_data, document_field, spacy_model): df["lemma_docs"] = lemma_docs vectorizer = TfidfVectorizer(stop_words="english") vectors = vectorizer.fit_transform(lemma_docs) - feature_names = vectorizer.get_feature_names() + feature_names = vectorizer.get_feature_names_out() dense = vectors.todense() denselist = dense.tolist() tfidf_df = pd.DataFrame(denselist, columns=feature_names) @@ -310,7 +310,7 @@ def create_tfidf(df, topic_data, document_field, spacy_model): top_n = 10 tfidf_words = [] for vector in vectors: - top_words = (sorted(list(zip(vectorizer.get_feature_names(), + top_words = (sorted(list(zip(vectorizer.get_feature_names_out(), vector.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n]) tfidf_words.append(top_words) @@ -325,7 +325,7 @@ def create_tfidf(df, topic_data, document_field, spacy_model): for leet_label, data in topic_data.items(): X = vectorizer.fit_transform(data["doc_lemmas"]) - words = (sorted(list(zip(vectorizer.get_feature_names(), + words = (sorted(list(zip(vectorizer.get_feature_names_out(), X.sum(0).getA1())), key=lambda x: x[1], reverse=True)[:top_n]) topic_data[leet_label]["key_words"] = words From 2784c914d387e9f5c7a3c2020a4dd21c5f9407ee Mon Sep 17 00:00:00 2001 From: PA <45755573+EquinetPaul@users.noreply.github.com> Date: Fri, 17 Feb 2023 13:54:49 +0100 Subject: [PATCH 3/4] Update leet_topic.py add a parameter to LeetTopic class to take pre-computed embeddings to accelerate general processing --- leet_topic/leet_topic.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/leet_topic/leet_topic.py b/leet_topic/leet_topic.py index b60165a..c0670f4 100644 --- a/leet_topic/leet_topic.py +++ b/leet_topic/leet_topic.py @@ -229,16 +229,21 @@ def list_creator(fields, str_type=""): show(layout) -def create_labels(df, document_field, encoding_model, +def create_labels(df, document_field, encoding_model, embeddings=None, umap_params={"n_neighbors": 50, "min_dist": 0.01, "metric": 'correlation'}, hdbscan_params={"min_samples": 10, "min_cluster_size": 50}): - #Load Transformer Model - model = SentenceTransformer(encoding_model) + if embeddings is not None: + # Use pre-computed embeddings + logging.info("Using pre-computed embeddings") + doc_embeddings = embeddings + else: + #Load Transformer Model + model = SentenceTransformer(encoding_model) - #Create Document Embeddings - logging.info("Encoding Documents") - doc_embeddings = model.encode(df[document_field]) + #Create Document Embeddings + logging.info("Encoding Documents") + doc_embeddings = model.encode(df[document_field]) #Create UMAP Projection logging.info("Creating UMAP Projections") @@ -369,6 +374,7 @@ def LeetTopic(df: pd.DataFrame, encoding_model='all-MiniLM-L6-v2', umap_params={"n_neighbors": 50, "min_dist": 0.01, "metric": 'correlation'}, hdbscan_params={"min_samples": 10, "min_cluster_size": 50}, + embeddings = None, app_name="" ): """ @@ -401,6 +407,9 @@ def LeetTopic(df: pd.DataFrame, hdbscan_params: dict (Optional default {"min_samples": 10, "min_cluster_size": 50}) dictionary of keys to HBDscan params and values for those params + embeddings: numpy.ndarray (Optional) + pre-computed embeddings + app_name: str (Optional) title of your Bokeh application @@ -417,7 +426,7 @@ def LeetTopic(df: pd.DataFrame, download_spacy_model(spacy_model) - df = create_labels(df, document_field, encoding_model, umap_params=umap_params, hdbscan_params=hdbscan_params) + df = create_labels(df, document_field, encoding_model, umap_params=umap_params, hdbscan_params=hdbscan_params, embeddings=embeddings) logging.info("Calculating the Center of the Topic Clusters") topic_data = find_centers(df) logging.info(f"Recalculating clusters based on a max distance of {max_distance} from any topic vector") From 02932bea7918cb62375f6c648ef2447dfb3e1cb8 Mon Sep 17 00:00:00 2001 From: PA <45755573+EquinetPaul@users.noreply.github.com> Date: Fri, 17 Feb 2023 14:01:11 +0100 Subject: [PATCH 4/4] Update leet_topic.py Save the embeddings to a pickle object file after calculating them --- leet_topic/leet_topic.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/leet_topic/leet_topic.py b/leet_topic/leet_topic.py index c0670f4..f82266b 100644 --- a/leet_topic/leet_topic.py +++ b/leet_topic/leet_topic.py @@ -22,6 +22,7 @@ import string import logging import warnings +import pickle warnings.filterwarnings("ignore") logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) @@ -237,6 +238,7 @@ def create_labels(df, document_field, encoding_model, embeddings=None, # Use pre-computed embeddings logging.info("Using pre-computed embeddings") doc_embeddings = embeddings + else: #Load Transformer Model model = SentenceTransformer(encoding_model) @@ -245,6 +247,11 @@ def create_labels(df, document_field, encoding_model, embeddings=None, logging.info("Encoding Documents") doc_embeddings = model.encode(df[document_field]) + # Save embeddings + logging.info("Saving Embeddings") + with open("embeddings.pickle", "wb") as fichier: + pickle.dump(doc_embeddings, fichier) + #Create UMAP Projection logging.info("Creating UMAP Projections") umap_proj = umap.UMAP(**umap_params).fit_transform(doc_embeddings)