From 993d98c1e44cb15a4d597e706350cbf471d5def3 Mon Sep 17 00:00:00 2001 From: ospitia <90850810+ospitia@users.noreply.github.com> Date: Tue, 17 May 2022 14:27:58 +0200 Subject: [PATCH] Allow semantic search with taxonomies #69 (#80) * Allow semantic search with taxonomies #69 * rdf querying for taxonomy CSO * Update src/dossier_search/concept_search/faiss_search.py Co-authored-by: Wojciech Kusa * rdf search extension for CCS taxonomy * fix missing requirement * fix specific exceptions in get_id * fix(taxonomy): semantic search works on M1 chip - lexical search on M1 is disabled * fix(taxonomy): fix settings import * fix(taxonomy): faiss indexing works on cpu * style(taxonomy): style with black * build(taxonomy): update requirements * fix(taxonomy): sort imports, catch more specific exceptions * docs(taxonomy): update readme and change default M1_CHIP to False * fix path parameter optional Co-authored-by: Wojciech Kusa Co-authored-by: Wojciech Kusa Co-authored-by: Ayah Soufan --- README.md | 9 + data/processed/.gitkeep | 0 requirements.txt | 9 +- .../concept_search/faiss_search.py | 121 +++++++ .../concept_search/lexical_search.py | 44 +++ src/dossier_search/concept_search/taxonomy.py | 340 ++++++++++++++++-- src/dossier_search/dossier_search/settings.py | 3 + 7 files changed, 496 insertions(+), 30 deletions(-) create mode 100644 data/processed/.gitkeep create mode 100644 src/dossier_search/concept_search/faiss_search.py create mode 100644 src/dossier_search/concept_search/lexical_search.py diff --git a/README.md b/README.md index a870209..29a814d 100644 --- a/README.md +++ b/README.md @@ -70,3 +70,12 @@ Server should be available at http://127.0.0.1:8000/ (cruise-literature)$ python manage.py runserver YOUR_IP:YOUR_PORT ``` +## 3. Troubleshooting + +### 3.1 M1 Macbook + +If you are using a laptop with the M1 chip please change the following line in the [settings.py](src/dossier_search/dossier_search/settings.py) file: + +```python +M1_CHIP = True +``` \ No newline at end of file diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 75d9fc3..741ce3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,11 @@ wikipedia~=1.4.0 pandas==1.4.1 xmltodict==0.12.0 fuzzywuzzy==0.18.0 -requests~=2.27.1 \ No newline at end of file +requests~=2.27.1 +torch==1.10.2 +transformers==4.14.1 +python-terrier==0.8.1 +rdflib==6.1.1 +faiss-cpu==1.7.2 +numpy~=1.22.3 +faiss-gpu==1.7.2 \ No newline at end of file diff --git a/src/dossier_search/concept_search/faiss_search.py b/src/dossier_search/concept_search/faiss_search.py new file mode 100644 index 0000000..fe6dfa6 --- /dev/null +++ b/src/dossier_search/concept_search/faiss_search.py @@ -0,0 +1,121 @@ +from os.path import exists + +import faiss +import numpy as np +import torch +from dossier_search.settings import M1_CHIP +from transformers import AutoTokenizer, AutoModel + +if M1_CHIP: + # solves problems with MKL library on M1 macbook + # FIXME: this should be replaced by a proper requirements for M1 + import os + + os.environ["KMP_DUPLICATE_LIB_OK"] = "True" + + +class SemanticSearch: + def __init__(self, data: list, tax_name: str): + model_name = "allenai/scibert_scivocab_cased" + self.model = AutoModel.from_pretrained( + model_name, output_hidden_states=True + ).eval() + self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=16) + + self.n_dimensions = self.model.pooler.dense.out_features + self.data = data + + index_path = "../../data/processed/Taxonomy{}index.bin".format(tax_name) + + if exists(index_path): + self.taxonomy_index = faiss.read_index(index_path) + else: + self.taxonomy_index = self.create_faiss_index() + try: + faiss.write_index( + faiss.index_gpu_to_cpu(self.taxonomy_index), index_path + ) + except AttributeError: + faiss.write_index(self.taxonomy_index, index_path) + + try: + res = faiss.StandardGpuResources() + self.taxonomy_index = faiss.index_cpu_to_gpu(res, 0, self.taxonomy_index) + except AttributeError: + pass + + def embedding(self, word: str): + """ + embedds the word input as the sum of the last 4 hidden + states of Bert embeddings + """ + model = self.model + tokenizer = self.tokenizer + word = [word] if word.__class__ != list else word + marked_text = ["[CLS] " + i + " [SEP]" for i in word] + + padding = "max_length" + tokenized_text = [ + tokenizer.tokenize(i, padding=padding, truncation=True) for i in marked_text + ] + tokenized_text_ = [ + tokenizer.tokenize(i, padding=False, truncation=True) for i in marked_text + ] + wordpiece_vectors = [len(i) - 1 for i in tokenized_text_] + + indexed_tokens = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text] + tokens_tensor = torch.tensor([indexed_tokens]).squeeze(0) + + with torch.no_grad(): + outputs = model(tokens_tensor) + hidden_states = outputs["hidden_states"] + + # combine the layers to make a single tensor. + token_embeddings = torch.stack(hidden_states, dim=0) + token_embeddings = torch.squeeze(token_embeddings, dim=1) + + try: + token_embeddings = token_embeddings.permute(1, 2, 0, 3) + except RuntimeError: + # RuntimeError: number of dims don't match in permute + token_embeddings = token_embeddings.unsqueeze(0).permute(0, 2, 1, 3) + + out = [] + for item, wordpiece_vector in zip(token_embeddings, wordpiece_vectors): + token_vecs_sum = [] + for token in item: + sum_vec = torch.sum(token[-4:], dim=0) + token_vecs_sum.append(sum_vec) + out_i = torch.stack(token_vecs_sum[1:wordpiece_vector], dim=-1) + out_i = out_i.mean(dim=-1) + out.append(out_i) + return torch.stack(out) + + def create_faiss_index(self): + """ + create a faiss index of type FlatL2 from data vectors of n_dimensions + """ + + fastIndex = faiss.IndexFlatL2(self.n_dimensions) + + try: + # copy the index to GPU + res = faiss.StandardGpuResources() + fastIndex = faiss.index_cpu_to_gpu(res, 0, fastIndex) + except AttributeError: + pass + + # index data + embeddings = self.embedding(self.data).detach().numpy() + fastIndex.add(np.stack(embeddings).astype("float32")) + return fastIndex + + def do_faiss_lookup(self, text): + """ + the semantic search of word in the indexed collection + """ + vector = self.embedding(text).detach().numpy().astype("float32") + + score, index = self.taxonomy_index.search(vector, 1) + + return self.data[index[0][0]], score diff --git a/src/dossier_search/concept_search/lexical_search.py b/src/dossier_search/concept_search/lexical_search.py new file mode 100644 index 0000000..d5984af --- /dev/null +++ b/src/dossier_search/concept_search/lexical_search.py @@ -0,0 +1,44 @@ +from os.path import exists, join + +import pyterrier as pt + +if not pt.started(): + pt.init() + + +class LexicalSearch: + def __init__(self, data: list, tax_name: str): + self.data = data + self.path = "../../data/processed/taxonomy{}iter_index".format(tax_name) + self.indexref = self.get_index() + + def collection_iter(self, collection): + for idx, text in enumerate(collection): + yield {"docno": idx, "text": text} + + def get_index(self): + if exists(join(self.path, "data.properties")): + indexref = pt.IndexFactory.of(join(self.path, "data.properties")) + else: + iter_indexer = pt.IterDictIndexer(self.path, blocks=True, overwrite=True) + + doc_iter = self.collection_iter(self.data) + indexref = iter_indexer.index(doc_iter) + + return indexref + + def lexical_search(self, query): + retr_controls = { + "wmodel": "BM25", + "string.use_utf": "true", + "end": 0, + } + + retr = pt.BatchRetrieve(self.indexref, controls=retr_controls) + + res = retr.search(query) + + if len(res) > 0 and res.score.values[0] > 7: + return self.data[int(res.docno.values[0])] + else: + return -100 diff --git a/src/dossier_search/concept_search/taxonomy.py b/src/dossier_search/concept_search/taxonomy.py index c0d543e..bf13c1c 100644 --- a/src/dossier_search/concept_search/taxonomy.py +++ b/src/dossier_search/concept_search/taxonomy.py @@ -1,12 +1,37 @@ from abc import ABC, abstractmethod +from os.path import join as path_join +from typing import Tuple + import pandas as pd import xmltodict +from dossier_search.settings import M1_CHIP from fuzzywuzzy import fuzz +from rdflib import Graph, Namespace from .concept import Concept +from .faiss_search import SemanticSearch + +if M1_CHIP: + + class LexicalSearch: + """Mockup for LexicalSearch on laptops with M1 chip. + FIXME: this should be replaced at some point by a different implementation + of lexical search. + """ + + def __init__(self, data, tax_name): + pass + + def lexical_search(self, query): + raise IndexError + +else: + from .lexical_search import LexicalSearch class Taxonomy(ABC): + taxonomy = None + def __init__(self): pass @@ -14,17 +39,24 @@ def __init__(self): def read_taxonomy(self): pass - def get_id(self, query: str) -> str: + def get_id(self, query: str) -> Tuple[int, str]: query = query.lower().lstrip() taxonomy = self.taxonomy try: id = taxonomy[taxonomy.text == query].id.values[0] - except: + except IndexError: scores = taxonomy.text.apply(lambda x: fuzz.ratio(x, query)) max_value, idx = scores.max(), scores.idxmax() # -100 is the id we use when the query is not found in the taxonomy id = taxonomy.iloc[idx].id if max_value > 90 else -100 - return id + if id == -100: + try: + query_ = self.lexical_search(query) + id = taxonomy[taxonomy.text == query_].id.values[0] + except IndexError: + query, _ = self.semantic_search(query) + id = taxonomy[taxonomy.text == query].id.values[0] + return id, query def get_1st_level_parents(self, id): taxonomy = self.taxonomy @@ -61,7 +93,7 @@ def get_children(self, id): return [self.assign_children(item) for item in children] def search_relationships(self, query): - id = self.get_id(query) + id, query = self.get_id(query) if id == -100: return Concept(-100, query) query = Concept(id, query) @@ -87,7 +119,14 @@ class TaxonomyCCS(Taxonomy): def __init__(self, path): super().__init__() self.path = path - self.taxonomy = self.read_taxonomy() + self.taxonomy, self.concept_list = self.read_taxonomy() + self.semantic_search = SemanticSearch( + data=self.concept_list, tax_name="CCS" + ).do_faiss_lookup + self.lexical_search = LexicalSearch( + data=self.concept_list, tax_name="CCS" + ).lexical_search + print("Taxonomy instantiated") def read_taxonomy(self): @@ -116,54 +155,297 @@ def read_taxonomy(self): table = pd.DataFrame(example_list, columns=["id", "text", "child"]) table = table.drop_duplicates() table.text = table.text.str.lower() + table.sort_values("text", inplace=True) + concept_list = list(table.text.unique()) + concept_list.sort() - return table + return table, concept_list class TaxonomyCSO(Taxonomy): def __init__(self, path): super().__init__() self.path = path - self.taxonomy = self.read_taxonomy() + self.taxonomy, self.concept_list = self.read_taxonomy() + self.semantic_search = SemanticSearch( + data=self.concept_list, tax_name="CSO" + ).do_faiss_lookup + self.lexical_search = LexicalSearch( + data=self.concept_list, tax_name="CSO" + ).lexical_search print("Taxonomy instantiated") def read_taxonomy(self): - df = pd.read_csv(self.path, header=None, - names=['item', 'relationship', 'item_rel']) + df = pd.read_csv( + self.path, header=None, names=["item", "relationship", "item_rel"] + ) # all the items are urls like '' [:-1] removes '>' - df = df.applymap(lambda x: x.split('/')[-1][:-1]).copy() + df = df.applymap(lambda x: x.split("/")[-1][:-1]).copy() # there's a constant substring 'cso#' not needed in relationships - df.relationship = df.relationship.apply(lambda x: x.split('#')[-1]) + df.relationship = df.relationship.apply(lambda x: x.split("#")[-1]) # labels and type might not be used. contributesTo not sure what it is yet # related links seem not well curated - df = df[~df.relationship.isin(['label', 'type', 'contributesTo', 'relatedLink'])] + df = df[~df.relationship.isin(["label", "type", "relatedLink"])] # there are some strings with a segment after % # e.g, numerical_analysis % 2C_computer - assisted - df = df.applymap(lambda x: x.split('%')[0]) - # names are give with '_' + df = df.applymap(lambda x: x.split("%")[0]) + # names are given with '_' # e.g. automated_pattern_recognition - df = df.applymap(lambda x: ' '.join(x.split('_')).lower().lstrip()) + df = df.applymap(lambda x: " ".join(x.split("_")).lower().lstrip()) + # names are given with '-' + # e.g. context-aware systems + df = df.applymap(lambda x: " ".join(x.split("-")).lower().lstrip()) # there are several relationships showing different ways of # writing/referring to the concept - # relationships = ['relatedEquivalent', - # 'relatedEquivalent', 'preferentialEquivalent' - # 'sameAs', 'relatedLink'] - # trying to get same format adopted for ccs: - taxonomy = df[df.relationship == 'supertopicof'].copy() + # relationships = ['relatedequivalent', 'preferentialequivalent', + # 'sameas', 'contributesto'] + df_temp = df[df.item != df.item_rel].copy() + unique_concepts = list( + df_temp[df_temp.relationship == "preferentialequivalent"].item + ) + unique_concepts += set(df.item) - set( + df[df.relationship.isin(["preferentialequivalent"])].item + ) + redundant_concepts = set(df.item) - set(unique_concepts) + df = df[ + df.item.isin(unique_concepts) & (~df.item_rel.isin(redundant_concepts)) + ].copy() + taxonomy = df[df.relationship == "supertopicof"].copy() # assign unique numbers to each concept text = set(list(taxonomy.item.values) + list(taxonomy.item_rel.values)) ids = pd.DataFrame(list(text), columns=["text"]).reset_index() - ids = ids.rename(columns={'index': 'id'}) + ids = ids.rename(columns={"index": "id"}) ids.id = ids.id.astype(str) # merge to replace concepts by ids - taxonomy = taxonomy.merge(ids, left_on='item', right_on='text', how='outer') - taxonomy.drop(columns=['item'], inplace=True) - ids = ids.rename(columns={'id': 'child', 'text': 'item'}) - taxonomy = taxonomy.merge(ids, left_on='item_rel', right_on='item', how='left') - taxonomy.drop(columns=['item', 'relationship', 'item_rel'], inplace=True) + taxonomy = taxonomy.merge(ids, left_on="item", right_on="text", how="outer") + taxonomy.drop(columns=["item"], inplace=True) + ids = ids.rename(columns={"id": "child", "text": "item"}) + taxonomy = taxonomy.merge(ids, left_on="item_rel", right_on="item", how="left") + taxonomy.drop(columns=["item", "relationship", "item_rel"], inplace=True) taxonomy = taxonomy.drop_duplicates() + taxonomy = taxonomy[~(taxonomy.text == "")] + # required to match index + taxonomy.sort_values("text", inplace=True) + concept_list = list(taxonomy.text.unique()) + concept_list.sort() + return taxonomy, concept_list + + +class TaxonomyRDF(Taxonomy): + def __init__(self): + super().__init__() + self.namespace = None + self.rdf_query_parents = None + self.rdf_query_children = None + self.graph = None + self.MAX_DEPTH = 2 + self.path = "../../data/external" + + @abstractmethod + def read_taxonomy(self): + pass + + def format_taxonomy(self, graph, query): + res = graph.query(query) + + df = pd.DataFrame( + [ + ( + x["x"].n3(graph.namespace_manager), + x["z"].n3(graph.namespace_manager)[1:-4], + ) + for x in res + ], + columns=["node", "text"], + ) + + df.node = df.node.apply(lambda x: x[1:-1]) + # names are given with '-' + # e.g. context-aware systems + df.text = df.text.apply(lambda x: x.lower().lstrip()) + df.text = df.text.apply(lambda x: " ".join(x.split("-"))) + df.rename(columns={"node": "id"}, inplace=True) + df = df[~(df.text == "")] + df = df.reset_index(drop=True) + + df.sort_values("text", inplace=True) + concept_list = list(df.text.unique()) + concept_list.sort() + + return df, concept_list + + def assign_children(self, node, text, depth): + child = Concept(node, text) + if depth == self.MAX_DEPTH: + return child + child.children = self.get_children(child, depth) + return child + + def get_children(self, query, depth): + depth += 1 + rdf_query = self.rdf_query_children % {"node": query.id} + res = self.graph.query(rdf_query) + return [ + self.assign_children( + i["x"], i["z"].n3(self.graph.namespace_manager)[1:-4], depth + ) + for i in res + ] + + def assign_parents(self, node, text, depth): + parent = Concept(node, text) + if depth == self.MAX_DEPTH: + return parent + parent.parents = self.get_parents(parent, depth) + return parent + + def get_parents(self, query, depth): + depth += 1 + rdf_query = self.rdf_query_parents % {"node": query.id} + res = self.graph.query(rdf_query) + return [ + self.assign_parents( + i["x"], i["z"].n3(self.graph.namespace_manager)[1:-4], depth + ) + for i in res + ] + + def search(self, query): + node, query = self.get_id(query) + if id == -100: + return Concept(-100, query) + query = Concept(self.namespace[node], query) + query.children, query.parents = ( + self.get_children(query, depth=0), + self.get_parents(query, depth=0), + ) + return query + + +class TaxonomyRDFCSO(TaxonomyRDF): + def __init__(self, path=None): + super().__init__() + if path is not None: + self.path = path + ( + self.graph, + self.namespace, + self.taxonomy, + self.concept_list, + ) = self.read_taxonomy() + self.semantic_search = SemanticSearch( + data=self.concept_list, tax_name="CSO_RDF" + ).do_faiss_lookup + self.lexical_search = LexicalSearch( + data=self.concept_list, tax_name="CSO_RDF" + ).lexical_search + + self.rdf_query_children = f""" + select ?x ?z where + {{ + {{ + {{ + ?x ns0:preferentialEquivalent ?x . + }} + {{ + select * where + {{ + <%(node)s> ns0:superTopicOf ?x . ?x ns1:label ?z . + }} + }} + }} + UNION + {{ + select * where + {{ + <%(node)s> ns0:superTopicOf ?x . ?x ns1:label ?z + FILTER (!EXISTS + {{ + ?x ns0:preferentialEquivalent ?y + }}) + }} + }} + }} + """ + self.rdf_query_parents = f""" + select ?x ?z where + {{ + {{ + {{ + ?x ns0:preferentialEquivalent ?x . + }} + {{ + select * where + {{ + ?x ns0:superTopicOf <%(node)s> . ?x ns1:label ?z . + }} + }} + }} + UNION + {{ + select * where + {{ + ?x ns0:superTopicOf <%(node)s> . ?x ns1:label ?z + FILTER (!EXISTS + {{ + ?x ns0:preferentialEquivalent ?y + }}) + }} + }} + }} + """ + print("Taxonomy instantiated") + + def read_taxonomy(self): + namespace = Namespace("") + graph = Graph().parse(path_join(self.path, "CSO.3.3.ttl"), format="ttl") + + query = f"select ?x ?z where {{ ?x ns1:label ?z}}" + df, concept_list = self.format_taxonomy(graph, query) + + return graph, namespace, df, concept_list + + +class TaxonomyRDFCCS(TaxonomyRDF): + def __init__(self, path=None): + super().__init__() + if path is not None: + self.path = path + ( + self.graph, + self.namespace, + self.taxonomy, + self.concept_list, + ) = self.read_taxonomy() + self.semantic_search = SemanticSearch( + data=self.concept_list, tax_name="CCS_RDF" + ).do_faiss_lookup + self.lexical_search = LexicalSearch( + data=self.concept_list, tax_name="CCS_RDF" + ).lexical_search + self.rdf_query_children = ( + f"select * where {{ <%(node)s> skos:narrower ?x . ?x skos:prefLabel ?z}}" + ) + self.rdf_query_parents = ( + f"select * where {{ <%(node)s> skos:broader ?x . ?x skos:prefLabel ?z}}" + ) + print("Taxonomy instantiated") + + def read_taxonomy(self): + # fix format + with open(path_join(self.path, "acm_ccs.xml"), "r") as f: + content = f.read() + + fixed = content.replace("lang=", "xml:lang=") + + with open(path_join(self.path, "acm_ccs_fixed.xml"), "w") as f: + f.write(fixed) + + graph = Graph().parse(path_join(self.path, "acm_ccs_fixed.xml"), format="xml") + namespace = Namespace("") + + query = f"select ?x ?z where {{ ?x skos:prefLabel ?z}}" + df, concept_list = self.format_taxonomy(graph, query) - # careful! it seems like there are some parenting loops - # 'search' method won't work - return taxonomy + return graph, namespace, df, concept_list diff --git a/src/dossier_search/dossier_search/settings.py b/src/dossier_search/dossier_search/settings.py index d9afe1b..36f1403 100644 --- a/src/dossier_search/dossier_search/settings.py +++ b/src/dossier_search/dossier_search/settings.py @@ -140,4 +140,7 @@ DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" +# change to True if you are running this code on M1 Apple Macbook +M1_CHIP = False + AUTH_USER_MODEL = 'users.User'