From 993d98c1e44cb15a4d597e706350cbf471d5def3 Mon Sep 17 00:00:00 2001
From: ospitia <90850810+ospitia@users.noreply.github.com>
Date: Tue, 17 May 2022 14:27:58 +0200
Subject: [PATCH] Allow semantic search with taxonomies #69 (#80)

* Allow semantic search with taxonomies #69

* rdf querying for taxonomy CSO

* Update src/dossier_search/concept_search/faiss_search.py

Co-authored-by: Wojciech Kusa <WojciechKusa@users.noreply.github.com>

* rdf search extension for CCS taxonomy

* fix missing requirement

* fix specific exceptions in get_id

* fix(taxonomy): semantic search works on M1 chip

- lexical search on M1 is disabled

* fix(taxonomy): fix settings import

* fix(taxonomy): faiss indexing works on cpu

* style(taxonomy): style with black

* build(taxonomy): update requirements

* fix(taxonomy): sort imports, catch more specific exceptions

* docs(taxonomy): update readme and change default M1_CHIP to False

* fix path parameter optional

Co-authored-by: Wojciech Kusa <WojciechKusa@users.noreply.github.com>
Co-authored-by: Wojciech Kusa <wojciech.kusa@gmail.com>
Co-authored-by: Ayah Soufan <ayahsoufan@gmail.com>
---
 README.md                                     |   9 +
 data/processed/.gitkeep                       |   0
 requirements.txt                              |   9 +-
 .../concept_search/faiss_search.py            | 121 +++++++
 .../concept_search/lexical_search.py          |  44 +++
 src/dossier_search/concept_search/taxonomy.py | 340 ++++++++++++++++--
 src/dossier_search/dossier_search/settings.py |   3 +
 7 files changed, 496 insertions(+), 30 deletions(-)
 create mode 100644 data/processed/.gitkeep
 create mode 100644 src/dossier_search/concept_search/faiss_search.py
 create mode 100644 src/dossier_search/concept_search/lexical_search.py

diff --git a/README.md b/README.md
index a870209..29a814d 100644
--- a/README.md
+++ b/README.md
@@ -70,3 +70,12 @@ Server should be available at http://127.0.0.1:8000/
 (cruise-literature)$ python manage.py runserver YOUR_IP:YOUR_PORT
 ```
 
+## 3. Troubleshooting
+
+### 3.1 M1 Macbook
+
+If you are using a laptop with the M1 chip please change the following line in the [settings.py](src/dossier_search/dossier_search/settings.py) file:
+
+```python
+M1_CHIP = True
+```
\ No newline at end of file
diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 75d9fc3..741ce3d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,11 @@ wikipedia~=1.4.0
 pandas==1.4.1
 xmltodict==0.12.0
 fuzzywuzzy==0.18.0
-requests~=2.27.1
\ No newline at end of file
+requests~=2.27.1
+torch==1.10.2
+transformers==4.14.1
+python-terrier==0.8.1
+rdflib==6.1.1
+faiss-cpu==1.7.2
+numpy~=1.22.3
+faiss-gpu==1.7.2
\ No newline at end of file
diff --git a/src/dossier_search/concept_search/faiss_search.py b/src/dossier_search/concept_search/faiss_search.py
new file mode 100644
index 0000000..fe6dfa6
--- /dev/null
+++ b/src/dossier_search/concept_search/faiss_search.py
@@ -0,0 +1,121 @@
+from os.path import exists
+
+import faiss
+import numpy as np
+import torch
+from dossier_search.settings import M1_CHIP
+from transformers import AutoTokenizer, AutoModel
+
+if M1_CHIP:
+    # solves problems with MKL library on M1 macbook
+    # FIXME: this should be replaced by a proper requirements for M1
+    import os
+
+    os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+
+
+class SemanticSearch:
+    def __init__(self, data: list, tax_name: str):
+        model_name = "allenai/scibert_scivocab_cased"
+        self.model = AutoModel.from_pretrained(
+            model_name, output_hidden_states=True
+        ).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=16)
+
+        self.n_dimensions = self.model.pooler.dense.out_features
+        self.data = data
+
+        index_path = "../../data/processed/Taxonomy{}index.bin".format(tax_name)
+
+        if exists(index_path):
+            self.taxonomy_index = faiss.read_index(index_path)
+        else:
+            self.taxonomy_index = self.create_faiss_index()
+            try:
+                faiss.write_index(
+                    faiss.index_gpu_to_cpu(self.taxonomy_index), index_path
+                )
+            except AttributeError:
+                faiss.write_index(self.taxonomy_index, index_path)
+
+        try:
+            res = faiss.StandardGpuResources()
+            self.taxonomy_index = faiss.index_cpu_to_gpu(res, 0, self.taxonomy_index)
+        except AttributeError:
+            pass
+
+    def embedding(self, word: str):
+        """
+        embedds the word input as the sum of the last 4 hidden
+        states of Bert embeddings
+        """
+        model = self.model
+        tokenizer = self.tokenizer
+        word = [word] if word.__class__ != list else word
+        marked_text = ["[CLS] " + i + " [SEP]" for i in word]
+
+        padding = "max_length"
+        tokenized_text = [
+            tokenizer.tokenize(i, padding=padding, truncation=True) for i in marked_text
+        ]
+        tokenized_text_ = [
+            tokenizer.tokenize(i, padding=False, truncation=True) for i in marked_text
+        ]
+        wordpiece_vectors = [len(i) - 1 for i in tokenized_text_]
+
+        indexed_tokens = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text]
+        tokens_tensor = torch.tensor([indexed_tokens]).squeeze(0)
+
+        with torch.no_grad():
+            outputs = model(tokens_tensor)
+            hidden_states = outputs["hidden_states"]
+
+        # combine the layers to make a single tensor.
+        token_embeddings = torch.stack(hidden_states, dim=0)
+        token_embeddings = torch.squeeze(token_embeddings, dim=1)
+
+        try:
+            token_embeddings = token_embeddings.permute(1, 2, 0, 3)
+        except RuntimeError:
+            # RuntimeError: number of dims don't match in permute
+            token_embeddings = token_embeddings.unsqueeze(0).permute(0, 2, 1, 3)
+
+        out = []
+        for item, wordpiece_vector in zip(token_embeddings, wordpiece_vectors):
+            token_vecs_sum = []
+            for token in item:
+                sum_vec = torch.sum(token[-4:], dim=0)
+                token_vecs_sum.append(sum_vec)
+            out_i = torch.stack(token_vecs_sum[1:wordpiece_vector], dim=-1)
+            out_i = out_i.mean(dim=-1)
+            out.append(out_i)
+        return torch.stack(out)
+
+    def create_faiss_index(self):
+        """
+        create a faiss index of type FlatL2 from data vectors of n_dimensions
+        """
+
+        fastIndex = faiss.IndexFlatL2(self.n_dimensions)
+
+        try:
+            # copy the index to GPU
+            res = faiss.StandardGpuResources()
+            fastIndex = faiss.index_cpu_to_gpu(res, 0, fastIndex)
+        except AttributeError:
+            pass
+
+        # index data
+        embeddings = self.embedding(self.data).detach().numpy()
+        fastIndex.add(np.stack(embeddings).astype("float32"))
+        return fastIndex
+
+    def do_faiss_lookup(self, text):
+        """
+        the semantic search of word in the indexed collection
+        """
+        vector = self.embedding(text).detach().numpy().astype("float32")
+
+        score, index = self.taxonomy_index.search(vector, 1)
+
+        return self.data[index[0][0]], score
diff --git a/src/dossier_search/concept_search/lexical_search.py b/src/dossier_search/concept_search/lexical_search.py
new file mode 100644
index 0000000..d5984af
--- /dev/null
+++ b/src/dossier_search/concept_search/lexical_search.py
@@ -0,0 +1,44 @@
+from os.path import exists, join
+
+import pyterrier as pt
+
+if not pt.started():
+    pt.init()
+
+
+class LexicalSearch:
+    def __init__(self, data: list, tax_name: str):
+        self.data = data
+        self.path = "../../data/processed/taxonomy{}iter_index".format(tax_name)
+        self.indexref = self.get_index()
+
+    def collection_iter(self, collection):
+        for idx, text in enumerate(collection):
+            yield {"docno": idx, "text": text}
+
+    def get_index(self):
+        if exists(join(self.path, "data.properties")):
+            indexref = pt.IndexFactory.of(join(self.path, "data.properties"))
+        else:
+            iter_indexer = pt.IterDictIndexer(self.path, blocks=True, overwrite=True)
+
+            doc_iter = self.collection_iter(self.data)
+            indexref = iter_indexer.index(doc_iter)
+
+        return indexref
+
+    def lexical_search(self, query):
+        retr_controls = {
+            "wmodel": "BM25",
+            "string.use_utf": "true",
+            "end": 0,
+        }
+
+        retr = pt.BatchRetrieve(self.indexref, controls=retr_controls)
+
+        res = retr.search(query)
+
+        if len(res) > 0 and res.score.values[0] > 7:
+            return self.data[int(res.docno.values[0])]
+        else:
+            return -100
diff --git a/src/dossier_search/concept_search/taxonomy.py b/src/dossier_search/concept_search/taxonomy.py
index c0d543e..bf13c1c 100644
--- a/src/dossier_search/concept_search/taxonomy.py
+++ b/src/dossier_search/concept_search/taxonomy.py
@@ -1,12 +1,37 @@
 from abc import ABC, abstractmethod
+from os.path import join as path_join
+from typing import Tuple
+
 import pandas as pd
 import xmltodict
+from dossier_search.settings import M1_CHIP
 from fuzzywuzzy import fuzz
+from rdflib import Graph, Namespace
 
 from .concept import Concept
+from .faiss_search import SemanticSearch
+
+if M1_CHIP:
+
+    class LexicalSearch:
+        """Mockup for LexicalSearch on laptops with M1 chip.
+        FIXME: this should be replaced at some point by a different implementation
+        of lexical search.
+        """
+
+        def __init__(self, data, tax_name):
+            pass
+
+        def lexical_search(self, query):
+            raise IndexError
+
+else:
+    from .lexical_search import LexicalSearch
 
 
 class Taxonomy(ABC):
+    taxonomy = None
+
     def __init__(self):
         pass
 
@@ -14,17 +39,24 @@ def __init__(self):
     def read_taxonomy(self):
         pass
 
-    def get_id(self, query: str) -> str:
+    def get_id(self, query: str) -> Tuple[int, str]:
         query = query.lower().lstrip()
         taxonomy = self.taxonomy
         try:
             id = taxonomy[taxonomy.text == query].id.values[0]
-        except:
+        except IndexError:
             scores = taxonomy.text.apply(lambda x: fuzz.ratio(x, query))
             max_value, idx = scores.max(), scores.idxmax()
             # -100 is the id we use when the query is not found in the taxonomy
             id = taxonomy.iloc[idx].id if max_value > 90 else -100
-        return id
+            if id == -100:
+                try:
+                    query_ = self.lexical_search(query)
+                    id = taxonomy[taxonomy.text == query_].id.values[0]
+                except IndexError:
+                    query, _ = self.semantic_search(query)
+                    id = taxonomy[taxonomy.text == query].id.values[0]
+        return id, query
 
     def get_1st_level_parents(self, id):
         taxonomy = self.taxonomy
@@ -61,7 +93,7 @@ def get_children(self, id):
         return [self.assign_children(item) for item in children]
 
     def search_relationships(self, query):
-        id = self.get_id(query)
+        id, query = self.get_id(query)
         if id == -100:
             return Concept(-100, query)
         query = Concept(id, query)
@@ -87,7 +119,14 @@ class TaxonomyCCS(Taxonomy):
     def __init__(self, path):
         super().__init__()
         self.path = path
-        self.taxonomy = self.read_taxonomy()
+        self.taxonomy, self.concept_list = self.read_taxonomy()
+        self.semantic_search = SemanticSearch(
+            data=self.concept_list, tax_name="CCS"
+        ).do_faiss_lookup
+        self.lexical_search = LexicalSearch(
+            data=self.concept_list, tax_name="CCS"
+        ).lexical_search
+
         print("Taxonomy instantiated")
 
     def read_taxonomy(self):
@@ -116,54 +155,297 @@ def read_taxonomy(self):
         table = pd.DataFrame(example_list, columns=["id", "text", "child"])
         table = table.drop_duplicates()
         table.text = table.text.str.lower()
+        table.sort_values("text", inplace=True)
+        concept_list = list(table.text.unique())
+        concept_list.sort()
 
-        return table
+        return table, concept_list
 
 
 class TaxonomyCSO(Taxonomy):
     def __init__(self, path):
         super().__init__()
         self.path = path
-        self.taxonomy = self.read_taxonomy()
+        self.taxonomy, self.concept_list = self.read_taxonomy()
+        self.semantic_search = SemanticSearch(
+            data=self.concept_list, tax_name="CSO"
+        ).do_faiss_lookup
+        self.lexical_search = LexicalSearch(
+            data=self.concept_list, tax_name="CSO"
+        ).lexical_search
         print("Taxonomy instantiated")
 
     def read_taxonomy(self):
-        df = pd.read_csv(self.path, header=None,
-                         names=['item', 'relationship', 'item_rel'])
+        df = pd.read_csv(
+            self.path, header=None, names=["item", "relationship", "item_rel"]
+        )
 
         # all the items are urls like '<url>' [:-1] removes '>'
-        df = df.applymap(lambda x: x.split('/')[-1][:-1]).copy()
+        df = df.applymap(lambda x: x.split("/")[-1][:-1]).copy()
         # there's a constant substring 'cso#' not needed in relationships
-        df.relationship = df.relationship.apply(lambda x: x.split('#')[-1])
+        df.relationship = df.relationship.apply(lambda x: x.split("#")[-1])
         # labels and type might not be used. contributesTo not sure what it is yet
         # related links seem not well curated
-        df = df[~df.relationship.isin(['label', 'type', 'contributesTo', 'relatedLink'])]
+        df = df[~df.relationship.isin(["label", "type", "relatedLink"])]
         # there are some strings with a segment after %
         # e.g, numerical_analysis % 2C_computer - assisted
-        df = df.applymap(lambda x: x.split('%')[0])
-        # names are give with '_'
+        df = df.applymap(lambda x: x.split("%")[0])
+        # names are given with '_'
         # e.g. automated_pattern_recognition
-        df = df.applymap(lambda x: ' '.join(x.split('_')).lower().lstrip())
+        df = df.applymap(lambda x: " ".join(x.split("_")).lower().lstrip())
+        # names are given with '-'
+        # e.g. context-aware systems
+        df = df.applymap(lambda x: " ".join(x.split("-")).lower().lstrip())
         # there are several relationships showing different ways of
         # writing/referring to the concept
-        # relationships = ['relatedEquivalent',
-        #                  'relatedEquivalent', 'preferentialEquivalent'
-        #                  'sameAs', 'relatedLink']
-        # trying to get same format adopted for ccs:
-        taxonomy = df[df.relationship == 'supertopicof'].copy()
+        # relationships = ['relatedequivalent', 'preferentialequivalent',
+        #                  'sameas', 'contributesto']
+        df_temp = df[df.item != df.item_rel].copy()
+        unique_concepts = list(
+            df_temp[df_temp.relationship == "preferentialequivalent"].item
+        )
+        unique_concepts += set(df.item) - set(
+            df[df.relationship.isin(["preferentialequivalent"])].item
+        )
+        redundant_concepts = set(df.item) - set(unique_concepts)
+        df = df[
+            df.item.isin(unique_concepts) & (~df.item_rel.isin(redundant_concepts))
+        ].copy()
+        taxonomy = df[df.relationship == "supertopicof"].copy()
         # assign unique numbers to each concept
         text = set(list(taxonomy.item.values) + list(taxonomy.item_rel.values))
         ids = pd.DataFrame(list(text), columns=["text"]).reset_index()
-        ids = ids.rename(columns={'index': 'id'})
+        ids = ids.rename(columns={"index": "id"})
         ids.id = ids.id.astype(str)
         # merge to replace concepts by ids
-        taxonomy = taxonomy.merge(ids, left_on='item', right_on='text', how='outer')
-        taxonomy.drop(columns=['item'], inplace=True)
-        ids = ids.rename(columns={'id': 'child', 'text': 'item'})
-        taxonomy = taxonomy.merge(ids, left_on='item_rel', right_on='item', how='left')
-        taxonomy.drop(columns=['item', 'relationship', 'item_rel'], inplace=True)
+        taxonomy = taxonomy.merge(ids, left_on="item", right_on="text", how="outer")
+        taxonomy.drop(columns=["item"], inplace=True)
+        ids = ids.rename(columns={"id": "child", "text": "item"})
+        taxonomy = taxonomy.merge(ids, left_on="item_rel", right_on="item", how="left")
+        taxonomy.drop(columns=["item", "relationship", "item_rel"], inplace=True)
         taxonomy = taxonomy.drop_duplicates()
+        taxonomy = taxonomy[~(taxonomy.text == "")]
+        # required to match index
+        taxonomy.sort_values("text", inplace=True)
+        concept_list = list(taxonomy.text.unique())
+        concept_list.sort()
+        return taxonomy, concept_list
+
+
+class TaxonomyRDF(Taxonomy):
+    def __init__(self):
+        super().__init__()
+        self.namespace = None
+        self.rdf_query_parents = None
+        self.rdf_query_children = None
+        self.graph = None
+        self.MAX_DEPTH = 2
+        self.path = "../../data/external"
+
+    @abstractmethod
+    def read_taxonomy(self):
+        pass
+
+    def format_taxonomy(self, graph, query):
+        res = graph.query(query)
+
+        df = pd.DataFrame(
+            [
+                (
+                    x["x"].n3(graph.namespace_manager),
+                    x["z"].n3(graph.namespace_manager)[1:-4],
+                )
+                for x in res
+            ],
+            columns=["node", "text"],
+        )
+
+        df.node = df.node.apply(lambda x: x[1:-1])
+        # names are given with '-'
+        # e.g. context-aware systems
+        df.text = df.text.apply(lambda x: x.lower().lstrip())
+        df.text = df.text.apply(lambda x: " ".join(x.split("-")))
+        df.rename(columns={"node": "id"}, inplace=True)
+        df = df[~(df.text == "")]
+        df = df.reset_index(drop=True)
+
+        df.sort_values("text", inplace=True)
+        concept_list = list(df.text.unique())
+        concept_list.sort()
+
+        return df, concept_list
+
+    def assign_children(self, node, text, depth):
+        child = Concept(node, text)
+        if depth == self.MAX_DEPTH:
+            return child
+        child.children = self.get_children(child, depth)
+        return child
+
+    def get_children(self, query, depth):
+        depth += 1
+        rdf_query = self.rdf_query_children % {"node": query.id}
+        res = self.graph.query(rdf_query)
+        return [
+            self.assign_children(
+                i["x"], i["z"].n3(self.graph.namespace_manager)[1:-4], depth
+            )
+            for i in res
+        ]
+
+    def assign_parents(self, node, text, depth):
+        parent = Concept(node, text)
+        if depth == self.MAX_DEPTH:
+            return parent
+        parent.parents = self.get_parents(parent, depth)
+        return parent
+
+    def get_parents(self, query, depth):
+        depth += 1
+        rdf_query = self.rdf_query_parents % {"node": query.id}
+        res = self.graph.query(rdf_query)
+        return [
+            self.assign_parents(
+                i["x"], i["z"].n3(self.graph.namespace_manager)[1:-4], depth
+            )
+            for i in res
+        ]
+
+    def search(self, query):
+        node, query = self.get_id(query)
+        if id == -100:
+            return Concept(-100, query)
+        query = Concept(self.namespace[node], query)
+        query.children, query.parents = (
+            self.get_children(query, depth=0),
+            self.get_parents(query, depth=0),
+        )
+        return query
+
+
+class TaxonomyRDFCSO(TaxonomyRDF):
+    def __init__(self, path=None):
+        super().__init__()
+        if path is not None:
+            self.path = path
+        (
+            self.graph,
+            self.namespace,
+            self.taxonomy,
+            self.concept_list,
+        ) = self.read_taxonomy()
+        self.semantic_search = SemanticSearch(
+            data=self.concept_list, tax_name="CSO_RDF"
+        ).do_faiss_lookup
+        self.lexical_search = LexicalSearch(
+            data=self.concept_list, tax_name="CSO_RDF"
+        ).lexical_search
+
+        self.rdf_query_children = f"""
+                            select ?x ?z where
+                            {{
+                                {{
+                                    {{
+                                        ?x ns0:preferentialEquivalent ?x .
+                                    }}
+                                    {{
+                                        select * where
+                                        {{
+                                            <%(node)s> ns0:superTopicOf ?x . ?x ns1:label ?z .
+                                        }}
+                                    }}
+                                }}
+                                UNION
+                                {{
+                                    select * where
+                                    {{
+                                        <%(node)s> ns0:superTopicOf ?x . ?x ns1:label ?z
+                                        FILTER (!EXISTS
+                                        {{
+                                            ?x ns0:preferentialEquivalent ?y
+                                        }})
+                                    }}
+                                }}
+                            }}
+                        """
+        self.rdf_query_parents = f"""
+                            select ?x ?z where
+                            {{
+                                {{
+                                    {{
+                                        ?x ns0:preferentialEquivalent ?x .
+                                    }}
+                                    {{
+                                        select * where
+                                        {{
+                                            ?x ns0:superTopicOf <%(node)s> . ?x ns1:label ?z .
+                                        }}
+                                    }}
+                                }}
+                                UNION
+                                {{
+                                    select * where
+                                    {{
+                                        ?x ns0:superTopicOf <%(node)s> . ?x ns1:label ?z
+                                        FILTER (!EXISTS
+                                        {{
+                                            ?x ns0:preferentialEquivalent ?y
+                                        }})
+                                    }}
+                                }}
+                            }}
+                        """
+        print("Taxonomy instantiated")
+
+    def read_taxonomy(self):
+        namespace = Namespace("")
+        graph = Graph().parse(path_join(self.path, "CSO.3.3.ttl"), format="ttl")
+
+        query = f"select ?x ?z where {{ ?x ns1:label ?z}}"
+        df, concept_list = self.format_taxonomy(graph, query)
+
+        return graph, namespace, df, concept_list
+
+
+class TaxonomyRDFCCS(TaxonomyRDF):
+    def __init__(self, path=None):
+        super().__init__()
+        if path is not None:
+            self.path = path
+        (
+            self.graph,
+            self.namespace,
+            self.taxonomy,
+            self.concept_list,
+        ) = self.read_taxonomy()
+        self.semantic_search = SemanticSearch(
+            data=self.concept_list, tax_name="CCS_RDF"
+        ).do_faiss_lookup
+        self.lexical_search = LexicalSearch(
+            data=self.concept_list, tax_name="CCS_RDF"
+        ).lexical_search
+        self.rdf_query_children = (
+            f"select * where {{ <%(node)s> skos:narrower ?x . ?x skos:prefLabel ?z}}"
+        )
+        self.rdf_query_parents = (
+            f"select * where {{ <%(node)s> skos:broader ?x . ?x skos:prefLabel ?z}}"
+        )
+        print("Taxonomy instantiated")
+
+    def read_taxonomy(self):
+        # fix format
+        with open(path_join(self.path, "acm_ccs.xml"), "r") as f:
+            content = f.read()
+
+        fixed = content.replace("lang=", "xml:lang=")
+
+        with open(path_join(self.path, "acm_ccs_fixed.xml"), "w") as f:
+            f.write(fixed)
+
+        graph = Graph().parse(path_join(self.path, "acm_ccs_fixed.xml"), format="xml")
+        namespace = Namespace("")
+
+        query = f"select ?x ?z where {{ ?x skos:prefLabel ?z}}"
+        df, concept_list = self.format_taxonomy(graph, query)
 
-        # careful! it seems like there are some parenting loops
-        # 'search' method won't work
-        return taxonomy
+        return graph, namespace, df, concept_list
diff --git a/src/dossier_search/dossier_search/settings.py b/src/dossier_search/dossier_search/settings.py
index d9afe1b..36f1403 100644
--- a/src/dossier_search/dossier_search/settings.py
+++ b/src/dossier_search/dossier_search/settings.py
@@ -140,4 +140,7 @@
 
 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
 
+# change to True if you are running this code on M1 Apple Macbook
+M1_CHIP = False
+
 AUTH_USER_MODEL = 'users.User'