Skip to content

Commit

Permalink
semsim multi-compare (#612)
Browse files Browse the repository at this point in the history
Add a compare endpoint that takes one set and runs a semsim compare
against a list of other sets
  • Loading branch information
kevinschaper authored Mar 2, 2024
1 parent 501f3a6 commit 8fa071b
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 82 deletions.
82 changes: 4 additions & 78 deletions backend/src/monarch_py/api/config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import requests as rq
from functools import lru_cache
from typing import List

from pydantic import BaseModel

from monarch_py.implementations.solr.solr_implementation import SolrImplementation
from monarch_py.implementations.spacy.spacy_implementation import SpacyImplementation
from monarch_py.datamodels.model import TermSetPairwiseSimilarity, SemsimSearchResult

from monarch_py.service.semsim_service import SemsimianService

class Settings(BaseModel):
solr_host: str = os.getenv("SOLR_HOST") if os.getenv("SOLR_HOST") else "127.0.0.1"
Expand All @@ -28,83 +26,11 @@ def solr():
return SolrImplementation(settings.solr_url)


def convert_nans(input_dict, to_value=None):
"""
Given an input dict of the form {<term>: {<field>: <value>, ...}}
converts any <value> of 'NaN' to None.
"""
for k, v in input_dict.items():
for ik, iv in v.items():
if iv == "NaN":
input_dict[k][ik] = None

return input_dict


class SemsimianHTTPRequester:
"""A class that makes HTTP requests to the semsimian_server."""

def convert_tsps_data(self, data):
"""Convert to a format that can be coerced into a TermSetPairwiseSimilarity model
FIXME: currently, the response returned from semsimian_server doesn't
100% match the TermSetPairwiseSimilarity model, so we perform some
transformations below. once it does, we can remove all the code below
and just return TermSetPairwiseSimilarity(**data)
"""
# remove these similarity maps and fold them into the _best_matches dicts
object_best_matches_similarity_map = convert_nans(data.pop("object_best_matches_similarity_map"))
subject_best_matches_similarity_map = convert_nans(data.pop("subject_best_matches_similarity_map"))
converted_data = {
**data,
**{
# flatten the nested termset dicts
"subject_termset": {k: v for d in data["subject_termset"] for k, v in d.items()},
"object_termset": {k: v for d in data["object_termset"] for k, v in d.items()},
"subject_best_matches": {
k: {**v, "similarity": subject_best_matches_similarity_map[k]}
for k, v in data["subject_best_matches"].items()
},
"object_best_matches": {
k: {**v, "similarity": object_best_matches_similarity_map[k]}
for k, v in data["object_best_matches"].items()
},
},
}
return converted_data

def compare(self, subjects: List[str], objects: List[str]):
host = f"http://{settings.semsim_server_host}:{settings.semsim_server_port}"
path = f"compare/{','.join(subjects)}/{','.join(objects)}"
url = f"{host}/{path}"

print(f"Fetching {url}...")
response = rq.get(url=url)
data = response.json()
results = self.convert_tsps_data(data)
return TermSetPairwiseSimilarity(**results)

def search(self, termset: List[str], prefix: str, limit: int):
host = f"http://{settings.semsim_server_host}:{settings.semsim_server_port}"
path = f"search/{','.join(termset)}/{prefix}?limit={limit}"
url = f"{host}/{path}"

print(f"Fetching {url}...")
response = rq.get(url=url)
data = response.json()
results = [
SemsimSearchResult(
subject=solr().get_entity(i[2], extra=False), score=i[0], similarity=self.convert_tsps_data(i[1])
)
for i in data
]

return results


@lru_cache(maxsize=1)
def semsimian():
return SemsimianHTTPRequester()
return SemsimianService(semsim_server_host=settings.semsim_server_host,
semsim_server_port=settings.semsim_server_port,
entity_implementation=solr())


@lru_cache(maxsize=1)
Expand Down
10 changes: 10 additions & 0 deletions backend/src/monarch_py/api/semsim.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from fastapi import APIRouter, Path, Query
from typing import List

from monarch_py.api.additional_models import SemsimCompareRequest, SemsimSearchRequest, SemsimSearchGroup
from monarch_py.api.config import semsimian, solr
Expand Down Expand Up @@ -79,6 +80,15 @@ def _post_compare(request: SemsimCompareRequest):
return semsimian().compare(subjects=request.subjects, objects=request.objects)


# Do we like /multicompare/HP:212,HP:443?objects=HP:123,HP:456&objects=HP:789,HP:101112 ?
@router.get("/multicompare/{subjects}")
def _multicompare(subjects: str = Path(...,
title="Comma separated list of subjects for comparison"),
objects: List[str] = Query(...,
title="List of comma separated object sets to compare against")):

return semsimian().multi_compare(subjects=subjects.split(","), object_sets=[obj.split(",") for obj in objects])

@router.get("/search/{termset}/{group}")
def _search(
termset: str = Path(..., title="Termset to search"),
Expand Down
90 changes: 90 additions & 0 deletions backend/src/monarch_py/service/semsim_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from typing import List, Any
import requests

from pydantic import BaseModel

from monarch_py.datamodels.model import TermSetPairwiseSimilarity, SemsimSearchResult
from monarch_py.interfaces.entity_interface import EntityInterface


class SemsimianService(BaseModel):
"""A class that makes HTTP requests to the semsimian_server."""
semsim_server_port: int
semsim_server_host: str
entity_implementation: Any # TODO: should be EntityInterface

def convert_tsps_data(self, data):
"""Convert to a format that can be coerced into a TermSetPairwiseSimilarity model
FIXME: currently, the response returned from semsimian_server doesn't
100% match the TermSetPairwiseSimilarity model, so we perform some
transformations below. once it does, we can remove all the code below
and just return TermSetPairwiseSimilarity(**data)
"""
# remove these similarity maps and fold them into the _best_matches dicts
object_best_matches_similarity_map = self._convert_nans(data.pop("object_best_matches_similarity_map"))
subject_best_matches_similarity_map = self._convert_nans(data.pop("subject_best_matches_similarity_map"))
converted_data = {
**data,
**{
# flatten the nested termset dicts
"subject_termset": {k: v for d in data["subject_termset"] for k, v in d.items()},
"object_termset": {k: v for d in data["object_termset"] for k, v in d.items()},
"subject_best_matches": {
k: {**v, "similarity": subject_best_matches_similarity_map[k]}
for k, v in data["subject_best_matches"].items()
},
"object_best_matches": {
k: {**v, "similarity": object_best_matches_similarity_map[k]}
for k, v in data["object_best_matches"].items()
},
},
}
return converted_data

def compare(self, subjects: List[str], objects: List[str]):
host = f"http://{self.semsim_server_host}:{self.semsim_server_port}"
path = f"compare/{','.join(subjects)}/{','.join(objects)}"
url = f"{host}/{path}"

print(f"Fetching {url}...")
response = requests.get(url=url)
data = response.json()
results = self.convert_tsps_data(data)
return TermSetPairwiseSimilarity(**results)

def multi_compare(self, subjects: List[str], object_sets: List[List[str]]) -> List[TermSetPairwiseSimilarity]:
compare_results = [self.compare(subjects, object_set) for object_set in object_sets]
return compare_results

def search(self, termset: List[str], prefix: str, limit: int):
host = f"http://{self.semsim_server_host}:{self.semsim_server_port}"
path = f"search/{','.join(termset)}/{prefix}?limit={limit}"
url = f"{host}/{path}"

print(f"Fetching {url}...")
response = requests.get(url=url)
data = response.json()
results = [
SemsimSearchResult(
subject=self.entity_implementation.get_entity(i[2], extra=False),
score=i[0],
similarity=self.convert_tsps_data(i[1])
)
for i in data
]

return results

@staticmethod
def _convert_nans(input_dict, to_value=None):
"""
Given an input dict of the form {<term>: {<field>: <value>, ...}}
converts any <value> of 'NaN' to None.
"""
for k, v in input_dict.items():
for ik, iv in v.items():
if iv == "NaN":
input_dict[k][ik] = None

return input_dict
8 changes: 4 additions & 4 deletions backend/tests/api/test_semsim_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_autocomplete_params(mock_autocomplete, autocomplete):
)


@patch("monarch_py.api.config.SemsimianHTTPRequester.compare")
@patch("monarch_py.service.semsim_service.SemsimianService.compare")
def test_get_compare(mock_compare):
mock_compare.return_value = MagicMock()

Expand All @@ -34,7 +34,7 @@ def test_get_compare(mock_compare):
mock_compare.assert_called_once_with(subjects=["HP:123", "HP:456"], objects=["HP:789", "HP:101112"])


@patch("monarch_py.api.config.SemsimianHTTPRequester.compare")
@patch("monarch_py.service.semsim_service.SemsimianService.compare")
def test_post_compare(mock_compare):
mock_compare.return_value = MagicMock()

Expand All @@ -47,7 +47,7 @@ def test_post_compare(mock_compare):
mock_compare.assert_called_once_with(subjects=subjects, objects=objects)


@patch("monarch_py.api.config.SemsimianHTTPRequester.search")
@patch("monarch_py.service.semsim_service.SemsimianService.search")
@pytest.mark.parametrize("termset", ["HP:123,HP:456", "HP:123, HP:456", " HP:123, HP:456 "])
def test_get_search(mock_search, termset: str):
mock_search.return_value = MagicMock()
Expand All @@ -61,7 +61,7 @@ def test_get_search(mock_search, termset: str):
mock_search.assert_called_once_with(termset=["HP:123", "HP:456"], prefix=group.name, limit=limit)


@patch("monarch_py.api.config.SemsimianHTTPRequester.search")
@patch("monarch_py.service.semsim_service.SemsimianService.search")
def test_post_search(mock_search):
mock_search.return_value = MagicMock()

Expand Down

0 comments on commit 8fa071b

Please sign in to comment.