Skip to content

Commit

Permalink
implement acronym expansion in search (#171)
Browse files Browse the repository at this point in the history
* implement acronym replacement in search

* rewrite unit tests, adding another example

* add tests for exact match search

* fix failing test

* bump version to 1.13.0

* fix failing test

* set acronym replacement to off by default until it's tested

* address review comments
  • Loading branch information
kdutia authored Jan 14, 2025
1 parent 07c4753 commit e35b9cb
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 7 deletions.
6 changes: 6 additions & 0 deletions src/cpr_sdk/models/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,12 @@ class SearchParameters(BaseModel):
so can also be used to override YQL or ranking profiles.
"""

replace_acronyms: bool = False
"""
Whether to perform acronym replacement based on the 'acronyms' ruleset.
See docs: https://docs.vespa.ai/en/query-rewriting.html#rule-bases
"""

@model_validator(mode="after")
def validate(self):
"""Validate against mutually exclusive fields"""
Expand Down
4 changes: 2 additions & 2 deletions src/cpr_sdk/version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
_MAJOR = "1"
_MINOR = "12"
_PATCH = "1"
_MINOR = "13"
_PATCH = "0"
_SUFFIX = ""

VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
Expand Down
9 changes: 9 additions & 0 deletions src/cpr_sdk/vespa.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,15 @@ def build_vespa_request_body(parameters: SearchParameters) -> dict[str, str]:

vespa_request_body = vespa_request_body | parameters.custom_vespa_request_body

if parameters.replace_acronyms:
if parameters.exact_match:
_LOGGER.warning(
"Exact match and replace_acronyms are incompatible. Ignoring replace_acronyms."
)
else:
vespa_request_body["rules.off"] = False
vespa_request_body["rules.rulebase"] = "acronyms"

# Disabling embedding search for descriptions
vespa_request_body["input.query(description_closeness_weight)"] = 0

Expand Down
152 changes: 152 additions & 0 deletions tests/local_vespa/test_app/rules/acronyms.sr
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
bipoc +> "black indigenous and people of colour";
black indigenous and people of colour +> bipoc;

bur +> "biennial update report";
biennial update report +> bur;

c2h6 +> "ethane";
ethane +> c2h6;

c3h8 +> "propane";
propane +> c3h8;

c4h10 +> "butane";
butane +> c4h10;

cbam +> "carbon border adjustment mechanism";
carbon border adjustment mechanism +> cbam;

cbews +> "community based early warning system";
community based early warning system +> cbews;

cfc +> "chlorofluorocarbon";
chlorofluorocarbon +> cfc;

cfcs +> "chlorofluorocarbons";
chlorofluorocarbons +> cfcs;

ch4 +> "methane";
methane +> ch4;

co2 +> "carbon dioxide";
carbon dioxide +> co2;

csrd +> "corporate sustainability reporting directive";
corporate sustainability reporting directive +> csrd;

dfi +> "development finance institutions";
development finance institutions +> dfi;

drm +> "disaster risk management";
disaster risk management +> drm;

erf +> "emission reduction fund";
emission reduction fund +> erf;

ets +> "emission trading system";
emission trading system +> ets;

ews +> "early warning systems";
early warning systems +> ews;

fgm +> "female genital mutilation";
female genital mutilation +> fgm;

gga +> "global goal on adaptation";
global goal on adaptation +> gga;

ghg +> "greenhouse gas";
greenhouse gas +> ghg;

glof +> "glacial lake outburst flood";
glacial lake outburst flood +> glof;

glofs +> "glacial lake outburst floods";
glacial lake outburst floods +> glofs;

glp +> "liquefied petroleum gas";
liquefied petroleum gas +> glp;

gst +> "global stocktake";
global stocktake +> gst;

hcfc +> "hydrochlorofluorocarbon";
hydrochlorofluorocarbon +> hcfc;

hcfcs +> "hydrochlorofluorocarbons";
hydrochlorofluorocarbons +> hcfcs;

hfc +> "hydrofluorocarbons";
hydrofluorocarbons +> hfc;

ifrs +> "international financial reporting standards";
international financial reporting standards +> ifrs;

indc +> "initial nationally determined contribution";
initial nationally determined contribution +> indc;

ipcc +> "intergovernmental panel on climate change";
intergovernmental panel on climate change +> ipcc;

lez +> "low emission zone";
low emission zone +> lez;

lng +> "liquified natural gas";
liquified natural gas +> lng;

lpg +> "liquefied petroleum gas";
liquefied petroleum gas +> lpg;

mhews +> "multi hazard early warning systems";
multi hazard early warning systems +> mhews;

n2o +> "nitrous oxide";
nitrous oxide +> n2o;

ndc +> "nationally determined contribution";
nationally determined contribution +> ndc;

nf3 +> "nitrogen trifluoride";
nitrogen trifluoride +> nf3;

ngo +> "non governmental organisation";
non governmental organisation +> ngo;

nh3 +> "ammonia";
ammonia +> nh3;

o2 +> "oxygen";
oxygen +> o2;

o3 +> "ozone";
ozone +> o3;

pfc +> "perfluorocarbon";
perfluorocarbon +> pfc;

pfcs +> "perfluorocarbons";
perfluorocarbons +> pfcs;

ril +> "reduced impact logging";
reduced impact logging +> ril;

sdg +> "sustainable development goal";
sustainable development goal +> sdg;

sf6 +> "sulphur hexafluoride";
sulphur hexafluoride +> sf6;

slr +> "sea level rise";
sea level rise +> slr;

ulez +> "ultra low emission zone";
ultra low emission zone +> ulez;

wfp +> "world food programme";
world food programme +> wfp;

zev +> "zero emissons vehicle";
zero emissons vehicle +> zev;

pv +> "photovoltaic";
photovoltaic +> pv;
4 changes: 2 additions & 2 deletions tests/local_vespa/test_documents/family_document.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
"fields": {
"family_source": "CCLW",
"search_weights_ref": "id:doc_search:search_weights::default_weights",
"family_name": "Climate Change Adaptation and Low Emissions Growth Strategy by 2035",
"family_name": "Nationally Determined Contribution: Climate Change Adaptation and Low Emissions Growth Strategy by 2035",
"document_title": null,
"document_content_type": "text/html",
"family_slug": "climate-change-adaptation-and-low-emissions-growth-strategy-by-2035_75e3",
"document_source_url": "https://unfccc.int/sites/default/files/resource/ENG_CC%20adaptation%20and%20Low%20emission%20development%20Strategy%20BiH%202020-2030.pdf",
"family_geography": "BIH",
"family_geographies": ["BIH", "NOR"],
"family_category": "Executive",
"family_name_index": "Climate Change Adaptation and Low Emissions Growth Strategy by 2035",
"family_name_index": "Nationally Determined Contribution: Climate Change Adaptation and Low Emissions Growth Strategy by 2035",
"document_languages": ["English"],
"document_slug": "climate-change-adaptation-and-low-emissions-growth-strategy-by-2035_6c4c",
"family_description_embedding": {
Expand Down
77 changes: 74 additions & 3 deletions tests/test_search_adaptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,15 @@ def test_vespa_search_adaptor__bad_query_string_still_works(test_vespa):

@pytest.mark.vespa
def test_vespa_search_adaptor__hybrid(test_vespa):
family_name = "Climate Change Adaptation and Low Emissions Growth Strategy by 2035"
family_name = "Nationally Determined Contribution: Climate Change Adaptation and Low Emissions Growth Strategy by 2035"
request = SearchParameters(query_string=family_name)
response = vespa_search(test_vespa, request)

# Was the family searched for in the results.
# Note that this is a fairly loose test
got_family_names = []
for fam in response.families:
for doc in fam.hits:
got_family_names.append(doc.family_name)
got_family_names.append(fam.hits[0].family_name)
assert family_name in got_family_names


Expand Down Expand Up @@ -774,3 +773,75 @@ def test_vespa_search_hybrid_no_closeness_profile(test_vespa):
)

assert response_no_closeness == response_null_closeness_weights


@pytest.mark.vespa
def test_acronym_replacement(test_vespa):
ndc_response = vespa_search(
test_vespa,
SearchParameters(
query_string="ndc",
replace_acronyms=True,
),
)

ndc_response_no_replacement = vespa_search(
test_vespa,
SearchParameters(
query_string="ndc",
replace_acronyms=False,
),
)

assert "Nationally Determined Contribution" in str(
ndc_response.families[0].hits[0].family_name
)
assert "Nationally Determined Contribution" not in str(
ndc_response_no_replacement.families[0].hits[0].family_name
)

methane_ch4_response = vespa_search(
test_vespa,
SearchParameters(
query_string="ch4",
replace_acronyms=True,
),
)
methane_ch4_response_no_replacement = vespa_search(
test_vespa,
SearchParameters(
query_string="ch4",
replace_acronyms=False,
),
)

assert isinstance(methane_ch4_response.families[0].hits[0], Passage)
assert "methane" in methane_ch4_response.families[0].hits[0].text_block.lower()

assert (
not (
isinstance(methane_ch4_response_no_replacement.families[0].hits[0], Passage)
)
or "methane"
not in methane_ch4_response_no_replacement.families[0]
.hits[0]
.text_block.lower()
)


@pytest.mark.vespa
def test_acronym_replacement_exact_match_search(test_vespa, caplog):
"""Acronym replacement should not run on exact match searches"""

# There are no exact matches for the query "ndc" in the test data
ndc_response = vespa_search(
test_vespa,
SearchParameters(
query_string="ndc",
exact_match=True,
replace_acronyms=True,
),
)

assert "Exact match and replace_acronyms are incompatible." in caplog.text
assert len(ndc_response.families) == 0

0 comments on commit e35b9cb

Please sign in to comment.