Skip to content

Commit

Permalink
Merge pull request #17 from uktrade/feature/orpd-52-sort
Browse files Browse the repository at this point in the history
orpd-52 - feat:add sorting by relevance
  • Loading branch information
hareshkainthdbt authored Oct 14, 2024
2 parents 565675f + 77801a6 commit f98ac82
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 5 deletions.
5 changes: 5 additions & 0 deletions orp/orp_search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,9 @@ def validate(self):
if self.limit < 0:
logger.error("limit must be a positive integer")
return False

if self.sort_by:
if self.sort_by not in ["recently", "relevance"]:
logger.error("sort_by must be 'recently' or 'relevance'")
return False
return True
37 changes: 33 additions & 4 deletions orp/orp_search/public_gateway.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

import pandas as pd
import requests # type: ignore

from jinja2 import Template
Expand Down Expand Up @@ -130,17 +131,45 @@ def search(self, config: SearchDocumentConfig):

sorted_df = None

if config.sort_by == "recently_updated":
if config.sort_by == "recently":
# Sort the DataFrame by 'date_modified' in descending order
sorted_df = filtered_df.sort_values(
by="date_modified", ascending=False
# Ensure 'date_issued' is in datetime format
filtered_df["date_issued"] = pd.to_datetime(
filtered_df["date_issued"], format="%d/%m/%Y"
)

elif config.sort_by == "recently_published":
# Sort the DataFrame by 'date_issued' in descending order
sorted_df = filtered_df.sort_values(
by="date_issued", ascending=False
)
elif config.sort_by == "relevance":
# Calculate relevance score
# (based on the number of keywords found)
def calculate_relevance(row, search_terms):
def score_text(text, terms):
text_processed = text.replace(" ", "").lower()
return sum(
1
for term in terms
if term.replace(" ", "").lower() in text_processed
)

title_score = score_text(row["title"], search_terms)
description_score = score_text(
row["description"], search_terms
)
return title_score + description_score

filtered_df["relevance_score"] = filtered_df.apply(
calculate_relevance,
axis=1,
search_terms=config.search_terms,
)

# Sort the DataFrame by 'relevance_score' in descending order
sorted_df = filtered_df.sort_values(
by="relevance_score", ascending=False
)

if sorted_df is not None:
results = sorted_df.to_dict(orient="records")
Expand Down
2 changes: 1 addition & 1 deletion orp/orp_search/templates/orp.html
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ <h2 class="govuk-fieldset__heading">
</label>
<select class="govuk-select" id="sort" name="sort">
<option value="date">Recently updated</option>
<option value="sort">???</option>
<option value="sort">Relevance</option>
</select>
</div>
</div>
Expand Down
7 changes: 7 additions & 0 deletions orp/orp_search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ def search(request: HttpRequest) -> HttpResponse:
else:
logger.info("search query: %s", search_query)

sort_by = request.GET.get("sort", None)
if sort_by:
logger.info("sort by: %s", sort_by)

logger.info("document types: %s", document_types)
logger.info("page: %s", offset)

Expand All @@ -110,6 +114,9 @@ def search(request: HttpRequest) -> HttpResponse:
if publisher:
config.publisher_terms = publisher

if sort_by:
config.sort_by = sort_by

# Check if the response is cached
public_gateway = PublicGateway()
search_results = public_gateway.search(config)
Expand Down

0 comments on commit f98ac82

Please sign in to comment.