Skip to content

Commit

Permalink
Make embedding generation task use correct run (#2074)
Browse files Browse the repository at this point in the history
* switch to using next_run

* adding test

* adding fallback for missing next runs

* adding test

* checking published

* fixing test flakiness
  • Loading branch information
shanbady authored Feb 27, 2025
1 parent 01fc6b6 commit 1001aee
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 2 deletions.
8 changes: 6 additions & 2 deletions vector_search/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ def start_embed_resources(self, indexes, skip_content_files, overwrite):
.order_by("id")
):
run = (
course.runs.filter(published=True)
course.next_run
if course.next_run
else course.runs.filter(published=True)
.order_by("-start_date")
.first()
)
Expand Down Expand Up @@ -193,7 +195,9 @@ def embed_learning_resources_by_id(self, ids, skip_content_files, overwrite):
etl_source__in=RESOURCE_FILE_ETL_SOURCES
).order_by("id"):
run = (
course.runs.filter(published=True)
course.next_run
if course.next_run
else course.runs.filter(published=True)
.order_by("-start_date")
.first()
)
Expand Down
75 changes: 75 additions & 0 deletions vector_search/tasks_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,78 @@ def test_embed_learning_resources_by_id(mocker, mocked_celery):
assert mock_call.args[1] == "content_file"
embedded_resource_ids = generate_embeddings_mock.si.mock_calls[0].args[0]
assert sorted(resource_ids) == sorted(embedded_resource_ids)


def test_embedded_content_from_next_run(mocker, mocked_celery):
"""
Content files to embed should come from next course run
"""

mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])

course = CourseFactory.create(etl_source=ETLSource.ocw.value)

other_run = LearningResourceRunFactory.create(
learning_resource=course.learning_resource,
created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(days=2),
)
LearningResourceRunFactory.create(
learning_resource=course.learning_resource,
created_on=datetime.datetime.now(tz=datetime.UTC),
)

next_run_contentfiles = [
cf.id
for cf in ContentFileFactory.create_batch(
3, run=course.learning_resource.next_run
)
]
# create contentfiles using the other run
ContentFileFactory.create_batch(3, run=other_run)

generate_embeddings_mock = mocker.patch(
"vector_search.tasks.generate_embeddings", autospec=True
)

with pytest.raises(mocked_celery.replace_exception_class):
start_embed_resources.delay(
["course"], skip_content_files=False, overwrite=True
)

generate_embeddings_mock.si.assert_called_with(
next_run_contentfiles,
"content_file",
True, # noqa: FBT003
)


def test_embedded_content_from_latest_run_if_next_missing(mocker, mocked_celery):
"""
Content files to embed should come from latest run if the next run is missing
"""

mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[])

course = CourseFactory.create(etl_source=ETLSource.ocw.value)
course.runs.all().delete()
latest_run = LearningResourceRunFactory.create(
learning_resource=course.learning_resource,
created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(hours=1),
)
latest_run_contentfiles = [
cf.id for cf in ContentFileFactory.create_batch(3, run=latest_run)
]
generate_embeddings_mock = mocker.patch(
"vector_search.tasks.generate_embeddings", autospec=True
)

with pytest.raises(mocked_celery.replace_exception_class):
start_embed_resources.delay(
["course"], skip_content_files=False, overwrite=True
)

generate_embeddings_mock.si.assert_called_with(
latest_run_contentfiles,
"content_file",
True, # noqa: FBT003
)

0 comments on commit 1001aee

Please sign in to comment.