diff --git a/vector_search/tasks.py b/vector_search/tasks.py index 7c6c0d00b3..1a0249e343 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -105,7 +105,9 @@ def start_embed_resources(self, indexes, skip_content_files, overwrite): .order_by("id") ): run = ( - course.runs.filter(published=True) + course.next_run + if course.next_run + else course.runs.filter(published=True) .order_by("-start_date") .first() ) @@ -193,7 +195,9 @@ def embed_learning_resources_by_id(self, ids, skip_content_files, overwrite): etl_source__in=RESOURCE_FILE_ETL_SOURCES ).order_by("id"): run = ( - course.runs.filter(published=True) + course.next_run + if course.next_run + else course.runs.filter(published=True) .order_by("-start_date") .first() ) diff --git a/vector_search/tasks_test.py b/vector_search/tasks_test.py index 1c76770c5c..83b76e5705 100644 --- a/vector_search/tasks_test.py +++ b/vector_search/tasks_test.py @@ -221,3 +221,78 @@ def test_embed_learning_resources_by_id(mocker, mocked_celery): assert mock_call.args[1] == "content_file" embedded_resource_ids = generate_embeddings_mock.si.mock_calls[0].args[0] assert sorted(resource_ids) == sorted(embedded_resource_ids) + + +def test_embedded_content_from_next_run(mocker, mocked_celery): + """ + Content files to embed should come from next course run + """ + + mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[]) + + course = CourseFactory.create(etl_source=ETLSource.ocw.value) + + other_run = LearningResourceRunFactory.create( + learning_resource=course.learning_resource, + created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(days=2), + ) + LearningResourceRunFactory.create( + learning_resource=course.learning_resource, + created_on=datetime.datetime.now(tz=datetime.UTC), + ) + + next_run_contentfiles = [ + cf.id + for cf in ContentFileFactory.create_batch( + 3, run=course.learning_resource.next_run + ) + ] + # create contentfiles using the other run + ContentFileFactory.create_batch(3, run=other_run) + + generate_embeddings_mock = mocker.patch( + "vector_search.tasks.generate_embeddings", autospec=True + ) + + with pytest.raises(mocked_celery.replace_exception_class): + start_embed_resources.delay( + ["course"], skip_content_files=False, overwrite=True + ) + + generate_embeddings_mock.si.assert_called_with( + next_run_contentfiles, + "content_file", + True, # noqa: FBT003 + ) + + +def test_embedded_content_from_latest_run_if_next_missing(mocker, mocked_celery): + """ + Content files to embed should come from latest run if the next run is missing + """ + + mocker.patch("vector_search.tasks.load_course_blocklist", return_value=[]) + + course = CourseFactory.create(etl_source=ETLSource.ocw.value) + course.runs.all().delete() + latest_run = LearningResourceRunFactory.create( + learning_resource=course.learning_resource, + created_on=datetime.datetime.now(tz=datetime.UTC) - datetime.timedelta(hours=1), + ) + latest_run_contentfiles = [ + cf.id for cf in ContentFileFactory.create_batch(3, run=latest_run) + ] + generate_embeddings_mock = mocker.patch( + "vector_search.tasks.generate_embeddings", autospec=True + ) + + with pytest.raises(mocked_celery.replace_exception_class): + start_embed_resources.delay( + ["course"], skip_content_files=False, overwrite=True + ) + + generate_embeddings_mock.si.assert_called_with( + latest_run_contentfiles, + "content_file", + True, # noqa: FBT003 + )