Add all Contentfile metadata to chunk responses (#2075)

* serialize contentfiles like we do with learning resources * fixing contentfile serialization * optimize loop and data fetch Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * fixing n+1 queries * adding block id to embedded metadata * adding block id as filter parameter * regenerate spec * fixing test: * some consolidation --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
mitodl · Feb 27, 2025 · aa02630 · aa02630
1 parent 1001aee
commit aa02630
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 19 deletions.
diff --git a/frontends/api/src/generated/v0/api.ts b/frontends/api/src/generated/v0/api.ts
@@ -10916,6 +10916,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
      * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
      * @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
      * @param {Array<string>} [course_number] Course number of the content file
+     * @param {Array<string>} [edx_block_id] The edx_block_id of the content file
      * @param {Array<string>} [file_extension] The extension of the content file.
      * @param {Array<string>} [key] The filename of the content file
      * @param {number} [limit] Number of results to return per page
@@ -10933,6 +10934,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
       collection_name?: string,
       content_feature_type?: Array<string>,
       course_number?: Array<string>,
+      edx_block_id?: Array<string>,
       file_extension?: Array<string>,
       key?: Array<string>,
       limit?: number,
@@ -10973,6 +10975,10 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
         localVarQueryParameter["course_number"] = course_number
       }
 
+      if (edx_block_id) {
+        localVarQueryParameter["edx_block_id"] = edx_block_id
+      }
+
       if (file_extension) {
         localVarQueryParameter["file_extension"] = file_extension
       }
@@ -11046,6 +11052,7 @@ export const VectorContentFilesSearchApiFp = function (
      * @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
      * @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
      * @param {Array<string>} [course_number] Course number of the content file
+     * @param {Array<string>} [edx_block_id] The edx_block_id of the content file
      * @param {Array<string>} [file_extension] The extension of the content file.
      * @param {Array<string>} [key] The filename of the content file
      * @param {number} [limit] Number of results to return per page
@@ -11063,6 +11070,7 @@ export const VectorContentFilesSearchApiFp = function (
       collection_name?: string,
       content_feature_type?: Array<string>,
       course_number?: Array<string>,
+      edx_block_id?: Array<string>,
       file_extension?: Array<string>,
       key?: Array<string>,
       limit?: number,
@@ -11085,6 +11093,7 @@ export const VectorContentFilesSearchApiFp = function (
           collection_name,
           content_feature_type,
           course_number,
+          edx_block_id,
           file_extension,
           key,
           limit,
@@ -11140,6 +11149,7 @@ export const VectorContentFilesSearchApiFactory = function (
           requestParameters.collection_name,
           requestParameters.content_feature_type,
           requestParameters.course_number,
+          requestParameters.edx_block_id,
           requestParameters.file_extension,
           requestParameters.key,
           requestParameters.limit,
@@ -11184,6 +11194,13 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ
    */
   readonly course_number?: Array<string>
 
+  /**
+   * The edx_block_id of the content file
+   * @type {Array<string>}
+   * @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve
+   */
+  readonly edx_block_id?: Array<string>
+
   /**
    * The extension of the content file.
    * @type {Array<string>}
@@ -11279,6 +11296,7 @@ export class VectorContentFilesSearchApi extends BaseAPI {
         requestParameters.collection_name,
         requestParameters.content_feature_type,
         requestParameters.course_number,
+        requestParameters.edx_block_id,
         requestParameters.file_extension,
         requestParameters.key,
         requestParameters.limit,

diff --git a/learning_resources/models.py b/learning_resources/models.py
@@ -838,13 +838,21 @@ def for_serialization(self):
         return self.select_related("run").prefetch_related(
             "content_tags",
             "run__learning_resource",
+            "run__learning_resource__course",
+            "run__learning_resource__platform",
             Prefetch(
                 "run__learning_resource__topics",
                 queryset=LearningResourceTopic.objects.for_serialization(),
             ),
+            Prefetch(
+                "run__learning_resource__offered_by",
+                queryset=LearningResourceOfferor.objects.for_serialization(),
+            ),
             Prefetch(
                 "run__learning_resource__departments",
-                queryset=LearningResourceDepartment.objects.for_serialization(),
+                queryset=LearningResourceDepartment.objects.for_serialization(
+                    prefetch_school=True
+                ).select_related("school"),
             ),
         )
 

diff --git a/openapi/specs/v0.yaml b/openapi/specs/v0.yaml
@@ -854,6 +854,14 @@ paths:
             type: string
             minLength: 1
         description: Course number of the content file
+      - in: query
+        name: edx_block_id
+        schema:
+          type: array
+          items:
+            type: string
+            minLength: 1
+        description: The edx_block_id of the content file
       - in: query
         name: file_extension
         schema:

diff --git a/vector_search/constants.py b/vector_search/constants.py
@@ -14,6 +14,11 @@
     "run_readable_id": "run_readable_id",
     "resource_readable_id": "resource_readable_id",
     "run_title": "run_title",
+    "edx_block_id": "edx_block_id",
+    "content_type": "content_type",
+    "description": "description",
+    "url": "url",
+    "file_type": "file_type",
 }
 
 QDRANT_RESOURCE_PARAM_MAP = {
@@ -66,4 +71,5 @@
     "run_readable_id": models.PayloadSchemaType.INTEGER,
     "resource_readable_id": models.PayloadSchemaType.KEYWORD,
     "run_title": models.PayloadSchemaType.KEYWORD,
+    "edx_block_id": models.PayloadSchemaType.KEYWORD,
 }
diff --git a/vector_search/serializers.py b/vector_search/serializers.py
@@ -229,6 +229,11 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer):
             "The readable_id value of the parent learning resource for the content file"
         ),
     )
+    edx_block_id = serializers.ListField(
+        required=False,
+        child=serializers.CharField(),
+        help_text="The edx_block_id of the content file",
+    )
     collection_name = serializers.CharField(
         required=False,
         help_text=("Manually specify the name of the Qdrant collection to query"),

diff --git a/vector_search/utils.py b/vector_search/utils.py
@@ -6,8 +6,11 @@
 from langchain_experimental.text_splitter import SemanticChunker
 from qdrant_client import QdrantClient, models
 
-from learning_resources.models import LearningResource
-from learning_resources.serializers import LearningResourceSerializer
+from learning_resources.models import ContentFile, LearningResource
+from learning_resources.serializers import (
+    ContentFileSerializer,
+    LearningResourceSerializer,
+)
 from learning_resources_search.constants import CONTENT_FILE_TYPE
 from learning_resources_search.serializers import (
     serialize_bulk_content_files,
@@ -235,21 +238,8 @@ def _process_content_embeddings(serialized_content):
                 "chunk_content": d.page_content,
                 **{
                     key: d.metadata[key]
-                    for key in [
-                        "run_title",
-                        "platform",
-                        "offered_by",
-                        "run_readable_id",
-                        "resource_readable_id",
-                        "content_type",
-                        "file_extension",
-                        "content_feature_type",
-                        "course_number",
-                        "file_type",
-                        "description",
-                        "key",
-                        "url",
-                    ]
+                    for key in QDRANT_CONTENT_FILE_PARAM_MAP
+                    if key in d.metadata
                 },
             }
             for chunk_id, d in enumerate(split_docs)
@@ -368,7 +358,31 @@ def _resource_vector_hits(search_result):
 
 
 def _content_file_vector_hits(search_result):
-    return [hit.payload for hit in search_result]
+    run_readable_ids = [hit.payload["run_readable_id"] for hit in search_result]
+    keys = [hit.payload["key"] for hit in search_result]
+
+    serialized_content_files = ContentFileSerializer(
+        ContentFile.objects.for_serialization().filter(
+            run__run_id__in=run_readable_ids, key__in=keys
+        ),
+        many=True,
+    ).data
+    results = []
+    contentfiles_dict = {}
+    [
+        contentfiles_dict.update({(cf["run_readable_id"], cf["key"]): cf})
+        for cf in serialized_content_files
+    ]
+    results = []
+    for hit in search_result:
+        payload = hit.payload
+        serialized = contentfiles_dict.get((payload["run_readable_id"], payload["key"]))
+        if serialized:
+            if "content" in serialized:
+                serialized.pop("content")
+            payload.update(serialized)
+            results.append(payload)
+    return results
 
 
 def vector_search(