Skip to content

Commit

Permalink
Add all Contentfile metadata to chunk responses (#2075)
Browse files Browse the repository at this point in the history
* serialize contentfiles like we do with learning resources

* fixing contentfile serialization

* optimize loop and data fetch

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* fixing n+1 queries

* adding block id to embedded metadata

* adding block id as filter parameter

* regenerate spec

* fixing test:

* some consolidation

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
  • Loading branch information
shanbady and gemini-code-assist[bot] authored Feb 27, 2025
1 parent 1001aee commit aa02630
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 19 deletions.
18 changes: 18 additions & 0 deletions frontends/api/src/generated/v0/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10916,6 +10916,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
* @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
* @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
* @param {Array<string>} [course_number] Course number of the content file
* @param {Array<string>} [edx_block_id] The edx_block_id of the content file
* @param {Array<string>} [file_extension] The extension of the content file.
* @param {Array<string>} [key] The filename of the content file
* @param {number} [limit] Number of results to return per page
Expand All @@ -10933,6 +10934,7 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
collection_name?: string,
content_feature_type?: Array<string>,
course_number?: Array<string>,
edx_block_id?: Array<string>,
file_extension?: Array<string>,
key?: Array<string>,
limit?: number,
Expand Down Expand Up @@ -10973,6 +10975,10 @@ export const VectorContentFilesSearchApiAxiosParamCreator = function (
localVarQueryParameter["course_number"] = course_number
}

if (edx_block_id) {
localVarQueryParameter["edx_block_id"] = edx_block_id
}

if (file_extension) {
localVarQueryParameter["file_extension"] = file_extension
}
Expand Down Expand Up @@ -11046,6 +11052,7 @@ export const VectorContentFilesSearchApiFp = function (
* @param {string} [collection_name] Manually specify the name of the Qdrant collection to query
* @param {Array<string>} [content_feature_type] The feature type of the content file. Possible options are at api/v1/course_features/
* @param {Array<string>} [course_number] Course number of the content file
* @param {Array<string>} [edx_block_id] The edx_block_id of the content file
* @param {Array<string>} [file_extension] The extension of the content file.
* @param {Array<string>} [key] The filename of the content file
* @param {number} [limit] Number of results to return per page
Expand All @@ -11063,6 +11070,7 @@ export const VectorContentFilesSearchApiFp = function (
collection_name?: string,
content_feature_type?: Array<string>,
course_number?: Array<string>,
edx_block_id?: Array<string>,
file_extension?: Array<string>,
key?: Array<string>,
limit?: number,
Expand All @@ -11085,6 +11093,7 @@ export const VectorContentFilesSearchApiFp = function (
collection_name,
content_feature_type,
course_number,
edx_block_id,
file_extension,
key,
limit,
Expand Down Expand Up @@ -11140,6 +11149,7 @@ export const VectorContentFilesSearchApiFactory = function (
requestParameters.collection_name,
requestParameters.content_feature_type,
requestParameters.course_number,
requestParameters.edx_block_id,
requestParameters.file_extension,
requestParameters.key,
requestParameters.limit,
Expand Down Expand Up @@ -11184,6 +11194,13 @@ export interface VectorContentFilesSearchApiVectorContentFilesSearchRetrieveRequ
*/
readonly course_number?: Array<string>

/**
* The edx_block_id of the content file
* @type {Array<string>}
* @memberof VectorContentFilesSearchApiVectorContentFilesSearchRetrieve
*/
readonly edx_block_id?: Array<string>

/**
* The extension of the content file.
* @type {Array<string>}
Expand Down Expand Up @@ -11279,6 +11296,7 @@ export class VectorContentFilesSearchApi extends BaseAPI {
requestParameters.collection_name,
requestParameters.content_feature_type,
requestParameters.course_number,
requestParameters.edx_block_id,
requestParameters.file_extension,
requestParameters.key,
requestParameters.limit,
Expand Down
10 changes: 9 additions & 1 deletion learning_resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,13 +838,21 @@ def for_serialization(self):
return self.select_related("run").prefetch_related(
"content_tags",
"run__learning_resource",
"run__learning_resource__course",
"run__learning_resource__platform",
Prefetch(
"run__learning_resource__topics",
queryset=LearningResourceTopic.objects.for_serialization(),
),
Prefetch(
"run__learning_resource__offered_by",
queryset=LearningResourceOfferor.objects.for_serialization(),
),
Prefetch(
"run__learning_resource__departments",
queryset=LearningResourceDepartment.objects.for_serialization(),
queryset=LearningResourceDepartment.objects.for_serialization(
prefetch_school=True
).select_related("school"),
),
)

Expand Down
8 changes: 8 additions & 0 deletions openapi/specs/v0.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,14 @@ paths:
type: string
minLength: 1
description: Course number of the content file
- in: query
name: edx_block_id
schema:
type: array
items:
type: string
minLength: 1
description: The edx_block_id of the content file
- in: query
name: file_extension
schema:
Expand Down
6 changes: 6 additions & 0 deletions vector_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
"run_readable_id": "run_readable_id",
"resource_readable_id": "resource_readable_id",
"run_title": "run_title",
"edx_block_id": "edx_block_id",
"content_type": "content_type",
"description": "description",
"url": "url",
"file_type": "file_type",
}

QDRANT_RESOURCE_PARAM_MAP = {
Expand Down Expand Up @@ -66,4 +71,5 @@
"run_readable_id": models.PayloadSchemaType.INTEGER,
"resource_readable_id": models.PayloadSchemaType.KEYWORD,
"run_title": models.PayloadSchemaType.KEYWORD,
"edx_block_id": models.PayloadSchemaType.KEYWORD,
}
5 changes: 5 additions & 0 deletions vector_search/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,11 @@ class ContentFileVectorSearchRequestSerializer(serializers.Serializer):
"The readable_id value of the parent learning resource for the content file"
),
)
edx_block_id = serializers.ListField(
required=False,
child=serializers.CharField(),
help_text="The edx_block_id of the content file",
)
collection_name = serializers.CharField(
required=False,
help_text=("Manually specify the name of the Qdrant collection to query"),
Expand Down
50 changes: 32 additions & 18 deletions vector_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@
from langchain_experimental.text_splitter import SemanticChunker
from qdrant_client import QdrantClient, models

from learning_resources.models import LearningResource
from learning_resources.serializers import LearningResourceSerializer
from learning_resources.models import ContentFile, LearningResource
from learning_resources.serializers import (
ContentFileSerializer,
LearningResourceSerializer,
)
from learning_resources_search.constants import CONTENT_FILE_TYPE
from learning_resources_search.serializers import (
serialize_bulk_content_files,
Expand Down Expand Up @@ -235,21 +238,8 @@ def _process_content_embeddings(serialized_content):
"chunk_content": d.page_content,
**{
key: d.metadata[key]
for key in [
"run_title",
"platform",
"offered_by",
"run_readable_id",
"resource_readable_id",
"content_type",
"file_extension",
"content_feature_type",
"course_number",
"file_type",
"description",
"key",
"url",
]
for key in QDRANT_CONTENT_FILE_PARAM_MAP
if key in d.metadata
},
}
for chunk_id, d in enumerate(split_docs)
Expand Down Expand Up @@ -368,7 +358,31 @@ def _resource_vector_hits(search_result):


def _content_file_vector_hits(search_result):
return [hit.payload for hit in search_result]
run_readable_ids = [hit.payload["run_readable_id"] for hit in search_result]
keys = [hit.payload["key"] for hit in search_result]

serialized_content_files = ContentFileSerializer(
ContentFile.objects.for_serialization().filter(
run__run_id__in=run_readable_ids, key__in=keys
),
many=True,
).data
results = []
contentfiles_dict = {}
[
contentfiles_dict.update({(cf["run_readable_id"], cf["key"]): cf})
for cf in serialized_content_files
]
results = []
for hit in search_result:
payload = hit.payload
serialized = contentfiles_dict.get((payload["run_readable_id"], payload["key"]))
if serialized:
if "content" in serialized:
serialized.pop("content")
payload.update(serialized)
results.append(payload)
return results


def vector_search(
Expand Down

0 comments on commit aa02630

Please sign in to comment.