diff --git a/src/nv_ingest/modules/sinks/vdb_task_sink.py b/src/nv_ingest/modules/sinks/vdb_task_sink.py index 42bc4bbf..d133a0e0 100644 --- a/src/nv_ingest/modules/sinks/vdb_task_sink.py +++ b/src/nv_ingest/modules/sinks/vdb_task_sink.py @@ -211,6 +211,7 @@ def extract_df(ctrl_msg: ControlMessage, filter_errors: bool): mdf["embedding"] = mdf["metadata"].struct.field("embedding") mdf["_source_metadata"] = mdf["metadata"].struct.field("source_metadata") + mdf["_content_metadata"] = mdf["metadata"].struct.field("content_metadata") df = mdf[mdf["_contains_embeddings"]].copy() df = df[ @@ -218,9 +219,10 @@ def extract_df(ctrl_msg: ControlMessage, filter_errors: bool): "embedding", "_content", "_source_metadata", + "_content_metadata", ] ] - df.columns = ["vector", "text", "source"] + df.columns = ["vector", "text", "source", "content_metadata"] return df, resource_name diff --git a/src/nv_ingest/schemas/vdb_task_sink_schema.py b/src/nv_ingest/schemas/vdb_task_sink_schema.py index c5faadee..74df980e 100644 --- a/src/nv_ingest/schemas/vdb_task_sink_schema.py +++ b/src/nv_ingest/schemas/vdb_task_sink_schema.py @@ -67,6 +67,11 @@ def build_default_milvus_config(embedding_size: int = 1024) -> typing.Dict[str, dtype=pymilvus.DataType.JSON, description="Source document and raw data extracted content", ).to_dict(), + pymilvus.FieldSchema( + name="content_metadata", + dtype=pymilvus.DataType.JSON, + description="Content metadata", + ).to_dict(), ], "description": "NV-INGEST collection schema", },