diff --git a/docs/contributing/migration.md b/docs/contributing/migration.md index a2f3badbe..a492aef6f 100644 --- a/docs/contributing/migration.md +++ b/docs/contributing/migration.md @@ -8,13 +8,13 @@ schema. If upgrading DB, You must do this step!! 1. Make desired changes to SQLAlchemy orm models in `src/core/trulens/core/database/orm.py`. -1. Get a database with the new changes: - 1. `rm default.sqlite` - 1. Run `TruSession()` to create a fresh database that uses the new ORM. 1. Run automatic alembic revision script generator. This will generate a new python script in `src/core/trulens/core/database/migrations`. 1. `cd src/core/trulens/core/database/migrations` 1. `SQLALCHEMY_URL="sqlite:///../../../../../../default.sqlite" alembic revision --autogenerate -m "" --rev-id ""` 1. Check over the automatically generated script in `src/core/trulens/core/database/migration/versions` to make sure it looks correct. +1. Get a database with the new changes: + 1. `rm default.sqlite` + 1. Run `TruSession()` to create a fresh database that uses the new ORM. 1. Add the version to `src/core/trulens/core/database/migrations/data.py` in the variable `sql_alchemy_migration_versions` 1. Make any `sqlalchemy_upgrade_paths` updates in `src/core/trulens/core/database/migrations/data.py` if a backfill is necessary. @@ -30,32 +30,32 @@ Note: Some of these instructions may be outdated and are in progress if being up github; which will invalidate it upon commit) 1. cd `tests/docs_notebooks/notebooks_to_test` 1. remove any local dbs - * `rm -rf default.sqlite` + - `rm -rf default.sqlite` 1. run below notebooks (Making sure you also run with the most recent code in trulens) TODO: Move these to a script - * all_tools.ipynb # `cp ../../../generated_files/all_tools.ipynb ./` - * llama_index_quickstart.ipynb # `cp - ../../../examples/quickstart/llama_index_quickstart.ipynb ./` - * langchain-retrieval-augmentation-with-trulens.ipynb # `cp - ../../../examples/vector-dbs/pinecone/langchain-retrieval-augmentation-with-trulens.ipynb - ./` - * Add any other notebooks you think may have possible breaking changes + - all_tools.ipynb # `cp ../../../generated_files/all_tools.ipynb ./` + - llama_index_quickstart.ipynb # `cp +../../../examples/quickstart/llama_index_quickstart.ipynb ./` + - langchain-retrieval-augmentation-with-trulens.ipynb # `cp +../../../examples/vector-dbs/pinecone/langchain-retrieval-augmentation-with-trulens.ipynb +./` + - Add any other notebooks you think may have possible breaking changes 1. replace the last compatible db with this new db file - * Use the version you chose for --rev-id - * `mkdir release_dbs/sql_alchemy_/` - * `cp default.sqlite - release_dbs/sql_alchemy_/` + - Use the version you chose for --rev-id + - `mkdir release_dbs/sql_alchemy_/` + - `cp default.sqlite +release_dbs/sql_alchemy_/` 1. `git add release_dbs` ## Testing the DB Run the tests with the requisite env vars. - ```bash - HUGGINGFACE_API_KEY="" \ - OPENAI_API_KEY="" \ - PINECONE_API_KEY="" \ - PINECONE_ENV="" \ - HUGGINGFACEHUB_API_TOKEN="" \ - python -m pytest tests/docs_notebooks -k backwards_compat - ``` +```bash +HUGGINGFACE_API_KEY="" \ +OPENAI_API_KEY="" \ +PINECONE_API_KEY="" \ +PINECONE_ENV="" \ +HUGGINGFACEHUB_API_TOKEN="" \ +python -m pytest tests/docs_notebooks -k backwards_compat +``` diff --git a/examples/experimental/otel_exporter.ipynb b/examples/experimental/otel_exporter.ipynb new file mode 100644 index 000000000..0eddcb41c --- /dev/null +++ b/examples/experimental/otel_exporter.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install opentelemetry-api\n", + "# !pip install opentelemetry-sdk" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import sys\n", + "\n", + "# Add base dir to path to be able to access test folder.\n", + "base_dir = Path().cwd().parent.parent.resolve()\n", + "if str(base_dir) not in sys.path:\n", + " print(f\"Adding {base_dir} to sys.path\")\n", + " sys.path.append(str(base_dir))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from opentelemetry import trace\n", + "from opentelemetry.sdk.resources import Resource\n", + "from opentelemetry.sdk.trace import TracerProvider\n", + "from opentelemetry.sdk.trace.export import BatchSpanProcessor\n", + "from opentelemetry.sdk.trace.export import ConsoleSpanExporter\n", + "\n", + "SERVICE_NAME = \"trulens\"\n", + "\n", + "\n", + "def init():\n", + " # Use Resource.create() instead of constructor directly\n", + " resource = Resource.create({\"service.name\": SERVICE_NAME})\n", + "\n", + " trace.set_tracer_provider(TracerProvider(resource=resource))\n", + " trace.get_tracer_provider().add_span_processor(\n", + " BatchSpanProcessor(ConsoleSpanExporter())\n", + " )\n", + "\n", + "\n", + "init()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Callable\n", + "\n", + "from trulens.apps.custom import instrument\n", + "\n", + "SERVICE_NAME = \"trulens\"\n", + "\n", + "\n", + "def decorator(func: Callable):\n", + " tracer = trace.get_tracer(SERVICE_NAME)\n", + "\n", + " def wrapper(*args, **kwargs):\n", + " print(\"start wrap\")\n", + "\n", + " with tracer.start_as_current_span(\"custom\"):\n", + " result = func(*args, **kwargs)\n", + " span = trace.get_current_span()\n", + " print(\"---span---\")\n", + " print(span.get_span_context())\n", + " span.set_attribute(\"result\", result)\n", + " span.set_status(trace.Status(trace.StatusCode.OK))\n", + " return result\n", + "\n", + " return wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from examples.dev.dummy_app.dummy import Dummy\n", + "\n", + "\n", + "class TestApp(Dummy):\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " @decorator\n", + " @instrument\n", + " def respond_to_query(self, query: str) -> str:\n", + " return f\"answer: {self.nested(query)}\"\n", + "\n", + " @decorator\n", + " @instrument\n", + " def nested(self, query: str) -> str:\n", + " return f\"nested: {query}\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from trulens.core.session import TruSession\n", + "\n", + "SERVICE_NAME = \"trulens\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "session = TruSession()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "trulens", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/core/trulens/core/database/migrations/data.py b/src/core/trulens/core/database/migrations/data.py index 2c0dd3274..f5a6ea1dd 100644 --- a/src/core/trulens/core/database/migrations/data.py +++ b/src/core/trulens/core/database/migrations/data.py @@ -15,7 +15,7 @@ from trulens.core.schema import record as record_schema from trulens.core.utils import pyschema as pyschema_utils -sql_alchemy_migration_versions: List[int] = [1, 2, 3] +sql_alchemy_migration_versions: List[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] """DB versions.""" sqlalchemy_upgrade_paths: Dict[int, Tuple[int, Callable[[DB]]]] = { diff --git a/src/core/trulens/core/database/migrations/versions/10_create_event_table.py b/src/core/trulens/core/database/migrations/versions/10_create_event_table.py new file mode 100644 index 000000000..3dfe4eae6 --- /dev/null +++ b/src/core/trulens/core/database/migrations/versions/10_create_event_table.py @@ -0,0 +1,44 @@ +"""create event table + +Revision ID: 10 +Revises: 9 +Create Date: 2024-12-11 09:32:48.976169 +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = "10" +down_revision = "9" +branch_labels = None +depends_on = None + + +def upgrade(config) -> None: + prefix = config.get_main_option("trulens.table_prefix") + + if prefix is None: + raise RuntimeError("trulens.table_prefix is not set") + + op.create_table( + prefix + "events", + sa.Column("event_id", sa.VARCHAR(length=256), nullable=False), + sa.Column("record", sa.JSON(), nullable=False), + sa.Column("record_attributes", sa.JSON(), nullable=False), + sa.Column("record_type", sa.VARCHAR(length=256), nullable=False), + sa.Column("resource_attributes", sa.JSON(), nullable=False), + sa.Column("start_timestamp", sa.TIMESTAMP(), nullable=False), + sa.Column("timestamp", sa.TIMESTAMP(), nullable=False), + sa.Column("trace", sa.JSON(), nullable=False), + sa.PrimaryKeyConstraint("event_id"), + ) + + +def downgrade(config) -> None: + prefix = config.get_main_option("trulens.table_prefix") + + if prefix is None: + raise RuntimeError("trulens.table_prefix is not set") + + op.drop_table(prefix + "events") diff --git a/src/core/trulens/core/database/orm.py b/src/core/trulens/core/database/orm.py index 67a2dec75..bc8387f55 100644 --- a/src/core/trulens/core/database/orm.py +++ b/src/core/trulens/core/database/orm.py @@ -5,9 +5,12 @@ from sqlite3 import Connection as SQLite3Connection from typing import ClassVar, Dict, Generic, Type, TypeVar +from sqlalchemy import JSON +from sqlalchemy import TIMESTAMP from sqlalchemy import VARCHAR from sqlalchemy import Column from sqlalchemy import Engine +from sqlalchemy import Enum from sqlalchemy import Float from sqlalchemy import ForeignKey from sqlalchemy import Text @@ -22,6 +25,7 @@ from trulens.core.database import base as core_db from trulens.core.schema import app as app_schema from trulens.core.schema import dataset as dataset_schema +from trulens.core.schema import event as event_schema from trulens.core.schema import feedback as feedback_schema from trulens.core.schema import groundtruth as groundtruth_schema from trulens.core.schema import record as record_schema @@ -116,6 +120,7 @@ class ORM(abc.ABC, Generic[T]): FeedbackResult: Type[T] GroundTruth: Type[T] Dataset: Type[T] + Event: Type[T] def new_orm(base: Type[T], prefix: str = "trulens_") -> Type[ORM[T]]: @@ -401,6 +406,81 @@ def parse( dataset_json=obj.model_dump_json(redact_keys=redact_keys), ) + class Event(base): + """ + ORM class for OTEL traces/spans. This should be kept in line with the event table + https://docs.snowflake.com/en/developer-guide/logging-tracing/event-table-columns#data-for-trace-events + https://docs.snowflake.com/en/developer-guide/logging-tracing/event-table-columns#label-event-table-record-column-span + """ + + _table_base_name = "events" + + record = Column(JSON, nullable=False) + """ + For a span, this is an object that includes: + - name: the function/procedure that emitted the data + - kind: SPAN_KIND_TRULENS + - parent_span_id: the unique identifier for the parent span + - status: STATUS_CODE_ERROR when the span corresponds to an unhandled exception. Otherwise, STATUS_CODE_UNSET. + """ + + event_id = Column(TYPE_ID, nullable=False, primary_key=True) + """ + Used as primary key for the schema. Not technically present in the event table schema, + but having it here makes it easier to work with the ORM. + """ + + record_attributes = Column(JSON, nullable=False) + """ + Attributes of the record that can either come from the user, or based on the TruLens semantic conventions. + """ + + record_type = Column( + Enum(event_schema.EventRecordType), nullable=False + ) + """ + Specifies the kind of record specified by this row. This will always be "SPAN" for TruLens. + """ + + resource_attributes = Column(JSON, nullable=False) + """ + Reserved. + """ + + start_timestamp = Column(TIMESTAMP, nullable=False) + """ + The timestamp when the span started. This is a UNIX timestamp in milliseconds. + Note: The Snowflake event table uses the TIMESTAMP_NTZ data type for this column. + """ + + timestamp = Column(TIMESTAMP, nullable=False) + """ + The timestamp when the span concluded. This is a UNIX timestamp in milliseconds. + Note: The Snowflake event table uses the TIMESTAMP_NTZ data type for this column. + """ + + trace = Column(JSON, nullable=False) + """ + Contains the span context, including the trace_id, parent_id, and span_id for the span. + """ + + @classmethod + def parse( + cls, + obj: event_schema.Event, + redact_keys: bool = False, + ) -> ORM.EventTable: + return cls( + event_id=obj.event_id, + record=obj.record, + record_attributes=obj.record_attributes, + record_type=obj.record_type, + resource_attributes=obj.resource_attributes, + start_timestamp=obj.start_timestamp, + timestamp=obj.timestamp, + trace=obj.trace, + ) + configure_mappers() # IMPORTANT # Without the above, orm class attributes which are defined using backref # will not be visible, i.e. orm.AppDefinition.records. diff --git a/src/core/trulens/core/schema/event.py b/src/core/trulens/core/schema/event.py new file mode 100644 index 000000000..63fc96e5a --- /dev/null +++ b/src/core/trulens/core/schema/event.py @@ -0,0 +1,86 @@ +"""Serializable event-related classes.""" + +from __future__ import annotations + +from datetime import datetime +import enum +import logging +from typing import Any, Dict, Hashable + +from trulens.core.utils import serial as serial_utils +from typing_extensions import TypedDict + +logger = logging.getLogger(__name__) + + +class EventRecordType(enum.Enum): + """The enumeration of the possible record types for an event.""" + + SPAN = "SPAN" + + +class Trace(TypedDict): + """The type hint for a trace dictionary.""" + + trace_id: str + parent_id: str + span_id: str + + +class Event(serial_utils.SerialModel, Hashable): + """The class that represents a single event data entry.""" + + event_id: str + """ + The unique identifier for the event. + """ + + record: Dict[str, Any] + """ + For a span, this is an object that includes: + - name: the function/procedure that emitted the data + - kind: SPAN_KIND_TRULENS + - parent_span_id: the unique identifier for the parent span + - status: STATUS_CODE_ERROR when the span corresponds to an unhandled exception. Otherwise, STATUS_CODE_UNSET. + """ + + record_attributes: Dict[str, Any] + """ + Attributes of the record that can either come from the user, or based on the TruLens semantic conventions. + """ + + record_type: EventRecordType + """ + Specifies the kind of record specified by this row. This will always be "SPAN" for TruLens. + """ + + resource_attributes: Dict[str, Any] + """ + Reserved. + """ + + start_timestamp: datetime + """ + The timestamp when the span started. This is a UNIX timestamp in milliseconds. + Note: The Snowflake event table uses the TIMESTAMP_NTZ data type for this column. + """ + + timestamp: datetime + """ + The timestamp when the span concluded. This is a UNIX timestamp in milliseconds. + Note: The Snowflake event table uses the TIMESTAMP_NTZ data type for this column. + """ + + trace: Trace + """ + The trace context information for the span. + """ + + def __init__( + self, + **kwargs, + ): + super().__init__(**kwargs) + + def __hash__(self): + return self.trace["span_id"] diff --git a/src/core/trulens/core/schema/types.py b/src/core/trulens/core/schema/types.py index 18428c715..5384bb34f 100644 --- a/src/core/trulens/core/schema/types.py +++ b/src/core/trulens/core/schema/types.py @@ -83,3 +83,7 @@ def new_call_id() -> CallID: By default these are hashes of dataset content as json. """ + +EventID: TypeAlias = str +"""Unique identifier for a event. +""" diff --git a/tests/unit/static/golden/api.trulens.3.11.yaml b/tests/unit/static/golden/api.trulens.3.11.yaml index 1465da6eb..ae63b815b 100644 --- a/tests/unit/static/golden/api.trulens.3.11.yaml +++ b/tests/unit/static/golden/api.trulens.3.11.yaml @@ -4,7 +4,7 @@ trulens: lows: {} trulens.benchmark: __class__: builtins.module - __version__: 1.2.8 + __version__: 1.2.11 highs: {} lows: __version__: builtins.str @@ -60,7 +60,7 @@ trulens.benchmark.test_cases: generate_summeval_groundedness_golden_set: builtins.function trulens.core: __class__: builtins.module - __version__: 1.2.8 + __version__: 1.2.11 highs: Feedback: pydantic._internal._model_construction.ModelMetaclass FeedbackMode: enum.EnumType @@ -816,6 +816,41 @@ trulens.core.schema.dataset.Dataset: dataset_id: builtins.str meta: typing.Dict name: builtins.str +trulens.core.schema.event: + __class__: builtins.module + highs: {} + lows: + Event: pydantic._internal._model_construction.ModelMetaclass + EventRecordType: enum.EnumType + Trace: typing_extensions._TypedDictMeta +trulens.core.schema.event.Event: + __bases__: + - trulens.core.utils.serial.SerialModel + - collections.abc.Hashable + - typing.Generic + __class__: pydantic._internal._model_construction.ModelMetaclass + attributes: + event_id: builtins.str + record: typing.Dict[builtins.str, typing.Any] + record_attributes: typing.Dict[builtins.str, typing.Any] + record_type: trulens.core.schema.event.EventRecordType + resource_attributes: typing.Dict[builtins.str, typing.Any] + start_timestamp: datetime.datetime + timestamp: datetime.datetime + trace: trulens.core.schema.event.Trace +trulens.core.schema.event.EventRecordType: + __bases__: + - enum.Enum + __class__: enum.EnumType + attributes: + SPAN: trulens.core.schema.event.EventRecordType + name: enum.property + value: enum.property +trulens.core.schema.event.Trace: + __bases__: + - builtins.dict + __class__: typing_extensions._TypedDictMeta + attributes: {} trulens.core.schema.feedback: __class__: builtins.module highs: {} @@ -1678,7 +1713,7 @@ trulens.core.utils.trulens.Other: attributes: {} trulens.dashboard: __class__: builtins.module - __version__: 1.2.8 + __version__: 1.2.11 highs: run_dashboard: builtins.function run_dashboard_sis: builtins.function @@ -1966,7 +2001,7 @@ trulens.dashboard.ux.styles.ResultCategoryType: value: enum.property trulens.feedback: __class__: builtins.module - __version__: 1.2.8 + __version__: 1.2.11 highs: Embeddings: pydantic._internal._model_construction.ModelMetaclass GroundTruthAggregator: pydantic._internal._model_construction.ModelMetaclass