PostHog · skoob13 · Dec 16, 2024 · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/ee/hogai/assistant.py b/ee/hogai/assistant.py
@@ -1,6 +1,6 @@
 import json
 from collections.abc import Generator, Iterator
-from typing import Any, Optional
+from typing import Any, Optional, cast
 from uuid import uuid4
 
 from langchain_core.messages import AIMessageChunk
@@ -12,6 +12,7 @@
 from ee import settings
 from ee.hogai.funnels.nodes import FunnelGeneratorNode
 from ee.hogai.graph import AssistantGraph
+from ee.hogai.memory.nodes import MemoryInitializerNode
 from ee.hogai.retention.nodes import RetentionGeneratorNode
 from ee.hogai.schema_generator.nodes import SchemaGeneratorNode
 from ee.hogai.trends.nodes import TrendsGeneratorNode
@@ -57,6 +58,17 @@
     AssistantNodeName.RETENTION_GENERATOR: RetentionGeneratorNode,
 }
 
+STREAMING_NODES: set[AssistantNodeName] = {
+    AssistantNodeName.MEMORY_ONBOARDING,
+    AssistantNodeName.MEMORY_INITIALIZER,
+    AssistantNodeName.SUMMARIZER,
+}
+"""Nodes that can stream messages to the client."""
+
+
+VERBOSE_NODES = STREAMING_NODES | {AssistantNodeName.MEMORY_INITIALIZER_INTERRUPT}
+"""Nodes that can send messages to the client."""
+
 
 class Assistant:
     _team: Team
@@ -117,8 +129,11 @@ def _stream(self) -> Generator[str, None, None]:
             # Check if the assistant has requested help.
             state = self._graph.get_state(config)
             if state.next:
+                interrupt_value = state.tasks[0].interrupts[0].value
                 yield self._serialize_message(
-                    AssistantMessage(content=state.tasks[0].interrupts[0].value, id=str(uuid4()))
+                    AssistantMessage(content=interrupt_value, id=str(uuid4()))
+                    if isinstance(interrupt_value, str)
+                    else interrupt_value
                 )
             else:
                 self._report_conversation_state(last_viz_message)
@@ -227,26 +242,34 @@ def _process_value_update(self, update: GraphValueUpdateTuple) -> BaseModel | No
                 return node_val.messages[0]
             elif node_val.intermediate_steps:
                 return AssistantGenerationStatusEvent(type=AssistantGenerationStatusType.GENERATION_ERROR)
-        elif node_val := state_update.get(AssistantNodeName.SUMMARIZER):
-            if isinstance(node_val, PartialAssistantState) and node_val.messages:
-                self._chunks = AIMessageChunk(content="")
-                return node_val.messages[0]
+
+        for node_name in VERBOSE_NODES:
+            if node_val := state_update.get(node_name):
+                if isinstance(node_val, PartialAssistantState) and node_val.messages:
+                    self._chunks = AIMessageChunk(content="")
+                    return node_val.messages[0]
 
         return None
 
     def _process_message_update(self, update: GraphMessageUpdateTuple) -> BaseModel | None:
         langchain_message, langgraph_state = update[1]
         if isinstance(langchain_message, AIMessageChunk):
-            if langgraph_state["langgraph_node"] in VISUALIZATION_NODES.keys():
+            node_name = langgraph_state["langgraph_node"]
+            if node_name in VISUALIZATION_NODES.keys():
                 self._chunks += langchain_message  # type: ignore
-                parsed_message = VISUALIZATION_NODES[langgraph_state["langgraph_node"]].parse_output(
-                    self._chunks.tool_calls[0]["args"]
-                )
+                parsed_message = VISUALIZATION_NODES[node_name].parse_output(self._chunks.tool_calls[0]["args"])
                 if parsed_message:
                     initiator_id = self._state.start_id if self._state is not None else None
                     return VisualizationMessage(answer=parsed_message.query, initiator=initiator_id)
-            elif langgraph_state["langgraph_node"] == AssistantNodeName.SUMMARIZER:
+            elif node_name in STREAMING_NODES:
                 self._chunks += langchain_message  # type: ignore
+                if node_name == AssistantNodeName.MEMORY_INITIALIZER:
+                    if not MemoryInitializerNode.should_process_message_chunk(langchain_message):
+                        return None
+                    else:
+                        return AssistantMessage(
+                            content=MemoryInitializerNode.format_message(cast(str, self._chunks.content))
+                        )
                 return AssistantMessage(content=self._chunks.content)
         return None
 

diff --git a/ee/hogai/eval/conftest.py b/ee/hogai/eval/conftest.py
@@ -8,6 +8,7 @@
 from langchain_core.runnables import RunnableConfig
 
 from ee.models import Conversation
+from ee.models.assistant import CoreMemory
 from posthog.demo.matrix.manager import MatrixManager
 from posthog.models import Organization, Project, Team, User
 from posthog.tasks.demo_create_data import HedgeboxMatrix
@@ -78,6 +79,32 @@ def user(team, django_db_blocker) -> Generator[User, None, None]:
         user.delete()
 
 
+@pytest.fixture(scope="package")
+def core_memory(team) -> Generator[CoreMemory, None, None]:
+    initial_memory = """Hedgebox is a cloud storage service enabling users to store, share, and access files across devices.
+
+    The company operates in the cloud storage and collaboration market for individuals and businesses.
+
+    Their audience includes professionals and organizations seeking file management and collaboration solutions.
+
+    Hedgebox’s freemium model provides free accounts with limited storage and paid subscription plans for additional features.
+
+    Core features include file storage, synchronization, sharing, and collaboration tools for seamless file access and sharing.
+
+    It integrates with third-party applications to enhance functionality and streamline workflows.
+
+    Hedgebox sponsors the YouTube channel Marius Tech Tips."""
+
+    core_memory = CoreMemory.objects.create(
+        team=team,
+        text=initial_memory,
+        initial_text=initial_memory,
+        scraping_status=CoreMemory.ScrapingStatus.COMPLETED,
+    )
+    yield core_memory
+    core_memory.delete()
+
+
 @pytest.mark.django_db(transaction=True)
 @pytest.fixture
 def runnable_config(team, user) -> Generator[RunnableConfig, None, None]:

diff --git a/ee/hogai/eval/tests/test_eval_memory.py b/ee/hogai/eval/tests/test_eval_memory.py
@@ -0,0 +1,178 @@
+import json
+from collections.abc import Callable
+from typing import Optional
+
+import pytest
+from deepeval import assert_test
+from deepeval.metrics import GEval, ToolCorrectnessMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+from langchain_core.messages import AIMessage
+from langchain_core.runnables.config import RunnableConfig
+from langgraph.graph.state import CompiledStateGraph
+
+from ee.hogai.assistant import AssistantGraph
+from ee.hogai.utils.types import AssistantNodeName, AssistantState
+from posthog.schema import HumanMessage
+
+
+@pytest.fixture
+def retrieval_metrics():
+    retrieval_correctness_metric = GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+            "The actual fact must only contain information about the user's company or product",
+            "Context must not contain similar information to the actual fact",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.CONTEXT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
+
+    return [ToolCorrectnessMetric(), retrieval_correctness_metric]
+
+
+@pytest.fixture
+def replace_metrics():
+    retrieval_correctness_metric = GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output tuple is factually correct based on the expected output tuple. The first element is the original fact from the context to replace with, while the second element is the new fact to replace it with.",
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+            "The actual fact must only contain information about the user's company or product",
+            "Context must contain the first element of the tuples",
+            "For deletion, the second element should be an empty string in both the actual and expected output",
+        ],
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.CONTEXT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+        ],
+        threshold=0.7,
+    )
+
+    return [ToolCorrectnessMetric(), retrieval_correctness_metric]
+
+
+@pytest.fixture
+def call_node(team, runnable_config: RunnableConfig) -> Callable[[str], Optional[AIMessage]]:
+    graph: CompiledStateGraph = (
+        AssistantGraph(team).add_memory_collector(AssistantNodeName.END, AssistantNodeName.END).compile()
+    )
+
+    def callable(query: str) -> Optional[AIMessage]:
+        state = graph.invoke(
+            AssistantState(messages=[HumanMessage(content=query)]),
+            runnable_config,
+        )
+        validated_state = AssistantState.model_validate(state)
+        if not validated_state.memory_collection_messages:
+            return None
+        return validated_state.memory_collection_messages[-1]
+
+    return callable
+
+
+def test_saves_relevant_fact(call_node, retrieval_metrics, core_memory):
+    query = "calculate ARR: use the paid_bill event and the amount property."
+    actual_output = call_node(query)
+    tool = actual_output.tool_calls[0]
+
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="The product uses the event paid_bill and the property amount to calculate Annual Recurring Revenue (ARR).",
+        expected_tools=["core_memory_append"],
+        context=[core_memory.formatted_text],
+        actual_output=tool["args"]["memory_content"],
+        tools_called=[tool["name"]],
+    )
+    assert_test(test_case, retrieval_metrics)
+
+
+def test_saves_company_related_information(call_node, retrieval_metrics, core_memory):
+    query = "Our secondary target audience is technical founders or highly-technical product managers."
+    actual_output = call_node(query)
+    tool = actual_output.tool_calls[0]
+
+    test_case = LLMTestCase(
+        input=query,
+        expected_output="The company's secondary target audience is technical founders or highly-technical product managers.",
+        expected_tools=["core_memory_append"],
+        context=[core_memory.formatted_text],
+        actual_output=tool["args"]["memory_content"],
+        tools_called=[tool["name"]],
+    )
+    assert_test(test_case, retrieval_metrics)
+
+
+def test_omits_irrelevant_personal_information(call_node):
+    query = "My name is John Doherty."
+    actual_output = call_node(query)
+    assert actual_output is None
+
+
+def test_omits_irrelevant_excessive_info_from_insights(call_node):
+    query = "Build a pageview trend for users with name John."
+    actual_output = call_node(query)
+    assert actual_output is None
+
+
+def test_fact_replacement(call_node, core_memory, replace_metrics):
+    query = "Hedgebox doesn't sponsor the YouTube channel Marius Tech Tips anymore."
+    actual_output = call_node(query)
+    tool = actual_output.tool_calls[0]
+
+    test_case = LLMTestCase(
+        input=query,
+        expected_output=json.dumps(
+            [
+                "Hedgebox sponsors the YouTube channel Marius Tech Tips.",
+                "Hedgebox no longer sponsors the YouTube channel Marius Tech Tips.",
+            ]
+        ),
+        expected_tools=["core_memory_replace"],
+        context=[core_memory.formatted_text],
+        actual_output=json.dumps([tool["args"]["original_fragment"], tool["args"]["new_fragment"]]),
+        tools_called=[tool["name"]],
+    )
+    assert_test(test_case, replace_metrics)
+
+
+def test_fact_removal(call_node, core_memory, replace_metrics):
+    query = "Delete info that Hedgebox sponsored the YouTube channel Marius Tech Tips."
+    actual_output = call_node(query)
+    tool = actual_output.tool_calls[0]
+
+    test_case = LLMTestCase(
+        input=query,
+        expected_output=json.dumps(["Hedgebox sponsors the YouTube channel Marius Tech Tips.", ""]),
+        expected_tools=["core_memory_replace"],
+        context=[core_memory.formatted_text],
+        actual_output=json.dumps([tool["args"]["original_fragment"], tool["args"]["new_fragment"]]),
+        tools_called=[tool["name"]],
+    )
+    assert_test(test_case, replace_metrics)
+
+
+def test_parallel_calls(call_node):
+    query = "Delete info that Hedgebox sponsored the YouTube channel Marius Tech Tips, and we don't have file sharing."
+    actual_output = call_node(query)
+
+    tool = actual_output.tool_calls
+    test_case = LLMTestCase(
+        input=query,
+        expected_tools=["core_memory_replace", "core_memory_append"],
+        actual_output=actual_output.content,
+        tools_called=[tool[0]["name"], tool[1]["name"]],
+    )
+    assert_test(test_case, [ToolCorrectnessMetric()])
diff --git a/ee/hogai/eval/tests/test_eval_router.py b/ee/hogai/eval/tests/test_eval_router.py
@@ -13,7 +13,7 @@
 def call_node(team, runnable_config) -> Callable[[str | list], str]:
     graph: CompiledStateGraph = (
         AssistantGraph(team)
-        .add_start()
+        .add_edge(AssistantNodeName.START, AssistantNodeName.ROUTER)
         .add_router(path_map={"trends": AssistantNodeName.END, "funnel": AssistantNodeName.END})
         .compile()
     )

diff --git a/ee/hogai/funnels/prompts.py b/ee/hogai/funnels/prompts.py
@@ -2,16 +2,15 @@
 <agent_info>
 You are an expert product analyst agent specializing in data visualization and funnel analysis. Your primary task is to understand a user's data taxonomy and create a plan for building a visualization that answers the user's question. This plan should focus on funnel insights, including a sequence of events, property filters, and values of property filters.
 
-{{#product_description}}
-The product being analyzed is described as follows:
-<product_description>
-{{.}}
-</product_description>
-{{/product_description}}
+{{core_memory_instructions}}
 
 {{react_format}}
 </agent_info>
 
+<core_memory>
+{{core_memory}}
+</core_memory>
+
 {{react_human_in_the_loop}}
 
 Below you will find information on how to correctly discover the taxonomy of the user's data.