diff --git a/README.md b/README.md index f7309fcd..274f0c3d 100644 --- a/README.md +++ b/README.md @@ -134,8 +134,9 @@ Manage multiple projects with ease. 2. Goal Fulfillment Rate (goal_fulfillment_rate) 3. Tool Call Correctness Rate (tool_call_correctness_rate) 4. Tool Call Success Rate (tool_call_success_rate) -5. Error Detection Rate (error_detection_rate) -5. Context Retention Rate (context_retention_rate) +5. Response Latency (response_latency) +6. Error Detection Rate (error_detection_rate) +7. Context Retention Rate (context_retention_rate) - **Run multiple metrics together** ```python diff --git a/agentneo/evaluation/evaluation.py b/agentneo/evaluation/evaluation.py index a8b87c12..74cc124e 100644 --- a/agentneo/evaluation/evaluation.py +++ b/agentneo/evaluation/evaluation.py @@ -21,6 +21,7 @@ execute_tool_selection_accuracy_metric, execute_tool_usage_efficiency_metric, execute_plan_adaptibility_metric, + execute_response_latency_metric, execute_error_detection_rate_metric, execute_context_retention_metric ) @@ -79,6 +80,10 @@ def _execute_metric(self, metric, config, metadata, custom_criteria, context): trace_json=self.trace_data, config=config, ) + elif metric == 'response_latency': + return execute_response_latency_metric( + trace_json=self.trace_data, + config=config, elif metric == 'error_detection_rate': return execute_error_detection_rate_metric( trace_json=self.trace_data, diff --git a/agentneo/evaluation/metrics/__init__.py b/agentneo/evaluation/metrics/__init__.py index a9484f53..a5c5a0b8 100644 --- a/agentneo/evaluation/metrics/__init__.py +++ b/agentneo/evaluation/metrics/__init__.py @@ -5,6 +5,7 @@ from .tool_usage_efficiency import execute_tool_usage_efficiency_metric from .goal_decomposition_efficiency import execute_goal_decomposition_efficiency_metric from .plan_adaptibility import execute_plan_adaptibility_metric +from .response_latency import execute_response_latency_metric from .error_detection_rate import execute_error_detection_rate_metric from .context_retention_rate import execute_context_retention_metric from .custom_evaluation_metric import execute_custom_evaluation_metric @@ -17,6 +18,7 @@ "execute_tool_usage_efficiency_metric", "execute_goal_decomposition_efficiency_metric", "execute_plan_adaptibility_metric", + "execute_response_latency_metric" "execute_error_detection_rate_metric", "execute_context_retention_metric" "execute_custom_evaluation_metric" diff --git a/agentneo/evaluation/metrics/response_latency.py b/agentneo/evaluation/metrics/response_latency.py new file mode 100644 index 00000000..b113b394 --- /dev/null +++ b/agentneo/evaluation/metrics/response_latency.py @@ -0,0 +1,78 @@ +import json +from typing import List, Optional, Dict, Any +from datetime import datetime + +def parse_timestamp(timestamp: str) -> datetime: + return datetime.fromisoformat(timestamp) + +def calculate_latency(start_time: str, end_time: str) -> Optional[float]: + start = parse_timestamp(start_time) + end = parse_timestamp(end_time) + if start and end: + return (end - start).total_seconds() + else: + return None + +def extract_latency_data(trace_json: Dict[str, Any]) -> List[float]: + latencies = [] + for call_type in ["llm_calls", "tool_calls"]: + if call_type in trace_json: + for call in trace_json[call_type]: + if "start_time" in call and "end_time" in call: + latency = calculate_latency(call["start_time"], call["end_time"]) + if latency is not None: + latencies.append(latency) + return latencies + +def execute_response_latency_metric(trace_json: Dict[str, Any], config: Dict[str, Any] = {}) -> Dict[str, Any]: + try: + # Extract latency data + latencies = extract_latency_data(trace_json) + + if not latencies: + return { + "metric_name": "response_latency", + "config": config, + "result": { + "score": None, + "reason": "No response latencies found in the trace.", + }, + } + + # Calculate primary statistics + average_latency = sum(latencies) / len(latencies) + min_latency = min(latencies) + max_latency = max(latencies) + + # Calculate additional statistics + latencies.sort() + median_latency = latencies[len(latencies) // 2] + p90_latency = latencies[int(len(latencies) * 0.9)] + std_dev_latency = (sum((x - average_latency) ** 2 for x in latencies) / len(latencies)) ** 0.5 + + return { + "metric_name": "response_latency", + "config": config, + "result": { + "score": average_latency, + "reason": "Response latency metrics successfully calculated.", + "details": { + "average_latency": average_latency, + "min_latency": min_latency, + "max_latency": max_latency, + "median_latency": median_latency, + "p90_latency": p90_latency, + "std_dev_latency": std_dev_latency, + }, + }, + } + + except Exception as e: + return { + "metric_name": "response_latency", + "config": config, + "result": { + "score": None, + "reason": f"An error occurred during evaluation: {str(e)}", + }, + } \ No newline at end of file diff --git a/docs/content/index.md b/docs/content/index.md index 7f0f1aad..f3f6b17a 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -55,6 +55,7 @@ Welcome to AgentNeo! This documentation will help you understand and utilize Age - Goal Decomposition Efficiency - Goal Fulfillment Rate - Tool Call Metrics + - Response Latency - Error Detection Rate - Context Retention Rate - [Configuration](metrics/configuration.md) diff --git a/docs/content/metrics/supported-metrics.md b/docs/content/metrics/supported-metrics.md index e02a0ced..e1026369 100644 --- a/docs/content/metrics/supported-metrics.md +++ b/docs/content/metrics/supported-metrics.md @@ -56,12 +56,21 @@ Measures the reliability of tool executions. exe.evaluate(metric_list=['tool_call_success_rate']) ``` -### 5. Error Detection Rate +### 5. Response Latency +Measures the time taken to complete LLM and tool calls. + +```python +exe.evaluate(metric_list=['response_latency']) +``` + +### 6. Error Detection Rate Measures the system's ability to detect and identify errors during execution. ```python exe.evaluate(metric_list=['error_detection_rate']) -### 5. Context Retention Rate +``` + +### 7. Context Retention Rate Measures the Context Retention during calls. ```python @@ -73,8 +82,9 @@ exe.evaluate(metric_list=['context_retention_rate']) config = { "model": "gpt-4o-mini", } +``` -### 6. Custom User Defined Metrics +### 8. Custom User Defined Metrics Enables users to define there own personlized metrics system ##### Default metrics @@ -157,6 +167,7 @@ exe.evaluate( 'goal_fulfillment_rate', 'tool_call_correctness_rate', 'tool_call_success_rate', + 'response_latency' 'error_detection_rate 'context_retention_rate' ] diff --git a/examples/FinancialAnalysisSystem.ipynb b/examples/FinancialAnalysisSystem.ipynb index b5d0f504..f7dcbb1c 100644 --- a/examples/FinancialAnalysisSystem.ipynb +++ b/examples/FinancialAnalysisSystem.ipynb @@ -20,17 +20,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json \"HTTP/1.1 200 OK\"\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "os.chdir('..')\n", @@ -59,13 +51,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "Project connected successfully\n", "Project 'financial_analysis_project2' found.\n", "Tracing Started.\n" ] @@ -75,8 +68,15 @@ "# Initialize AgentNeo session\n", "neo_session = AgentNeo(session_name=\"financial_analysis_session2\")\n", "\n", + "project_name = \"financial_analysis_project2\"\n", "# Create project\n", - "neo_session.create_project(project_name=\"financial_analysis_project2\")\n", + "try:\n", + " neo_session.create_project(project_name=project_name)\n", + " print(\"Project created successfully\")\n", + "# Connect project\n", + "except:\n", + " neo_session.connect_project(project_name=project_name)\n", + " print(\"Project connected successfully\")\n", "\n", "# Start tracing\n", "tracer = Tracer(session=neo_session)\n", @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -203,7 +203,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -212,6 +212,7 @@ "text": [ "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "DEBUG:agentneo.tracing.agent_tracer:Successfully updated and committed AgentCallModel with id 15\n" "DEBUG:agentneo.tracing.agent_tracer:Successfully updated and committed AgentCallModel with id 1\n" ] }, @@ -221,6 +222,19 @@ "text": [ "\n", "Analysis for AAPL:\n", + "Stock Data: {'symbol': 'AAPL', 'price': 170.99, 'change': -4.44}\n", + "News Sentiment: 0.04545454545454545\n", + "Economic Indicators: {'gdp_growth': -0.86, 'unemployment_rate': 8.18, 'inflation_rate': 2.61}\n", + "\n", + "Investment Recommendation:\n", + "Given the market outlook and the investor's moderate risk tolerance, a specific investment recommendation would involve a balanced approach that accounts for the potential rewards associated with Apple Inc. (AAPL) while also mitigating risks through diversification and careful selection of securities. Here's the recommendation:\n", + "\n", + "**Diversified Investment Strategy with a Focus on Apple Inc. (AAPL):**\n", + "\n", + "1. **AAPL Position Sizing:** Given AAPL's current trading situation, with a slight decline and a relatively low yet positive news sentiment, it may be advisable to include AAPL in the investment portfolio but with limited exposure. Consider allocating a portion of the portfolio to AAPL that reflects the investor's moderate risk tolerance, such as 10-15%. This size allows the investor to potentially benefit from AAPL's growth and recovery without significantly exposing the portfolio to the stock's volatility.\n", + "\n", + "2. **Sector Diversification:** To reduce the impact of sector-specific risks, particularly from the tech sector that AAPL is a part\n", + "No action taken based on the current recommendation.\n", "Stock Data: {'symbol': 'AAPL', 'price': 353.24, 'change': -1.08}\n", "News Sentiment: 0.04545454545454545\n", "Economic Indicators: {'gdp_growth': 1.18, 'unemployment_rate': 5.0, 'inflation_rate': 1.14}\n", @@ -260,6 +274,7 @@ }, { "cell_type": "code", + "execution_count": 18, "execution_count": null, "metadata": {}, "outputs": [ @@ -267,11 +282,15 @@ "name": "stderr", "output_type": "stream", "text": [ + "\u001b[92m17:49:13 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", "\u001b[92m12:36:44 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", "INFO:LiteLLM:\n", "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "\u001b[92m17:49:15 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", + "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", + "\u001b[92m17:49:15 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", "\u001b[92m12:36:47 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", "\u001b[92m12:36:47 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", @@ -293,6 +312,9 @@ "INFO:LiteLLM:\n", "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", + "\u001b[92m17:49:17 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", + "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", + "\u001b[92m17:49:17 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", "\u001b[92m12:36:57 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", "\u001b[92m12:36:57 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", @@ -300,338 +322,6 @@ "INFO:LiteLLM:\n", "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:03 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:03 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:04 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:04 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:09 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:09 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:10 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:10 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:12 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:12 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:13 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:13 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:14 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:14 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:15 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:15 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:16 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:16 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:18 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:18 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-3.5-turbo; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:19 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:19 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:21 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:21 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:21 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:21 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:22 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:22 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:24 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:24 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:25 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:25 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:26 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:26 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:28 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:28 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:29 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:29 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:31 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:31 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:32 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:32 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:34 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:34 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:36 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:36 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:37 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:37 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:39 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:39 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:41 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:41 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:42 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:42 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:44 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n", - "\u001b[92m12:37:44 - LiteLLM:INFO\u001b[0m: utils.py:2687 - \n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:LiteLLM:\n", - "LiteLLM completion() model= gpt-4o-mini; provider = openai\n", - "INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n", - "\u001b[92m12:37:47 - LiteLLM:INFO\u001b[0m: utils.py:938 - Wrapper: Completed Call, calling success_handler\n", - "INFO:LiteLLM:Wrapper: Completed Call, calling success_handler\n" - ] - } - ], - "source": [ - "exe = Evaluation(session=neo_session, trace_id=tracer.trace_id)\n", - "\n", - "# run a single metric\n", - "exe.evaluate(metric_list=['goal_decomposition_efficiency', \n", - " 'goal_fulfillment_rate', \n", - " 'tool_call_correctness_rate', \n", - " 'tool_call_success_rate',\n", - " 'erro_detection_rate'])" - " 'context_retention_rate'])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'metric_name': 'goal_decomposition_efficiency',\n", - " 'score': 0.85,\n", - " 'reason': \"The AI effectively decomposed the original goal into clear and relevant sub-tasks that align well with the user's objectives. Each sub-task is logically sequenced, allowing for a structured approach to achieving the overall goal. The tools used correspond appropriately to the sub-tasks, and the decomposition covers all necessary aspects of the investment decision-making process. However, there could be slight improvements in granularity, particularly in the analysis of market conditions, which could benefit from more detailed insights. Overall, the decomposition is efficient and facilitates a comprehensive approach to the user's investment strategy.\",\n", - " 'result_detail': {'metric_name': 'goal_fulfillment_rate',\n", - " 'config': {},\n", - " 'result': {'originalGoal': 'Make informed investment decisions regarding Apple Inc. (AAPL) stock by analyzing current stock data, news sentiment, and economic indicators, and executing a buy order.',\n", - " 'subtasks': ['Fetch current stock data for AAPL.',\n", - " 'Retrieve recent news articles related to AAPL.',\n", - " 'Analyze sentiment for each news article.',\n", - " 'Gather relevant economic indicators.',\n", - " 'Analyze market conditions based on stock data, news sentiment, and economic indicators.',\n", - " 'Generate an investment recommendation based on the market outlook and investor risk tolerance.',\n", - " 'Execute a buy order for AAPL.'],\n", - " 'score': 0.85,\n", - " 'reason': \"The AI effectively decomposed the original goal into clear and relevant sub-tasks that align well with the user's objectives. Each sub-task is logically sequenced, allowing for a structured approach to achieving the overall goal. The tools used correspond appropriately to the sub-tasks, and the decomposition covers all necessary aspects of the investment decision-making process. However, there could be slight improvements in granularity, particularly in the analysis of market conditions, which could benefit from more detailed insights. Overall, the decomposition is efficient and facilitates a comprehensive approach to the user's investment strategy.\"}},\n", - " 'config': {},\n", - " 'start_time': '2024-11-19T12:36:44.319561',\n", - " 'end_time': '2024-11-19T12:36:54.867682',\n", - " 'duration': 10.548121},\n", - " {'metric_name': 'goal_fulfillment_rate',\n", - " 'score': 0.75,\n", - " 'reason': \"The system responses effectively address the user's intent to make informed investment decisions regarding Apple Inc. (AAPL) stock. The responses include fetching current stock data, relevant news articles, sentiment analysis, and economic indicators, which are crucial for the user's analysis. The stock data indicates a slight decline in price, which aligns with the user's concern about recent performance. The news articles provide context, although the sentiment analysis scores are low, indicating mixed or slightly positive sentiment, which may not fully satisfy the user's need for comprehensive sentiment evaluation. The economic indicators are relevant and provide additional context for market conditions. The investment recommendation is well-aligned with the user's moderate risk tolerance and suggests a balanced approach, which is appropriate given the user's intent. However, the lack of an executed buy order indicates that the final step of the user's goal is not fulfilled, which slightly lowers the overall score. Overall, the system responses are informative and relevant, but the incomplete execution of the buy order prevents a perfect score.\",\n", - " 'result_detail': {'metric_name': 'goal_fulfillment_rate',\n", - " 'config': {},\n", - " 'result': {'inputGoal': \"The user aims to make informed investment decisions regarding Apple Inc. (AAPL) stock. They seek to analyze current stock data, news sentiment, and economic indicators to understand the market conditions better. After evaluating the information, the user intends to execute a buy order for AAPL, indicating a belief in its long-term growth potential despite a recent decline in stock price. Overall, the user's goal is to strategically invest in AAPL while considering their moderate risk tolerance.\",\n", - " 'relevantResponses': \"fetch_stock_data: {'symbol': 'AAPL', 'price': 353.24, 'change': -1.08}\\n\\nfetch_news_articles: ['AAPL announces new product line', 'AAPL reports quarterly earnings', 'AAPL faces regulatory scrutiny']\\n\\nanalyze_sentiment: 0.13636363636363635\\n\\nanalyze_sentiment: 0.0\\n\\nanalyze_sentiment: 0.0\\n\\nfetch_economic_indicators: {'gdp_growth': 1.18, 'unemployment_rate': 5.0, 'inflation_rate': 1.14}\\n\\nanalyze_market_conditions: Based on the provided data, we can create a concise market outlook focusing on the stock in question, Apple Inc. (AAPL), in the context of the prevailing economic indicators and sentiment analysis.\\n\\n### Stock Analysis: Apple Inc. (AAPL)\\n- **Current Stock Price:** $353.24\\n- **Percentage Change:** -1.08%\\n\\nThis decline in Appleā€™s stock price points to a negative market reaction on the trading day in question. However, this should be contextualized within broader market trends, the company's performance, and specific news events impacting the stock.\\n\\n### News Sentiment: 0.04545454545454545\\nThe news sentiment score, marginally above zero, suggests a slightly positive bias in news coverage concerning\\n\\ngenerate_investment_recommendation: Given the provided market outlook for Apple Inc. (AAPL), which highlights a recent decline in stock price and a slightly positive news sentiment, coupled with your moderate risk tolerance, a specific investment recommendation would entail a balanced approach that combines growth with a degree of caution. Here's the strategy:\\n\\n### Recommended Investment Strategy: Buy and Hold with Monitoring\\n\\n1. **Initial Purchase**: Considering the slight positive bias in news sentiment and your moderate risk tolerance, initiating a modest position in AAPL could be a prudent move. The recent dip in stock price may present a buying opportunity, assuming you believe in the company's long-term growth prospects. Apple, as a leading technology company, has shown resilience and innovation over the years, which could translate to stock recovery and growth over time.\\n\\n2. **Diversification**: To mitigate the risks associated with investing in a single stock and to balance your investment portfolio, consider diversifying across different sectors and asset classes. This may include bonds, ETFs, and stocks in\\n\\nexecute_buy_order: None\",\n", - " 'score': 0.75,\n", - " 'reason': \"The system responses effectively address the user's intent to make informed investment decisions regarding Apple Inc. (AAPL) stock. The responses include fetching current stock data, relevant news articles, sentiment analysis, and economic indicators, which are crucial for the user's analysis. The stock data indicates a slight decline in price, which aligns with the user's concern about recent performance. The news articles provide context, although the sentiment analysis scores are low, indicating mixed or slightly positive sentiment, which may not fully satisfy the user's need for comprehensive sentiment evaluation. The economic indicators are relevant and provide additional context for market conditions. The investment recommendation is well-aligned with the user's moderate risk tolerance and suggests a balanced approach, which is appropriate given the user's intent. However, the lack of an executed buy order indicates that the final step of the user's goal is not fulfilled, which slightly lowers the overall score. Overall, the system responses are informative and relevant, but the incomplete execution of the buy order prevents a perfect score.\"}},\n", - " 'config': {},\n", - " 'start_time': '2024-11-19T12:36:54.867682',\n", - " 'end_time': '2024-11-19T12:37:03.324747',\n", - " 'duration': 8.457065},\n", - " {'metric_name': 'tool_call_correctness_rate',\n", - " 'score': 0.42857142857142855,\n", - " 'reason': \"The correctness rate of 0.43 (or 43%) in this interaction can be explained by analyzing the tool usage against the intended tools and the actual calls made.\\n\\n1. **Intended Tools**: The intended tools for the query were:\\n - `fetch_news_articles`: To gather relevant news articles that could impact the stock's performance.\\n - `fetch_economic_indicators`: To obtain current economic data that influences market conditions.\\n - `fetch_stock_data`: To retrieve the latest stock information for AAPL.\\n\\n2. **Actual Tool Usage**: The actual calls made were 7, but only 3 of them were correct, meaning they aligned with the intended tools. The remaining 4 calls likely included tools that were not necessary for the analysis, such as `analyze_sentiment` and `execute_buy_order`, which do not directly contribute to providing a market outlook based on the given data.\\n\\n3. **Analysis of Correctness Rate**: The correctness rate is calculated as the number of correct calls divided by the total calls made:\\n \\\\[\\n \\\\text{Correctness Rate} = \\\\frac{\\\\text{Correct Calls}}{\\\\text{Total Calls}} = \\\\frac{3}{7} \\\\approx 0.43\\n \\\\]\\n This indicates that while some relevant tools were used correctly, a significant number of unnecessary or incorrect tools were also called, leading to a lower overall correctness rate.\\n\\nIn summary, the correctness rate reflects the effectiveness of tool selection in relation to the query's requirements. The presence of unnecessary tool calls diluted the effectiveness of the correct calls, resulting in a lower correctness rate.\",\n", - " 'result_detail': {'metric_name': 'tool_correctness',\n", - " 'config': {},\n", - " 'result': {'score': 0.42857142857142855,\n", - " 'reason': \"The correctness rate of 0.43 (or 43%) in this interaction can be explained by analyzing the tool usage against the intended tools and the actual calls made.\\n\\n1. **Intended Tools**: The intended tools for the query were:\\n - `fetch_news_articles`: To gather relevant news articles that could impact the stock's performance.\\n - `fetch_economic_indicators`: To obtain current economic data that influences market conditions.\\n - `fetch_stock_data`: To retrieve the latest stock information for AAPL.\\n\\n2. **Actual Tool Usage**: The actual calls made were 7, but only 3 of them were correct, meaning they aligned with the intended tools. The remaining 4 calls likely included tools that were not necessary for the analysis, such as `analyze_sentiment` and `execute_buy_order`, which do not directly contribute to providing a market outlook based on the given data.\\n\\n3. **Analysis of Correctness Rate**: The correctness rate is calculated as the number of correct calls divided by the total calls made:\\n \\\\[\\n \\\\text{Correctness Rate} = \\\\frac{\\\\text{Correct Calls}}{\\\\text{Total Calls}} = \\\\frac{3}{7} \\\\approx 0.43\\n \\\\]\\n This indicates that while some relevant tools were used correctly, a significant number of unnecessary or incorrect tools were also called, leading to a lower overall correctness rate.\\n\\nIn summary, the correctness rate reflects the effectiveness of tool selection in relation to the query's requirements. The presence of unnecessary tool calls diluted the effectiveness of the correct calls, resulting in a lower correctness rate.\",\n", - " 'details': {'correct_calls': 3,\n", - " 'total_calls': 7,\n", - " 'intended_tools': ['fetch_news_articles',\n", - " 'fetch_economic_indicators',\n", - " 'fetch_stock_data'],\n", - " 'available_tools': ['fetch_news_articles',\n", - " 'analyze_sentiment',\n", - " 'fetch_stock_data',\n", - " 'execute_buy_order',\n", - " 'fetch_economic_indicators']}}},\n", - " 'config': {},\n", - " 'start_time': '2024-11-19T12:37:03.324747',\n", - " 'end_time': '2024-11-19T12:37:09.816785',\n", - " 'duration': 6.492038},\n", - " {'metric_name': 'tool_call_success_rate',\n", - " 'score': 0.8571428571428571,\n", - " 'reason': 'The overall performance of the tool call was successful, with a high success rate of 0.86. Most of the tool call results indicated successful execution, with valid outputs and expected information being returned. However, there was one instance where the tool call output was None, indicating a potential error during execution. This highlights the need for further investigation and potential improvement in error handling to ensure consistent successful outcomes.',\n", - " 'result_detail': {'metric_name': 'tool_call_success_rate',\n", - " 'config': {},\n", - " 'result': {'score': 0.8571428571428571,\n", - " 'reason': 'The overall performance of the tool call was successful, with a high success rate of 0.86. Most of the tool call results indicated successful execution, with valid outputs and expected information being returned. However, there was one instance where the tool call output was None, indicating a potential error during execution. This highlights the need for further investigation and potential improvement in error handling to ensure consistent successful outcomes.'}},\n", - " 'config': {},\n", - " 'start_time': '2024-11-19T12:37:09.816785',\n", - " 'end_time': '2024-11-19T12:37:19.802576',\n", - " 'duration': 9.985791},\n", - " {'metric_name': 'context_retention_rate',\n", - " 'score': 0.24,\n", - " 'reason': 'The conversation demonstrates some coherence in terms of maintaining relevant information about the stock AAPL and its associated news articles. However, there are significant issues with context retention and information reuse. The context switches are frequent, leading to a fragmented flow of information, and the context maintenance score is notably low, indicating that the system struggles to retain and utilize previously mentioned information effectively. Additionally, while the referenced information is relevant, it is not consistently applied across interactions, leading to a lack of clarity in the final investment recommendation. Overall, the conversation lacks the necessary coherence and effectiveness in context retention to achieve a higher score.',\n", - " 'result_detail': {'metric_name': 'context_retention_rate',\n", - " 'config': {},\n", - " 'result': {'score': 0.24,\n", - " 'reason': 'The conversation demonstrates some coherence in terms of maintaining relevant information about the stock AAPL and its associated news articles. However, there are significant issues with context retention and information reuse. The context switches are frequent, leading to a fragmented flow of information, and the context maintenance score is notably low, indicating that the system struggles to retain and utilize previously mentioned information effectively. Additionally, while the referenced information is relevant, it is not consistently applied across interactions, leading to a lack of clarity in the final investment recommendation. Overall, the conversation lacks the necessary coherence and effectiveness in context retention to achieve a higher score.',\n", - " 'context_metrics': {'context_switches': 8,\n", - " 'context_maintenance_score': 0.0,\n", - " 'total_key_points': 33},\n", - " 'coherence_analysis': {'coherence_score': 0.4,\n", - " 'key_observations': [\"The conversation maintains relevant information about AAPL's stock price and news articles, but struggles with context retention.\",\n", - " 'Frequent context switches disrupt the flow of the conversation, making it harder to follow.',\n", - " 'The context maintenance score is 0.0, indicating poor retention of previously mentioned information.',\n", - " 'Information reuse is consistently low, suggesting that the system does not effectively leverage past interactions.',\n", - " 'The final investment recommendation lacks clarity due to inconsistent application of referenced information.']},\n", - " 'execution_stats': {'total_interactions': 9, 'average_info_reuse': 0.0}}},\n", - " 'config': {},\n", - " 'start_time': '2024-11-19T12:37:19.802576',\n", - " 'end_time': '2024-11-19T12:37:47.898536',\n", - " 'duration': 28.09596}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "results = exe.get_results()\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analyzing the Results\n", - "\n", - "After running the analysis, you can examine the output to see the stock data, news sentiment, economic indicators, and the investment recommendation. The AgentNeo tracer will have logged all the steps of the process, which you can later analyze using the AgentNeo dashboard.\n", - "\n", - "To launch the AgentNeo dashboard and analyze the traces, you can use:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, "metadata": {}, "outputs": [ { diff --git a/examples/test_response_latency_metric.py b/examples/test_response_latency_metric.py new file mode 100644 index 00000000..63e25068 --- /dev/null +++ b/examples/test_response_latency_metric.py @@ -0,0 +1,83 @@ +import pytest +from datetime import datetime +from agentneo.evaluation.metrics import execute_response_latency_metric + +@pytest.fixture +def valid_trace_json(): + return { + "llm_calls": [ + {"start_time": "2024-11-16T10:00:00", "end_time": "2024-11-16T10:00:02"}, + {"start_time": "2024-11-16T10:05:00", "end_time": "2024-11-16T10:05:03"}, + ], + "tool_calls": [ + {"start_time": "2024-11-16T10:10:00", "end_time": "2024-11-16T10:10:01"}, + {"start_time": "2024-11-16T10:15:00", "end_time": "2024-11-16T10:15:04"}, + ], + } + + +@pytest.fixture +def trace_json_with_missing_times(): + return { + "llm_calls": [ + {"start_time": "2024-11-16T10:00:00", "end_time": "2024-11-16T10:00:02"}, + {"start_time": "2024-11-16T10:05:00"}, # Missing end_time + ], + "tool_calls": [ + {"start_time": "2024-11-16T10:10:00", "end_time": "2024-11-16T10:10:01"}, + {"end_time": "2024-11-16T10:15:04"}, # Missing start_time + ], + } + + +@pytest.fixture +def empty_trace_json(): + return {} + + +@pytest.fixture +def config(): + return {"metric_threshold": 2.0} + + +def test_valid_trace_json(valid_trace_json, config): + result = execute_response_latency_metric(valid_trace_json, config) + + assert result["metric_name"] == "response_latency" + assert result["result"]["score"] == 2.5 # Average latency: [2, 3, 1, 4] + assert result["result"]["details"]["min_latency"] == 1 + assert result["result"]["details"]["max_latency"] == 4 + assert result["result"]["details"]["median_latency"] == 2.5 # Median of [1, 2, 3, 4] + assert result["result"]["details"]["p90_latency"] == 4 # 90th percentile + assert "std_dev_latency" in result["result"]["details"] + + +def test_trace_json_with_missing_times(trace_json_with_missing_times, config): + result = execute_response_latency_metric(trace_json_with_missing_times, config) + + assert result["metric_name"] == "response_latency" + assert result["result"]["score"] == 1.5 # Average latency: [2, 1] + assert result["result"]["details"]["min_latency"] == 1 + assert result["result"]["details"]["max_latency"] == 2 + + +def test_empty_trace_json(empty_trace_json, config): + result = execute_response_latency_metric(empty_trace_json, config) + + assert result["metric_name"] == "response_latency" + assert result["result"]["score"] is None + assert "No response latencies found" in result["result"]["reason"] + + +def test_error_handling(config): + def faulty_extract_latency_data(_): + raise ValueError("Simulated error") + + with pytest.monkeypatch.context() as m: + m.setattr("response_latency.extract_latency_data", faulty_extract_latency_data) + + result = execute_response_latency_metric({}, config) + + assert result["metric_name"] == "response_latency" + assert result["result"]["score"] is None + assert "An error occurred during evaluation" in result["result"]["reason"] \ No newline at end of file