Merge pull request #108 from NastyRunner13/response_latency-Metrics

Add Response Latency Metric
raga-ai-hub · Jan 16, 2025 · 0b6c737 · 0b6c737
2 parents 4d56440 + 1e3ce97
commit 0b6c737
Show file tree

Hide file tree

Showing 8 changed files with 222 additions and 351 deletions.
diff --git a/README.md b/README.md
@@ -134,8 +134,9 @@ Manage multiple projects with ease.
 2. Goal Fulfillment Rate (goal_fulfillment_rate)
 3. Tool Call Correctness Rate (tool_call_correctness_rate)
 4. Tool Call Success Rate (tool_call_success_rate)
-5. Error Detection Rate (error_detection_rate)
-5. Context Retention Rate (context_retention_rate)
+5. Response Latency (response_latency)
+6. Error Detection Rate (error_detection_rate)
+7. Context Retention Rate (context_retention_rate)
 
 - **Run multiple metrics together**
 ```python

diff --git a/agentneo/evaluation/evaluation.py b/agentneo/evaluation/evaluation.py
@@ -21,6 +21,7 @@
     execute_tool_selection_accuracy_metric,
     execute_tool_usage_efficiency_metric,
     execute_plan_adaptibility_metric,
+    execute_response_latency_metric,
     execute_error_detection_rate_metric,
     execute_context_retention_metric
 )
@@ -79,6 +80,10 @@ def _execute_metric(self, metric, config, metadata, custom_criteria, context):
                 trace_json=self.trace_data,
                 config=config,
             )
+        elif metric == 'response_latency':
+            return execute_response_latency_metric(
+                trace_json=self.trace_data,
+                config=config,
         elif metric == 'error_detection_rate':
             return execute_error_detection_rate_metric(
                 trace_json=self.trace_data,

diff --git a/agentneo/evaluation/metrics/__init__.py b/agentneo/evaluation/metrics/__init__.py
@@ -5,6 +5,7 @@
 from .tool_usage_efficiency import execute_tool_usage_efficiency_metric
 from .goal_decomposition_efficiency import execute_goal_decomposition_efficiency_metric
 from .plan_adaptibility import execute_plan_adaptibility_metric
+from .response_latency import execute_response_latency_metric
 from .error_detection_rate import execute_error_detection_rate_metric
 from .context_retention_rate import execute_context_retention_metric
 from .custom_evaluation_metric import execute_custom_evaluation_metric
@@ -17,6 +18,7 @@
     "execute_tool_usage_efficiency_metric",
     "execute_goal_decomposition_efficiency_metric",
     "execute_plan_adaptibility_metric",
+    "execute_response_latency_metric"
     "execute_error_detection_rate_metric",
     "execute_context_retention_metric"
     "execute_custom_evaluation_metric"

diff --git a/agentneo/evaluation/metrics/response_latency.py b/agentneo/evaluation/metrics/response_latency.py
@@ -0,0 +1,78 @@
+import json
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+
+def parse_timestamp(timestamp: str) -> datetime:
+    return datetime.fromisoformat(timestamp)
+
+def calculate_latency(start_time: str, end_time: str) -> Optional[float]:
+    start = parse_timestamp(start_time)
+    end = parse_timestamp(end_time)
+    if start and end:
+        return (end - start).total_seconds()
+    else:
+        return None
+
+def extract_latency_data(trace_json: Dict[str, Any]) -> List[float]:
+    latencies = []
+    for call_type in ["llm_calls", "tool_calls"]:
+        if call_type in trace_json:
+            for call in trace_json[call_type]:
+                if "start_time" in call and "end_time" in call:
+                    latency = calculate_latency(call["start_time"], call["end_time"])
+                    if latency is not None:
+                        latencies.append(latency)
+    return latencies
+
+def execute_response_latency_metric(trace_json: Dict[str, Any], config: Dict[str, Any] = {}) -> Dict[str, Any]:
+    try:
+        # Extract latency data
+        latencies = extract_latency_data(trace_json)
+
+        if not latencies:
+            return {
+                "metric_name": "response_latency",
+                "config": config,
+                "result": {
+                    "score": None,
+                    "reason": "No response latencies found in the trace.",
+                },
+            }
+
+        # Calculate primary statistics
+        average_latency = sum(latencies) / len(latencies)
+        min_latency = min(latencies)
+        max_latency = max(latencies)
+
+        # Calculate additional statistics
+        latencies.sort()
+        median_latency = latencies[len(latencies) // 2]
+        p90_latency = latencies[int(len(latencies) * 0.9)]
+        std_dev_latency = (sum((x - average_latency) ** 2 for x in latencies) / len(latencies)) ** 0.5
+
+        return {
+            "metric_name": "response_latency",
+            "config": config,
+            "result": {
+                "score": average_latency,
+                "reason": "Response latency metrics successfully calculated.",
+                "details": {
+                    "average_latency": average_latency,
+                    "min_latency": min_latency,
+                    "max_latency": max_latency,
+                    "median_latency": median_latency,
+                    "p90_latency": p90_latency,
+                    "std_dev_latency": std_dev_latency,
+                },
+            },
+        }
+
+    except Exception as e:
+        return {
+            "metric_name": "response_latency",
+            "config": config,
+            "result": {
+                "score": None,
+                "reason": f"An error occurred during evaluation: {str(e)}",
+            },
+        }
diff --git a/docs/content/index.md b/docs/content/index.md
@@ -55,6 +55,7 @@ Welcome to AgentNeo! This documentation will help you understand and utilize Age
   - Goal Decomposition Efficiency
   - Goal Fulfillment Rate
   - Tool Call Metrics
+  - Response Latency
   - Error Detection Rate
   - Context Retention Rate
 - [Configuration](metrics/configuration.md)

diff --git a/docs/content/metrics/supported-metrics.md b/docs/content/metrics/supported-metrics.md
@@ -56,12 +56,21 @@ Measures the reliability of tool executions.
 exe.evaluate(metric_list=['tool_call_success_rate'])
 ```
 
-### 5. Error Detection Rate
+### 5. Response Latency
+Measures the time taken to complete LLM and tool calls.
+
+```python
+exe.evaluate(metric_list=['response_latency'])
+```
+
+### 6. Error Detection Rate
 Measures the system's ability to detect and identify errors during execution.
 
 ```python
 exe.evaluate(metric_list=['error_detection_rate'])
-### 5. Context Retention Rate
+```
+
+### 7. Context Retention Rate
 Measures the Context Retention during calls.
 
 ```python
@@ -73,8 +82,9 @@ exe.evaluate(metric_list=['context_retention_rate'])
 config = {
     "model": "gpt-4o-mini",
 }
+```
 
-### 6. Custom User Defined Metrics
+### 8. Custom User Defined Metrics
 Enables users to define there own personlized metrics system
 
 ##### Default metrics
@@ -157,6 +167,7 @@ exe.evaluate(
         'goal_fulfillment_rate',
         'tool_call_correctness_rate',
         'tool_call_success_rate',
+        'response_latency'
         'error_detection_rate
         'context_retention_rate'
     ]