Skip to content

Commit

Permalink
Merge pull request #108 from NastyRunner13/response_latency-Metrics
Browse files Browse the repository at this point in the history
Add Response Latency Metric
  • Loading branch information
vijayc9 authored Jan 16, 2025
2 parents 4d56440 + 1e3ce97 commit 0b6c737
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 351 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ Manage multiple projects with ease.
2. Goal Fulfillment Rate (goal_fulfillment_rate)
3. Tool Call Correctness Rate (tool_call_correctness_rate)
4. Tool Call Success Rate (tool_call_success_rate)
5. Error Detection Rate (error_detection_rate)
5. Context Retention Rate (context_retention_rate)
5. Response Latency (response_latency)
6. Error Detection Rate (error_detection_rate)
7. Context Retention Rate (context_retention_rate)

- **Run multiple metrics together**
```python
Expand Down
5 changes: 5 additions & 0 deletions agentneo/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
execute_tool_selection_accuracy_metric,
execute_tool_usage_efficiency_metric,
execute_plan_adaptibility_metric,
execute_response_latency_metric,
execute_error_detection_rate_metric,
execute_context_retention_metric
)
Expand Down Expand Up @@ -79,6 +80,10 @@ def _execute_metric(self, metric, config, metadata, custom_criteria, context):
trace_json=self.trace_data,
config=config,
)
elif metric == 'response_latency':
return execute_response_latency_metric(
trace_json=self.trace_data,
config=config,
elif metric == 'error_detection_rate':
return execute_error_detection_rate_metric(
trace_json=self.trace_data,
Expand Down
2 changes: 2 additions & 0 deletions agentneo/evaluation/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .tool_usage_efficiency import execute_tool_usage_efficiency_metric
from .goal_decomposition_efficiency import execute_goal_decomposition_efficiency_metric
from .plan_adaptibility import execute_plan_adaptibility_metric
from .response_latency import execute_response_latency_metric
from .error_detection_rate import execute_error_detection_rate_metric
from .context_retention_rate import execute_context_retention_metric
from .custom_evaluation_metric import execute_custom_evaluation_metric
Expand All @@ -17,6 +18,7 @@
"execute_tool_usage_efficiency_metric",
"execute_goal_decomposition_efficiency_metric",
"execute_plan_adaptibility_metric",
"execute_response_latency_metric"
"execute_error_detection_rate_metric",
"execute_context_retention_metric"
"execute_custom_evaluation_metric"
Expand Down
78 changes: 78 additions & 0 deletions agentneo/evaluation/metrics/response_latency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import json
from typing import List, Optional, Dict, Any
from datetime import datetime

def parse_timestamp(timestamp: str) -> datetime:
return datetime.fromisoformat(timestamp)

def calculate_latency(start_time: str, end_time: str) -> Optional[float]:
start = parse_timestamp(start_time)
end = parse_timestamp(end_time)
if start and end:
return (end - start).total_seconds()
else:
return None

def extract_latency_data(trace_json: Dict[str, Any]) -> List[float]:
latencies = []
for call_type in ["llm_calls", "tool_calls"]:
if call_type in trace_json:
for call in trace_json[call_type]:
if "start_time" in call and "end_time" in call:
latency = calculate_latency(call["start_time"], call["end_time"])
if latency is not None:
latencies.append(latency)
return latencies

def execute_response_latency_metric(trace_json: Dict[str, Any], config: Dict[str, Any] = {}) -> Dict[str, Any]:
try:
# Extract latency data
latencies = extract_latency_data(trace_json)

if not latencies:
return {
"metric_name": "response_latency",
"config": config,
"result": {
"score": None,
"reason": "No response latencies found in the trace.",
},
}

# Calculate primary statistics
average_latency = sum(latencies) / len(latencies)
min_latency = min(latencies)
max_latency = max(latencies)

# Calculate additional statistics
latencies.sort()
median_latency = latencies[len(latencies) // 2]
p90_latency = latencies[int(len(latencies) * 0.9)]
std_dev_latency = (sum((x - average_latency) ** 2 for x in latencies) / len(latencies)) ** 0.5

return {
"metric_name": "response_latency",
"config": config,
"result": {
"score": average_latency,
"reason": "Response latency metrics successfully calculated.",
"details": {
"average_latency": average_latency,
"min_latency": min_latency,
"max_latency": max_latency,
"median_latency": median_latency,
"p90_latency": p90_latency,
"std_dev_latency": std_dev_latency,
},
},
}

except Exception as e:
return {
"metric_name": "response_latency",
"config": config,
"result": {
"score": None,
"reason": f"An error occurred during evaluation: {str(e)}",
},
}
1 change: 1 addition & 0 deletions docs/content/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Welcome to AgentNeo! This documentation will help you understand and utilize Age
- Goal Decomposition Efficiency
- Goal Fulfillment Rate
- Tool Call Metrics
- Response Latency
- Error Detection Rate
- Context Retention Rate
- [Configuration](metrics/configuration.md)
Expand Down
17 changes: 14 additions & 3 deletions docs/content/metrics/supported-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,21 @@ Measures the reliability of tool executions.
exe.evaluate(metric_list=['tool_call_success_rate'])
```

### 5. Error Detection Rate
### 5. Response Latency
Measures the time taken to complete LLM and tool calls.

```python
exe.evaluate(metric_list=['response_latency'])
```

### 6. Error Detection Rate
Measures the system's ability to detect and identify errors during execution.

```python
exe.evaluate(metric_list=['error_detection_rate'])
### 5. Context Retention Rate
```

### 7. Context Retention Rate
Measures the Context Retention during calls.

```python
Expand All @@ -73,8 +82,9 @@ exe.evaluate(metric_list=['context_retention_rate'])
config = {
"model": "gpt-4o-mini",
}
```

### 6. Custom User Defined Metrics
### 8. Custom User Defined Metrics
Enables users to define there own personlized metrics system

##### Default metrics
Expand Down Expand Up @@ -157,6 +167,7 @@ exe.evaluate(
'goal_fulfillment_rate',
'tool_call_correctness_rate',
'tool_call_success_rate',
'response_latency'
'error_detection_rate
'context_retention_rate'
]
Expand Down
Loading

0 comments on commit 0b6c737

Please sign in to comment.