Skip to content

Commit

Permalink
merge main
Browse files Browse the repository at this point in the history
Signed-off-by: Stephanie <[email protected]>
  • Loading branch information
yangcao77 committed Jan 23, 2025
2 parents e0be713 + e794248 commit 0568e76
Show file tree
Hide file tree
Showing 34 changed files with 2,691 additions and 22 deletions.
126 changes: 124 additions & 2 deletions docs/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,10 @@
"username": {
"type": "string",
"title": "Username"
},
"skip_user_id_check": {
"type": "boolean",
"title": "Skip User Id Check"
}
},
"type": "object",
Expand All @@ -518,14 +522,132 @@
"username"
],
"title": "AuthorizationResponse",
"description": "Model representing a response to an authorization request.\n\nAttributes:\n user_id: The ID of the logged in user.\n username: The name of the logged in user.",
"description": "Model representing a response to an authorization request.\n\nAttributes:\n user_id: The ID of the logged in user.\n username: The name of the logged in user.\n skip_user_id_check: Skip user_id suid check.",
"examples": [
{
"user_id": "123e4567-e89b-12d3-a456-426614174000",
"username": "user1"
"username": "user1",
"skip_user_id_check": false
}
]
},
"BaseMessage": {
"additionalProperties": true,
"description": "Base abstract message class.\n\n Messages are the inputs and outputs of ChatModels.",
"properties": {
"additional_kwargs": {
"title": "Additional Kwargs",
"type": "object"
},
"content": {
"anyOf": [
{
"type": "string"
},
{
"items": {
"anyOf": [
{
"type": "string"
},
{
"type": "object"
}
]
},
"type": "array"
}
],
"title": "Content"
},
"id": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Id"
},
"name": {
"anyOf": [
{
"type": "string"
},
{
"type": "null"
}
],
"title": "Name"
},
"response_metadata": {
"title": "Response Metadata",
"type": "object"
},
"type": {
"title": "Type",
"type": "string"
}
},
"required": [
"content",
"type"
],
"title": "BaseMessage",
"type": "object"
},
"ChatHistoryResponse": {
"description": "Model representing a response to a list conversation request.\n\n Attributes:\n chat_history: List of conversation messages.",
"examples": [
{
"chat_history": [
{
"content": "what is openshift",
"type": "human"
},
{
"content": " OpenShift is a container orchestration platform built by Red Hat...",
"type": "ai"
}
]
}
],
"properties": {
"chat_history": {
"items": {
"$ref": "#/components/schemas/BaseMessage"
},
"title": "Chat History",
"type": "array"
}
},
"required": [
"chat_history"
],
"title": "ChatHistoryResponse",
"type": "object"
},
"ConversationDeletionResponse": {
"description": "Model representing a response to a conversation deletion request.\n\n Attributes:\n response: The response of the conversation deletion request.\n\n Example:\n ```python\n conversation_deletion_response = ConversationDeletionResponse(response='conversation deleted')\n ```",
"examples": [
{
"response": "conversation deleted"
}
],
"properties": {
"response": {
"title": "Response",
"type": "string"
}
},
"required": [
"response"
],
"title": "ConversationDeletionResponse",
"type": "object"
},
"ErrorResponse": {
"properties": {
"detail": {
Expand Down
5 changes: 5 additions & 0 deletions ols/src/prompts/prompt_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
from ols.customize import prompts


def restructure_rag_context(text: str, model: str) -> str:
"""Restructure rag text by appending special characters.."""
return restructure_rag_context_post(restructure_rag_context_pre(text, model), model)


def restructure_rag_context_pre(text: str, model: str) -> str:
"""Restructure rag text - pre truncation."""
if ModelFamily.GRANITE in model:
Expand Down
14 changes: 11 additions & 3 deletions ols/src/query_helpers/docs_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@
from ols.app.models.models import RagChunk, SummarizerResponse
from ols.constants import RAG_CONTENT_LIMIT, GenericLLMParameters
from ols.customize import prompts, reranker
from ols.src.prompts.prompt_generator import GeneratePrompt
from ols.src.prompts.prompt_generator import (
GeneratePrompt,
restructure_history,
restructure_rag_context,
)
from ols.src.query_helpers.query_helper import QueryHelper
from ols.utils.token_handler import TokenHandler

Expand Down Expand Up @@ -86,9 +90,13 @@ def _prepare_prompt(
# Use sample text for context/history to get complete prompt
# instruction. This is used to calculate available tokens.
temp_prompt, temp_prompt_input = GeneratePrompt(
query, ["sample"], [AIMessage("sample")], self._system_prompt
# Sample prompt's context/history must be re-structured for the given model,
# to ensure the further right available token calculation.
query,
[restructure_rag_context("sample", self.model)],
[restructure_history(AIMessage("sample"), self.model)],
self._system_prompt,
).generate_prompt(self.model)

available_tokens = token_handler.calculate_and_check_available_tokens(
temp_prompt.format(**temp_prompt_input),
self.model_config.context_window_size,
Expand Down
2 changes: 1 addition & 1 deletion ols/utils/token_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def calculate_and_check_available_tokens(
context_window_size - max_tokens_for_response - prompt_token_count
)

if available_tokens <= 0:
if available_tokens < 0:
limit = context_window_size - max_tokens_for_response
raise PromptTooLongError(
f"Prompt length {prompt_token_count} exceeds LLM "
Expand Down
84 changes: 84 additions & 0 deletions scripts/evaluation/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Evaluation

## Description
Currently we have 2 types of evaluations.
1. `consistency`: Ability to compare responses against ground-truth answer for specific provider+model. Objective of this evaluation is to flag any variation in specific provider+model response. Currently a combination of similarity distances are used to calculate final score. Cut-off scores are used to flag any deviations. This also stores a .csv file with query, pre-defined answer, API response & score. Input for this is a [json file](eval_data/question_answer_pair.json)

2. `model`: Ability to compare responses against single ground-truth answer. Here we can do evaluation for more than one provider+model at a time. This creates a json file as summary report with scores (f1-score) for each provider+model. Along with selected QnAs from above json file, we can also provide additional QnAs using a parquet file (optional). [Sample QnA set (parquet)](eval_data/interview_qna_30_per_title.parquet) with 30 queries per OCP documentation title.

**Notes**
- QnAs should `not` be used for model training or tuning. This is created only for evaluation purpose.
- QnAs were generated from OCP docs by LLMs. It is possible that some of the questions/answers are not entirely correct. We are constantly trying to verify both Questions & Answers manually. If you find any QnA pair to be modified or removed, please create a PR.
- OLS API should be ready/live with all the required provider+model configured.
- It is possible that we want to run both consistency and model evaluation together. To avoid multiple API calls for same query, *model* evaluation first checks .csv file generated by *consistency* evaluation. If response is not present in csv file, then only we call API to get the response.

### e2e test case

These evaluations are also part of **e2e test cases**. Currently *consistency* evaluation is parimarily used to gate PRs. Final e2e suite will also invoke *model* evaluation which will use .csv files generated by earlier suites, if any file is not present then last suite will fail.

### Usage
```
python -m scripts.evaluation.driver
```

### Input Data/QnA pool
[Json file](eval_data/question_answer_pair.json)

[Sample QnA set (parquet)](eval_data/interview_qna_30_per_title.parquet)

Please refer above files for the structure, add new data accordingly.

### Arguments
**eval_type**: This will control which evaluation, we want to do. Currently we have 3 options.
1. `consistency` -> Compares model specific answer for QnAs provided in json file
2. `model` -> Compares set of models based on their response and generates a summary report. For this we can provide additional QnAs in parquet format, along with json file.
3. `all` -> Both of the above evaluations.

**eval_api_url**: OLS API url. Default is `http://localhost:8080`. If deployed in a cluster, then pass cluster API url.

**eval_api_token_file**: Path to a text file containing OLS API token. Required, if OLS is deployed in cluster.

**eval_scenario**: This is primarily required to indetify which pre-defined answers need to be compared. Values can be `with_rag`, `without_rag`. Currently we always do evaluation for the API with rag.

**eval_query_ids**: Option to give set of query ids for evaluation. By default all queries are processed.

**eval_provider_model_id**: We can provide set of provider/model combinations as ids for comparison.

**qna_pool_file**: Applicable only for `model` evaluation. Provide file path to the parquet file having additional QnAs. Default is None.

**eval_out_dir**: Directory, where output csv/json files will be saved.

**eval_metrics**: By default all scores/metrics are calculated, but this decides which scores will be used to create the graph.
This is a list of metrics. Ex: cosine, euclidean distance, precision/recall/F1 score, answer relevancy score, LLM based similarity score.

**judge_provider / judge_model**: Provider / Model for judge LLM. This is required for LLM based evaluation (answer relevancy score, LLM based similarity score). This needs to be configured correctly through config yaml file. [Sample provider/model configuration](../../examples/olsconfig.yaml)

**eval_modes**: Apart from OLS api, we may want to evaluate vanilla model or with just OLS paramaters/prompt/RAG so that we can have baseline score. This is a list of modes. Ex: vanilla, ols_param, ols_prompt, ols_rag, & ols (actual api).

### Outputs
Evaluation scripts creates below files.
- CSV file with response for given provider/model & modes.
- response evaluation result with scores (for consistency check).
- Final csv file with all results, json score summary & graph (for model evaluation)

[Evaluation Result](eval_data/result/README.md)


# RAG retrieval script
```
python -m scripts.evaluation.query_rag
```
This is used to generate a .csv file having retrieved chunks for given set of queries with similarity score. This is not part of actual evaluation. But useful to do a spot check to understand the text that we send to LLMs as context (this may explain any deviation in the response)

#### Arguments
*db-path*: Path to the RAG index

*product-index*: RAG index ID

*model-path*: Path or name of the embedding model

*queries*: Set of queries separated by space. If not passed default queries are used.

*top-k*: How many chunks we want to retrieve. Default is 10.

*output_dir*: To save the .csv file.
1 change: 1 addition & 0 deletions scripts/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Modules for evaluation."""
Loading

0 comments on commit 0568e76

Please sign in to comment.