feat: add ragas_cli pip pkg

kubeagi · Jan 15, 2024 · c23b163 · c23b163
1 parent ef1b6f9
commit c23b163
Show file tree

Hide file tree

Showing 5 changed files with 225 additions and 88 deletions.
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,54 @@
+# Ragas CLI
+
+A one-step Ragas cli tool to evaluate QCAG testsets generated by RAG apps. (Q = Question, C = Contexts, A = Answer, G = Ground_truth)
+
+## Install with pip
+
+```bash
+pip install ragacli
+```
+
+## Arguments
+
+- `--model`: Specifies the model to use for evaluation.
+    - Default value is "gpt-3.5-turbo". Langchain compatible.
+- `--api_base`: Specifies the base URL for the API.
+    - Default value is "https://api.openai.com/v1".
+- `--api_key`: Specifies the API key to authenticate requests. 
+    - Not required if using psuedo-openai API server, e.g. vLLM, Fastchat, etc.
+- `--embeddings`: Specifies the Huggingface embeddings model to use for evaluation. 
+    - Embeddings will run **locally**.
+    - Will use OpenAI embeddings if not set.
+    - Better set if using psuedo-openai API server.
+- `--metrics`: Specifies the metrics to use for evaluation.
+    - Will use Ragas default metrics if not set.
+    - Default metrics: `["answer_relevancy", "context_precision", "faithfulness", "context_recall", "context_relevancy"]`
+    - Other metrics: `"answer_similarity", "answer_correctness"`
+- `--dataset`: Specifies the path to the dataset for evaluation.    
+    - Dataset format must meet RAGAS requirements.
+    - Will use fiqa dataset as demo if not set.
+
+## Usage
+
+### Fiqa dataset demo:
+
+```bash
+python3 -m ragacli --api_key "YOUR_OPENAI_API_KEY"
+```
+
+### Evaluate with GPT-4 and `BAAI/bge-small-en` embeddings
+
+The huggingface embeddings will run locally, so **Make sure your machine works and have [sentence-transformers](https://pypi.org/project/sentence-transformers/) installed:**
+
+```bash
+pip install sentence-transformers
+```
+Then run:
+
+```bash
+python3 -m ragacli --model "gpt-4" --api_key "YOUR_OPENAI_API_KEY" --embeddings "BAAI/bge-small-en" --dataset "path/to/dataset.csv"
+```
+
+### Prepare Dataset
+
+See [**Ragas documentation**](https://docs.ragas.io/en/stable/howtos/applications/data_preparation.html)
diff --git a/evaluation/ragas-sample.py b/evaluation/ragas-sample.py
diff --git a/evaluation/run/run.py b/evaluation/run/run.py
@@ -0,0 +1,48 @@
+import argparse
+import src.pkg as pkg
+from ragas import evaluate
+from datasets import load_dataset
+
+def run_evaluation():
+    parser = argparse.ArgumentParser(description='RAGAS CLI')
+    parser.add_argument("--model", type=str, default="gpt-3.5-turbo",
+                        help="Specifies the model to use for evaluation. Defaults to gpt-3.5-turbo.")
+    parser.add_argument("--api_base", type=str, default="https://api.openai.com/v1",
+                        help="Specifies the base URL for the API. Defaults to OpenAI.")
+    parser.add_argument("--api_key", type=str,
+                        help="Specifies the API key to authenticate requests.")
+    parser.add_argument("--embeddings", type=str,
+                        help="Specifies Huggingface embeddings model (or its path) to use for evaluation. Will use OpenAI embeddings if not set.")
+    parser.add_argument("--metrics", type=list, default=[],
+                        help="Specifies the metrics to use for evaluation.")
+    parser.add_argument("--dataset", type=str,
+                        help="Specifies the path to the dataset for evaluation. Will use fiqa dataset if not set.")
+
+    args = parser.parse_args()
+
+    model = args.model
+    api_base = args.api_base
+    api_key = args.api_key
+    metrics = args.metrics
+    dataset = args.dataset
+
+    judge_model = pkg.wrap_langchain_llm(model, api_base, api_key)
+
+    embeddings_model_name = args.embeddings
+
+    if embeddings_model_name:
+        embeddings = pkg.wrap_embeddings('huggingface', embeddings_model_name, None)
+    else:
+        embeddings = pkg.wrap_embeddings('openai', None, api_key)
+
+    if dataset:
+        test_set = load_dataset('csv', data_files=dataset)
+    else:
+        print('test_set not provided, using fiqa dataset')
+        fiqa = load_dataset('explodinggradients/fiqa', 'ragas_eval')
+        test_set = fiqa["baseline"].select(range(5))
+
+    ms = pkg.set_metrics(metrics, judge_model, embeddings, metrics)
+
+    return evaluate(test_set, ms)
+
diff --git a/evaluation/setup.py b/evaluation/setup.py
@@ -0,0 +1,27 @@
+# make a setup.py for evaluation package
+
+from setuptools import setup, find_packages
+
+with open("README.md", "r", encoding="utf-8") as f:
+    long_description = f.read()
+
+setup(
+    name="ragacli",
+    version="0.0.1",
+    author="Kielo",
+    author_email="[email protected]",
+    description="A one-step cli tool for RAGAS",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+    install_requires=[
+        'ragas',
+        'langchain==0.0.354'
+    ]
+)
diff --git a/evaluation/src/pkg.py b/evaluation/src/pkg.py
@@ -0,0 +1,96 @@
+import os
+from langchain.chat_models import ChatOpenAI
+from ragas.llms import RagasLLM
+from ragas.llms import LangchainLLM
+from ragas.embeddings import RagasEmbeddings
+from ragas.embeddings import OpenAIEmbeddings
+from ragas.embeddings import HuggingfaceEmbeddings
+from ragas.metrics.base import Metric
+
+from ragas.metrics import (
+    context_precision,
+    context_recall,
+    context_relevancy,
+    answer_relevancy,
+    answer_correctness,
+    answer_similarity,
+    faithfulness
+)
+
+DEFAULT_METRICS = [
+        "answer_relevancy",
+        "context_precision",
+        "faithfulness",
+        "context_recall",
+        "context_relevancy"
+    ]
+
+def wrap_langchain_llm(
+    model: str,
+    api_base: str | None,
+    api_key: str | None
+) -> LangchainLLM:
+    if api_base is None:
+        print('api_base not provided, assuming OpenAI default')
+        api_base = 'https://api.openai.com/v1'
+        os.environ["OPENAI_API_KEY"] = api_key
+        if api_key is None:
+            raise ValueError("api_key must be provided")
+        base = ChatOpenAI(model_name=model)
+    else:
+        os.environ["OPENAI_API_KEY"] = api_key
+        os.environ["OPENAI_API_BASE"] = api_base
+        base = ChatOpenAI(
+            model_name=model,
+            openai_api_key=api_key,
+            openai_api_base=api_base
+        )
+    return LangchainLLM(llm=base)
+
+
+def set_metrics(
+    metrics: list[str],
+    llm: RagasLLM | None,
+    embeddings: RagasEmbeddings | None
+) -> list[Metric]:
+    ms = []
+    if llm:
+        context_precision.llm = llm
+        context_recall.llm = llm
+        context_relevancy.llm = llm
+        answer_correctness.llm = llm
+        answer_similarity.llm = llm
+        faithfulness.llm = llm
+    if embeddings:
+        answer_relevancy.embeddings = embeddings
+        answer_correctness.embeddings = embeddings
+    if not metrics:
+        metrics = DEFAULT_METRICS
+    for m in metrics:
+        if m == 'context_precision':
+            ms.append(context_precision)
+        elif m == 'context_recall':
+            ms.append(context_recall)
+        elif m == 'context_relevancy':
+            ms.append(context_relevancy)
+        elif m == 'answer_relevancy':
+            ms.append(answer_relevancy)
+        elif m == 'answer_correctness':
+            ms.append(answer_correctness)
+        elif m == 'answer_similarity':
+            ms.append(answer_similarity)
+        elif m == 'faithfulness':
+            ms.append(faithfulness)
+    return ms
+
+def wrap_embeddings(
+    model_type: str,
+    model_name: str | None,
+    api_key: str | None
+) -> RagasEmbeddings:
+    if model_type == 'openai':
+        return OpenAIEmbeddings(api_key=api_key)
+    elif model_type == 'huggingface':
+        return HuggingfaceEmbeddings(model_name=model_name)
+    else:
+        raise ValueError(f"Invalid model type: {model_type}")