From 39443df711db8dbaceca0f3692de4ea567e3d0e1 Mon Sep 17 00:00:00 2001
From: Miguel Neves <61327611+MiNeves00@users.noreply.github.com>
Date: Wed, 5 Feb 2025 09:20:08 +0000
Subject: [PATCH] feat: added support for o3-mini and updated o1-mini prices.
 also updated integration tests to support o3 (#202)

---
 examples/core.py                              |   4 +-
 libs/core/llmstudio_core/config.yaml          |  12 +-
 .../test_cache_and_reasoning_costs.py         | 192 ++++++++++++++++++
 .../test_cache_and_reasoning_costs.py         |  11 +-
 4 files changed, 208 insertions(+), 11 deletions(-)
 create mode 100644 libs/core/tests/integration_tests/test_cache_and_reasoning_costs.py

diff --git a/examples/core.py b/examples/core.py
index 255bcae4..e84ea89b 100644
--- a/examples/core.py
+++ b/examples/core.py
@@ -97,7 +97,7 @@ async def async_stream():
     return latencies
 
 def build_chat_request(model: str, chat_input: str, is_stream: bool, max_tokens: int=1000):
-    if model == "o1-preview" or model == "o1-mini":
+    if model.startswith(('o1', 'o3')):
         chat_request = {
             "chat_input": chat_input,
             "model": model,
@@ -132,7 +132,7 @@ def multiple_provider_runs(provider:str, model:str, num_runs:int, api_key:str, *
 
 # OpenAI
 multiple_provider_runs(provider="openai", model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
-multiple_provider_runs(provider="openai", model="o1-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
+multiple_provider_runs(provider="openai", model="o3-mini", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
 #multiple_provider_runs(provider="openai", model="o1-preview", api_key=os.environ["OPENAI_API_KEY"], num_runs=1)
 
 
diff --git a/libs/core/llmstudio_core/config.yaml b/libs/core/llmstudio_core/config.yaml
index 86b84c96..b608bd01 100644
--- a/libs/core/llmstudio_core/config.yaml
+++ b/libs/core/llmstudio_core/config.yaml
@@ -213,9 +213,15 @@ providers:
       o1-mini:
         mode: chat
         max_completion_tokens: 128000
-        input_token_cost: 0.000003
-        cached_token_cost: 0.0000015
-        output_token_cost: 0.000012
+        input_token_cost: 0.0000011
+        cached_token_cost: 0.00000055
+        output_token_cost: 0.0000044
+      o3-mini:
+        mode: chat
+        max_completion_tokens: 200000
+        input_token_cost: 0.0000011
+        cached_token_cost: 0.00000055
+        output_token_cost: 0.0000044
       gpt-4o-mini:
         mode: chat
         max_tokens: 128000
diff --git a/libs/core/tests/integration_tests/test_cache_and_reasoning_costs.py b/libs/core/tests/integration_tests/test_cache_and_reasoning_costs.py
new file mode 100644
index 00000000..8dd3b18c
--- /dev/null
+++ b/libs/core/tests/integration_tests/test_cache_and_reasoning_costs.py
@@ -0,0 +1,192 @@
+import asyncio
+import os
+from pprint import pprint
+
+import pytest
+from dotenv import load_dotenv
+from llmstudio_core.providers import LLMCore
+
+load_dotenv()
+
+# input prompt has to be >1024 tokens to auto cache on OpenAI
+input_prompt = """
+What is Lorem Ipsum? json
+Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
+
+Why do we use it?
+It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).
+
+
+Where does it come from?
+Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
+
+What is Lorem Ipsum? json
+Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
+
+Why do we use it?
+It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).
+
+
+Where does it come from?
+Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
+
+What is Lorem Ipsum? json
+Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.
+
+Why do we use it?
+It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).
+
+
+Where does it come from?
+Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.
+
+After reading this answer just in one line, saying hello how are you in latin.
+"""
+
+
+def run_provider(provider, model, api_key, **kwargs):
+    print(f"\n\n###RUNNING for <{provider}>, <{model}> ###")
+    llm = LLMCore(provider=provider, api_key=api_key, **kwargs)
+
+    metrics = {}
+
+    print("\nAsync Non-Stream")
+
+    chat_request = build_chat_request(model, chat_input=input_prompt, is_stream=False)
+
+    response_async = asyncio.run(llm.achat(**chat_request))
+    pprint(response_async)
+    metrics["async non-stream"] = response_async.metrics
+
+    print("\nAsync Stream")
+
+    async def async_stream():
+        chat_request = build_chat_request(
+            model, chat_input=input_prompt, is_stream=True
+        )
+
+        response_async = await llm.achat(**chat_request)
+        async for p in response_async:
+            if not p.metrics:
+                print("that: ", p.chat_output_stream)
+            if p.metrics:
+                pprint(p)
+                metrics["async stream"] = p.metrics
+
+    asyncio.run(async_stream())
+
+    print("\nSync Non-Stream")
+    chat_request = build_chat_request(model, chat_input=input_prompt, is_stream=False)
+
+    response_sync = llm.chat(**chat_request)
+    pprint(response_sync)
+    metrics["sync non-stream"] = response_sync.metrics
+
+    print("\nSync Stream")
+    chat_request = build_chat_request(model, chat_input=input_prompt, is_stream=True)
+
+    response_sync_stream = llm.chat(**chat_request)
+    for p in response_sync_stream:
+        if p.metrics:
+            pprint(p)
+            metrics["sync stream"] = p.metrics
+
+    print(f"\n\n###REPORT for <{provider}>, <{model}> ###")
+    return metrics
+
+
+def build_chat_request(
+    model: str, chat_input: str, is_stream: bool, max_tokens: int = 1000
+):
+    if model.startswith(("o1", "o3")):
+        chat_request = {
+            "chat_input": chat_input,
+            "model": model,
+            "is_stream": is_stream,
+            "retries": 0,
+            "parameters": {"max_completion_tokens": max_tokens},
+        }
+    else:
+        chat_request = {
+            "chat_input": chat_input,
+            "model": model,
+            "is_stream": is_stream,
+            "retries": 0,
+            "parameters": {
+                "temperature": 0,
+                "max_tokens": max_tokens,
+                "response_format": {"type": "json_object"},
+                "functions": None,
+            },
+        }
+    return chat_request
+
+
+# Fixture for provider-model pairs
+@pytest.fixture(
+    scope="module", params=[("openai", "gpt-4o-mini"), ("openai", "o1-mini")]
+)
+def provider_model(request):
+    return request.param
+
+
+# Fixture for metrics, computes them once per provider-model pair
+@pytest.fixture(scope="module")
+def metrics(provider_model):
+    provider, model = provider_model
+    print(f"Running provider {provider} with model {model}")
+    return run_provider(
+        provider=provider, model=model, api_key=os.environ["OPENAI_API_KEY"]
+    )
+
+
+# Test cache metrics (runs for all models)
+def test_metrics_cache(provider_model, metrics):
+    provider, model = provider_model
+    at_least_one_cached = False
+    for current_metrics in metrics.values():
+        print(current_metrics)
+        assert current_metrics["input_tokens"] > current_metrics["cached_tokens"]
+        if current_metrics["cached_tokens"] > 0:
+            at_least_one_cached = True
+    assert at_least_one_cached == True
+    print(f"All Cache Tests Passed for {provider} - {model}")
+
+
+# Test reasoning metrics (only runs for o1 and o3 family)
+def test_metrics_reasoning(provider_model, metrics):
+    provider, model = provider_model
+
+    if not model.startswith(("o1", "o3")):
+        pytest.skip(f"Reasoning metrics test not applicable for model {model}")
+
+    for current_metrics in metrics.values():
+        assert current_metrics["reasoning_tokens"] > 0
+        assert current_metrics["reasoning_tokens"] < current_metrics["total_tokens"]
+        assert (
+            current_metrics["input_tokens"]
+            + current_metrics["output_tokens"]
+            + current_metrics["reasoning_tokens"]
+            == current_metrics["total_tokens"]
+        )
+    print(f"All Reasoning Tests Passed for {provider} - {model}")
+
+
+def usage_when_max_tokens_reached():
+    """
+    Usefull to test handling of Usage in other finish_reason scenarios
+    """
+    provider, model = ("openai", "o1-mini")
+    api_key = os.environ["OPENAI_API_KEY"]
+
+    llm = LLMCore(provider=provider, api_key=api_key)
+    chat_request = build_chat_request(
+        model, chat_input=input_prompt, is_stream=False, max_tokens=7
+    )
+    response = asyncio.run(llm.achat(**chat_request))
+
+    assert response.metrics["async non-stream"]["reasoning_tokens"] > 0
+    assert response.metrics["sync non-stream"]["reasoning_tokens"] > 0
+    assert response.metrics["async stream"]["reasoning_tokens"] > 0
+    assert response.metrics["sync stream"]["reasoning_tokens"] > 0
+    print(f"All Max Tokens Usage Reached Tests Passed for {provider} - {model}")
diff --git a/libs/llmstudio/tests/integration_tests/test_cache_and_reasoning_costs.py b/libs/llmstudio/tests/integration_tests/test_cache_and_reasoning_costs.py
index 4a910814..50f2777b 100644
--- a/libs/llmstudio/tests/integration_tests/test_cache_and_reasoning_costs.py
+++ b/libs/llmstudio/tests/integration_tests/test_cache_and_reasoning_costs.py
@@ -98,7 +98,7 @@ async def async_stream():
 def build_chat_request(
     model: str, chat_input: str, is_stream: bool, max_tokens: int = 1000
 ):
-    if model == "o1-preview" or model == "o1-mini":
+    if model.startswith(("o1", "o3")):
         chat_request = {
             "chat_input": chat_input,
             "model": model,
@@ -124,7 +124,7 @@ def build_chat_request(
 
 # Fixture for provider-model pairs
 @pytest.fixture(
-    scope="module", params=[("openai", "gpt-4o-mini"), ("openai", "o1-mini")]
+    scope="module", params=[("openai", "gpt-4o-mini"), ("openai", "o3-mini")]
 )
 def provider_model(request):
     return request.param
@@ -153,12 +153,11 @@ def test_metrics_cache(provider_model, metrics):
     print(f"All Cache Tests Passed for {provider} - {model}")
 
 
-# Test reasoning metrics (only runs for o1-mini and o1-preview)
+# Test reasoning metrics (only runs for o1 and o3 family)
 def test_metrics_reasoning(provider_model, metrics):
     provider, model = provider_model
 
-    # Skip test if model is not o1-mini or o1-preview
-    if model not in ["o1-mini", "o1-preview"]:
+    if not model.startswith(("o1", "o3")):
         pytest.skip(f"Reasoning metrics test not applicable for model {model}")
 
     for current_metrics in metrics.values():
@@ -177,7 +176,7 @@ def usage_when_max_tokens_reached():
     """
     Usefull to test handling of Usage in other finish_reason scenarios
     """
-    provider, model = ("openai", "o1-mini")
+    provider, model = ("openai", "o3-mini")
     api_key = os.environ["OPENAI_API_KEY"]
 
     llm = LLMCore(provider=provider, api_key=api_key)