Merge pull request #10 from valentinfrlch/dev

Added support for 'detail' parameter
valentinfrlch · May 31, 2024 · fbdbdab · fbdbdab
2 parents 7246472 + 9e499a4
commit fbdbdab
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -108,20 +108,23 @@ To get GPT's analysis of a local image, use the following service call.
 ```yaml
 service: gpt4vision.image_analyzer
 data:
+  provider: OpenAI
+  message: Describe what you see?
   max_tokens: 100
-  message: Describe what you see
+  model: gpt-4o
   image_file: |-
     /config/www/tmp/example.jpg
     /config/www/tmp/example2.jpg
-  provider: LocalAI
-  model: gpt-4-vision-preview
   target_width: 1280
+  detail: low
   temperature: 0.5
 ```
-The parameters `message`, `max_tokens`, `image_file`, `provider` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.
-
-Optionally, the `model` and `target_width` properties can be set. For available models check these pages: [OpenAI](https://platform.openai.com/docs/models) and [LocalAI](https://localai.io/models/).
+The parameters `provider`, `message`, `max_tokens`, `image_file` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.
 
+Optionally, the `model`, `target_width` and `detail` properties can be set.  
+- For available **models** check these pages: [supported models for OpenAI](https://platform.openai.com/docs/models) and [LocalAI model gallery](https://localai.io/models/).
+- The **target_width** is an integer between 640 and 3840 representing the image width in pixels. It is used to downscale the image before encoding it.
+- The **detail** parameter can be set to `low` or `high`. If it is not set, it is set to 'auto'. OpenAI will then use the image size to determine the detail level. For more information check the [OpenAI documentation](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
 ## How to report a bug or request a feature
 > [!NOTE]
 > **Bugs:** If you encounter any bugs and have followed the instructions carefully, feel free to file a bug report.  

diff --git a/custom_components/gpt4vision/__init__.py b/custom_components/gpt4vision/__init__.py
@@ -1,18 +1,19 @@
 # Declare variables
 from .const import (
     DOMAIN,
-    CONF_PROVIDER,
     CONF_OPENAI_API_KEY,
-    CONF_MAXTOKENS,
-    CONF_TARGET_WIDTH,
-    CONF_MODEL,
-    CONF_MESSAGE,
-    CONF_IMAGE_FILE,
     CONF_LOCALAI_IP_ADDRESS,
     CONF_LOCALAI_PORT,
     CONF_OLLAMA_IP_ADDRESS,
     CONF_OLLAMA_PORT,
-    CONF_TEMPERATURE
+    PROVIDER,
+    MAXTOKENS,
+    TARGET_WIDTH,
+    MODEL,
+    MESSAGE,
+    IMAGE_FILE,
+    TEMPERATURE,
+    DETAIL
 )
 from .request_handlers import (
     handle_localai_request,
@@ -22,11 +23,14 @@
 import base64
 import io
 import os
+import logging
 from homeassistant.helpers.aiohttp_client import async_get_clientsession
 from homeassistant.core import SupportsResponse
 from homeassistant.exceptions import ServiceValidationError
 from PIL import Image
 
+_LOGGER = logging.getLogger(__name__)
+
 
 async def async_setup_entry(hass, entry):
     """Set up gpt4vision from a config entry."""
@@ -103,30 +107,32 @@ async def image_analyzer(data_call):
         ollama_port = hass.data.get(DOMAIN, {}).get(CONF_OLLAMA_PORT)
 
         # Read data from service call
-        mode = str(data_call.data.get(CONF_PROVIDER))
+        mode = str(data_call.data.get(PROVIDER))
         # Message to be sent to AI model
-        message = str(data_call.data.get(CONF_MESSAGE)[0:2000])
+        message = str(data_call.data.get(MESSAGE)[0:2000])
         # Local path to your image. Example: "/config/www/images/garage.jpg"
-        image_path = data_call.data.get(CONF_IMAGE_FILE)
+        image_path = data_call.data.get(IMAGE_FILE)
         # create a list of image paths (separator: newline character)
         image_paths = image_path.split("\n")
         # Resolution (width only) of the image. Example: 1280 for 720p etc.
-        target_width = data_call.data.get(CONF_TARGET_WIDTH, 1280)
+        target_width = data_call.data.get(TARGET_WIDTH, 1280)
         # Temperature parameter. Default is 0.5
-        temperature = float(data_call.data.get(CONF_TEMPERATURE, 0.5))
+        temperature = float(data_call.data.get(TEMPERATURE, 0.5))
         # Maximum number of tokens used by model. Default is 100.
-        max_tokens = int(data_call.data.get(CONF_MAXTOKENS))
+        max_tokens = int(data_call.data.get(MAXTOKENS))
+        # Detail one of ["high", "low", "auto"] default is "auto"
+        detail = str(data_call.data.get(DETAIL, "auto"))
 
         # Validate configuration and input data and set model
         if mode == 'OpenAI':
             validate(mode, api_key, image_paths)
-            model = str(data_call.data.get(CONF_MODEL, "gpt-4o"))
+            model = str(data_call.data.get(MODEL, "gpt-4o"))
         elif mode == 'LocalAI':
             validate(mode, None, image_paths, localai_ip_address, localai_port)
-            model = str(data_call.data.get(CONF_MODEL, "gpt-4-vision-preview"))
+            model = str(data_call.data.get(MODEL, "gpt-4-vision-preview"))
         elif mode == 'Ollama':
             validate(mode, None, image_paths, ollama_ip_address, ollama_port)
-            model = str(data_call.data.get(CONF_MODEL, "llava"))
+            model = str(data_call.data.get(MODEL, "llava"))
 
 
         def encode_image(image_path):
@@ -171,7 +177,7 @@ def encode_image(image_path):
             response_text = await handle_localai_request(session, model, message, base64_images, localai_ip_address, localai_port, max_tokens, temperature)
 
         elif mode == "OpenAI":
-            response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature)
+            response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail)
 
         elif mode == 'Ollama':
             response_text = await handle_ollama_request(session, model, message, base64_images, ollama_ip_address, ollama_port, max_tokens, temperature)

diff --git a/custom_components/gpt4vision/config_flow.py b/custom_components/gpt4vision/config_flow.py
@@ -3,8 +3,7 @@
 from homeassistant.exceptions import ServiceValidationError
 from homeassistant.helpers.aiohttp_client import async_get_clientsession
 from .const import (
-    DOMAIN, 
-    CONF_PROVIDER,
+    DOMAIN,
     CONF_OPENAI_API_KEY, 
     CONF_LOCALAI_IP_ADDRESS, 
     CONF_LOCALAI_PORT,
@@ -19,7 +18,7 @@
 
 async def validate_mode(user_input: dict):
     # check CONF_MODE is not empty
-    if not user_input[CONF_PROVIDER]:
+    if not user_input["provider"]:
         raise ServiceValidationError("empty_mode")
 
 
@@ -33,6 +32,7 @@ async def validate_localai(hass, user_input: dict):
         raise ServiceValidationError("empty_port")
     # perform handshake with LocalAI server
     if not await validate_connection(hass, user_input[CONF_LOCALAI_IP_ADDRESS], user_input[CONF_LOCALAI_PORT], "/readyz"):
+        _LOGGER.error("Could not connect to LocalAI server.")
         raise ServiceValidationError("handshake_failed")
 
 
@@ -46,12 +46,14 @@ async def validate_ollama(hass, user_input: dict):
         raise ServiceValidationError("empty_port")
     # perform handshake with LocalAI server
     if not await validate_connection(hass, user_input[CONF_OLLAMA_IP_ADDRESS], user_input[CONF_OLLAMA_PORT], "/api/tags"):
+        _LOGGER.error("Could not connect to Ollama server.")
         raise ServiceValidationError("handshake_failed")
 
 
 def validate_openai(user_input: dict):
     # check CONF_API_KEY is not empty
     if not user_input[CONF_OPENAI_API_KEY]:
+        _LOGGER.error("OpenAI API key is empty.")
         raise ServiceValidationError("empty_api_key")
 
 
@@ -65,6 +67,7 @@ async def validate_connection(hass, ip_address, port, endpoint, expected_status=
         else:
             return False
     except Exception as e:
+        _LOGGER.error(f"Could not connect to {url}: {e}")
         return False
 
 
@@ -86,16 +89,19 @@ async def async_step_user(self, user_input=None):
 
         if user_input is not None:
             self.init_info = user_input
-            if user_input[CONF_PROVIDER] == "LocalAI":
+            if user_input["provider"] == "LocalAI":
                 if DOMAIN in self.hass.data and CONF_LOCALAI_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_LOCALAI_PORT in self.hass.data[DOMAIN]:
+                    _LOGGER.error("LocalAI already configured.")
                     return self.async_abort(reason="already_configured")
                 return await self.async_step_localai()
-            elif user_input[CONF_PROVIDER] == "Ollama":
+            elif user_input["provider"] == "Ollama":
                 if DOMAIN in self.hass.data and CONF_OLLAMA_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_OLLAMA_PORT in self.hass.data[DOMAIN]:
+                    _LOGGER.error("Ollama already configured.")
                     return self.async_abort(reason="already_configured")
                 return await self.async_step_ollama()
             else:
                 if DOMAIN in self.hass.data and CONF_OPENAI_API_KEY in self.hass.data[DOMAIN]:
+                    _LOGGER.error("OpenAI already configured.")
                     return self.async_abort(reason="already_configured")
                 return await self.async_step_openai()
 
@@ -117,6 +123,7 @@ async def async_step_localai(self, user_input=None):
                 # add the mode to user_input
                 return self.async_create_entry(title="GPT4Vision LocalAI", data=user_input)
             except ServiceValidationError as e:
+                _LOGGER.error(f"Validation failed: {e}")
                 return self.async_show_form(
                     step_id="localai",
                     data_schema=data_schema,
@@ -140,6 +147,7 @@ async def async_step_ollama(self, user_input=None):
                 # add the mode to user_input
                 return self.async_create_entry(title="GPT4Vision Ollama", data=user_input)
             except ServiceValidationError as e:
+                _LOGGER.error(f"Validation failed: {e}")
                 return self.async_show_form(
                     step_id="ollama",
                     data_schema=data_schema,
@@ -163,6 +171,7 @@ async def async_step_openai(self, user_input=None):
                 user_input["provider"] = self.init_info["provider"]
                 return self.async_create_entry(title="GPT4Vision OpenAI", data=user_input)
             except ServiceValidationError as e:
+                _LOGGER.error(f"Validation failed: {e}")
                 return self.async_show_form(
                     step_id="openai",
                     data_schema=data_schema,

diff --git a/custom_components/gpt4vision/const.py b/custom_components/gpt4vision/const.py
@@ -1,20 +1,21 @@
 """ Constants for gpt4vision component"""
 
-# Global values
+# Global constants
 DOMAIN = "gpt4vision"
 
 # Configuration values from setup
-CONF_PROVIDER = 'provider'
 CONF_OPENAI_API_KEY = 'api_key'
 CONF_LOCALAI_IP_ADDRESS = 'localai_ip'
 CONF_LOCALAI_PORT = 'localai_port'
 CONF_OLLAMA_IP_ADDRESS = 'ollama_ip'
 CONF_OLLAMA_PORT = 'ollama_port'
 
-# Values from service call
-CONF_MAXTOKENS = 'max_tokens'
-CONF_TARGET_WIDTH = 'target_width'
-CONF_MODEL = 'model'
-CONF_MESSAGE = 'message'
-CONF_IMAGE_FILE = 'image_file'
-CONF_TEMPERATURE = 'temperature'
+# service call constants
+PROVIDER = 'provider'
+MAXTOKENS = 'max_tokens'
+TARGET_WIDTH = 'target_width'
+MODEL = 'model'
+MESSAGE = 'message'
+IMAGE_FILE = 'image_file'
+TEMPERATURE = 'temperature'
+DETAIL = 'detail'
diff --git a/custom_components/gpt4vision/request_handlers.py b/custom_components/gpt4vision/request_handlers.py
@@ -31,7 +31,7 @@ async def handle_localai_request(session, model, message, base64_images, ip_addr
     return response_text
 
 
-async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature):
+async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail):
     headers = {'Content-type': 'application/json',
                'Authorization': 'Bearer ' + api_key}
     data = {"model": model,
@@ -45,7 +45,7 @@ async def handle_openai_request(session, model, message, base64_images, api_key,
     # Add the images to the request
     for image in base64_images:
         data["messages"][0]["content"].append(
-            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}})
+            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": detail}})
 
     try:
         response = await session.post(

diff --git a/custom_components/gpt4vision/services.yaml b/custom_components/gpt4vision/services.yaml
@@ -50,6 +50,15 @@ image_analyzer:
         number:
           min: 640
           max: 3840
+    detail:
+      required: false
+      description: "Detail parameter (OpenAI only), leave empty for 'auto'"
+      default: 'high'
+      selector:
+        select:
+          options:
+            - 'high'
+            - 'low'
     temperature:
       required: true
       description: 'Randomness. Lower is more accurate, higher is more creative'