Skip to content

Commit

Permalink
Merge pull request #10 from valentinfrlch/dev
Browse files Browse the repository at this point in the history
Added support for 'detail' parameter
  • Loading branch information
valentinfrlch authored May 31, 2024
2 parents 7246472 + 9e499a4 commit fbdbdab
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 39 deletions.
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,20 +108,23 @@ To get GPT's analysis of a local image, use the following service call.
```yaml
service: gpt4vision.image_analyzer
data:
provider: OpenAI
message: Describe what you see?
max_tokens: 100
message: Describe what you see
model: gpt-4o
image_file: |-
/config/www/tmp/example.jpg
/config/www/tmp/example2.jpg
provider: LocalAI
model: gpt-4-vision-preview
target_width: 1280
detail: low
temperature: 0.5
```
The parameters `message`, `max_tokens`, `image_file`, `provider` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.

Optionally, the `model` and `target_width` properties can be set. For available models check these pages: [OpenAI](https://platform.openai.com/docs/models) and [LocalAI](https://localai.io/models/).
The parameters `provider`, `message`, `max_tokens`, `image_file` and `temperature` are required. You can send multiple images per service call. Note that each path must be on a new line.

Optionally, the `model`, `target_width` and `detail` properties can be set.
- For available **models** check these pages: [supported models for OpenAI](https://platform.openai.com/docs/models) and [LocalAI model gallery](https://localai.io/models/).
- The **target_width** is an integer between 640 and 3840 representing the image width in pixels. It is used to downscale the image before encoding it.
- The **detail** parameter can be set to `low` or `high`. If it is not set, it is set to 'auto'. OpenAI will then use the image size to determine the detail level. For more information check the [OpenAI documentation](https://platform.openai.com/docs/guides/vision/low-or-high-fidelity-image-understanding).
## How to report a bug or request a feature
> [!NOTE]
> **Bugs:** If you encounter any bugs and have followed the instructions carefully, feel free to file a bug report.
Expand Down
40 changes: 23 additions & 17 deletions custom_components/gpt4vision/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# Declare variables
from .const import (
DOMAIN,
CONF_PROVIDER,
CONF_OPENAI_API_KEY,
CONF_MAXTOKENS,
CONF_TARGET_WIDTH,
CONF_MODEL,
CONF_MESSAGE,
CONF_IMAGE_FILE,
CONF_LOCALAI_IP_ADDRESS,
CONF_LOCALAI_PORT,
CONF_OLLAMA_IP_ADDRESS,
CONF_OLLAMA_PORT,
CONF_TEMPERATURE
PROVIDER,
MAXTOKENS,
TARGET_WIDTH,
MODEL,
MESSAGE,
IMAGE_FILE,
TEMPERATURE,
DETAIL
)
from .request_handlers import (
handle_localai_request,
Expand All @@ -22,11 +23,14 @@
import base64
import io
import os
import logging
from homeassistant.helpers.aiohttp_client import async_get_clientsession
from homeassistant.core import SupportsResponse
from homeassistant.exceptions import ServiceValidationError
from PIL import Image

_LOGGER = logging.getLogger(__name__)


async def async_setup_entry(hass, entry):
"""Set up gpt4vision from a config entry."""
Expand Down Expand Up @@ -103,30 +107,32 @@ async def image_analyzer(data_call):
ollama_port = hass.data.get(DOMAIN, {}).get(CONF_OLLAMA_PORT)

# Read data from service call
mode = str(data_call.data.get(CONF_PROVIDER))
mode = str(data_call.data.get(PROVIDER))
# Message to be sent to AI model
message = str(data_call.data.get(CONF_MESSAGE)[0:2000])
message = str(data_call.data.get(MESSAGE)[0:2000])
# Local path to your image. Example: "/config/www/images/garage.jpg"
image_path = data_call.data.get(CONF_IMAGE_FILE)
image_path = data_call.data.get(IMAGE_FILE)
# create a list of image paths (separator: newline character)
image_paths = image_path.split("\n")
# Resolution (width only) of the image. Example: 1280 for 720p etc.
target_width = data_call.data.get(CONF_TARGET_WIDTH, 1280)
target_width = data_call.data.get(TARGET_WIDTH, 1280)
# Temperature parameter. Default is 0.5
temperature = float(data_call.data.get(CONF_TEMPERATURE, 0.5))
temperature = float(data_call.data.get(TEMPERATURE, 0.5))
# Maximum number of tokens used by model. Default is 100.
max_tokens = int(data_call.data.get(CONF_MAXTOKENS))
max_tokens = int(data_call.data.get(MAXTOKENS))
# Detail one of ["high", "low", "auto"] default is "auto"
detail = str(data_call.data.get(DETAIL, "auto"))

# Validate configuration and input data and set model
if mode == 'OpenAI':
validate(mode, api_key, image_paths)
model = str(data_call.data.get(CONF_MODEL, "gpt-4o"))
model = str(data_call.data.get(MODEL, "gpt-4o"))
elif mode == 'LocalAI':
validate(mode, None, image_paths, localai_ip_address, localai_port)
model = str(data_call.data.get(CONF_MODEL, "gpt-4-vision-preview"))
model = str(data_call.data.get(MODEL, "gpt-4-vision-preview"))
elif mode == 'Ollama':
validate(mode, None, image_paths, ollama_ip_address, ollama_port)
model = str(data_call.data.get(CONF_MODEL, "llava"))
model = str(data_call.data.get(MODEL, "llava"))


def encode_image(image_path):
Expand Down Expand Up @@ -171,7 +177,7 @@ def encode_image(image_path):
response_text = await handle_localai_request(session, model, message, base64_images, localai_ip_address, localai_port, max_tokens, temperature)

elif mode == "OpenAI":
response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature)
response_text = await handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail)

elif mode == 'Ollama':
response_text = await handle_ollama_request(session, model, message, base64_images, ollama_ip_address, ollama_port, max_tokens, temperature)
Expand Down
19 changes: 14 additions & 5 deletions custom_components/gpt4vision/config_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from homeassistant.exceptions import ServiceValidationError
from homeassistant.helpers.aiohttp_client import async_get_clientsession
from .const import (
DOMAIN,
CONF_PROVIDER,
DOMAIN,
CONF_OPENAI_API_KEY,
CONF_LOCALAI_IP_ADDRESS,
CONF_LOCALAI_PORT,
Expand All @@ -19,7 +18,7 @@

async def validate_mode(user_input: dict):
# check CONF_MODE is not empty
if not user_input[CONF_PROVIDER]:
if not user_input["provider"]:
raise ServiceValidationError("empty_mode")


Expand All @@ -33,6 +32,7 @@ async def validate_localai(hass, user_input: dict):
raise ServiceValidationError("empty_port")
# perform handshake with LocalAI server
if not await validate_connection(hass, user_input[CONF_LOCALAI_IP_ADDRESS], user_input[CONF_LOCALAI_PORT], "/readyz"):
_LOGGER.error("Could not connect to LocalAI server.")
raise ServiceValidationError("handshake_failed")


Expand All @@ -46,12 +46,14 @@ async def validate_ollama(hass, user_input: dict):
raise ServiceValidationError("empty_port")
# perform handshake with LocalAI server
if not await validate_connection(hass, user_input[CONF_OLLAMA_IP_ADDRESS], user_input[CONF_OLLAMA_PORT], "/api/tags"):
_LOGGER.error("Could not connect to Ollama server.")
raise ServiceValidationError("handshake_failed")


def validate_openai(user_input: dict):
# check CONF_API_KEY is not empty
if not user_input[CONF_OPENAI_API_KEY]:
_LOGGER.error("OpenAI API key is empty.")
raise ServiceValidationError("empty_api_key")


Expand All @@ -65,6 +67,7 @@ async def validate_connection(hass, ip_address, port, endpoint, expected_status=
else:
return False
except Exception as e:
_LOGGER.error(f"Could not connect to {url}: {e}")
return False


Expand All @@ -86,16 +89,19 @@ async def async_step_user(self, user_input=None):

if user_input is not None:
self.init_info = user_input
if user_input[CONF_PROVIDER] == "LocalAI":
if user_input["provider"] == "LocalAI":
if DOMAIN in self.hass.data and CONF_LOCALAI_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_LOCALAI_PORT in self.hass.data[DOMAIN]:
_LOGGER.error("LocalAI already configured.")
return self.async_abort(reason="already_configured")
return await self.async_step_localai()
elif user_input[CONF_PROVIDER] == "Ollama":
elif user_input["provider"] == "Ollama":
if DOMAIN in self.hass.data and CONF_OLLAMA_IP_ADDRESS in self.hass.data[DOMAIN] and CONF_OLLAMA_PORT in self.hass.data[DOMAIN]:
_LOGGER.error("Ollama already configured.")
return self.async_abort(reason="already_configured")
return await self.async_step_ollama()
else:
if DOMAIN in self.hass.data and CONF_OPENAI_API_KEY in self.hass.data[DOMAIN]:
_LOGGER.error("OpenAI already configured.")
return self.async_abort(reason="already_configured")
return await self.async_step_openai()

Expand All @@ -117,6 +123,7 @@ async def async_step_localai(self, user_input=None):
# add the mode to user_input
return self.async_create_entry(title="GPT4Vision LocalAI", data=user_input)
except ServiceValidationError as e:
_LOGGER.error(f"Validation failed: {e}")
return self.async_show_form(
step_id="localai",
data_schema=data_schema,
Expand All @@ -140,6 +147,7 @@ async def async_step_ollama(self, user_input=None):
# add the mode to user_input
return self.async_create_entry(title="GPT4Vision Ollama", data=user_input)
except ServiceValidationError as e:
_LOGGER.error(f"Validation failed: {e}")
return self.async_show_form(
step_id="ollama",
data_schema=data_schema,
Expand All @@ -163,6 +171,7 @@ async def async_step_openai(self, user_input=None):
user_input["provider"] = self.init_info["provider"]
return self.async_create_entry(title="GPT4Vision OpenAI", data=user_input)
except ServiceValidationError as e:
_LOGGER.error(f"Validation failed: {e}")
return self.async_show_form(
step_id="openai",
data_schema=data_schema,
Expand Down
19 changes: 10 additions & 9 deletions custom_components/gpt4vision/const.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,21 @@
""" Constants for gpt4vision component"""

# Global values
# Global constants
DOMAIN = "gpt4vision"

# Configuration values from setup
CONF_PROVIDER = 'provider'
CONF_OPENAI_API_KEY = 'api_key'
CONF_LOCALAI_IP_ADDRESS = 'localai_ip'
CONF_LOCALAI_PORT = 'localai_port'
CONF_OLLAMA_IP_ADDRESS = 'ollama_ip'
CONF_OLLAMA_PORT = 'ollama_port'

# Values from service call
CONF_MAXTOKENS = 'max_tokens'
CONF_TARGET_WIDTH = 'target_width'
CONF_MODEL = 'model'
CONF_MESSAGE = 'message'
CONF_IMAGE_FILE = 'image_file'
CONF_TEMPERATURE = 'temperature'
# service call constants
PROVIDER = 'provider'
MAXTOKENS = 'max_tokens'
TARGET_WIDTH = 'target_width'
MODEL = 'model'
MESSAGE = 'message'
IMAGE_FILE = 'image_file'
TEMPERATURE = 'temperature'
DETAIL = 'detail'
4 changes: 2 additions & 2 deletions custom_components/gpt4vision/request_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ async def handle_localai_request(session, model, message, base64_images, ip_addr
return response_text


async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature):
async def handle_openai_request(session, model, message, base64_images, api_key, max_tokens, temperature, detail):
headers = {'Content-type': 'application/json',
'Authorization': 'Bearer ' + api_key}
data = {"model": model,
Expand All @@ -45,7 +45,7 @@ async def handle_openai_request(session, model, message, base64_images, api_key,
# Add the images to the request
for image in base64_images:
data["messages"][0]["content"].append(
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}})
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}", "detail": detail}})

try:
response = await session.post(
Expand Down
9 changes: 9 additions & 0 deletions custom_components/gpt4vision/services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ image_analyzer:
number:
min: 640
max: 3840
detail:
required: false
description: "Detail parameter (OpenAI only), leave empty for 'auto'"
default: 'high'
selector:
select:
options:
- 'high'
- 'low'
temperature:
required: true
description: 'Randomness. Lower is more accurate, higher is more creative'
Expand Down

0 comments on commit fbdbdab

Please sign in to comment.