diff --git a/.env.example b/.env.example index df2af46..1d73364 100644 --- a/.env.example +++ b/.env.example @@ -2,4 +2,5 @@ OPENAI_API_KEY="your_openai_api_key" GROQ_API_KEY="your_groq_api_key" ANTHROPIC_API_KEY="your_anthropic_api_key" AZURE_OPENAI_API_KEY="your_azure_openai_api_key" -# Additional keys \ No newline at end of file +# Additional keys +WORKSPACE_DIR="agent_workspace" \ No newline at end of file diff --git a/.gitignore b/.gitignore index 36b36ef..d61b14f 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ __pycache__/ .Python build/ develop-eggs/ +agent_worpace +agent_worpace dist/ downloads/ eggs/ diff --git a/README.md b/README.md index 8411c06..0096579 100644 --- a/README.md +++ b/README.md @@ -139,6 +139,7 @@ Ready to streamline your API integrations and boost your application's performan | `LayoutLMDocumentQA` | Model for document question answering. | | `GPT4VisionAPI` | Model for analyzing images with GPT-4 capabilities. | | `LlamaForCausalLM` | Causal language model from the Llama family. | +| `GroundedSAMTwo` | Analyzes and track objects in images. GPU Only | diff --git a/example.py b/example.py deleted file mode 100644 index e69de29..0000000 diff --git a/examples/models/sam2_example.py b/examples/models/sam2_example.py new file mode 100644 index 0000000..98200b1 --- /dev/null +++ b/examples/models/sam2_example.py @@ -0,0 +1,16 @@ +from swarm_models.sam_two import GroundedSAMTwo +from loguru import logger + +# Example usage: +ontology = {"shipping container": "container"} +runner = GroundedSAMTwo(ontology) + +# Run on a single image +image_path = "path/to/your/image.jpg" +json_output = runner.run(image_path, output_dir="annotated_images") +logger.info("Annotation result: \n{}", json_output) + +# Run on a dataset (directory) +image_dir = "path/to/your/dataset" +json_output = runner.run(image_dir) +logger.info("Dataset labeling result: \n{}", json_output) diff --git a/pyproject.toml b/pyproject.toml index ace357c..15e6a56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "swarm-models" -version = "0.0.4" +version = "0.0.7" description = "Swarm Models - Pytorch" license = "MIT" authors = ["Kye Gomez "] diff --git a/swarm_models/__init__.py b/swarm_models/__init__.py index a62f53a..7686854 100644 --- a/swarm_models/__init__.py +++ b/swarm_models/__init__.py @@ -1,4 +1,3 @@ -from swarm_models.base_embedding_model import BaseEmbeddingModel from swarm_models.base_llm import BaseLLM # noqa: E402 from swarm_models.base_multimodal_model import BaseMultiModalModel from swarm_models.fuyu import Fuyu # noqa: E402 @@ -29,7 +28,6 @@ ) from swarm_models.popular_llms import ReplicateChat as Replicate from swarm_models.qwen import QwenVLMultiModal # noqa: E402 -from swarm_models.sampling_params import SamplingParams, SamplingType from swarm_models.together import TogetherLLM # noqa: E402 from swarm_models.model_types import ( # noqa: E402 AudioModality, @@ -42,9 +40,10 @@ from swarm_models.popular_llms import FireWorksAI from swarm_models.openai_function_caller import OpenAIFunctionCaller from swarm_models.ollama_model import OllamaModel +from swarm_models.sam_two import GroundedSAMTwo +from swarm_models.utils import * # NOQA __all__ = [ - "BaseEmbeddingModel", "BaseLLM", "BaseMultiModalModel", "Fuyu", @@ -65,8 +64,6 @@ "OctoAIChat", "QwenVLMultiModal", "Replicate", - "SamplingParams", - "SamplingType", "TogetherLLM", "AudioModality", "ImageModality", @@ -79,4 +76,5 @@ "FireWorksAI", "OpenAIFunctionCaller", "OllamaModel", + "GroundedSAMTwo", ] diff --git a/swarm_models/base_embedding_model.py b/swarm_models/base_embedding_model.py deleted file mode 100644 index 215cb7e..0000000 --- a/swarm_models/base_embedding_model.py +++ /dev/null @@ -1,73 +0,0 @@ -from __future__ import annotations - -from abc import ABC, abstractmethod -from dataclasses import dataclass - -import numpy as np -from typing import Callable -from swarm_models.artifacts.text_artifact import TextArtifact -from swarm_models.utils.exponential_backoff import ( - ExponentialBackoffMixin, -) - - -@dataclass -class BaseEmbeddingModel( - ExponentialBackoffMixin, - ABC, - # SerializableMixin -): - """ - Attributes: - model: The name of the model to use. - tokenizer: An instance of `BaseTokenizer` to use when calculating tokens. - """ - - model: str = None - tokenizer: Callable = None - chunker: Callable = None - - def embed_text_artifact( - self, artifact: TextArtifact - ) -> list[float]: - return self.embed_string(artifact.to_text()) - - def embed_string(self, string: str) -> list[float]: - for attempt in self.retrying(): - with attempt: - if ( - self.tokenizer - and self.tokenizer.count_tokens(string) - > self.tokenizer.max_tokens - ): - return self._embed_long_string(string) - else: - return self.try_embed_chunk(string) - - else: - raise RuntimeError("Failed to embed string.") - - @abstractmethod - def try_embed_chunk(self, chunk: str) -> list[float]: ... - - def _embed_long_string(self, string: str) -> list[float]: - """Embeds a string that is too long to embed in one go.""" - chunks = self.chunker.chunk(string) - - embedding_chunks = [] - length_chunks = [] - for chunk in chunks: - embedding_chunks.append(self.try_embed_chunk(chunk.value)) - length_chunks.append(len(chunk)) - - # generate weighted averages - embedding_chunks = np.average( - embedding_chunks, axis=0, weights=length_chunks - ) - - # normalize length to 1 - embedding_chunks = embedding_chunks / np.linalg.norm( - embedding_chunks - ) - - return embedding_chunks.tolist() diff --git a/swarm_models/base_llm.py b/swarm_models/base_llm.py index d740b22..982c477 100644 --- a/swarm_models/base_llm.py +++ b/swarm_models/base_llm.py @@ -4,10 +4,9 @@ import time from abc import abstractmethod from typing import List, Optional -from swarm_models.structs.base_structure import BaseStructure -class BaseLLM(BaseStructure): +class BaseLLM: """Abstract Language Model that defines the interface for all language models Args: diff --git a/swarm_models/gpt4_vision_api.py b/swarm_models/gpt4_vision_api.py index 36fd8cf..c52e32e 100644 --- a/swarm_models/gpt4_vision_api.py +++ b/swarm_models/gpt4_vision_api.py @@ -8,7 +8,7 @@ import requests from dotenv import load_dotenv from termcolor import colored -from swarm_models.utils.loguru_logger import logger +from loguru import logger from swarm_models.base_multimodal_model import BaseMultiModalModel # Load environment variables diff --git a/swarm_models/ollama_model.py b/swarm_models/ollama_model.py index 1604ee5..95ce223 100644 --- a/swarm_models/ollama_model.py +++ b/swarm_models/ollama_model.py @@ -15,7 +15,7 @@ class Message(BaseModel): role: str = Field( ..., - regex="^(user|system|assistant)$", + pattern="^(user|system|assistant)$", description="The role of the message sender.", ) content: str = Field( diff --git a/swarm_models/openai_function_caller.py b/swarm_models/openai_function_caller.py index d716bbf..de5841d 100644 --- a/swarm_models/openai_function_caller.py +++ b/swarm_models/openai_function_caller.py @@ -1,7 +1,7 @@ import openai from pydantic import BaseModel import os -from swarm_models.utils.loguru_logger import logger +from loguru import logger from swarm_models.base_llm import BaseLLM from typing import List diff --git a/swarm_models/sam_two.py b/swarm_models/sam_two.py new file mode 100644 index 0000000..e269289 --- /dev/null +++ b/swarm_models/sam_two.py @@ -0,0 +1,205 @@ +import json +import os +from typing import Dict, Optional +from loguru import logger +from pydantic import BaseModel +import importlib +import subprocess + + +# Define a Pydantic model to handle JSON output format +class AnnotationResult(BaseModel): + image_name: str + annotated_image_path: Optional[str] = None + detections: list + + +class GroundedSAMTwo: + def __init__( + self, + ontology_dict: Dict[str, str], + model_name: str = "Grounding DINO", + grounding_dino_box_threshold: float = 0.25, + extension: str = "jpg", + output_dir: Optional[str] = None, + ): + """ + Initialize the GroundedSAMTwo class with a caption ontology and load the model. + + :param ontology_dict: Dictionary for mapping captions to classes, e.g., {"shipping container": "container"}. + :param model_name: Name of the model to use (default: "Grounding DINO"). + :param grounding_dino_box_threshold: Threshold for the bounding box confidence (default: 0.25). + """ + self.ontology_dict = ontology_dict + self.model_name = model_name + self.grounding_dino_box_threshold = ( + grounding_dino_box_threshold + ) + self.extension = extension + self.output_dir = output_dir + self.base_model = None # Model will be loaded lazily + + logger.info( + "GroundedSAMTwo initialized with model: {}, box threshold: {}", + model_name, + grounding_dino_box_threshold, + ) + + def _install_and_import(self): + """ + Install and import the necessary packages at runtime. + """ + try: + # Dynamically import required modules + global cv2, sv, CaptionOntology, GroundedSAM2 + + cv2 = importlib.import_module("cv2") + sv = importlib.import_module("supervision") + CaptionOntology = importlib.import_module( + "autodistill.detection" + ).CaptionOntology + GroundedSAM2 = importlib.import_module( + "autodistill_grounded_sam_2" + ).GroundedSAM2 + except ImportError: + logger.warning( + "Some packages are missing. Installing required packages: supervision, autodistill, autodistill-grounded-sam-2" + ) + subprocess.check_call( + [ + "python", + "-m", + "pip", + "install", + "-U", + "supervision", + "autodistill", + "autodistill-grounded-sam-2", + ] + ) + # Retry imports after installation + self._install_and_import() + + def _load_model(self): + """ + Lazily load the GroundedSAM2 model. + """ + if self.base_model is None: + self._install_and_import() # Install and import required packages + self.base_model = GroundedSAM2( + ontology=CaptionOntology(self.ontology_dict), + model=self.model_name, + grounding_dino_box_threshold=self.grounding_dino_box_threshold, + ) + logger.info("GroundedSAM2 model loaded.") + + def run(self, input_path: str) -> Optional[str]: + """ + Annotate an image or label a dataset directory. + + :param input_path: Path to an image or directory of images. + :return: JSON string output of the annotation results or None. + """ + self._load_model() # Load the model if not already loaded + + if os.path.isdir(input_path): + logger.info("Processing directory: {}", input_path) + return self._label_dataset(input_path, self.extension) + else: + logger.info("Processing single image: {}", input_path) + return self._annotate_single_image( + input_path, self.output_dir + ) + + def _label_dataset( + self, image_dir: str, extension: str = "jpg" + ) -> str: + """ + Label all images in the provided directory. + + :param image_dir: Directory containing images to label. + :param extension: Image file extension (default: "jpg"). + :return: JSON string of the annotated results for the dataset. + """ + self._load_model() + logger.info( + "Labeling dataset in directory: {} with extension: {}", + image_dir, + extension, + ) + self.base_model.label(image_dir, extension=extension) + + # Output results (could be adjusted to store or handle annotations) + result = {"directory": image_dir, "status": "Labeled"} + return json.dumps(result, indent=4) + + def _annotate_single_image( + self, image_path: str, output_dir: Optional[str] = None + ) -> str: + """ + Annotate a single image and optionally save the annotated image. + + :param image_path: Path to the image. + :param output_dir: Optional directory to save the annotated image. + :return: JSON string of the annotation result. + """ + self._load_model() + logger.info("Annotating image: {}", image_path) + + try: + # Make predictions and apply non-max suppression + results = self.base_model.predict(image_path).with_nms() + results = results[results.confidence > 0.3] + + # Read the image and annotate + image = cv2.imread(image_path) + mask_annotator = sv.BoxAnnotator() + annotated_image = mask_annotator.annotate( + image.copy(), detections=results + ) + + # Display the annotated image + sv.plot_image(image=annotated_image, size=(8, 8)) + + # Save the annotated image if an output directory is provided + annotated_image_path = None + if output_dir: + os.makedirs(output_dir, exist_ok=True) + annotated_image_path = os.path.join( + output_dir, os.path.basename(image_path) + ) + cv2.imwrite(annotated_image_path, annotated_image) + logger.info( + "Annotated image saved to: {}", + annotated_image_path, + ) + + # Prepare the JSON result using Pydantic + annotation_result = AnnotationResult( + image_name=os.path.basename(image_path), + annotated_image_path=annotated_image_path, + detections=results.to_dict( + orient="records" + ), # Assuming the results object supports conversion to dict + ) + + # Return the result as a JSON string + return annotation_result.json(indent=4) + except Exception as e: + logger.error("Error during image annotation: {}", e) + return json.dumps({"error": str(e)}) + + +# Example usage: +# ontology = {"shipping container": "container"} +# runner = GroundedSAMTwo(ontology) +# +# # Run on a single image +# image_path = "path/to/your/image.jpg" +# json_output = runner.run(image_path, output_dir="annotated_images") +# logger.info("Annotation result: \n{}", json_output) +# +# # Run on a dataset (directory) +# image_dir = "path/to/your/dataset" +# json_output = runner.run(image_dir) +# logger.info("Dataset labeling result: \n{}", json_output) diff --git a/swarm_models/utils/__init__.py b/swarm_models/utils/__init__.py new file mode 100644 index 0000000..3365cec --- /dev/null +++ b/swarm_models/utils/__init__.py @@ -0,0 +1,3 @@ +from swarm_models.utils.download_img_from_url import download_image + +__all__ = ["download_image"] diff --git a/swarm_models/utils/download_img_from_url.py b/swarm_models/utils/download_img_from_url.py new file mode 100644 index 0000000..034ec6a --- /dev/null +++ b/swarm_models/utils/download_img_from_url.py @@ -0,0 +1,80 @@ +import requests +from loguru import logger +import os +from typing import Optional +from urllib.parse import urlparse +import time + + +def download_image( + url: str, save_dir: str, retries: int = 3, timeout: int = 10 +) -> Optional[str]: + """ + Downloads an image from a given URL and saves it to the specified directory. + + :param url: URL of the image to download. + :param save_dir: Directory where the image will be saved. + :param retries: Number of retries in case of failure (default: 3). + :param timeout: Timeout for the request in seconds (default: 10). + :return: Path to the saved image or None if the download fails. + """ + logger.info(f"Starting download of image from {url}") + + # Parse the image file name from the URL + parsed_url = urlparse(url) + image_name = os.path.basename(parsed_url.path) + + if not image_name: + logger.error(f"Could not parse image name from URL: {url}") + return None + + # Ensure the save directory exists + os.makedirs(save_dir, exist_ok=True) + save_path = os.path.join(save_dir, image_name) + + attempt = 0 + while attempt < retries: + try: + # Attempt to download the image + logger.info( + f"Attempt {attempt + 1} to download {image_name} from {url}" + ) + response = requests.get(url, timeout=timeout) + + # Check if the request was successful + if response.status_code == 200: + # Save the image to the specified path + with open(save_path, "wb") as f: + f.write(response.content) + logger.info( + f"Successfully downloaded and saved image to {save_path}" + ) + return save_path + else: + logger.error( + f"Failed to download image. Status code: {response.status_code}" + ) + + except requests.exceptions.RequestException as e: + # Handle request errors and log them + logger.error(f"Error downloading image from {url}: {e}") + + # Wait before retrying if not successful + attempt += 1 + if attempt < retries: + logger.warning( + f"Retrying download in 3 seconds... ({attempt}/{retries})" + ) + time.sleep(3) + + logger.error( + f"Failed to download image from {url} after {retries} attempts" + ) + return None + + +# Example usage +# if __name__ == "__main__": +# image_url = "https://example.com/path/to/image.jpg" +# save_directory = "downloaded_images" +# download_image(image_url, save_directory)