diff --git a/multi_object_tracking/yolo_sam/.dockerignore b/multi_object_tracking/yolo_sam/.dockerignore
new file mode 100644
index 000000000..894795019
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/.dockerignore
@@ -0,0 +1,18 @@
+# Exclude everything
+_wsgi.py
+
+# Include Dockerfile and docker-compose for reference (optional, decide based on your use case)
+!Dockerfile
+!docker-compose.yml
+
+# Include Python application files
+!*.py
+
+# Include requirements files
+!requirements*.txt
+
+# Include script
+!*.sh
+
+# Exclude specific requirements if necessary
+# requirements-test.txt (Uncomment if you decide to exclude this)
diff --git a/multi_object_tracking/yolo_sam/Dockerfile b/multi_object_tracking/yolo_sam/Dockerfile
new file mode 100644
index 000000000..033a3263d
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/Dockerfile
@@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.1.2-cuda12.1-cudnn8-runtime
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG TEST_ENV
+
+WORKDIR /app
+
+# Update Conda
+RUN conda update conda -y
+
+# Install system dependencies
+RUN --mount=type=cache,target="/var/cache/apt",sharing=locked \
+    --mount=type=cache,target="/var/lib/apt/lists",sharing=locked \
+    apt-get -y update \
+    && apt-get install -y git wget g++ freeglut3-dev build-essential \
+    libx11-dev libxmu-dev libxi-dev libglu1-mesa libglu1-mesa-dev \
+    libfreeimage-dev ffmpeg libsm6 libxext6 libffi-dev python3-dev \
+    python3-pip gcc
+
+# Environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PIP_CACHE_DIR=/.cache \
+    PORT=9090 \
+    WORKERS=2 \
+    THREADS=4 \
+    CUDA_HOME=/usr/local/cuda \
+    TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6+PTX;8.9;9.0" \
+    SEGMENT_ANYTHING_2_REPO_PATH=/segment-anything-2 \
+    PYTHONPATH=/app
+
+# Install CUDA toolkit via Conda
+RUN conda install -c "nvidia/label/cuda-12.1.1" cuda -y
+
+# Install Python dependencies
+COPY requirements-base.txt .
+RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
+    pip install -r requirements-base.txt
+
+COPY requirements.txt .
+RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
+    pip install -r requirements.txt
+
+# Install segment-anything-2
+RUN cd / && git clone --depth 1 --branch main --single-branch https://github.com/facebookresearch/sam2.git
+WORKDIR /sam2
+RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
+    pip install -e .
+RUN cd checkpoints && ./download_ckpts.sh
+
+# Return to app working directory
+WORKDIR /app
+
+# Install test dependencies (optional)
+COPY requirements-test.txt .
+RUN --mount=type=cache,target=${PIP_CACHE_DIR},sharing=locked \
+    if [ "$TEST_ENV" = "true" ]; then \
+      pip install -r requirements-test.txt; \
+    fi
+
+# Download YOLO models
+RUN /bin/sh -c 'if [ ! -f /app/models/yolov8m.pt ]; then \
+    yolo predict model=/app/models/yolov8m.pt source=/app/tests/car.jpg \
+    && yolo predict model=/app/models/yolov8n.pt source=/app/tests/car.jpg \
+    && yolo predict model=/app/models/yolov8n-cls.pt source=/app/tests/car.jpg \
+    && yolo predict model=/app/models/yolov8n-seg.pt source=/app/tests/car.jpg; \
+    fi'
+
+# Copy app files
+COPY . ./
+
+# Default command
+CMD ["/app/start.sh"]
diff --git a/multi_object_tracking/yolo_sam/README.md b/multi_object_tracking/yolo_sam/README.md
new file mode 100644
index 000000000..0bcfbca74
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/README.md
@@ -0,0 +1,58 @@
+This guide describes the simplest way to start using ML backend with Label Studio.
+
+## Running with Docker (Recommended)
+
+1. Start Machine Learning backend on `http://localhost:9090` with prebuilt image:
+
+```bash
+docker-compose up
+```
+
+2. Validate that backend is running
+
+```bash
+$ curl http://localhost:9090/
+{"status":"UP"}
+```
+
+3. Connect to the backend from Label Studio running on the same host: go to your project `Settings -> Machine Learning -> Add Model` and specify `http://localhost:9090` as a URL.
+
+
+## Building from source (Advanced)
+
+To build the ML backend from source, you have to clone the repository and build the Docker image:
+
+```bash
+docker-compose build
+```
+
+## Running without Docker (Advanced)
+
+To run the ML backend without Docker, you have to clone the repository and install all dependencies using pip:
+
+```bash
+python -m venv ml-backend
+source ml-backend/bin/activate
+pip install -r requirements.txt
+```
+
+Then you can start the ML backend:
+
+```bash
+label-studio-ml start ./dir_with_your_model
+```
+
+# Configuration
+Parameters can be set in `docker-compose.yml` before running the container.
+
+
+The following common parameters are available:
+- `BASIC_AUTH_USER` - specify the basic auth user for the model server
+- `BASIC_AUTH_PASS` - specify the basic auth password for the model server
+- `LOG_LEVEL` - set the log level for the model server
+- `WORKERS` - specify the number of workers for the model server
+- `THREADS` - specify the number of threads for the model server
+
+# Customization
+
+The ML backend can be customized by adding your own models and logic inside the `./dir_with_your_model` directory. 
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/_wsgi.py b/multi_object_tracking/yolo_sam/_wsgi.py
new file mode 100644
index 000000000..789f04669
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/_wsgi.py
@@ -0,0 +1,125 @@
+import os
+import argparse
+import json
+import logging
+import logging.config
+
+# Set a default log level if LOG_LEVEL is not defined
+log_level = os.getenv("LOG_LEVEL", "INFO")
+
+logging.config.dictConfig({
+  "version": 1,
+  "disable_existing_loggers": False,  # Prevent overriding existing loggers
+  "formatters": {
+    "standard": {
+      "format": "[%(asctime)s] [%(levelname)s] [%(name)s::%(funcName)s::%(lineno)d] %(message)s"
+    }
+  },
+  "handlers": {
+    "console": {
+      "class": "logging.StreamHandler",
+      "level": log_level,
+      "stream": "ext://sys.stdout",
+      "formatter": "standard"
+    }
+  },
+  "root": {
+    "level": log_level,
+    "handlers": [
+      "console"
+    ],
+    "propagate": True
+  }
+})
+
+from label_studio_ml.api import init_app
+from model import NewModel
+
+
+_DEFAULT_CONFIG_PATH = os.path.join(os.path.dirname(__file__), 'config.json')
+
+
+def get_kwargs_from_config(config_path=_DEFAULT_CONFIG_PATH):
+    if not os.path.exists(config_path):
+        return dict()
+    with open(config_path) as f:
+        config = json.load(f)
+    assert isinstance(config, dict)
+    return config
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Label studio')
+    parser.add_argument(
+        '-p', '--port', dest='port', type=int, default=9090,
+        help='Server port')
+    parser.add_argument(
+        '--host', dest='host', type=str, default='0.0.0.0',
+        help='Server host')
+    parser.add_argument(
+        '--kwargs', '--with', dest='kwargs', metavar='KEY=VAL', nargs='+', type=lambda kv: kv.split('='),
+        help='Additional LabelStudioMLBase model initialization kwargs')
+    parser.add_argument(
+        '-d', '--debug', dest='debug', action='store_true',
+        help='Switch debug mode')
+    parser.add_argument(
+        '--log-level', dest='log_level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default=log_level,
+        help='Logging level')
+    parser.add_argument(
+        '--model-dir', dest='model_dir', default=os.path.dirname(__file__),
+        help='Directory where models are stored (relative to the project directory)')
+    parser.add_argument(
+        '--check', dest='check', action='store_true',
+        help='Validate model instance before launching server')
+    parser.add_argument('--basic-auth-user',
+                        default=os.environ.get('ML_SERVER_BASIC_AUTH_USER', None),
+                        help='Basic auth user')
+    
+    parser.add_argument('--basic-auth-pass',
+                        default=os.environ.get('ML_SERVER_BASIC_AUTH_PASS', None),
+                        help='Basic auth pass')    
+    
+    args = parser.parse_args()
+
+    # setup logging level
+    if args.log_level:
+        logging.root.setLevel(args.log_level)
+
+    def isfloat(value):
+        try:
+            float(value)
+            return True
+        except ValueError:
+            return False
+
+    def parse_kwargs():
+        param = dict()
+        for k, v in args.kwargs:
+            if v.isdigit():
+                param[k] = int(v)
+            elif v == 'True' or v == 'true':
+                param[k] = True
+            elif v == 'False' or v == 'false':
+                param[k] = False
+            elif isfloat(v):
+                param[k] = float(v)
+            else:
+                param[k] = v
+        return param
+
+    kwargs = get_kwargs_from_config()
+
+    if args.kwargs:
+        kwargs.update(parse_kwargs())
+
+    if args.check:
+        print('Check "' + NewModel.__name__ + '" instance creation..')
+        model = NewModel(**kwargs)
+
+    app = init_app(model_class=NewModel, basic_auth_user=args.basic_auth_user, basic_auth_pass=args.basic_auth_pass)
+
+    app.run(host=args.host, port=args.port, debug=args.debug)
+
+else:
+    # for uWSGI use
+    app = init_app(model_class=NewModel)
diff --git a/multi_object_tracking/yolo_sam/control_models/__init__.py b/multi_object_tracking/yolo_sam/control_models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/multi_object_tracking/yolo_sam/control_models/base.py b/multi_object_tracking/yolo_sam/control_models/base.py
new file mode 100644
index 000000000..fbc3e1cc4
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/base.py
@@ -0,0 +1,201 @@
+import os
+import logging
+
+from pydantic import BaseModel
+from typing import Optional, List, Dict, ClassVar
+from ultralytics import YOLO
+
+from label_studio_ml.model import LabelStudioMLBase
+from label_studio_ml.utils import DATA_UNDEFINED_NAME
+from label_studio_sdk._extensions.label_studio_tools.core.utils.io import get_local_path
+from label_studio_sdk.label_interface.control_tags import ControlTag
+from label_studio_sdk.label_interface import LabelInterface
+
+
+# use matplotlib plots for debug
+DEBUG_PLOT = os.getenv("DEBUG_PLOT", "false").lower() in ["1", "true"]
+MODEL_SCORE_THRESHOLD = float(os.getenv("MODEL_SCORE_THRESHOLD", 0.5))
+DEFAULT_MODEL_ROOT = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models")
+MODEL_ROOT = os.getenv("MODEL_ROOT", DEFAULT_MODEL_ROOT)
+os.makedirs(MODEL_ROOT, exist_ok=True)
+# if true, allow to use custom model path from the control tag in the labeling config
+ALLOW_CUSTOM_MODEL_PATH = os.getenv("ALLOW_CUSTOM_MODEL_PATH", "true").lower() in [
+    "1",
+    "true",
+]
+
+# Global cache for YOLO models
+_model_cache = {}
+logger = logging.getLogger(__name__)
+
+
+def get_bool(attr, attr_name, default="false"):
+    return attr.get(attr_name, default).lower() in ["1", "true", "yes"]
+
+
+class ControlModel(BaseModel):
+    """
+    Represents a control tag in Label Studio, which is associated with a specific type of labeling task
+    and is used to generate predictions using a YOLO model.
+
+    Attributes:
+        type (str): Type of the control, e.g., RectangleLabels, Choices, etc.
+        control (ControlTag): The actual control element from the Label Studio configuration.
+        from_name (str): The name of the control tag, used to link the control to the data.
+        to_name (str): The name of the data field that this control is associated with.
+        value (str): The value name from the object that this control operates on, e.g., an image or text field.
+        model (object): The model instance (e.g., YOLO) used to generate predictions for this control.
+        model_path (str): Path to the YOLO model file.
+        model_score_threshold (float): Threshold for prediction scores; predictions below this value will be ignored.
+        label_map (Optional[Dict[str, str]]): A mapping of model labels to Label Studio labels.
+    """
+
+    type: ClassVar[str]
+    control: ControlTag
+    from_name: str
+    to_name: str
+    value: str
+    model: YOLO
+    model_path: ClassVar[str]
+    model_score_threshold: float = 0.5
+    label_map: Optional[Dict[str, str]] = {}
+    label_studio_ml_backend: LabelStudioMLBase
+    project_id: Optional[str] = None
+
+    def __init__(self, **data):
+        super().__init__(**data)
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        """Check if the control tag matches the model type.
+        Args:
+            control (ControlTag): The control tag from the Label Studio Interface.
+        """
+        raise NotImplementedError("This method should be overridden in derived classes")
+
+    @staticmethod
+    def get_from_name_for_label_map(
+        label_interface: LabelInterface, target_name: str
+    ) -> str:
+        """Get the 'from_name' attribute for the label map building."""
+        return target_name
+
+    @classmethod
+    def create(cls, mlbackend: LabelStudioMLBase, control: ControlTag):
+        """Factory method to create an instance of a specific control model class.
+        Args:
+            mlbackend (LabelStudioMLBase): The ML backend instance.
+            control (ControlTag): The control tag from the Label Studio Interface.
+        """
+        from_name = control.name
+        to_name = control.to_name[0]
+        value = control.objects[0].value_name
+
+        # if skip is true, don't process this control
+        if get_bool(control.attr, "model_skip", "false"):
+            logger.info(
+                f"Skipping control tag '{control.tag}' with name '{from_name}', model_skip=true found"
+            )
+            return None
+        # read threshold attribute from the control tag, e.g.: <RectangleLabels model_score_threshold="0.5">
+        model_score_threshold = float(
+            control.attr.get("model_score_threshold")
+            or control.attr.get(
+                "score_threshold"
+            )  # not recommended option, use `model_score_threshold`
+            or MODEL_SCORE_THRESHOLD
+        )
+        # read `model_path` attribute from the control tag
+        model_path = (
+            ALLOW_CUSTOM_MODEL_PATH and control.attr.get("model_path")
+        ) or cls.model_path
+
+        model = cls.get_cached_model(model_path)
+        model_names = model.names.values()  # class names from the model
+        # from_name for label mapping can be differed from control.name (e.g. VideoRectangle)
+        label_map_from_name = cls.get_from_name_for_label_map(
+            mlbackend.label_interface, from_name
+        )
+        label_map = mlbackend.build_label_map(label_map_from_name, model_names)
+
+        return cls(
+            control=control,
+            from_name=from_name,
+            to_name=to_name,
+            value=value,
+            model=model,
+            model_score_threshold=model_score_threshold,
+            label_map=label_map,
+            label_studio_ml_backend=mlbackend,
+            project_id=mlbackend.project_id,
+        )
+
+    @classmethod
+    def load_yolo_model(cls, filename) -> YOLO:
+        """Load YOLO model from the file."""
+        path = os.path.join(MODEL_ROOT, filename)
+        logger.info(f"Loading yolo model: {path}")
+        model = YOLO(path)
+        logger.info(f"Model {path} names:\n{model.names}")
+        return model
+
+    @classmethod
+    def get_cached_model(cls, path: str) -> YOLO:
+        if path not in _model_cache:
+            _model_cache[path] = cls.load_yolo_model(path)
+        return _model_cache[path]
+
+    def debug_plot(self, image):
+        if not DEBUG_PLOT:
+            return
+
+        import matplotlib.pyplot as plt
+
+        plt.figure(figsize=(10, 10))
+        plt.imshow(image[..., ::-1])
+        plt.axis("off")
+        plt.title(self.type)
+        plt.show()
+
+    def predict_regions(self, path) -> List[Dict]:
+        """Predict regions in the image using the YOLO model.
+        Args:
+            path (str): Path to the file with media
+        """
+        raise NotImplementedError("This method should be overridden in derived classes")
+
+    def fit(self, event, data, **kwargs):
+        """Fit the model."""
+        logger.warning("The fit method is not implemented for this control model")
+        return False
+
+    def get_path(self, task):
+        task_path = task["data"].get(self.value) or task["data"].get(
+            DATA_UNDEFINED_NAME
+        )
+        if task_path is None:
+            raise ValueError(
+                f"Can't load path using key '{self.value}' from task {task}"
+            )
+        if not isinstance(task_path, str):
+            raise ValueError(f"Path should be a string, but got {task_path}")
+
+        # try path as local file or try to load it from Label Studio instance/download via http
+        path = (
+            task_path
+            if os.path.exists(task_path)
+            else get_local_path(task_path, task_id=task.get("id"))
+        )
+        logger.debug(f"load_image: {task_path} => {path}")
+        return path
+
+    def __str__(self):
+        """Return a string with full representation of the control tag."""
+        return (
+            f"{self.type} from_name={self.from_name}, "
+            f"label_map={self.label_map}, model_score_threshold={self.model_score_threshold}"
+        )
+
+    class Config:
+        arbitrary_types_allowed = True
+        protected_namespaces = ("__.*__", "_.*")  # Excludes 'model_'
diff --git a/multi_object_tracking/yolo_sam/control_models/choices.py b/multi_object_tracking/yolo_sam/control_models/choices.py
new file mode 100644
index 000000000..fc9dac106
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/choices.py
@@ -0,0 +1,92 @@
+import logging
+import numpy as np
+
+from control_models.base import ControlModel
+from typing import List, Dict
+
+
+logger = logging.getLogger(__name__)
+
+
+class ChoicesModel(ControlModel):
+    """
+    Class representing a Choices (classes) control tag for YOLO model.
+    """
+
+    type = "Choices"
+    model_path = "yolov8n-cls.pt"
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # check object tag type
+        if control.objects[0].tag != "Image":
+            return False
+        # support both Choices and Taxonomy because of their similarity
+        return control.tag in [cls.type, "Taxonomy"]
+
+    def predict_regions(self, path) -> List[Dict]:
+        results = self.model.predict(path)
+        self.debug_plot(results[0].plot())
+        return self.create_choices(results, path)
+
+    def create_choices(self, results, path):
+        logger.debug(f"create_choices: {self.from_name}")
+        mode = self.control.attr.get("choice", "single")
+        data = results[0].probs.data.cpu().numpy()
+
+        # single
+        if mode in ["single", "single-radio"]:
+            # we must keep data items that matches label_map only, because we need to search among label_map only
+            indexes = [
+                i for i, name in self.model.names.items() if name in self.label_map
+            ]
+            data = data[indexes]
+            model_names = [self.model.names[i] for i in indexes]
+            # find the best choice
+            index = np.argmax(data)
+            probs = [data[index]]
+            names = [model_names[index]]
+        # multi
+        else:
+            # get indexes of data where data >= self.model_score_threshold
+            indexes = np.where(data >= self.model_score_threshold)
+            probs = data[indexes].tolist()
+            names = [self.model.names[int(i)] for i in indexes[0]]
+
+        if not probs:
+            logger.debug("No choices found")
+            return []
+
+        score = np.mean(probs)
+        logger.debug(
+            "----------------------\n"
+            f"task id > {path}\n"
+            f"control: {self.control}\n"
+            f"probs > {probs}\n"
+            f"score > {score}\n"
+            f"names > {names}\n"
+        )
+
+        if score < self.model_score_threshold:
+            logger.debug(f"Score is too low for single choice: {names[0]} = {probs[0]}")
+            return []
+
+        # map to Label Studio labels
+        output_labels = [
+            self.label_map[name] for name in names if name in self.label_map
+        ]
+
+        # add new region with rectangle
+        return [
+            {
+                "from_name": self.from_name,
+                "to_name": self.to_name,
+                "type": "choices",
+                "value": {"choices": output_labels},
+                "score": float(score),
+            }
+        ]
+
+
+# pre-load and cache default model at startup
+ChoicesModel.get_cached_model(ChoicesModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/keypoint_labels.py b/multi_object_tracking/yolo_sam/control_models/keypoint_labels.py
new file mode 100644
index 000000000..86199d4d1
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/keypoint_labels.py
@@ -0,0 +1,173 @@
+import logging
+from control_models.base import ControlModel, get_bool
+from typing import List, Dict
+
+logger = logging.getLogger(__name__)
+
+
+class KeypointLabelsModel(ControlModel):
+    """
+    Class representing a KeypointLabels control tag for YOLO model.
+    """
+
+    type = "KeyPointLabels"
+    model_path = (
+        "yolov8n-pose.pt"  # Adjust the model path to your keypoint detection model
+    )
+    add_bboxes: bool = True
+    point_size: float = 1
+    point_threshold: float = 0
+    point_map: Dict = {}
+
+    def __init__(self, **data):
+        super().__init__(**data)
+
+        self.add_bboxes = get_bool(self.control.attr, "model_add_bboxes", "true")
+        self.point_size = float(self.control.attr.get("model_point_size", 1))
+        self.point_threshold = float(self.control.attr.get("model_point_threshold", 0))
+        self.point_map = self.build_point_mapping()
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # Check object tag type
+        if control.objects[0].tag != "Image":
+            return False
+        return control.tag == cls.type
+
+    def build_point_mapping(self):
+        """Build a mapping between points and Label Studio labels, e.g.
+        <Label value="nose" predicted_values="person" model_index="0" /> => {"person::0": "nose"}
+        """
+        mapping = {}
+        for value, label_tag in self.control.labels_attrs.items():
+            model_name = label_tag.attr.get("predicted_values")
+            model_index = label_tag.attr.get("model_index")
+            if model_name and not model_index:
+                logger.warning(
+                    f"`model_index` is not provided for Label tag: {label_tag}"
+                )
+            if not model_name and model_index:
+                logger.warning(
+                    f"`predicted_values` is not provided for Label tag: {label_tag}"
+                )
+            if model_name and model_index:
+                mapping[f"{model_name}::{model_index}"] = value
+
+        if not mapping:
+            logger.error(
+                f"No point to label mapping found for control tag: {self.control}"
+            )
+        return mapping
+
+    def predict_regions(self, path) -> List[Dict]:
+        results = self.model.predict(path)
+        return self.create_keypoints(results, path)
+
+    def create_keypoints(self, results, path):
+        logger.debug(f"create_keypoints: {self.from_name}")
+        keypoints_data = results[0].keypoints  # Get keypoints from the first frame
+        bbox_data = results[0].boxes
+        image_width = results[0].orig_shape[1]
+        model_names = self.model.names
+        regions = []
+
+        for bbox_index in range(
+            keypoints_data.shape[0]
+        ):  # Iterate over detected bboxes
+            bbox_conf = bbox_data.conf[bbox_index]
+            point_xyn = (
+                keypoints_data.xyn[bbox_index] * 100
+            )  # Convert normalized keypoints to percentages
+            model_label = model_names[int(results[0].boxes.cls[bbox_index])]
+
+            point_logs = "\n".join(
+                [f' model_index="{i}", xy={xyn}' for i, xyn in enumerate(point_xyn)]
+            )
+            logger.debug(
+                "----------------------\n"
+                f"task id > {path}\n"
+                f"type: {self.control}\n"
+                f"model label > {model_label}\n"
+                f"keypoints >\n{point_logs}\n"
+                f"confidences > {bbox_conf}\n"
+            )
+
+            # bbox score is too low
+            if bbox_conf < self.model_score_threshold:
+                continue
+
+            # There is no mapping between model label and LS label
+            if model_label not in self.label_map:
+                continue
+
+            # Add parent bbox that contains all keypoints
+            if self.add_bboxes:
+                region = self.create_bounding_box(
+                    bbox_conf, bbox_data, bbox_index, model_label
+                )
+                regions.append(region)
+
+            for point_index, xyn in enumerate(point_xyn):
+                point_conf = keypoints_data.conf[bbox_index][point_index]
+                if point_conf < self.point_threshold:
+                    continue
+
+                x, y = xyn.tolist()
+                index_name = f"{model_label}::{point_index}"
+                if index_name not in self.point_map:
+                    logger.warning(
+                        f"Point {index_name} not found in point map, "
+                        f"you have to define it in the labeling config, e.g.:\n"
+                        f'<Label value="nose" predicted_values="person" model_index="0" />'
+                    )
+                    continue
+                point_label = self.point_map[index_name]
+
+                # Add new region with keypoint
+                region = {
+                    "from_name": self.from_name,
+                    "to_name": self.to_name,
+                    "type": "keypointlabels",
+                    "value": {
+                        # point label
+                        "keypointlabels": [point_label],
+                        # point width, just visual styling
+                        "width": self.point_size / image_width * 100,
+                        "x": x,
+                        "y": y,
+                    },
+                    "meta": {
+                        "text": [f"bbox-{bbox_index}"]  # Group keypoints by bbox index
+                    },
+                    "score": float(point_conf),
+                }
+                # If bboxes are used, group keypoints by bbox
+                if self.add_bboxes:
+                    region["parentID"] = f"bbox-{bbox_index}"
+                regions.append(region)
+        return regions
+
+    def create_bounding_box(self, bbox_conf, bbox_data, bbox_index, model_label):
+        # Add parent bbox that contains all keypoints
+        x, y, w, h = bbox_data.xywhn[bbox_index].tolist()
+        region = {
+            "id": f"bbox-{bbox_index}",
+            "from_name": self.from_name + "_bbox",
+            "to_name": self.to_name,
+            "type": "rectanglelabels",
+            "value": {
+                "rectanglelabels": [model_label],
+                "x": (x - w / 2) * 100,
+                "y": (y - h / 2) * 100,
+                "width": w * 100,
+                "height": h * 100,
+            },
+            "meta": {"text": [f"bbox-{bbox_index}"]},  # Group keypoints by bbox index
+            "score": float(bbox_conf),
+            "hidden": True,
+        }
+        return region
+
+
+# Pre-load and cache default model at startup
+KeypointLabelsModel.get_cached_model(KeypointLabelsModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/polygon_labels.py b/multi_object_tracking/yolo_sam/control_models/polygon_labels.py
new file mode 100644
index 000000000..95a6ab690
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/polygon_labels.py
@@ -0,0 +1,77 @@
+import logging
+
+from control_models.base import ControlModel
+from typing import List, Dict
+
+
+logger = logging.getLogger(__name__)
+
+
+class PolygonLabelsModel(ControlModel):
+    """
+    Class representing a PolygonLabels control tag for YOLO model.
+    """
+
+    type = "PolygonLabels"
+    model_path = "yolov8n-seg.pt"
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # check object tag type
+        if control.objects[0].tag != "Image":
+            return False
+        return control.tag == cls.type
+
+    def predict_regions(self, path) -> List[Dict]:
+        results = self.model.predict(path)
+        return self.create_polygons(results, path)
+
+    def create_polygons(self, results, path):
+        logger.debug(f"create_polygons: {self.from_name}")
+        data = results[0].masks  # take masks from the first frame
+        model_names = self.model.names
+        regions = []
+
+        for i in range(len(data)):
+            score = float(results[0].boxes.conf[i])  # tensor => float
+            points = (
+                data.xyn[i] * 100
+            )  # get the polygon points for the current instance
+            model_label = model_names[int(results[0].boxes.cls[i])]
+
+            logger.debug(
+                "----------------------\n"
+                f"task id > {path}\n"
+                f"type: {self.control}\n"
+                f"polygon points > {points}\n"
+                f"model label > {model_label}\n"
+                f"score > {score}\n"
+            )
+
+            # bbox score is too low
+            if score < self.model_score_threshold:
+                continue
+
+            # there is no mapping between model label and LS label
+            if model_label not in self.label_map:
+                continue
+            output_label = self.label_map[model_label]
+
+            # add new region with polygon
+            region = {
+                "from_name": self.from_name,
+                "to_name": self.to_name,
+                "type": "polygonlabels",
+                "value": {
+                    "polygonlabels": [output_label],
+                    "points": points.tolist(),  # Converting the tensor to a list for JSON serialization
+                    "closed": True,
+                },
+                "score": score,
+            }
+            regions.append(region)
+        return regions
+
+
+# pre-load and cache default model at startup
+PolygonLabelsModel.get_cached_model(PolygonLabelsModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/rectangle_labels.py b/multi_object_tracking/yolo_sam/control_models/rectangle_labels.py
new file mode 100644
index 000000000..035eb7d1e
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/rectangle_labels.py
@@ -0,0 +1,99 @@
+import logging
+
+from control_models.base import ControlModel, get_bool
+from typing import List, Dict
+from label_studio_sdk.label_interface.control_tags import ControlTag
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_obb(control: ControlTag) -> bool:
+    """Check if the model should use oriented bounding boxes (OBB)
+    based on the control tag attribute `model_obb` from the labeling config.
+    """
+    return get_bool(control.attr, "model_obb", "false")
+
+
+class RectangleLabelsModel(ControlModel):
+    """
+    Class representing a RectangleLabels (bounding boxes) control tag for YOLO model.
+    """
+
+    type = "RectangleLabels"
+    model_path = "yolov8m.pt"
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # check object tag type
+        if control.objects[0].tag != "Image":
+            return False
+        if is_obb(control):
+            return False
+        return control.tag == cls.type
+
+    def predict_regions(self, path) -> List[Dict]:
+        results = self.model.predict(path)
+        self.debug_plot(results[0].plot())
+
+        # oriented bounding boxes are detected, but it should be processed by RectangleLabelsObbModel
+        if results[0].obb is not None and results[0].boxes is None:
+            raise ValueError(
+                "Oriented bounding boxes are detected in the YOLO model results. "
+                'However, `model_obb="true"` is not set at the RectangleLabels tag '
+                "in the labeling config."
+            )
+
+        # simple bounding boxes without rotation
+        return self.create_rectangles(results, path)
+
+    def create_rectangles(self, results, path):
+        """Simple bounding boxes without rotation"""
+        logger.debug(f"create_rectangles: {self.from_name}")
+        data = results[0].boxes  # take bboxes from the first frame
+        model_names = self.model.names
+        regions = []
+
+        for i in range(data.shape[0]):  # iterate over items
+            score = float(data.conf[i])  # tensor => float
+            x, y, w, h = data.xywhn[i].tolist()
+            model_label = model_names[int(data.cls[i])]
+
+            logger.debug(
+                "----------------------\n"
+                f"task id > {path}\n"
+                f"type: {self.control}\n"
+                f"x, y, w, h > {x, y, w, h}\n"
+                f"model label > {model_label}\n"
+                f"score > {score}\n"
+            )
+
+            # bbox score is too low
+            if score < self.model_score_threshold:
+                continue
+
+            # there is no mapping between model label and LS label
+            if model_label not in self.label_map:
+                continue
+            output_label = self.label_map[model_label]
+
+            # add new region with rectangle
+            region = {
+                "from_name": self.from_name,
+                "to_name": self.to_name,
+                "type": "rectanglelabels",
+                "value": {
+                    "rectanglelabels": [output_label],
+                    "x": (x - w / 2) * 100,
+                    "y": (y - h / 2) * 100,
+                    "width": w * 100,
+                    "height": h * 100,
+                },
+                "score": score,
+            }
+            regions.append(region)
+        return regions
+
+
+# pre-load and cache default model at startup
+RectangleLabelsModel.get_cached_model(RectangleLabelsModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/rectangle_labels_obb.py b/multi_object_tracking/yolo_sam/control_models/rectangle_labels_obb.py
new file mode 100644
index 000000000..ce66aae9b
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/rectangle_labels_obb.py
@@ -0,0 +1,93 @@
+import logging
+
+from control_models.base import ControlModel
+from control_models.rectangle_labels import is_obb
+from typing import List, Dict
+from label_studio_sdk.converter.utils import convert_yolo_obb_to_annotation
+
+
+logger = logging.getLogger(__name__)
+
+
+class RectangleLabelsObbModel(ControlModel):
+    """
+    Class representing a RectangleLabels OBB
+    (oriented bounding boxes, rotated bounding boxes)
+    control tag for YOLO model.
+    """
+
+    type = "RectangleLabels"
+    model_path = "yolov8n-obb.pt"
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # check object tag type
+        if control.objects[0].tag != "Image":
+            return False
+        if not is_obb(control):
+            return False
+        return control.tag == cls.type
+
+    def predict_regions(self, path) -> List[Dict]:
+        results = self.model.predict(path)
+        self.debug_plot(results[0].plot())
+
+        # simple bounding boxes without rotation
+        if results[0].obb is None:
+            raise ValueError(
+                "Simple bounding boxes are detected in the YOLO model results. "
+                'However, `model_obb="true"` is set at the RectangleLabels tag '
+                "in the labeling config. Set it to `false` to use simple bounding boxes."
+            )
+
+        # oriented bounding boxes with rotation (yolo obb model)
+        return self.create_rotated_rectangles(results, path)
+
+    def create_rotated_rectangles(self, results, path):
+        """YOLO OBB: oriented bounding boxes"""
+        logger.debug(f"create_rotated_rectangles: {self.from_name}")
+        data = results[0].obb  # take bboxes from the first frame
+        model_names = self.model.names
+        regions = []
+
+        for i in range(data.shape[0]):  # iterate over items
+            score = float(data.conf[i])  # tensor => float
+            model_label = model_names[int(data.cls[i])]
+            original_height, original_width = data.orig_shape
+            value = convert_yolo_obb_to_annotation(
+                data.xyxyxyxy[i].tolist(), original_width, original_height
+            )
+
+            logger.debug(
+                "----------------------\n"
+                f"task id > {path}\n"
+                f"type: {self.control}\n"
+                f"x, y, w, h, r > {value}\n"
+                f"model label > {model_label}\n"
+                f"score > {score}\n"
+            )
+
+            # bbox score is too low
+            if score < self.model_score_threshold:
+                continue
+
+            # there is no mapping between model label and LS label
+            if model_label not in self.label_map:
+                continue
+            output_label = self.label_map[model_label]
+            value["rectanglelabels"] = [output_label]
+
+            # add new region with rectangle
+            region = {
+                "from_name": self.from_name,
+                "to_name": self.to_name,
+                "type": "rectanglelabels",
+                "value": value,
+                "score": score,
+            }
+            regions.append(region)
+        return regions
+
+
+# pre-load and cache default model at startup
+RectangleLabelsObbModel.get_cached_model(RectangleLabelsObbModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/timeline_labels.py b/multi_object_tracking/yolo_sam/control_models/timeline_labels.py
new file mode 100644
index 000000000..ebf11721f
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/timeline_labels.py
@@ -0,0 +1,244 @@
+import logging
+import os.path
+
+from control_models.base import ControlModel, MODEL_ROOT, get_bool
+from typing import List, Dict
+from utils.neural_nets import (
+    BaseNN,
+    MultiLabelLSTM,
+    cached_feature_extraction,
+    cached_yolo_predict,
+)
+from utils.converter import (
+    get_label_map,
+    convert_timelinelabels_to_probs,
+    convert_probs_to_timelinelabels,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class TimelineLabelsModel(ControlModel):
+    """
+    Class representing a TimelineLabels control tag for YOLO model.
+    See README_TIMELINE_LABELS.md for more details.
+    """
+
+    type = "TimelineLabels"
+    model_path = "yolov8n-cls.pt"
+    trainable: bool = False
+
+    @classmethod
+    def is_control_matched(cls, control) -> bool:
+        # Check object tag type
+        if control.objects[0].tag != "Video":
+            return False
+        return control.tag == cls.type
+
+    @classmethod
+    def create(cls, *args, **kwargs):
+        instance = super().create(*args, **kwargs)
+
+        # timeline models can be trainable and based on YOLO trained classes directly
+        instance.trainable = get_bool(instance.control.attr, "model_trainable", "false")
+        # if it's trainable, we need to use labels from the labeling config as is because we will train them
+        if instance.trainable:
+            instance.label_map = {label: label for label in instance.control.labels}
+        elif not instance.label_map:
+            raise ValueError(
+                f"TimelinesLabels model works in simple mode (without training), "
+                f"but no labels from YOLO model names are matched:\n{instance.control.name}\n"
+                f"Add labels from YOLO model names to the labeling config or use `predicted_values` to map them. "
+                f'As alternative option, you can set `model_trainable="true"` in the TimelineLabels control tag '
+                f"to train the model on the labels from the labeling config."
+            )
+        return instance
+
+    def predict_regions(self, video_path) -> List[Dict]:
+        if self.trainable:
+            return self.create_timelines_trainable(video_path)
+        else:
+            return self.create_timelines_simple(video_path)
+
+    def create_timelines_simple(self, video_path):
+        logger.debug(f"create_timelines_simple: {self.from_name}")
+        # get yolo predictions
+        frame_results = cached_yolo_predict(
+            self.model, video_path, self.model.model_name
+        )
+
+        # Initialize a dictionary to keep track of ongoing segments for each label
+        model_names = self.model.names
+        needed_ids = [i for i, name in model_names.items() if name in self.label_map]
+        needed_labels = [
+            name for i, name in model_names.items() if name in self.label_map
+        ]
+
+        probs = [frame.probs.data[needed_ids].cpu().numpy() for frame in frame_results]
+        label_map = {
+            self.label_map[label]: idx for idx, label in enumerate(needed_labels)
+        }
+
+        return convert_probs_to_timelinelabels(
+            probs, label_map, self.control.name, self.model_score_threshold
+        )
+
+    def create_timelines_trainable(self, video_path):
+        logger.debug(f"create_timelines_trainable: {self.from_name}")
+        # extract features based on pre-trained yolo classification model
+        frame_results = cached_feature_extraction(
+            self.model, video_path, self.model.model_name
+        )
+
+        yolo_probs = [frame.probs for frame in frame_results]
+        path = self.get_classifier_path(self.project_id)
+        classifier = BaseNN.load_cached_model(path)
+        if not classifier:
+            raise ValueError(
+                f"Temporal classifier model '{path}' not found for "
+                f"'{self.control.name}', maybe it's not trained yet"
+            )
+
+        # run predict and convert to timelinelabels
+        probs = classifier.predict(yolo_probs)
+        regions = convert_probs_to_timelinelabels(
+            probs,
+            classifier.get_label_map(),
+            self.control.name,
+            self.model_score_threshold,
+        )
+
+        return regions
+
+    def fit(self, event, data, **kwargs):
+        if not self.trainable:
+            logger.debug(
+                'TimelineLabels model is in not trainable mode. '
+                'Use model_trainable="true" to enable training.'
+            )
+            return
+
+        """Fit the model."""
+        if event == "START_TRAINING":
+            # TODO: the full training makes a lot of sense here, but it's not implemented yet
+            raise NotImplementedError(
+                f"The event START_TRAINING is not supported for this control model: {self.control.tag}"
+            )
+
+        if event in ("ANNOTATION_CREATED", "ANNOTATION_UPDATED"):
+            features, labels, label_map, project_id = self.load_features_and_labels(
+                data
+            )
+            classifier, path = self.load_classifier(features, label_map, project_id)
+            return self.train_classifier(classifier, features, labels, path)
+
+    def train_classifier(self, classifier, features, labels, path):
+        """Train the classifier model for timelinelabels using incremental partial learning."""
+        # Stop training when accuracy or f1 score reaches this threshold, it helps to avoid overfitting
+        # because we partially train it on a small dataset from one annotation only
+        get = self.control.attr.get
+        epochs = int(
+            get("model_classifier_epochs", 1000)
+        )  # Maximum number of training epochs
+        f1_threshold = float(get("model_classifier_f1_threshold", 0.95))
+        accuracy_threshold = float(get("model_classifier_accuracy_threshold", 1.00))
+
+        # Train and save
+        result = classifier.partial_fit(
+            features,
+            labels,
+            epochs=epochs,
+            f1_threshold=f1_threshold,
+            accuracy_threshold=accuracy_threshold,
+        )
+        classifier.save_and_cache(path)
+        return result
+
+    def load_classifier(self, features, label_map, project_id):
+        """Load or create a classifier model for timelinelabels.
+        1. Load neural network parameters from labeling config.
+        2. Try loading classifier model from memory cache, then from disk.
+        3. Or create a new classifier instance if there wasn't successful loading, or if parameters have changed.
+        """
+        get = self.control.attr.get
+        # LSTM sequence size
+        sequence_size = int(get("model_classifier_sequence_size", 16))
+        # LSTM hidden state size
+        hidden_size = int(get("model_classifier_hidden_size", 32))
+        # LSTM num layers
+        num_layers = int(get("model_classifier_num_layers", 1))
+
+        # Load classifier
+        path = self.get_classifier_path(project_id)
+        classifier = BaseNN.load_cached_model(path)
+
+        # Create a new classifier instance if it doesn't exist
+        # or if labeling config has changed
+        if (
+            not classifier
+            or classifier.label_map != label_map
+            or classifier.sequence_size != sequence_size
+            or classifier.hidden_size != hidden_size
+            or classifier.num_layers != num_layers
+        ):
+            logger.info("Creating a new classifier model for timelinelabels")
+            input_size = len(features[0])
+            output_size = len(label_map)
+            classifier = MultiLabelLSTM(
+                input_size,
+                output_size,
+                sequence_size=sequence_size,
+                hidden_size=hidden_size,
+                num_layers=num_layers,
+            )
+            classifier.set_label_map(label_map)
+
+        return classifier, path
+
+    def load_features_and_labels(self, data):
+        """Load features and labels from the annotation
+        Args:
+            data: event data, dictionary with keys 'task' and 'annotation'
+        Returns:
+            features: List of features, 2D array with shape (num_frames, num_features)
+            labels: List of labels, 2D array with shape (num_frames, num_labels)
+            label_map: Label map, dictionary mapping label names to indices in the labels array
+            project_id: Project ID from Label Studio
+        """
+        # Get the task and regions from the annotation
+        task = data["task"]
+        project_id = task["project"]
+        annotation = data["annotation"]
+        regions = annotation["result"]
+
+        # Get the features and labels for training
+        video_path = self.get_path(task)
+        frames = cached_feature_extraction(
+            self.model, video_path, self.model.model_name
+        )
+        features = [frame.probs for frame in frames]
+        label_map = get_label_map(self.control.labels)
+        labels, used_labels = convert_timelinelabels_to_probs(
+            regions, label_map=label_map, max_frame=len(frames)
+        )
+
+        # Check if all labels from used_labels are in the label_map
+        if not used_labels.issubset(label_map.keys()):
+            raise ValueError(
+                f"Annotation labels set ({used_labels}) is not subset "
+                f"of labels from the labeling config:\n{self.control}\n"
+                f"It can be caused by the mismatch between the labeling config "
+                f"and labels in the annotation #{data['annotation']['id']}"
+                f"of project #{project_id}."
+            )
+        return features, labels, label_map, project_id
+
+    def get_classifier_path(self, project_id):
+        yolo_base_name = os.path.splitext(os.path.basename(self.model.model_name))[0]
+        path = f"{MODEL_ROOT}/timelinelabels-{project_id}-{yolo_base_name}-{self.from_name}.pkl"
+        return path
+
+
+# Preload and cache the default yolo model at startup
+TimelineLabelsModel.get_cached_model(TimelineLabelsModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/control_models/video_rectangle.py b/multi_object_tracking/yolo_sam/control_models/video_rectangle.py
new file mode 100644
index 000000000..cfa1159c0
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/control_models/video_rectangle.py
@@ -0,0 +1,228 @@
+import os
+import cv2
+import logging
+import yaml
+import hashlib
+
+from collections import defaultdict
+from control_models.base import ControlModel, MODEL_ROOT
+from label_studio_sdk.label_interface.control_tags import ControlTag
+from typing import List, Dict, Union
+
+
+logger = logging.getLogger(__name__)
+
+
+class VideoRectangleModel(ControlModel):
+    """
+    Class representing a RectangleLabels (bounding boxes) control tag for YOLO model.
+    """
+
+    type = "VideoRectangle"
+    model_path = "yolov10x.pt"
+
+    @classmethod
+    def is_control_matched(cls, control: ControlTag) -> bool:
+        # check object tag type
+        if control.objects[0].tag != "Video":
+            return False
+        # check control type VideoRectangle
+        return control.tag == cls.type
+
+    @staticmethod
+    def get_from_name_for_label_map(label_interface, target_name) -> str:
+        """VideoRectangle doesn't have labels inside, and we should find a connected Labels tag
+        and return its name as a source for the label map.
+        """
+        target: ControlTag = label_interface.get_control(target_name)
+        if not target:
+            raise ValueError(f'Control tag with name "{target_name}" not found')
+
+        for connected in label_interface.controls:
+            if connected.tag == "Labels" and connected.to_name == target.to_name:
+                return connected.name
+
+        logger.error("VideoRectangle detected, but no connected 'Labels' tag found")
+
+    @staticmethod
+    def get_video_duration(path):
+        if not os.path.exists(path):
+            raise ValueError(f"Video file not found: {path}")
+        video = cv2.VideoCapture(path)
+        fps = video.get(cv2.CAP_PROP_FPS)
+        frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        duration = frame_count / fps
+        logger.info(
+            f"Video duration: {duration} seconds, {frame_count} frames, {fps} fps"
+        )
+        return frame_count, duration
+
+    def predict_regions(self, path) -> List[Dict]:
+        # bounding box parameters
+        # https://docs.ultralytics.com/modes/track/?h=track#tracking-arguments
+        conf = float(self.control.attr.get("model_conf", 0.25))
+        iou = float(self.control.attr.get("model_iou", 0.70))
+
+
+        # yolo model parameters
+        # https://docs.ultralytics.com/models/yolov10/
+        yolo_model = self.control.attr.get("yolo_model", "yolov10x").lower()
+        yolo_model_file_name = yolo_model + ".pt"
+
+        original = f"{MODEL_ROOT}/{tracker_name}.yaml"
+        tmp_yaml = self.update_tracker_params(original, prefix=tracker_name + "_")
+        tracker = tmp_yaml if tmp_yaml else original
+
+        # run model track
+        try:
+            results = self.model.track(
+                path, conf=conf, iou=iou, tracker=tracker, stream=True
+            )
+        finally:
+            # clean temporary file
+            if tmp_yaml and os.path.exists(tmp_yaml):
+                os.remove(tmp_yaml)
+
+        # convert model results to label studio regions
+        return self.create_video_rectangles(results, path)
+
+    def create_video_rectangles(self, results, path):
+        """Create regions of video rectangles from the yolo tracker results"""
+        frames_count, duration = self.get_video_duration(path)
+        model_names = self.model.names
+        logger.debug(
+            f"create_video_rectangles: {self.from_name}, {frames_count} frames"
+        )
+
+        tracks = defaultdict(list)
+        track_labels = dict()
+        frame = -1
+        for result in results:
+            frame += 1
+            data = result.boxes
+            if not data.is_track:
+                continue
+
+            for i, track_id in enumerate(data.id.tolist()):
+                score = float(data.conf[i])
+                x, y, w, h = data.xywhn[i].tolist()
+                # get label
+                model_label = model_names[int(data.cls[i])]
+                if model_label not in self.label_map:
+                    continue
+                output_label = self.label_map[model_label]
+                track_labels[track_id] = output_label
+
+                box = {
+                    "frame": frame + 1,
+                    "enabled": True,
+                    "rotation": 0,
+                    "x": (x - w / 2) * 100,
+                    "y": (y - h / 2) * 100,
+                    "width": w * 100,
+                    "height": h * 100,
+                    "time": (frame + 1) * (duration / frames_count),
+                    "score": score,
+                }
+                tracks[track_id].append(box)
+
+        regions = []
+        for track_id in tracks:
+            sequence = tracks[track_id]
+            sequence = self.process_lifespans_enabled(sequence)
+
+            label = track_labels[track_id]
+            region = {
+                "from_name": self.from_name,
+                "to_name": self.to_name,
+                "type": "videorectangle",
+                "value": {
+                    "framesCount": frames_count,
+                    "duration": duration,
+                    "sequence": sequence,
+                    "labels": [label],
+                },
+                "score": max([frame_info["score"] for frame_info in sequence]),
+                "origin": "manual",
+            }
+            regions.append(region)
+
+        return regions
+
+    @staticmethod
+    def process_lifespans_enabled(sequence: List[Dict]) -> List[Dict]:
+        """This function detects gaps in the sequence of bboxes
+        and disables lifespan line for the gaps assigning "enabled": False
+        to the last bboxes in the whole span sequence.
+        """
+        prev = None
+        for i, box in enumerate(sequence):
+            if prev is None:
+                prev = sequence[i]
+                continue
+            if box["frame"] - prev["frame"] > 1:
+                sequence[i - 1]["enabled"] = False
+            prev = sequence[i]
+
+        # the last frame enabled is false to turn off lifespan line
+        sequence[-1]["enabled"] = False
+        return sequence
+
+    @staticmethod
+    def generate_hash_filename(extension=".yaml"):
+        """Store yaml configs as temporary files just for one model.track() run"""
+        hash_name = hashlib.sha256(os.urandom(16)).hexdigest()
+        os.makedirs(f"{MODEL_ROOT}/tmp/", exist_ok=True)
+        return f"{MODEL_ROOT}/tmp/{hash_name}{extension}"
+
+    def update_tracker_params(self, yaml_path: str, prefix: str) -> Union[str, None]:
+        """Update tracker parameters in the yaml file with the attributes from the ControlTag,
+        e.g. <VideoRectangle model_tracker="bytetrack" bytetrack_max_age="10" bytetrack_min_hits="3" />
+        or <VideoRectangle model_tracker="botsort" botsort_max_age="10" botsort_min_hits="3" />
+        Args:
+            yaml_path: Path to the original yaml file.
+            prefix: Prefix for attributes of control tag to extract
+        Returns:
+            The file path for new yaml with updated parameters
+        """
+        # check if there are any custom parameters in the labeling config
+        for attr_name, attr_value in self.control.attr.items():
+            if attr_name.startswith(prefix):
+                break
+        else:
+            # no custom parameters, exit
+            return None
+
+        # Load the original yaml file
+        with open(yaml_path, "r") as file:
+            config = yaml.safe_load(file)
+
+        # Extract parameters with prefix from ControlTag
+        for attr_name, attr_value in self.control.attr.items():
+            if attr_name.startswith(prefix):
+                # Remove prefix and update the corresponding yaml key
+                key = attr_name[len(prefix) :]
+
+                # Convert value to the appropriate type (bool, int, float, etc.)
+                if isinstance(config[key], bool):
+                    attr_value = attr_value.lower() == "true"
+                elif isinstance(config[key], int):
+                    attr_value = int(attr_value)
+                elif isinstance(config[key], float):
+                    attr_value = float(attr_value)
+
+                config[key] = attr_value
+
+        # Generate a new filename with a random hash
+        new_yaml_filename = self.generate_hash_filename()
+
+        # Save the updated config to a new yaml file
+        with open(new_yaml_filename, "w") as file:
+            yaml.dump(config, file)
+
+        # Return the new filename
+        return new_yaml_filename
+
+
+# pre-load and cache default model at startup
+VideoRectangleModel.get_cached_model(VideoRectangleModel.model_path)
diff --git a/multi_object_tracking/yolo_sam/docker-compose.yml b/multi_object_tracking/yolo_sam/docker-compose.yml
new file mode 100644
index 000000000..f69ace33a
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/docker-compose.yml
@@ -0,0 +1,50 @@
+version: "3.8"
+
+services:
+  MOT_yolo_sam:
+    container_name: MOT_yolo_sam
+    image: humansignal/MOT_yolo_sam:v0
+    build:
+      context: .
+      args:
+        TEST_ENV: ${TEST_ENV}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            -   driver: nvidia
+                count: 1
+                capabilities: [ gpu ]
+    environment:
+      # specify these parameters if you want to use basic auth for the model server
+      - BASIC_AUTH_USER=
+      - BASIC_AUTH_PASS=
+      # set the log level for the model server
+      - LOG_LEVEL=DEBUG
+      # any other parameters that you want to pass to the model server
+      - ANY=PARAMETER
+      # specify the number of workers and threads for the model server
+      - WORKERS=1
+      - THREADS=8
+      # specify the model directory (likely you don't need to change this)
+      - MODEL_DIR=/data/models
+      # specify device
+      - DEVICE=cuda  # or 'cpu' (coming soon)
+      # SAM2 model config
+      - MODEL_CONFIG=./configs/sam2.1/sam2.1_hiera_l.yaml
+      # SAM2 checkpoint
+      - MODEL_CHECKPOINT=sam2.1_hiera_large.pt
+      - MAX_FRAMES_TO_TRACK=2000
+      # Specify the Label Studio URL and API key to access
+      # uploaded, local storage and cloud storage files.
+      # Do not use 'localhost' as it does not work within Docker containers.
+      # Use prefix 'http://' or 'https://' for the URL always.
+      # Determine the actual IP using 'ifconfig' (Linux/Mac) or 'ipconfig' (Windows).
+      - LABEL_STUDIO_URL=http://172.24.253.95:8080
+      - LABEL_STUDIO_API_KEY=6f72555ade2979e22ab435a05e682383e14ba734
+    ports:
+      - "9090:9090"
+    volumes:
+      - "./data/server:/data"
+      - ./models:/app/models  # Mount the local 'models' directory
+      - "./cache_dir:/app/cache_dir"
diff --git a/multi_object_tracking/yolo_sam/model.py b/multi_object_tracking/yolo_sam/model.py
new file mode 100644
index 000000000..c67544401
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/model.py
@@ -0,0 +1,607 @@
+import os
+import pathlib
+import tempfile
+import logging
+import json
+from typing import List, Dict, Optional, Literal, cast
+import sys
+
+import cv2
+import torch
+import numpy as np
+import requests
+from label_studio_ml.model import LabelStudioMLBase
+from label_studio_ml.response import ModelResponse
+from label_studio_sdk._extensions.label_studio_tools.core.utils.io import get_local_path
+from label_studio_sdk.label_interface.objects import PredictionValue
+# from PIL import Image
+from collections import defaultdict
+from label_studio_sdk.client import LabelStudio
+
+# YOLO imports:
+from control_models.base import ControlModel
+from control_models.choices import ChoicesModel
+from control_models.rectangle_labels import RectangleLabelsModel
+from control_models.rectangle_labels_obb import RectangleLabelsObbModel
+from control_models.polygon_labels import PolygonLabelsModel
+from control_models.keypoint_labels import KeypointLabelsModel
+from control_models.video_rectangle import VideoRectangleModel
+from control_models.timeline_labels import TimelineLabelsModel
+from typing import List, Dict, Optional
+
+# Register available model classes
+available_model_classes = [
+    ChoicesModel,
+    RectangleLabelsModel,
+    RectangleLabelsObbModel,
+    PolygonLabelsModel,
+    KeypointLabelsModel,
+    VideoRectangleModel,
+    TimelineLabelsModel,
+]
+
+
+# read the environment variables and set the paths just before importing the sam2 module
+SEGMENT_ANYTHING_2_REPO_PATH = os.getenv('SEGMENT_ANYTHING_2_REPO_PATH', 'sam2')
+sys.path.append(SEGMENT_ANYTHING_2_REPO_PATH)
+from sam2.build_sam import build_sam2, build_sam2_video_predictor
+
+logger = logging.getLogger(__name__)
+
+DEVICE = os.getenv('DEVICE', 'cuda')
+MODEL_CONFIG = os.getenv('MODEL_CONFIG', './configs/sam2.1/sam2.1_hiera_l.yaml')
+MODEL_CHECKPOINT = os.getenv('MODEL_CHECKPOINT', 'sam2.1_hiera_large.pt')
+MAX_FRAMES_TO_TRACK = int(os.getenv('MAX_FRAMES_TO_TRACK', 400))
+PROMPT_TYPE = cast(Literal["box", "point"], os.getenv('PROMPT_TYPE', 'box'))
+ANNOTATION_WORKAROUND = os.getenv('ANNOTATION_WORKAROUND', False)
+DEBUG = os.getenv('DEBUG', False)
+LABEL_STUDIO_API_KEY = os.getenv('LABEL_STUDIO_API_KEY', '')
+
+if DEVICE == 'cuda':
+    # use bfloat16 for the entire notebook
+    torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
+
+    if torch.cuda.get_device_properties(0).major >= 8:
+        # turn on tfloat32 for Ampere GPUs (https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+
+
+# build path to the model checkpoint
+sam2_checkpoint = str(pathlib.Path(__file__).parent / SEGMENT_ANYTHING_2_REPO_PATH / "checkpoints" / MODEL_CHECKPOINT)
+logger.debug(f'Model checkpoint: {sam2_checkpoint}')
+logger.debug(f'Model config: {MODEL_CONFIG}')
+predictor = build_sam2_video_predictor(MODEL_CONFIG, sam2_checkpoint)
+
+
+# manage cache for inference state
+# TODO: make it process-safe and implement cache invalidation
+_predictor_state_key = ''
+_inference_state = None
+
+def get_inference_state(video_dir):
+    global _predictor_state_key, _inference_state
+    if _predictor_state_key != video_dir:
+        _predictor_state_key = video_dir
+        _inference_state = predictor.init_state(video_path=video_dir)
+    return _inference_state
+
+class YOLO_SAM(LabelStudioMLBase):
+    """Custom ML Backend model
+    """
+
+    def setup(self):
+        """Configure any parameters of your model here"""
+        self.set("model_version", "yolo_sam")
+
+
+    def detect_control_models(self) -> List[ControlModel]:
+        """Detect control models based on the labeling config.
+        Control models are used to predict regions for different control tags in the labeling config.
+        """
+        control_models = []
+
+        for control in self.label_interface.controls:
+            # skipping tags without toName
+            if not control.to_name:
+                logger.warning(
+                    f'{control.tag} {control.name} has no "toName" attribute, skipping it'
+                )
+                continue
+
+            # match control tag with available control models
+            for model_class in available_model_classes:
+                if model_class.is_control_matched(control):
+                    instance = model_class.create(self, control)
+                    if not instance:
+                        logger.debug(
+                            f"No instance created for {control.tag} {control.name}"
+                        )
+                        continue
+                    if not instance.label_map:
+                        logger.error(
+                            f"No label map built for the '{control.tag}' control tag '{instance.from_name}'.\n"
+                            f"This indicates that your Label Studio config labels do not match the model's labels.\n"
+                            f"To fix this, ensure that the 'value' or 'predicted_values' attribute "
+                            f"in your Label Studio config matches one or more of these model labels.\n"
+                            f"If you don't want to use this control tag for predictions, "
+                            f'add `model_skip="true"` to it.\n'
+                            f"Examples:\n"
+                            f'  <Label value="Car"/>\n'
+                            f'  <Label value="YourLabel" predicted_values="label1,label2"/>\n'
+                            f"Labels provided in your labeling config:\n"
+                            f"  {str(control.labels_attrs)}\n"
+                            f"Available '{instance.model_path}' model labels:\n"
+                            f"  {list(instance.model.names.values())}"
+                        )
+                        continue
+
+                    control_models.append(instance)
+                    logger.debug(f"Control tag with model detected: {instance}")
+                    break
+
+        if not control_models:
+            control_tags = ", ".join([c.type for c in available_model_classes])
+            raise ValueError(
+                f"No suitable control tags (e.g. {control_tags} connected to Image or Video object tags) "
+                f"detected in the label config:\n{self.label_config}"
+            )
+
+        return control_models
+
+
+    def split_frames(self, video_path, temp_dir, start_frame=0, end_frame=100):
+        logger.debug(f'Opening video file: {video_path}')
+        video = cv2.VideoCapture(video_path)
+        fps = video.get(cv2.CAP_PROP_FPS)
+        frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
+        logger.debug(f'fps: {fps}, frame_count: {frame_count}')
+        duration = frame_count / fps
+        print(f'duration: {duration}')
+
+        if not video.isOpened():
+            raise ValueError(f"Could not open video file: {video_path}")
+
+        logger.debug(f'Number of frames: {int(video.get(cv2.CAP_PROP_FRAME_COUNT))}')
+
+        frame_count = 0
+        while True:
+            success, frame = video.read()
+
+            if not success:
+                logger.error(f'Failed to read frame {frame_count}')
+                # manage this (frame 57 of acutal video test)
+                # poi risovli il problema del label con diverse etichette
+                break
+
+            if frame_count < start_frame:
+                frame_count += 1
+                continue
+
+            if frame_count >= end_frame:
+                break
+
+            frame_filename = os.path.join(temp_dir, f'{frame_count:05d}.jpg')
+
+            if not os.path.exists(frame_filename):
+                cv2.imwrite(frame_filename, frame)
+
+            logger.debug(f'Frame {frame_count}: {frame_filename}')
+            yield frame_filename, frame
+            frame_count += 1
+
+        video.release()
+
+
+    def get_prompts(self, context) -> List[Dict]:
+        logger.debug(f'Extracting keypoints from context: {context}')
+        prompts = []
+        for ctx in context['result']:
+            # Process each video tracking object separately
+            obj_id = ctx['id']
+            for obj in ctx['value']['sequence']:
+                x = obj['x'] / 100
+                y = obj['y'] / 100
+                box_width = obj['width'] / 100
+                box_height = obj['height'] / 100
+                frame_idx = obj['frame'] - 1
+
+                if PROMPT_TYPE == 'point':
+                    # SAM2 video works with keypoints - convert the rectangle to the set of keypoints within the rectangle
+                    # bbox (x, y) is top-left corner
+                    kps = [
+                        # center of the bbox
+                        [x + box_width / 2, y + box_height / 2],
+                        # half of the bbox width to the left
+                        [x + box_width / 4, y + box_height / 2],
+                        # half of the bbox width to the right
+                        [x + 3 * box_width / 4, y + box_height / 2],
+                        # half of the bbox height to the top
+                        [x + box_width / 2, y + box_height / 4],
+                        # half of the bbox height to the bottom
+                        [x + box_width / 2, y + 3 * box_height / 4]
+                    ]
+                elif PROMPT_TYPE == 'box':
+                    # SAM2 video works with boxes - use the rectangle inf xyxy format
+                    kps = [x, y, x + box_width, y + box_height]
+                else:
+                    raise ValueError(f'Invalid prompt type: {PROMPT_TYPE}')
+
+                points = np.array(kps, dtype=np.float32)
+                # labels are not used for box prompts
+                labels = np.array([1] * len(kps), dtype=np.int32) if PROMPT_TYPE == 'point' else None
+                prompts.append({
+                    'points': points,
+                    'labels': labels,
+                    'frame_idx': frame_idx,
+                    'obj_id': obj_id
+                })
+
+        return prompts
+
+
+    def _get_fps(self, context):
+        # get the fps from the context
+        frames_count = context['result'][0]['value']['framesCount']
+        duration = context['result'][0]['value']['duration']
+        return frames_count, duration
+
+    # def convert_mask_to_bbox(self, mask):
+    #     # convert mask to bbox
+    #     h, w = mask.shape[-2:]
+    #     mask_int = mask.reshape(h, w, 1).astype(np.uint8)
+    #     contours, _ = cv2.findContours(mask_int, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    #     if len(contours) == 0:
+    #         return None
+    #     x, y, w, h = cv2.boundingRect(contours[0])
+    #     return {
+    #         'x': x,
+    #         'y': y,
+    #         'width': w,
+    #         'height': h
+    #     }
+
+    def convert_mask_to_bbox(self, mask):
+        # squeeze
+        mask = mask.squeeze()
+
+        y_indices, x_indices = np.where(mask == 1)
+        if len(x_indices) == 0 or len(y_indices) == 0:
+            return None
+
+        # Find the min and max indices
+        xmin, xmax = np.min(x_indices), np.max(x_indices)
+        ymin, ymax = np.min(y_indices), np.max(y_indices)
+
+        # Get mask dimensions
+        height, width = mask.shape
+
+        # Calculate bounding box dimensions
+        box_width = xmax - xmin + 1
+        box_height = ymax - ymin + 1
+
+        # Normalize and scale to percentage
+        x_pct = (xmin / width) * 100
+        y_pct = (ymin / height) * 100
+        width_pct = (box_width / width) * 100
+        height_pct = (box_height / height) * 100
+
+        return {
+            "x": round(x_pct, 2),
+            "y": round(y_pct, 2),
+            "width": round(width_pct, 2),
+            "height": round(height_pct, 2)
+        }
+
+
+    def dump_image_with_mask(self, frame, mask, output_file, obj_id=None, random_color=False):
+        from matplotlib import pyplot as plt
+        if random_color:
+            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        else:
+            cmap = plt.get_cmap("tab10")
+            cmap_idx = 0 if obj_id is None else obj_id
+            color = np.array([*cmap(cmap_idx)[:3], 0.6])
+        h, w = mask.shape[-2:]
+        mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+
+        # create an image file to display image overlayed with mask
+        mask_image = (mask_image * 255).astype(np.uint8)
+        mask_image = cv2.cvtColor(mask_image, cv2.COLOR_BGRA2BGR)
+        mask_image = cv2.addWeighted(frame, 1.0, mask_image, 0.8, 0)
+        logger.debug(f'Shapes: frame={frame.shape}, mask={mask.shape}, mask_image={mask_image.shape}')
+        # save in file
+        logger.debug(f'Saving image with mask to {output_file}')
+        cv2.imwrite(output_file, mask_image)
+
+    def predict(
+            self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs
+    ) -> ModelResponse:
+        """Run YOLO predictions on the tasks
+        :param tasks: [Label Studio tasks in JSON format](https://labelstud.io/guide/task_format.html)
+        :param context: [Label Studio context in JSON format](https://labelstud.io/guide/ml_create)
+        :return model_response
+            ModelResponse(predictions=predictions) with
+            predictions [Predictions array in JSON format]
+            (https://labelstud.io/guide/export.html#Label-Studio-JSON-format-of-annotated-tasks)
+        """
+        logger.info(
+            f"Run prediction on {len(tasks)} tasks, project ID = {self.project_id}"
+        )
+        control_models = self.detect_control_models()
+
+        predictions = []
+        for task in tasks:
+
+            regions = []
+            for model in control_models:
+                path = model.get_path(task)
+                regions += model.predict_regions(path)
+
+            # calculate final score
+            all_scores = [region["score"] for region in regions if "score" in region]
+            avg_score = sum(all_scores) / max(len(all_scores), 1)
+
+            # compose final prediction
+            prediction = {
+                "result": regions,
+                "score": avg_score,
+                "model_version": self.model_version,
+            }
+            predictions.append(prediction)
+
+        return ModelResponse(predictions=predictions)
+
+    def sam_predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -> ModelResponse:
+        """
+        Returns the predicted mask for a smart keypoint that has been placed.
+
+        This function is responsible for processing video annotation tasks and predicting the mask of an object for a given video frame. It uses Label Studio context and draft data to determine the bounding boxes or keypoints that need to be predicted. The prediction is performed using a video tracking model, which processes multiple frames to create a coherent annotation for the target object across a sequence of video frames.
+
+        For multi-object tracking, it is necessary to refer to the drafts instead of the context because the context contains only the data of the box that was most recently modified.
+
+        The logic is as follows: each time the model is called, the prediction starts from the frame containing the last label of the object that appears the earliest in the video. By calling the model multiple times, the prediction is always performed moving forward.
+
+        Steps involved in the process:
+        1. Extract the relevant data from `tasks` and `context` to determine the prompts for the model.
+        2. Cache the video locally and extract relevant frames using `split_frames`.
+        3. Use the prompts to guide the model in identifying and tracking the object of interest.
+        4. Generate a mask for each frame where the object is detected and track the object through subsequent frames.
+        5. Propagate the detected objects through the video sequence to refine annotations and maintain consistency.
+        6. Create or update the annotation in Label Studio to provide feedback to the user.
+
+        Args:
+            tasks (List[Dict]): List of tasks that need annotation.
+            context (Optional[Dict]): Additional information about the current annotation context.
+            kwargs: Optional additional arguments.
+
+        Returns:
+            ModelResponse: Response containing predicted annotations for the video frames.
+        """
+        from_name, to_name, value = self.get_first_tag_occurence('VideoRectangle', 'Video')
+        try:
+            drafts = tasks[0]['drafts'][0]
+        except IndexError:
+            logger.error('Drafts not found, using annotations')
+            try:
+                drafts = tasks[0]['annotations'][0]
+            except IndexError:
+                logger.error('Annotations not found, using context')
+                drafts = context
+        if not len(drafts):
+            logger.info('Draft empty, using context')
+            drafts = context
+        task = tasks[0]
+        task_id = task['id']
+        # Get the video URL from the task
+        video_url = task['data'][value]
+
+        # cache the video locally
+        video_path = get_local_path(video_url, task_id=task_id)
+        logger.debug(f'Video path: {video_path}')
+
+        # get prompts from context
+        # prompts = self.get_prompts(context)
+        prompts = self.get_prompts(drafts)
+
+        context_ids = set([ctx['id'] for ctx in context['result']])
+        all_obj_ids = set([p['id'] for p in drafts['result']] +
+                          ([p['id'] for p in tasks[0]['annotations'][0]['result']] if len(tasks[0]['annotations']) else []))
+        if not context_ids.issubset( all_obj_ids):
+            # Returning here because the case where object ids in the context do not match the ids found in the annotations is not supported.
+            # This remains an open issue but is not considered a substantial problem.
+            raise NotImplementedError(f'Context id {context_ids} not found in drafts result: {all_obj_ids}'
+                                      f'TODO merge context and drafts')
+
+        # create a map from obj_id to integer
+        obj_ids = {obj_id: i for i, obj_id in enumerate(all_obj_ids)}
+        # find the last frame index
+        # if there is only one object, use the last frame of the object: continue tracking from last tracked frame
+        # if there are multiple objects, use the smallest frame index of all objects
+        if len(all_obj_ids) == 1:
+            first_frame_idx = min(p['frame_idx'] for p in prompts) if prompts else 0
+            last_frame_idx = max(p['frame_idx'] for p in prompts) if prompts else 0
+        else:
+            first_frame_idx = min(p['frame_idx'] for p in prompts) if prompts else 0
+            # the minimum of the maximum frame_idx of all objects grouped by id
+            last_frame_idx = min(max(p['frame_idx'] for p in prompts if p['obj_id'] == obj_id) for obj_id in all_obj_ids)
+        frames_count, duration = self._get_fps(context)
+        fps = frames_count / duration
+
+        logger.debug(
+            f'Number of prompts: {len(prompts)}, '
+            f'first frame index: {first_frame_idx}, '
+            f'last frame index: {last_frame_idx}, '
+            f'obj_ids: {obj_ids}')
+
+        frames_to_track = min(MAX_FRAMES_TO_TRACK, frames_count - last_frame_idx)
+
+        # Split the video into frames
+        with tempfile.TemporaryDirectory() as temp_dir:
+
+            # # use persisted dir for debug
+            # temp_dir = '/tmp/frames'
+            # os.makedirs(temp_dir, exist_ok=True)
+
+            # get all frames
+            frames = list(self.split_frames(
+                video_path, temp_dir,
+                start_frame=first_frame_idx,
+                end_frame=last_frame_idx + frames_to_track
+            ))
+            height, width, _ = frames[0][1].shape
+            logger.debug(f'Video width={width}, height={height}')
+
+            # get inference state
+            inference_state = get_inference_state(temp_dir)
+            predictor.reset_state(inference_state)
+
+            # Group prompts by 'obj_id' and sort them by 'frame_idx' in one step
+            prompt_id_dict = defaultdict(list)
+            [prompt_id_dict[prompt['obj_id']].append(prompt) for prompt in prompts]
+
+            # Sort the prompts and extract the highest frame index for each object ID
+            highest_frames = [sorted(prompts, key=lambda x: x['frame_idx'])[-1]['frame_idx'] for prompts in
+                              prompt_id_dict.values() if prompts]
+
+            # Get the minimum value of the highest frame indices
+            prompt_idx = min(highest_frames) if highest_frames else None
+
+            for prompt in prompts:
+
+                frame_idx = prompt['frame_idx'] - first_frame_idx
+                # sam 2 not predict other frame if are present prompts after the frame: the prompt must be set in the same frame for each object
+                if frame_idx > prompt_idx:
+                    logger.warning(f'Prompt frame index {frame_idx} is out of bounds')
+                    continue
+
+
+                if PROMPT_TYPE == 'point':
+                    # multiply points by the frame size
+                    prompt['points'][:, 0] *= width
+                    prompt['points'][:, 1] *= height
+                    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+                        inference_state=inference_state,
+                        frame_idx=frame_idx,
+                        obj_id=obj_ids[prompt['obj_id']],
+                        points=prompt['points'],
+                        labels=prompt['labels']
+                    )
+                elif PROMPT_TYPE == 'box':
+                    # multiply points by the frame size
+                    prompt['points'][0] *= width
+                    prompt['points'][1] *= height
+                    prompt['points'][2] *= width
+                    prompt['points'][3] *= height
+                    _, out_obj_ids, out_mask_logits = predictor.add_new_points_or_box(
+                        inference_state=inference_state,
+                        frame_idx=frame_idx,
+                        obj_id=obj_ids[prompt['obj_id']],
+                        box=prompt['points'],
+                    )
+            if DEBUG:
+                debug_dir = './debug-frames'
+                os.makedirs(debug_dir, exist_ok=True)
+
+            sequences = dict()
+            logger.info(f'Propagating in video from frame {last_frame_idx} to {last_frame_idx + frames_to_track}')
+            for out_frame_idx, out_obj_ids, out_mask_logits in predictor.propagate_in_video(
+                    inference_state=inference_state,
+                    start_frame_idx=last_frame_idx,
+                    max_frame_num_to_track=frames_to_track
+            ):
+                real_frame_idx = out_frame_idx + first_frame_idx
+                for i, out_obj_id in enumerate(out_obj_ids):
+                    mask = (out_mask_logits[i] > 0.0).cpu().numpy()
+
+                    if DEBUG:
+
+                        # to debug, save the mask as an image
+                        self.dump_image_with_mask(frames[out_frame_idx][1], mask, f'{debug_dir}/{out_frame_idx:05d}_{out_obj_id}.jpg', obj_id=out_obj_id, random_color=True)
+
+                    bbox = self.convert_mask_to_bbox(mask)
+                    if bbox:
+                        obj_id = next((k for k, v in obj_ids.items() if v == out_obj_id), None)
+                        sequences[obj_id] = sequences.get(obj_id, [])
+                        sequences[obj_id].append({
+                            'frame': real_frame_idx + 1,
+                            # 'x': bbox['x'] / width * 100,
+                            # 'y': bbox['y'] / height * 100,
+                            # 'width': bbox['width'] / width * 100,
+                            # 'height': bbox['height'] / height * 100,
+                            'x': bbox['x'],
+                            'y': bbox['y'],
+                            'width': bbox['width'],
+                            'height': bbox['height'],
+                            'enabled': True,
+                            'rotation': 0,
+                            'time': out_frame_idx / fps
+                        })
+            result = []
+            for obj_id in all_obj_ids:
+                # find the context to use by searching on drafts by obj_id
+                context_result_sequence = next((ctx['value']['sequence'] for ctx in drafts["result"] if ctx['id'] == obj_id), [])
+                # take the old sequence only for the frames before the first frame of the new sequence
+                # and after the last frame of the new sequence
+                new_sequence = [s for s in context_result_sequence if s['frame'] < sequences[obj_id][0]['frame']] + \
+                               sequences[obj_id] + \
+                               [s for s in context_result_sequence if s['frame'] >= sequences[obj_id][-1]['frame']]
+                # take the old labels: take from context if present, otherwise from drafts
+                labels = next((ctx['value'].get('labels', None) for ctx in context["result"] if ctx['id'] == obj_id), None) or \
+                         next((ctx['value'].get('labels', None) for ctx in drafts["result"] if ctx['id'] == obj_id), None)
+                result.append({
+                    'value': {
+                        'framesCount': frames_count,
+                        'duration': duration,
+                        'sequence': new_sequence,
+                        'labels': labels if labels else []
+                    },
+                    'from_name': 'box',
+                    'to_name': 'video',
+                    'type': 'videorectangle',
+                    'origin': 'manual',
+                    'id': obj_id
+                })
+
+
+            prediction = PredictionValue(
+                model_version=MODEL_CHECKPOINT,
+                score=1.0,
+                result=result
+            )
+            logger.debug(f'Prediction: {prediction.model_dump()}')
+            if DEBUG:
+                with open('prediction.json', 'w') as f:
+                    json.dump(prediction.model_dump(), f)
+
+            if ANNOTATION_WORKAROUND:
+                # this is a workaround to update the annotation in the Label Studio since using the model response shows all the objects with the same label
+                # also if the label is different for each object
+                client = LabelStudio(
+                    api_key=LABEL_STUDIO_API_KEY,
+                )
+                if len(tasks[0]['annotations']) == 0:
+                    logger.debug('Creating new annotation')
+                    ann = client.annotations.create(
+                        id=task_id,
+                        result=result,
+                        task=tasks[0]['id'],
+                        project=tasks[0]['project']
+                    )
+                    client.annotations.get(id=ann.id)
+                else:
+                    logger.debug(f'Updating annotation: {tasks[0]["annotations"][0]["id"]}')
+                    ann = client.annotations.update(
+                        id=tasks[0]['annotations'][0]['id'],
+                        result=result,
+                        task=task_id,
+                        project=tasks[0]['project']
+                    ) # perche se non lo faccio nella UI mette tutti gli oggetti con la stessa label! sempre!
+                # convert annotation to draft making POST request to http://<IP>/api/annotations/{id}/convert-to-draft
+                url = f'{os.getenv("LABEL_STUDIO_URL")}/api/annotations/{ann.id}/convert-to-draft'
+                headers = {
+                    'Authorization': f'Token {os.getenv("LABEL_STUDIO_API_KEY")}'
+                }
+                response = requests.post(url, headers=headers)
+            # raise NotImplementedError('Stop here')
+            return ModelResponse(predictions=[prediction])
diff --git a/multi_object_tracking/yolo_sam/models/README.md b/multi_object_tracking/yolo_sam/models/README.md
new file mode 100644
index 000000000..fe9d093d1
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/models/README.md
@@ -0,0 +1 @@
+Put your YOLO models here.
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/models/botsort.yaml b/multi_object_tracking/yolo_sam/models/botsort.yaml
new file mode 100644
index 000000000..2ec6e3150
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/models/botsort.yaml
@@ -0,0 +1,19 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Default YOLO tracker settings for BoT-SORT tracker https://github.com/NirAharon/BoT-SORT
+# This file is taken from https://github.com/ultralytics/ultralytics/tree/main/ultralytics/cfg/trackers
+
+tracker_type: botsort # tracker type, ['botsort', 'bytetrack']
+track_high_thresh: 0.5 # threshold for the first association
+track_low_thresh: 0.1 # threshold for the second association
+new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks
+track_buffer: 30 # buffer to calculate the time when to remove tracks
+match_thresh: 0.8 # threshold for matching tracks
+fuse_score: True # Whether to fuse confidence scores with the iou distances before matching
+# min_box_area: 10  # threshold for min box areas(for tracker evaluation, not used for now)
+
+# BoT-SORT settings
+gmc_method: sparseOptFlow # method of global motion compensation
+# ReID model related thresh (not supported yet)
+proximity_thresh: 0.5
+appearance_thresh: 0.25
+with_reid: False
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/models/bytetrack.yaml b/multi_object_tracking/yolo_sam/models/bytetrack.yaml
new file mode 100644
index 000000000..3bed2182a
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/models/bytetrack.yaml
@@ -0,0 +1,12 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+# Default YOLO tracker settings for ByteTrack tracker https://github.com/ifzhang/ByteTrack
+# This file is taken from https://github.com/ultralytics/ultralytics/tree/main/ultralytics/cfg/trackers
+
+tracker_type: bytetrack # tracker type, ['botsort', 'bytetrack']
+track_high_thresh: 0.5 # threshold for the first association
+track_low_thresh: 0.1 # threshold for the second association
+new_track_thresh: 0.6 # threshold for init new track if the detection does not match any tracks
+track_buffer: 30 # buffer to calculate the time when to remove tracks
+match_thresh: 0.8 # threshold for matching tracks
+fuse_score: True # Whether to fuse confidence scores with the iou distances before matching
+# min_box_area: 10  # threshold for min box areas(for tracker evaluation, not used for now)
diff --git a/multi_object_tracking/yolo_sam/models/yolo.yaml b/multi_object_tracking/yolo_sam/models/yolo.yaml
new file mode 100644
index 000000000..95225e6fe
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/models/yolo.yaml
@@ -0,0 +1,4 @@
+# Inference configuration for YOLO
+conf: 0.5         # Confidence threshold
+iou: 0.45         # IoU threshold for NMS
+max_det: 50       # Maximum number of detections
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/requirements-base.txt b/multi_object_tracking/yolo_sam/requirements-base.txt
new file mode 100644
index 000000000..68ce357c7
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/requirements-base.txt
@@ -0,0 +1,2 @@
+gunicorn==22.0.0
+label-studio-ml @ git+https://github.com/HumanSignal/label-studio-ml-backend.git
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/requirements-test.txt b/multi_object_tracking/yolo_sam/requirements-test.txt
new file mode 100644
index 000000000..cffeec658
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/requirements-test.txt
@@ -0,0 +1,2 @@
+pytest
+pytest-cov
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/requirements.txt b/multi_object_tracking/yolo_sam/requirements.txt
new file mode 100644
index 000000000..eb032f3bc
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/requirements.txt
@@ -0,0 +1,5 @@
+ultralytics~=8.2.76
+tqdm
+torchmetrics~=1.4.2
+opencv-python
+cuda-python
\ No newline at end of file
diff --git a/multi_object_tracking/yolo_sam/test_api.py b/multi_object_tracking/yolo_sam/test_api.py
new file mode 100644
index 000000000..ca7767be1
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/test_api.py
@@ -0,0 +1,47 @@
+"""
+This file contains tests for the API of your model. You can run these tests by installing test requirements:
+
+    ```bash
+    pip install -r requirements-test.txt
+    ```
+Then execute `pytest` in the directory of this file.
+
+- Change `NewModel` to the name of the class in your model.py file.
+- Change the `request` and `expected_response` variables to match the input and output of your model.
+"""
+
+import pytest
+import json
+from model import NewModel
+
+
+@pytest.fixture
+def client():
+    from _wsgi import init_app
+    app = init_app(model_class=NewModel)
+    app.config['TESTING'] = True
+    with app.test_client() as client:
+        yield client
+
+
+def test_predict(client):
+    request = {
+        'tasks': [{
+            'data': {
+                # Your input test data here
+            }
+        }],
+        # Your labeling configuration here
+        'label_config': '<View></View>'
+    }
+
+    expected_response = {
+        'results': [{
+            # Your expected result here
+        }]
+    }
+
+    response = client.post('/predict', data=json.dumps(request), content_type='application/json')
+    assert response.status_code == 200
+    response = json.loads(response.data)
+    assert response == expected_response
diff --git a/multi_object_tracking/yolo_sam/utils/__init__.py b/multi_object_tracking/yolo_sam/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/multi_object_tracking/yolo_sam/utils/converter.py b/multi_object_tracking/yolo_sam/utils/converter.py
new file mode 100644
index 000000000..554187eb4
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/utils/converter.py
@@ -0,0 +1,140 @@
+import numpy as np
+
+from typing import List, Dict
+
+
+def get_label_map(labels: List[str]) -> Dict:
+    """
+    Generate a label map from a list of labels.
+    Args:
+        labels: List of label names
+    Returns:
+        label_map: Dictionary mapping label names to indices
+    """
+    return {label: idx for idx, label in enumerate(sorted(labels))}
+
+
+def convert_timelinelabels_to_probs(
+    regions: List[Dict], label_map: Dict[str, int], max_frame=None
+) -> (np.ndarray, Dict):
+    """Generated numpy array with shape (num_frames, num_labels) and label mapping from timeline regions.
+    Args:
+        regions: List of timeline regions from annotation
+        label_map: Dictionary mapping label names to indices
+        max_frame: Maximum frame number in video
+    Returns:
+        labels_array: Numpy array with shape (num_frames, num_labels)
+        used_labels: Labels that were used in the regions
+    """
+    # Step 1: Collect all unique labels and map them to an index
+    used_labels = set()
+
+    # Step 1: Identify all unique labels
+    for region in regions:
+        labels = region["value"]["timelinelabels"]
+        used_labels.update(labels)
+
+    # Step 2: Find the maximum frame index to define the array's X-axis size
+    if max_frame is None:
+        max_frame = 0
+        for region in regions:
+            for r in region["value"]["ranges"]:
+                max_frame = max(max_frame, r["end"])
+
+    # Step 3: Create a numpy array with shape (num_frames, num_labels)
+    # Initialize it with zeros (no label assigned)
+    num_labels = len(label_map)
+    labels_array = np.zeros((max_frame, num_labels), dtype=int)
+
+    # Step 4: Populate the array with labels based on frame ranges
+    for region in regions:
+        start_frame = region["value"]["ranges"][0]["start"] - 1
+        end_frame = region["value"]["ranges"][0]["end"]
+        label_name = region["value"]["timelinelabels"][0]
+        label_idx = label_map[label_name]
+
+        # Set the corresponding frames to 1 for the given label
+        labels_array[start_frame:end_frame, label_idx] = 1
+
+    return labels_array, used_labels
+
+
+def convert_probs_to_timelinelabels(
+    probs, label_mapping, from_name, score_threshold=0.5
+) -> List[Dict]:
+    """
+    Generate timeline labels regions based on the given probabilities and label mapping.
+
+    Args:
+    - probs: 2D numpy array or tensor of probabilities (shape: [num_frames, num_labels])
+    - label_mapping: dict mapping label names to indices in the probs array
+    - from_name: name of the control tag in the Label Studio configuration
+    - score_threshold: threshold above which a label is considered active for a frame
+
+    Returns:
+    - regions: List of regions in Label Studio format
+    """
+
+    # Initialize a dictionary to keep track of ongoing segments for each label
+    regions, added = [], 0
+    ongoing_segments = {label: {} for label in label_mapping}
+
+    num_frames = len(probs)  # Number of frames
+    if num_frames == 0:
+        return regions
+
+    # Iterate through each frame
+    for i in range(num_frames):
+        # Get probabilities for the current frame
+        frame_probs = probs[i]
+
+        # Iterate through each label
+        for label, label_idx in label_mapping.items():
+            prob = frame_probs[label_idx]
+            segment = ongoing_segments[label]
+
+            # Check if the probability exceeds the threshold
+            if prob >= score_threshold:
+                # Start a new segment if none exists
+                if not segment:
+                    segment["idx"] = added
+                    segment["start"] = i + 1
+                    segment["label"] = label
+                    segment["score"] = float(prob)
+                    segment["from_name"] = from_name
+                    added += 1
+                else:
+                    segment["score"] += float(prob)
+            else:
+                # Close the ongoing segment if probability falls below the threshold
+                if segment:
+                    segment["end"] = i
+                    segment["score"] /= i - (segment["start"] - 1)
+                    regions.append(create_timeline_region(**segment))
+                    segment.clear()
+
+    # Close any ongoing segments at the end of the video
+    for label, segment in ongoing_segments.items():
+        if segment:
+            segment["end"] = num_frames
+            segment["score"] /= num_frames - (segment["start"] - 1)
+            regions.append(create_timeline_region(**segment))
+
+    return regions
+
+
+def create_timeline_region(idx, start, end, label, score, from_name):
+    """
+    Helper function to add a timeline region to the timeline_labels list.
+    """
+    return {
+        "id": f"{idx}_{start}_{end}",
+        "type": "timelinelabels",
+        "value": {
+            "ranges": [{"start": start, "end": end}],
+            "timelinelabels": [label],
+        },
+        "to_name": "video",  # Customize if needed
+        "from_name": from_name,  # Customize if needed
+        "score": score,
+    }
diff --git a/multi_object_tracking/yolo_sam/utils/neural_nets.py b/multi_object_tracking/yolo_sam/utils/neural_nets.py
new file mode 100644
index 000000000..3caf3ef4b
--- /dev/null
+++ b/multi_object_tracking/yolo_sam/utils/neural_nets.py
@@ -0,0 +1,339 @@
+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import logging
+
+from torch.utils.data import DataLoader, TensorDataset
+from torch.nn.utils.rnn import pad_sequence
+from torchmetrics.classification import (
+    MultilabelPrecision,
+    MultilabelRecall,
+    MultilabelF1Score,
+    MultilabelAccuracy,
+)
+from typing import List, Union
+from joblib import Memory
+
+
+logger = logging.getLogger(__name__)
+memory = Memory("./cache_dir", verbose=0)  # Set up disk-based caching for model results
+_models = {}
+
+
+@memory.cache(ignore=["yolo_model"])
+def cached_yolo_predict(yolo_model, video_path, cache_params):
+    """Predict bounding boxes and labels using YOLO model and cache the results using joblib.
+    Args:
+        yolo_model (YOLO): YOLO model instance
+        video_path (str): Path to the video file
+        cache_params (str): Parameters for caching the results, they are used in @memory.cache decorator
+    """
+    frames = []
+    generator = yolo_model.predict(video_path, stream=True)
+
+    for frame in generator:
+        frame.orig_img = None  # remove image from cache to reduce size
+        frames.append(frame)
+
+    return frames
+
+
+@memory.cache(ignore=["yolo_model"])
+def cached_feature_extraction(yolo_model, video_path, cache_params):
+    """Extract features from the last layer of the YOLO model and cache them using joblib.
+    Args:
+        yolo_model (YOLO): YOLO model instance
+        video_path (str): Path to the video file
+        cache_params (str): Parameters for caching the results, they are used in @memory.cache decorator
+    """
+    layer_output = [None]
+
+    def get_last_layer_output(module, input, output):
+        layer_output[0] = input
+
+    # Register the hook on the last layer of the model
+    layer = yolo_model.model.model[-1].linear
+    hook_handle = layer.register_forward_hook(get_last_layer_output)
+
+    # Run model prediction, use stream to avoid out of memory
+    generator = yolo_model.predict(video_path, stream=True)
+
+    # Replace probs with last layer outputs
+    frames = []
+    for frame in generator:
+        frame.orig_img = None  # remove image from cache to reduce size
+        frame.probs = layer_output[0][0][0]  # => tensor, 1280 floats for yolov8n-cls
+        frames.append(frame)
+
+    # Remove the hook
+    hook_handle.remove()
+    return frames
+
+
+class BaseNN(nn.Module):
+    def __init__(self, **kwargs):
+        super(BaseNN, self).__init__()
+        self.label_map = None
+
+    def set_label_map(self, label_map):
+        self.label_map = label_map
+
+    def get_label_map(self):
+        return self.label_map
+
+    def save(self, path):
+        # ultralytics yolo11 patches torch.save to use dill,
+        # however it leads to serialization errors,
+        # so let's check for use_dill and disable it
+        if 'use_dill' in torch.save.__code__.co_varnames:
+            torch.save(self, path, use_dill=False)
+        else:
+            torch.save(self, path)
+        logger.info(f"Model saved to {path}")
+
+    @classmethod
+    def load(cls, path) -> "BaseNN":
+        model = torch.load(path)
+        model.eval()  # Set the model to evaluation mode
+        logger.info(f"Model loaded from {path}")
+        return model
+
+    @classmethod
+    def load_cached_model(cls, model_path: str) -> Union["BaseNN", None]:
+        global _models
+        if not os.path.exists(model_path):
+            return None
+
+        # Load per-project classifier
+        if model_path not in _models:
+            _models[model_path] = BaseNN.load(model_path)
+        return _models[model_path]
+
+    def save_and_cache(self, path):
+        self.save(path)
+        _models[path] = self
+
+
+class MultiLabelLSTM(BaseNN):
+
+    def __init__(
+        self,
+        input_size,
+        output_size,
+        fc_size=128,
+        hidden_size=16,
+        num_layers=1,
+        sequence_size=16,
+        learning_rate=1e-4,
+        weight_decay=1e-5,
+        dropout_rate=0.2,
+        device=None,
+        **kwargs,
+    ):
+        """Initialize the MultiLabelLSTM model.
+        Args:
+            input_size (int): Number of features in the input data
+            output_size (int): Number of labels in the output data
+            fc_size (int): Size of the fully connected layer
+            hidden_size (int): Size of the hidden state in the LSTM
+            num_layers (int): Number of layers in the LSTM
+            sequence_size (int): Size of the input sequence, used for chunking
+            learning_rate (float): Learning rate for the optimizer
+            weight_decay (float): Weight decay for the optimizer for L2 regularization
+            dropout_rate (float): Dropout rate for the fully connected layer
+            device (torch.device): Device to run the model on (CPU or GPU)
+        """
+        super(MultiLabelLSTM, self).__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.fc_size = fc_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.sequence_size = sequence_size
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.dropout_rate = dropout_rate
+
+        # Reduce dimensionality of input data
+        self.fc_input = nn.Linear(input_size, fc_size)
+        self.layer_norm = nn.LayerNorm(fc_size)
+        self.dropout = nn.Dropout(self.dropout_rate)
+
+        # LSTM layer for handling sequential data
+        self.lstm = nn.LSTM(
+            fc_size, hidden_size, num_layers, batch_first=True, bidirectional=True
+        )
+
+        # Fully connected layer for classification at each time step
+        # 2 because of bidirectional LSTM
+        self.fc = nn.Linear(2 * hidden_size, output_size)
+
+        # Initialize the loss function and optimizer
+        self.criterion = (
+            nn.BCEWithLogitsLoss()
+        )  # Binary cross-entropy for multi-label classification
+        self.optimizer = optim.Adam(
+            self.parameters(), lr=learning_rate, weight_decay=weight_decay
+        )
+
+        # Initialize device (CPU or GPU)
+        target_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = device if device else target_device
+        self.to(self.device)
+
+    def forward(self, x):
+        # Reduce the dimensionality of the input data
+        x = torch.relu(self.fc_input(x))
+        x = self.layer_norm(x)
+        x = self.dropout(x)
+
+        # x shape: (batch_size, seq_len, input_size)
+        # lstm_out contains outputs for all time steps
+        lstm_out, (_, _) = self.lstm(x)
+
+        # Apply fully connected layer at each time step to get output with final label number
+        out = self.fc(lstm_out)
+
+        # Output shape: (batch_size, seq_len, output_size)
+        return out
+
+    def preprocess_sequence(self, sequence: List[torch.Tensor], labels=None, overlap=2):
+        sequence = torch.stack(sequence) if isinstance(sequence, list) else sequence
+        sequence_size = self.sequence_size
+
+        # Split the data into small sequences by sequence_size with overlap
+        chunks = [
+            sequence[i : i + sequence_size]
+            for i in range(0, len(sequence), sequence_size // overlap)
+        ]
+        chunks = pad_sequence(chunks, batch_first=True, padding_value=0)
+
+        if labels is not None:
+            labels = torch.tensor(labels, dtype=torch.float32)
+            labels = [
+                labels[i : i + sequence_size]
+                for i in range(0, len(labels), sequence_size // overlap)
+            ]
+            labels = pad_sequence(labels, batch_first=True, padding_value=0)
+
+        return chunks, labels
+
+    def evaluate_metrics(self, dataloader, threshold=0.5):
+        self.eval()
+        params = {
+            "num_labels": self.output_size,
+            "average": "macro",
+            "threshold": threshold,
+            "zero_division": 1,
+        }
+        precision_metric = MultilabelPrecision(**params).to(self.device)
+        recall_metric = MultilabelRecall(**params).to(self.device)
+        f1_metric = MultilabelF1Score(**params).to(self.device)
+        accuracy_metric = MultilabelAccuracy(**params).to(self.device)
+
+        with torch.no_grad():
+            for data, labels in dataloader:
+                data = data.to(self.device)
+                labels = labels.to(self.device)
+                outputs = torch.sigmoid(self(data))
+
+                # Reshape outputs and labels from (batch_size, seq_len, num_labels)
+                # to (batch_size * seq_len, num_labels)
+                outputs = outputs.view(-1, self.output_size)
+                labels = labels.view(-1, self.output_size)
+
+                # No need to threshold manually; the metrics handle it
+                precision_metric.update(outputs, labels)
+                recall_metric.update(outputs, labels)
+                f1_metric.update(outputs, labels)
+                accuracy_metric.update(outputs, labels)
+
+        return {
+            "precision": precision_metric.compute().item(),
+            "recall": recall_metric.compute().item(),
+            "f1_score": f1_metric.compute().item(),
+            "accuracy": accuracy_metric.compute().item(),
+        }
+
+    def partial_fit(
+        self,
+        sequence,
+        labels,
+        batch_size=32,
+        epochs=1000,
+        accuracy_threshold=1.0,
+        f1_threshold=1.0,
+    ):
+        """Train the model on the given sequence data.
+        Args:
+            sequence (List[torch.Tensor]): List of tensors containing the input data
+            labels (List[List[int]]): List of lists containing the labels for each time step
+            batch_size (int): Batch size for training
+            epochs (int): Number of training epochs
+            accuracy_threshold (float): Stop training if accuracy exceeds this threshold
+            f1_threshold (float): Stop training if F1 score exceeds this threshold
+        """
+        batches, label_batches = self.preprocess_sequence(sequence, labels)
+
+        # Create a DataLoader for batching the input data
+        metrics = {}
+        dataset = TensorDataset(
+            batches, torch.tensor(label_batches, dtype=torch.float32)
+        )
+        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+        for epoch in range(epochs):
+            self.train()  # Set the model to training mode
+            epoch_loss = 0
+            for batch_data, batch_labels in dataloader:
+                # Move batch data and labels to the appropriate device
+                batch_data = batch_data.to(self.device)
+                batch_labels = batch_labels.to(self.device)
+
+                self.optimizer.zero_grad()
+                outputs = self(batch_data)  # Forward pass
+                loss = self.criterion(outputs, batch_labels)  # Calculate loss
+                loss.backward()  # Back propagation
+                self.optimizer.step()  # Update model parameters
+
+                epoch_loss += loss.item()
+
+            # metrics and threshold stops to avoid overfitting
+            metrics = self.evaluate_metrics(dataloader)
+            metrics["loss"] = epoch_loss / len(dataloader)
+            metrics["epoch"] = epoch + 1
+
+            logger.info(
+                f"Epoch {epoch + 1}, Loss: {epoch_loss / len(dataloader)}, {metrics}"
+            )
+            if metrics["accuracy"] >= accuracy_threshold:
+                logger.info(
+                    f"Accuracy >= {accuracy_threshold} threshold, model training stopped."
+                )
+                break
+            if metrics["f1_score"] >= f1_threshold:
+                logger.info(
+                    f"F1 score >= {f1_threshold} threshold, model training stopped."
+                )
+                break
+
+        return metrics
+
+    def predict(self, sequence):
+        """Split sequence into chunks with sequence_size and predict by chunks.
+        Then concatenate all predictions into one sequence of labels
+        """
+        length = len(sequence)
+        if length == 0:
+            return torch.tensor([])
+
+        batches, _ = self.preprocess_sequence(sequence, overlap=1)
+        self.eval()
+        logits = torch.sigmoid(self(batches))
+
+        # Concatenate batches to sequence back
+        shape = logits.shape
+        logits = torch.reshape(logits, [shape[0] * shape[1], shape[2]])
+        return logits[0:length]