Lightning-AI · rittik9 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 10, 2024
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, List, Union
+from typing import TYPE_CHECKING, List, Union, cast
 
 import torch
 from torch import Tensor
@@ -41,53 +41,140 @@ def _download_clip_for_clip_score() -> None:
     _CLIPProcessor = None
 
 
+def _detect_modality(input_data: Union[Tensor, List[Tensor], List[str], str]) -> Literal["image", "text"]:
+    """Automatically detect the modality of the input data.
+
+    Args:
+        input_data: Input data that can be either image tensors or text strings
+
+    Returns:
+        str: Either "image" or "text"
+
+    Raises:
+        ValueError: If the input_data is an empty list or modality cannot be determined
+
+    """
+    if isinstance(input_data, Tensor):
+        return "image"
+
+    if isinstance(input_data, list):
+        if len(input_data) == 0:
+            raise ValueError("Empty input list")
+        if isinstance(input_data[0], Tensor):
+            return "image"
+        if isinstance(input_data[0], str):
+            return "text"
+
+    if isinstance(input_data, str):
+        return "text"
+
+    raise ValueError("Could not automatically determine modality for input_data")
+
+
+def _process_image_data(images: Union[Tensor, List[Tensor]]) -> List[Tensor]:
+    """Helper function to process image data."""
+    if isinstance(images, Tensor):
+        if images.ndim == 3:
+            return [images]
+        raise ValueError("Expected all images to be 3d but found image that has either more or less")
+    if not all(i.ndim == 3 for i in images):
+        raise ValueError("Expected all images to be 3d but found image that has either more or less")
+    return images
+
+
+def _process_text_data(texts: Union[str, List[str]]) -> List[str]:
+    """Helper function to process text data."""
+    if not isinstance(texts, list):
+        texts = [texts]
+    return texts
+
+
+def _get_features(
+    data: List[Union[Tensor, str]],
+    modality: Literal["image", "text"],
+    device: torch.device,
+    model: "_CLIPModel",
+    processor: "_CLIPProcessor",
+) -> Tensor:
+    """Get features from the CLIP model for either images or text.
+
+    Args:
+       data: List of input data (images or text)
+       modality: Type of input data ("image" or "text")
+       device: Device to run the model on
+       model: CLIP model instance
+       processor: CLIP processor instance
+    Returns:
+       Tensor of features from the CLIP model
+
+    """
+    if modality == "image":
+        # Add type checking for images
+        image_data = [i for i in data if isinstance(i, Tensor)]
+        processed = processor(images=[i.cpu() for i in image_data], return_tensors="pt", padding=True)
+        features = model.get_image_features(processed["pixel_values"].to(device))
+    else:
+        processed = processor(text=data, return_tensors="pt", padding=True)
+        max_position_embeddings = model.config.text_config.max_position_embeddings
+        if processed["attention_mask"].shape[-1] > max_position_embeddings:
+            rank_zero_warn(
+                f"Encountered caption longer than {max_position_embeddings=}. Will truncate captions to this length."
+                "If longer captions are needed, initialize argument `model_name_or_path` with a model that supports"
+                "longer sequences",
+                UserWarning,
+            )
+            processed["attention_mask"] = processed["attention_mask"][..., :max_position_embeddings]
+            processed["input_ids"] = processed["input_ids"][..., :max_position_embeddings]
+        features = model.get_text_features(processed["input_ids"].to(device), processed["attention_mask"].to(device))
+
+    return features
+
+
 def _clip_score_update(
-    images: Union[Tensor, List[Tensor]],
-    text: Union[str, list[str]],
+    source: Union[Tensor, List[Tensor], List[str], str],
+    target: Union[Tensor, List[Tensor], List[str], str],
     model: _CLIPModel,
     processor: _CLIPProcessor,
 ) -> tuple[Tensor, int]:
-    if not isinstance(images, list):
-        if images.ndim == 3:
-            images = [images]
-    else:  # unwrap into list
-        images = list(images)
+    source_modality = _detect_modality(source)
+    target_modality = _detect_modality(target)
 
-    if not all(i.ndim == 3 for i in images):
-        raise ValueError("Expected all images to be 3d but found image that has either more or less")
-
-    if not isinstance(text, list):
-        text = [text]
+    source_data = (
+        _process_image_data(cast(Union[Tensor, List[Tensor]], source))
+        if source_modality == "image"
+        else _process_text_data(cast(Union[str, List[str]], source))
+    )
+    target_data = (
+        _process_image_data(cast(Union[Tensor, List[Tensor]], target))
+        if target_modality == "image"
+        else _process_text_data(cast(Union[str, List[str]], target))
+    )
 
-    if len(text) != len(images):
+    # Verify matching lengths
+    if len(source_data) != len(target_data):
         raise ValueError(
-            f"Expected the number of images and text examples to be the same but got {len(images)} and {len(text)}"
-        )
-    device = images[0].device
-    processed_input = processor(text=text, images=[i.cpu() for i in images], return_tensors="pt", padding=True)
-
-    img_features = model.get_image_features(processed_input["pixel_values"].to(device))
-    img_features = img_features / img_features.norm(p=2, dim=-1, keepdim=True)
-
-    max_position_embeddings = model.config.text_config.max_position_embeddings
-    if processed_input["attention_mask"].shape[-1] > max_position_embeddings:
-        rank_zero_warn(
-            f"Encountered caption longer than {max_position_embeddings=}. Will truncate captions to this length."
-            "If longer captions are needed, initialize argument `model_name_or_path` with a model that supports"
-            "longer sequences",
-            UserWarning,
+            "Expected the number of source and target examples to be the same but got "
+            f"{len(source_data)} and {len(target_data)}"
         )
-        processed_input["attention_mask"] = processed_input["attention_mask"][..., :max_position_embeddings]
-        processed_input["input_ids"] = processed_input["input_ids"][..., :max_position_embeddings]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if source_modality == "image" and isinstance(source_data[0], Tensor):
+        device = source_data[0].device
+    elif target_modality == "image" and isinstance(target_data[0], Tensor):
+        device = target_data[0].device
+    model = model.to(device)
 
-    txt_features = model.get_text_features(
-        processed_input["input_ids"].to(device), processed_input["attention_mask"].to(device)
+    source_features = _get_features(
+        cast(List[Union[Tensor, str]], source_data), source_modality, device, model, processor
     )
-    txt_features = txt_features / txt_features.norm(p=2, dim=-1, keepdim=True)
+    target_features = _get_features(
+        cast(List[Union[Tensor, str]], target_data), target_modality, device, model, processor
+    )
+    source_features = source_features / source_features.norm(p=2, dim=-1, keepdim=True)
+    target_features = target_features / target_features.norm(p=2, dim=-1, keepdim=True)
 
-    # cosine similarity between feature vectors
-    score = 100 * (img_features * txt_features).sum(axis=-1)
-    return score, len(text)
+    # Calculate cosine similarity
+    score = 100 * (source_features * target_features).sum(axis=-1)
+    return score, len(source_data)
 
 
 def _get_clip_model_and_processor(
@@ -113,8 +200,8 @@ def _get_clip_model_and_processor(
 
 
 def clip_score(
-    images: Union[Tensor, List[Tensor]],
-    text: Union[str, list[str]],
+    source: Union[Tensor, List[Tensor], List[str], str],
+    target: Union[Tensor, List[Tensor], List[str], str],
     model_name_or_path: Literal[
         "openai/clip-vit-base-patch16",
         "openai/clip-vit-base-patch32",
@@ -135,15 +222,21 @@ def clip_score(
     textual CLIP embedding :math:`E_C` for an caption :math:`C`. The score is bound between 0 and 100 and the closer
     to 100 the better.
 
-    .. caution::
-        Metric is not scriptable
+    .. note:: Metric is not scriptable
 
     Args:
-        images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors
-        text: Either a single caption or a list of captions
-        model_name_or_path: string indicating the version of the CLIP model to use. Available models are
-            `"openai/clip-vit-base-patch16"`, `"openai/clip-vit-base-patch32"`, `"openai/clip-vit-large-patch14-336"`
-            and `"openai/clip-vit-large-patch14"`,
+        source: Source input. This can be:
+            - Images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors.
+            - Text: Either a single caption or a list of captions.
+        target: Target input. This can be:
+            - Images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors.
+            - Text: Either a single caption or a list of captions.
+        model_name_or_path: String indicating the version of the CLIP model to use. Available models are:
+            - `"openai/clip-vit-base-patch16"`
+            - `"openai/clip-vit-base-patch32"`
+            - `"openai/clip-vit-large-patch14-336"`
+            - `"openai/clip-vit-large-patch14"`
+
 
     Raises:
         ModuleNotFoundError:
@@ -161,7 +254,6 @@ def clip_score(
 
     """
     model, processor = _get_clip_model_and_processor(model_name_or_path)
-    device = images.device if isinstance(images, Tensor) else images[0].device
-    score, _ = _clip_score_update(images, text, model.to(device), processor)
+    score, _ = _clip_score_update(source, target, model, processor)
     score = score.mean(0)
     return torch.max(score, torch.zeros_like(score))
@@ -11,8 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from collections.abc import Sequence
-from typing import Any, List, Optional, Union
+from typing import Any, List, Optional, Sequence, Union
 
 import torch
 from torch import Tensor
@@ -118,12 +117,18 @@ def __init__(
         self.add_state("score", torch.tensor(0.0), dist_reduce_fx="sum")
         self.add_state("n_samples", torch.tensor(0, dtype=torch.long), dist_reduce_fx="sum")
 
-    def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, list[str]]) -> None:
+    def update(
+        self, source: Union[Tensor, List[Tensor], List[str], str], target: Union[Tensor, List[Tensor], List[str], str]
+    ) -> None:
         """Update CLIP score on a batch of images and text.
 
         Args:
-            images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors
-            text: Either a single caption or a list of captions
+            source: Source input. This can be:
+                - Images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors.
+                - Text: Either a single caption or a list of captions.
+            target: Target input. This can be:
+                - Images: Either a single [N, C, H, W] tensor or a list of [C, H, W] tensors.
+                - Text: Either a single caption or a list of captions.
 
         Raises:
             ValueError:
@@ -132,7 +137,7 @@ def update(self, images: Union[Tensor, List[Tensor]], text: Union[str, list[str]
                 If the number of images and captions do not match
 
         """
-        score, n_samples = _clip_score_update(images, text, self.model, self.processor)
+        score, n_samples = _clip_score_update(source, target, self.model, self.processor)
         self.score += score.sum(0)
         self.n_samples += n_samples
 

@@ -110,7 +110,7 @@ def test_clip_score_differentiability(self, inputs, model_name_or_path):
     def test_error_on_not_same_amount_of_input(self, inputs, model_name_or_path):
         """Test that an error is raised if the number of images and text examples does not match."""
         metric = CLIPScore(model_name_or_path=model_name_or_path)
-        with pytest.raises(ValueError, match="Expected the number of images and text examples to be the same.*"):
+        with pytest.raises(ValueError, match="Expected the number of source and target examples to be the same.*"):
             metric(torch.randint(255, (2, 3, 64, 64)), "28-year-old chef found dead in San Francisco mall")
 
     @skip_on_connection_issues()