From fc57497db0a1bd43c0e8b3568a604619d77dcbc5 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 14 Dec 2023 15:26:25 +0100
Subject: [PATCH] Initial set of unittest in CI (#43)

* Add quantization tests

* Add some tooling for the CI

* Added initial set of test for quantization and sharding

* Use Protocol instead of ABC

* Add utility to ensure we can run on GPU and define how much we need

* Attempt to enable the workflow

* Second attempt

* Let's relax numpy version for now.

* Disable a few things

* Again

* Again Again

* Refactored tests and utils

* Update command for unittests

* Quality

* Once more

* One last?

* Add Makefile to make it easier to run common commands
---
 .github/workflows/pr_fast_tests.yml        | 53 ++++++++++++++++++++++
 Makefile                                   |  7 +++
 pyproject.toml                             |  8 +++-
 setup.py                                   |  2 +-
 src/optimum/nvidia/configs/__init__.py     |  2 +-
 src/optimum/nvidia/utils/tests/__init__.py |  1 +
 src/optimum/nvidia/utils/tests/utils.py    | 43 ++++++++++++++++++
 tests/models/test_llama.py                 | 11 +++++
 tests/quantization.py                      |  0
 tests/test_quantization.py                 | 14 ++++++
 tests/test_sharding.py                     | 39 ++++++++++++++++
 11 files changed, 177 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/pr_fast_tests.yml
 create mode 100644 Makefile
 create mode 100644 src/optimum/nvidia/utils/tests/__init__.py
 create mode 100644 src/optimum/nvidia/utils/tests/utils.py
 create mode 100644 tests/models/test_llama.py
 delete mode 100644 tests/quantization.py
 create mode 100644 tests/test_quantization.py
 create mode 100644 tests/test_sharding.py

diff --git a/.github/workflows/pr_fast_tests.yml b/.github/workflows/pr_fast_tests.yml
new file mode 100644
index 00000000..6acc23c8
--- /dev/null
+++ b/.github/workflows/pr_fast_tests.yml
@@ -0,0 +1,53 @@
+name: CPU Only Test Suite on PRs
+
+on:
+  pull_request:
+    branches:
+      - main
+  push:
+    branches:
+      - ci-*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  OPTIMUM_NVIDIA_IS_CI: ON
+  RUN_CPU_ONLY: ON
+
+jobs:
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: Fast Optimum-Nvidia Test Suite
+            runner: [ci, nvidia-gpu]
+            image: huggingface/optimum-nvidia
+            report: cpu_only
+
+    name: ${{ matrix.config.name }}
+    runs-on: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - name: Checkout optimum-nvidia
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade -e .[quality,tests]
+
+      - name: Run fast optimum-nvidia CPU tests
+        run: |
+          python -m pytest -s -v -p no:warnings tests
\ No newline at end of file
diff --git a/Makefile b/Makefile
new file mode 100644
index 00000000..5e4634ec
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,7 @@
+fix-quality:
+	python3 -m ruff check examples scripts src tests
+	python3 -m ruff format examples scripts src tests
+
+quality:
+	python3 -m ruff check examples scripts src tests
+	python3 -m ruff format examples scripts src tests --check
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index b20c0730..a5d800da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,4 +24,10 @@ indent-style = "space"
 skip-magic-trailing-comma = false
 
 # Like Black, automatically detect the appropriate line ending.
-line-ending = "auto"
\ No newline at end of file
+line-ending = "auto"
+
+
+[tool.pytest.ini_options]
+pythonpath = [
+    "src"
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 928dd81b..d9d6dc60 100644
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,7 @@
     "fsspec",
     "huggingface_hub >= 0.14.0",
     "hf-transfer",
-    "numpy >= 1.24.0",
+    "numpy >= 1.22.0",
     "onnx >= 1.12.0",
     "optimum >= 1.13.0",
     "transformers >= 4.32.1",
diff --git a/src/optimum/nvidia/configs/__init__.py b/src/optimum/nvidia/configs/__init__.py
index e8273a0b..1eac3d2b 100644
--- a/src/optimum/nvidia/configs/__init__.py
+++ b/src/optimum/nvidia/configs/__init__.py
@@ -15,4 +15,4 @@
 from typing import Protocol
 
 from .base import ModelConfig, TransformersConfig
-from .quantization import QuantizationConfig
+from .quantization import NO_QUANTIZATION, QuantizationConfig
diff --git a/src/optimum/nvidia/utils/tests/__init__.py b/src/optimum/nvidia/utils/tests/__init__.py
new file mode 100644
index 00000000..b6d12cfe
--- /dev/null
+++ b/src/optimum/nvidia/utils/tests/__init__.py
@@ -0,0 +1 @@
+from .utils import nightly, requires_gpu, slow
diff --git a/src/optimum/nvidia/utils/tests/utils.py b/src/optimum/nvidia/utils/tests/utils.py
new file mode 100644
index 00000000..925ad830
--- /dev/null
+++ b/src/optimum/nvidia/utils/tests/utils.py
@@ -0,0 +1,43 @@
+import functools
+import os
+from distutils.util import strtobool
+
+import pytest
+
+from optimum.nvidia.utils.nvml import get_device_count
+
+
+INT_TRUE_VALUE = 1
+
+# Environment variable controlling test set
+ENVVAR_NAME_RUN_NIGHTLY = "RUN_NIGHTLY"
+ENVVAR_NAME_RUN_SLOW = "RUN_SLOW"
+ENVVAR_NAME_RUN_CPU_ONLY = "RUN_CPU_ONLY"
+
+
+@functools.cache
+def parse_flag_from_env(name: str, default: bool) -> bool:
+    """
+    Parse the environment variable `name` as a boolean
+    :param name: Name of target environment variable
+    :param default: The default value to apply if `name` is not present
+    :return: Boolean value
+    """
+
+    # Retrieve the value or `default` if not present
+    value = os.environ.get(name, str(default))
+
+    try:
+        return strtobool(value) == INT_TRUE_VALUE
+    except ValueError:
+        raise ValueError(f"Failed to convert environment variable {name}={value} to a bool")
+
+
+nightly = pytest.mark.skipif(parse_flag_from_env(ENVVAR_NAME_RUN_NIGHTLY, False), reason="Nightly test")
+slow = pytest.mark.skipif(parse_flag_from_env(ENVVAR_NAME_RUN_SLOW, False), reason="Slow test")
+
+requires_gpu = pytest.mark.skipif(
+    parse_flag_from_env(ENVVAR_NAME_RUN_CPU_ONLY, False) or not get_device_count(),
+    reason=f"RUN_CPU_ONLY={parse_flag_from_env(ENVVAR_NAME_RUN_CPU_ONLY, False)} or "
+    f"no GPU detected (num_gpus={get_device_count()})",
+)
diff --git a/tests/models/test_llama.py b/tests/models/test_llama.py
new file mode 100644
index 00000000..ab705b5c
--- /dev/null
+++ b/tests/models/test_llama.py
@@ -0,0 +1,11 @@
+from parameterized import parameterized
+
+from optimum.nvidia.models.llama import LLamaForCausalLM as TrtLlamaForCausalLM
+from optimum.nvidia.utils.tests import requires_gpu
+
+
+@parameterized.expand(["float16", "bfloat16"])
+@requires_gpu
+def test_build_engine_7b_with_tp(dtype: str):
+    model = TrtLlamaForCausalLM.from_pretrained("huggingface/llama-7b", dtype=dtype)
+    assert model
diff --git a/tests/quantization.py b/tests/quantization.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
new file mode 100644
index 00000000..1fb6e325
--- /dev/null
+++ b/tests/test_quantization.py
@@ -0,0 +1,14 @@
+from tensorrt_llm.quantization import QuantMode
+
+from optimum.nvidia.configs import NO_QUANTIZATION, QuantizationConfig
+
+
+def test_no_quantization_has_quantization_step():
+    qconfig = QuantizationConfig(NO_QUANTIZATION)
+    assert not qconfig.has_quantization_step
+
+
+def test_float8_quantization_has_quantization_step():
+    qconfig = QuantizationConfig(QuantMode.from_description(use_fp8_qdq=True, use_fp8_kv_cache=True))
+
+    assert qconfig.has_quantization_step
diff --git a/tests/test_sharding.py b/tests/test_sharding.py
new file mode 100644
index 00000000..63b956a4
--- /dev/null
+++ b/tests/test_sharding.py
@@ -0,0 +1,39 @@
+from unittest import TestCase
+
+import numpy as np
+from parameterized import parameterized
+
+from optimum.nvidia.weights import shard
+
+
+TENSOR_DIM_0 = 1024
+TENSOR_DIM_1 = 4096
+
+
+class MatrixShardingTestCase(TestCase):
+    def setUp(self):
+        self.tensor = np.random.rand(TENSOR_DIM_0, TENSOR_DIM_1)
+
+    def test_no_sharding(self):
+        sharded_tensor = shard(self.tensor, 0, 1, axis=0)
+        self.assertTrue(np.array_equal(sharded_tensor, self.tensor))
+
+    @parameterized.expand([1, 2, 4, 8])
+    def test_sharding_tensor_parallelism_axis_0(self, tp_degree: int):
+        shard_size = TENSOR_DIM_0 // tp_degree
+
+        shards = [shard(self.tensor, rank, tp_degree, axis=0) for rank in range(tp_degree)]
+
+        for rank, tensor in enumerate(shards):
+            self.assertTupleEqual(tensor.shape, (TENSOR_DIM_0 // tp_degree, TENSOR_DIM_1))
+            self.assertTrue(np.array_equal(tensor, self.tensor[rank * shard_size : (rank + 1) * shard_size]))
+
+    @parameterized.expand([1, 2, 4, 8])
+    def test_sharding_tensor_parallelism_axis_1(self, tp_degree: int):
+        shard_size = TENSOR_DIM_1 // tp_degree
+
+        shards = [shard(self.tensor, rank, tp_degree, axis=1) for rank in range(tp_degree)]
+
+        for rank, tensor in enumerate(shards):
+            self.assertTupleEqual(tensor.shape, (TENSOR_DIM_0, TENSOR_DIM_1 // tp_degree))
+            self.assertTrue(np.array_equal(tensor, self.tensor[:, rank * shard_size : (rank + 1) * shard_size]))