diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
new file mode 100644
index 000000000000..5ca159074a4d
--- /dev/null
+++ b/.github/workflows/nv-sd.yml
@@ -0,0 +1,70 @@
+name: nv-sd
+
+on:
+  schedule:
+    - cron: "0 0 * * 0"
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - "deepspeed/ops/transformer/inference/diffusers_**"
+      - "tests/unit/inference/test_stable_diffusion.py"
+      - "deepspeed/model_implementations/diffusers/unet.py"
+      - "deepspeed/model_implementations/diffusers/vae.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  sd-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          pip install image-similarity-measures
+          python -m pip install opencv-python==4.6.* --force-reinstall
+          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install .[dev,1bit,autotuning,sd]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
+
+      - name: Open GitHub issue if weekly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
index 086a8e3f4879..cb679ae3771d 100644
--- a/requirements/requirements-sd.txt
+++ b/requirements/requirements-sd.txt
@@ -1,2 +1,2 @@
 diffusers
-triton
+triton>=2.1.0
diff --git a/tests/pytest.ini b/tests/pytest.ini
index cc6b6564daa8..8d043c8b3f9d 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -1,5 +1,5 @@
 [pytest]
-addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops"
+addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion"
 markers =
     sequential:Tests that need to be run sequentially
     inference:Inference model tests
@@ -9,3 +9,4 @@ markers =
     seq_inference:Inference model tests to run sequentially
     nightly:Tests that should be run nightly
     world_size:Change world size of individual tests in a class
+    stable_diffusion:Tests that run Stable Diffusion
diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py
new file mode 100644
index 000000000000..ac39b7ab12fa
--- /dev/null
+++ b/tests/unit/inference/test_stable_diffusion.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import torch
+import pytest
+import deepspeed
+import numpy
+from unit.common import DistributedTest
+from deepspeed.accelerator import get_accelerator
+
+
+# Setup for these models is different from other pipelines, so we add a separate test
+@pytest.mark.stable_diffusion
+class TestStableDiffusion(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        from diffusers import DiffusionPipeline
+        from image_similarity_measures.quality_metrics import rmse
+        generator = torch.Generator(device=get_accelerator().current_device())
+        seed = 0xABEDABE7
+        generator.manual_seed(seed)
+        prompt = "a dog on a rocket"
+        model = "prompthero/midjourney-v4-diffusion"
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        device = torch.device(f"cuda:{local_rank}")
+
+        pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half)
+        pipe = pipe.to(device)
+        baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
+
+        pipe = deepspeed.init_inference(
+            pipe,
+            mp_size=1,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            enable_cuda_graph=True,
+        )
+        generator.manual_seed(seed)
+        deepspeed_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
+
+        rmse_value = rmse(org_img=numpy.asarray(baseline_image), pred_img=numpy.asarray(deepspeed_image))
+
+        # RMSE threshold value is arbitrary, may need to adjust as needed
+        assert rmse_value <= 0.01