Skip to content

Commit

Permalink
Add utility to help launch milabench with docker
Browse files Browse the repository at this point in the history
  • Loading branch information
pierre.delaunay committed Jan 17, 2025
1 parent df7d8a1 commit cf751f7
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 26 deletions.
16 changes: 11 additions & 5 deletions config/examples/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,17 @@ system:
arch: cuda
# sshkey used in remote milabench operations
sshkey: ~/.ssh/id_ed25519
# Docker image to use
docker_image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly

docker:
executable: podman
image: ghcr.io/mila-iqia/milabench:${system.arch}-nightly
base: /tmp/workspace/
args: [
-it, --rm, --ipc=host, --gpus=all,
-e, MILABENCH_HF_TOKEN=<TOKEN>,
-v, "${system.docker.base}/data:/milabench/envs/data",
-v, "${system.docker.base}/runs:/milabench/envs/runs",
]

# Nodes list
nodes:
Expand All @@ -27,9 +36,6 @@ system:
main: false
user: username




multirun:
runs:
# Force batch size to populate the sizing model
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile-cuda
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# FROM ubuntu:22.04

# For cuda-gdb
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
FROM nvidia/cuda:12.6.3-devel-ubuntu22.04

# Arguments
# ---------
Expand Down Expand Up @@ -41,7 +41,7 @@ COPY . /milabench/milabench/

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y &&\
apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip &&\
apt-get install -y --no-install-recommends git build-essential curl python3 python-is-python3 python3-pip libgl1-mesa-glx &&\
curl -o /etc/apt/trusted.gpg.d/mellanox.asc https://content.mellanox.com/ofed/RPM-GPG-KEY-Mellanox &&\
curl -o /etc/apt/sources.list.d/mellanox.list https://linux.mellanox.com/public/repo/mlnx_ofed/${MOFED_VERSION}/ubuntu22.04/mellanox_mlnx_ofed.list &&\
apt-get update -y &&\
Expand Down
47 changes: 47 additions & 0 deletions docker/Dockerfile-ngc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
FROM nvcr.io/nvidia/pytorch:24.02-py3

# Arguments
# ---------

# Use ofed_info -s to get your local version
ARG MOFED_VERSION=5.4-3.4.0.0
ARG CONFIG=standard.yaml
ARG ARCH=cuda

ENV MILABENCH_GPU_ARCH=$ARCH
ENV MILABENCH_CONFIG_NAME=$CONFIG
ENV MILABENCH_DOCKER=1

# Paths
# -----

ENV MILABENCH_CONFIG=/milabench/milabench/config/$MILABENCH_CONFIG_NAME
ENV MILABENCH_BASE=/milabench/envs
ENV MILABENCH_OUTPUT=/milabench/results/
ENV MILABENCH_ARGS=""

# Copy milabench
# --------------

WORKDIR /milabench
COPY . /milabench/milabench/

# Install Dependencies
# --------------------

# curl: used to download anaconda and rust
# git: used by milabench
# libibverbs: used for infiniband
# rustc: used by BERT models inside https://pypi.org/project/tokenizers/
# build-essential: for rust


ENV DEBIAN_FRONTEND=noninteractive
RUN cd $HOME &&
rm -rf /milabench/env &&
pip install virtualenv &&
virtualenv --system-site-packages /milabench/env &&
/milabench/env/bin/pip install -e milabench &&
/milabench/env/bin/milabench install --use-current-env &&
/milabench/env/bin/pip uninstall torch torchvision torchaudio -y &&
/milabench/env/bin/pip cache purge
25 changes: 25 additions & 0 deletions docs/Contributing/recipes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ Run a benchmark without milabench
Containers
----------

NGC
^^^

When using containers where some dependencies are already installed, we need to use a dummy virtualenv
so make milabench install its dependencies there, then the duplicate dependencies can be removed.

Expand Down Expand Up @@ -231,6 +234,28 @@ so make milabench install its dependencies there, then the duplicate dependencie
milabench prepare --use-current-env
milabench run --use-current-env
Nightly
^^^^^^^

.. code-block:: bash
podman run -it --rm --ipc=host --gpus=all \
-e MILABENCH_HF_TOKEN=<TOKEN> \
-v /tmp/workspace/data:/milabench/envs/data \
-v /tmp/workspace/runs:/milabench/envs/runs \
ghcr.io/mila-iqia/milabench:cuda-nightly milabench prepare
podman run -it --rm --ipc=host --gpus=all \
-e MILABENCH_HF_TOKEN=<TOKEN> \
-v /tmp/workspace/data:/milabench/envs/data \
-v /tmp/workspace/runs:/milabench/envs/runs \
ghcr.io/mila-iqia/milabench:cuda-nightly milabench run
Multi Node & Docker
^^^^^^^^^^^^^^^^^^^



Example Reports
---------------
Expand Down
4 changes: 4 additions & 0 deletions milabench/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .prepare_run import cli_prepare_run
from .gated import cli_gated
from .sharedsetup import cli_shared_setup
from .docker import cli_docker


class Main:
Expand Down Expand Up @@ -112,6 +113,9 @@ def gated():
def sharedsetup():
cli_shared_setup()

def docker():
cli_docker()


def main(argv=None):
sys.path.insert(0, os.path.abspath(os.curdir))
Expand Down
55 changes: 55 additions & 0 deletions milabench/cli/docker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

from coleo import Option, tooled


from ..commands import DockerRunCommand, CmdCommand
from ..pack import Package
from ..common import get_multipack
from ..system import DockerConfig


def dummy_pack(packs) -> Package:
pack = list(packs.values())[0]
name = "setup"

return Package(
{
"name": name,
"tag": [name],
"definition": ".",
"run_name": pack.config["run_name"],
"dirs": pack.config["dirs"],
"config_base": pack.config["config_base"],
"config_file": pack.config["config_file"],
"system": pack.config["system"],
}
)


@tooled
def cli_docker(args=None):
"""Builds the docker command to execute from the system configuration"""
from .run import arguments

if args is None:
args = arguments()

mp = get_multipack(run_name=args.run_name)

pack = dummy_pack(mp.packs)

system = pack.config["system"]
config = DockerConfig(**system.get("docker"))

# milabench prepare
plan = DockerRunCommand(CmdCommand(pack, "milabench", "prepare"), config)

for pack, argv, _ in plan.commands():
print(" ".join(argv))

# milabench run
plan = DockerRunCommand(CmdCommand(pack, "milabench", "run"), config)

for pack, argv, _ in plan.commands():
print(" ".join(argv))
30 changes: 11 additions & 19 deletions milabench/commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from ..merge import merge
from ..utils import select_nodes
from .executors import execute_command
from ..system import option
from ..system import option, DockerConfig


def clone_with(cfg, new_cfg):
Expand Down Expand Up @@ -328,23 +328,15 @@ class DockerRunCommand(WrapperCommand):
"""

def __init__(
self, executor: SingleCmdCommand, image: str, *docker_argv, **kwargs
self, executor: SingleCmdCommand, config: DockerConfig, *docker_argv, **kwargs
) -> None:
self.config = config
self.extra_args = docker_argv

super().__init__(
executor,
"docker",
"run",
"-i",
"--rm",
"--network",
"host",
"--privileged",
"--gpus",
"all",
*docker_argv,
**kwargs,
)
self.image = image

def as_container_path(self, path):
# replace local output path with docker path
Expand Down Expand Up @@ -386,7 +378,7 @@ def is_inside_docker(self):
def _argv(self, **kwargs) -> List:
# if the command is executed remotely it does not matter
# if we are inside docker or not
if (self.image is None) or (self.is_inside_docker() and not self.remote):
if (self.config.image is None) or (self.is_inside_docker() and not self.remote):
# No-op when there's no docker image to run or inside a docker
# container
return []
Expand All @@ -395,11 +387,11 @@ def _argv(self, **kwargs) -> List:

env = self.pack.make_env()
for var in ("XDG_CACHE_HOME", "OMP_NUM_THREADS"):
argv.append("--env")
argv.append(f"{var}='{self.as_container_path(env[var])}'")
if var in env:
argv.append("--env")
argv.append(f"{var}='{self.as_container_path(env[var])}'")

argv.append(self.image)
return argv
return self.config.command(argv)


class SSHCommand(WrapperCommand):
Expand Down Expand Up @@ -923,7 +915,7 @@ def make_new_node_executor(self, rank, node, base):

return DockerRunCommand(
AccelerateLaunchCommand(executor, rank=rank, **self.options),
config["system"].get("docker_image"),
DockerConfig(**config["system"].get("docker")),
)


Expand Down
28 changes: 28 additions & 0 deletions milabench/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,34 @@ class DatasetConfig:
buffer: str = defaultfield("data.buffer", str, default="${dirs.base}/buffer")


def default_docker_args():
return [
"-it", "--rm", "--ipc=host", "--gpus=all",
"--network", "host",
"--privileged",
"-e", f"MILABENCH_HF_TOKEN={os.getenv('MILABENCH_HF_TOKEN', 'Undefined')}",
"-v", "/tmp/workspace/data:/milabench/envs/data",
"-v", "/tmp/workspace/runs:/milabench/envs/runs",
]


@dataclass
class DockerConfig:
executable: str = defaultfield("docker.executable", str, "podman")
image: str = defaultfield("docker.image", str, None)
base: str = defaultfield("docker.base", str, "/tmp/workspace")
args: list = defaultfield("docker.args", list, default_docker_args())

def command(self, extra_args):
return [
self.executable,
"run",
*self.args,
*extra_args,
self.image
]


@dataclass
class Dirs:
"""Common directories used by milabench. This can be used to override
Expand Down

0 comments on commit cf751f7

Please sign in to comment.