Skip to content

Commit

Permalink
new acpt env for torch2.1 and cuda12.1 (#2186)
Browse files Browse the repository at this point in the history
* new env for cuda12.1

* updated
  • Loading branch information
savitamittal1 authored Jan 31, 2024
1 parent 5eda5db commit cda956c
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: acpt-pytorch-2.1-cuda12.1
version: auto
type: environment
spec: spec.yaml
extra_config: environment.yaml
test:
pytest:
enabled: true
pip_requirements: tests/requirements.txt
tests_dir: tests
categories: ["PyTorch", "Training"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu121-py38-torch210:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}}

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/
RUN /var/requirements/install_system_requirements.sh && \
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
azureml-core=={{latest-pypi-version}}
azureml-dataset-runtime=={{latest-pypi-version}}
azureml-defaults=={{latest-pypi-version}}
azure-ml=={{latest-pypi-version}}
azure-ml-component=={{latest-pypi-version}}
azureml-mlflow=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
azureml-contrib-services=={{latest-pypi-version}}
azureml-automl-common-tools=={{latest-pypi-version}}
torch-tb-profiler~=0.4.0
azureml-inference-server-http~=0.8.0
inference-schema~=1.5.0
MarkupSafe==2.1.2
regex
pybind11
urllib3>=1.26.18
cryptography>=41.0.4
aiohttp>=3.8.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
image:
name: azureml/curated/acpt-pytorch-2.1-cuda12.1
os: linux
context:
dir: context
dockerfile: Dockerfile
template_files:
- Dockerfile
- requirements.txt
publish:
location: mcr
visibility: public
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json

description: >-
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.0 for early testing, and preview of new fastcheckpointing capability called Nebula.
Azure Container Registry:mcr.microsoft.com/azureml/curated/acpt-pytorch-2.1-cuda12.1
name: "{{asset.name}}"
version: "{{asset.version}}"

build:
path: "{{image.context.path}}"
dockerfile_path: "{{image.dockerfile.path}}"

os_type: linux

tags:
PyTorch: "2.1"
GPU: Cuda11
OS: Ubuntu20.04
Training: ""
Preview: ""
Python: "3.8"
DeepSpeed: "0.11.1"
ONNXRuntime: "1.16.0"
torch_ORT: "1.16.0"
Checkpointing:Nebula: "0.16.10"
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

"""Tests running a sample job in the pytorch 2.0 environment."""
import os
import time
from pathlib import Path
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration
from azure.identity import AzureCliCredential
import subprocess

BUILD_CONTEXT = Path("../context")
JOB_SOURCE_CODE = "../../acpt-tests/src"
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60)
STD_LOG = Path("artifacts/user_logs/std_log.txt")


def test_pytorch_2_1():
"""Tests a sample job using pytorch 2.1 as the environment."""
this_dir = Path(__file__).parent

subscription_id = os.environ.get("subscription_id")
resource_group = os.environ.get("resource_group")
workspace_name = os.environ.get("workspace")

ml_client = MLClient(
AzureCliCredential(), subscription_id, resource_group, workspace_name
)

env_name = "acpt-pytorch-2_1-cuda12_1"

env_docker_context = Environment(
build=BuildContext(path=this_dir / BUILD_CONTEXT),
name=env_name,
description="Pytorch 2.1 environment created from a Docker context.",
)
ml_client.environments.create_or_update(env_docker_context)

# create the command
job = command(
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored
command="pip install -r requirements.txt" \
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\"" \
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1" \
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\"" \
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1" \
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999" \
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000" \
" --model_checkpoint \"bert-large-uncased\"",
outputs={
"output": Output(
type="uri_folder",
mode="rw_mount",
path="azureml://datastores/workspaceblobstore/paths/outputs"
)
},
environment=f"{env_name}@latest",
compute=os.environ.get("gpu_v100_cluster"),
display_name="bert-pretrain-GLUE",
description="Pretrain the BERT model on the GLUE dataset.",
experiment_name="pytorch21_Cuda121_Experiment",
distribution=PyTorchDistribution(process_count_per_instance=1),
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'),
)

returned_job = ml_client.create_or_update(job)
assert returned_job is not None

# Poll until final status is reached or timed out
timeout = time.time() + (TIMEOUT_MINUTES * 60)
while time.time() <= timeout:
current_status = ml_client.jobs.get(returned_job.name).status
if current_status in ["Completed", "Failed"]:
break
time.sleep(30) # sleep 30 seconds

bashCommand = "ls"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
print(output)
print(error)

if current_status == "Failed" or current_status == "Cancelled":
ml_client.jobs.download(returned_job.name)
if STD_LOG.exists():
print(f"*** BEGIN {STD_LOG} ***")
with open(STD_LOG, "r") as f:
print(f.read(), end="")
print(f"*** END {STD_LOG} ***")
else:
ml_client.jobs.stream(returned_job.name)

assert current_status == "Completed"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
azure-ai-ml==1.13.0
azure.identity==1.15.0

0 comments on commit cda956c

Please sign in to comment.