-
Notifications
You must be signed in to change notification settings - Fork 136
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
new acpt env for torch2.1 and cuda12.1 (#2186)
* new env for cuda12.1 * updated
- Loading branch information
1 parent
5eda5db
commit cda956c
Showing
7 changed files
with
183 additions
and
0 deletions.
There are no files selected for viewing
11 changes: 11 additions & 0 deletions
11
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/asset.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: acpt-pytorch-2.1-cuda12.1 | ||
version: auto | ||
type: environment | ||
spec: spec.yaml | ||
extra_config: environment.yaml | ||
test: | ||
pytest: | ||
enabled: true | ||
pip_requirements: tests/requirements.txt | ||
tests_dir: tests | ||
categories: ["PyTorch", "Training"] |
20 changes: 20 additions & 0 deletions
20
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/context/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu121-py38-torch210:{{latest-image-tag:biweekly\.\d{6}\.\d{1}.*}} | ||
|
||
# Install pip dependencies | ||
COPY requirements.txt . | ||
RUN pip install -r requirements.txt --no-cache-dir | ||
|
||
# Inference requirements | ||
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/ | ||
RUN /var/requirements/install_system_requirements.sh && \ | ||
cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \ | ||
cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \ | ||
ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \ | ||
rm -f /etc/nginx/sites-enabled/default | ||
ENV SVDIR=/var/runit | ||
ENV WORKER_TIMEOUT=400 | ||
EXPOSE 5001 8883 8888 | ||
|
||
# support Deepspeed launcher requirement of passwordless ssh login | ||
RUN apt-get update | ||
RUN apt-get install -y openssh-server openssh-client |
18 changes: 18 additions & 0 deletions
18
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/context/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
azureml-core=={{latest-pypi-version}} | ||
azureml-dataset-runtime=={{latest-pypi-version}} | ||
azureml-defaults=={{latest-pypi-version}} | ||
azure-ml=={{latest-pypi-version}} | ||
azure-ml-component=={{latest-pypi-version}} | ||
azureml-mlflow=={{latest-pypi-version}} | ||
azureml-contrib-services=={{latest-pypi-version}} | ||
azureml-contrib-services=={{latest-pypi-version}} | ||
azureml-automl-common-tools=={{latest-pypi-version}} | ||
torch-tb-profiler~=0.4.0 | ||
azureml-inference-server-http~=0.8.0 | ||
inference-schema~=1.5.0 | ||
MarkupSafe==2.1.2 | ||
regex | ||
pybind11 | ||
urllib3>=1.26.18 | ||
cryptography>=41.0.4 | ||
aiohttp>=3.8.5 |
12 changes: 12 additions & 0 deletions
12
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/environment.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
image: | ||
name: azureml/curated/acpt-pytorch-2.1-cuda12.1 | ||
os: linux | ||
context: | ||
dir: context | ||
dockerfile: Dockerfile | ||
template_files: | ||
- Dockerfile | ||
- requirements.txt | ||
publish: | ||
location: mcr | ||
visibility: public |
26 changes: 26 additions & 0 deletions
26
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/spec.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json | ||
|
||
description: >- | ||
Recommended environment for Deep Learning in public preview with PyTorch on Azure containing the Azure ML SDK with the latest compatible versions of Ubuntu, Python, PyTorch, CUDA\RocM, combined with optimizers like ORT Training,+DeepSpeed+MSCCL+ORT MoE and more. The image introduces newly released PyTorch 2.0 for early testing, and preview of new fastcheckpointing capability called Nebula. | ||
Azure Container Registry:mcr.microsoft.com/azureml/curated/acpt-pytorch-2.1-cuda12.1 | ||
name: "{{asset.name}}" | ||
version: "{{asset.version}}" | ||
|
||
build: | ||
path: "{{image.context.path}}" | ||
dockerfile_path: "{{image.dockerfile.path}}" | ||
|
||
os_type: linux | ||
|
||
tags: | ||
PyTorch: "2.1" | ||
GPU: Cuda11 | ||
OS: Ubuntu20.04 | ||
Training: "" | ||
Preview: "" | ||
Python: "3.8" | ||
DeepSpeed: "0.11.1" | ||
ONNXRuntime: "1.16.0" | ||
torch_ORT: "1.16.0" | ||
Checkpointing:Nebula: "0.16.10" |
94 changes: 94 additions & 0 deletions
94
...s/training/general/environments/acpt-pytorch-2.1-cuda12.1/tests/pytorch2_1_sample_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# Licensed under the MIT License. | ||
|
||
"""Tests running a sample job in the pytorch 2.0 environment.""" | ||
import os | ||
import time | ||
from pathlib import Path | ||
from azure.ai.ml import command, Output, MLClient, PyTorchDistribution | ||
from azure.ai.ml.entities import Environment, BuildContext, JobResourceConfiguration | ||
from azure.identity import AzureCliCredential | ||
import subprocess | ||
|
||
BUILD_CONTEXT = Path("../context") | ||
JOB_SOURCE_CODE = "../../acpt-tests/src" | ||
TIMEOUT_MINUTES = os.environ.get("timeout_minutes", 60) | ||
STD_LOG = Path("artifacts/user_logs/std_log.txt") | ||
|
||
|
||
def test_pytorch_2_1(): | ||
"""Tests a sample job using pytorch 2.1 as the environment.""" | ||
this_dir = Path(__file__).parent | ||
|
||
subscription_id = os.environ.get("subscription_id") | ||
resource_group = os.environ.get("resource_group") | ||
workspace_name = os.environ.get("workspace") | ||
|
||
ml_client = MLClient( | ||
AzureCliCredential(), subscription_id, resource_group, workspace_name | ||
) | ||
|
||
env_name = "acpt-pytorch-2_1-cuda12_1" | ||
|
||
env_docker_context = Environment( | ||
build=BuildContext(path=this_dir / BUILD_CONTEXT), | ||
name=env_name, | ||
description="Pytorch 2.1 environment created from a Docker context.", | ||
) | ||
ml_client.environments.create_or_update(env_docker_context) | ||
|
||
# create the command | ||
job = command( | ||
code=this_dir / JOB_SOURCE_CODE, # local path where the code is stored | ||
command="pip install -r requirements.txt" \ | ||
" && python pretrain_glue.py --tensorboard_log_dir \"/outputs/runs/\"" \ | ||
" --deepspeed ds_config.json --num_train_epochs 5 --output_dir outputs --disable_tqdm 1" \ | ||
" --local_rank $RANK --evaluation_strategy \"epoch\" --logging_strategy \"epoch\"" \ | ||
" --per_device_train_batch_size 93 --gradient_accumulation_steps 1" \ | ||
" --per_device_eval_batch_size 93 --learning_rate 3e-05 --adam_beta1 0.8 --adam_beta2 0.999" \ | ||
" --weight_decay 3e-07 --warmup_steps 500 --fp16 --logging_steps 1000" \ | ||
" --model_checkpoint \"bert-large-uncased\"", | ||
outputs={ | ||
"output": Output( | ||
type="uri_folder", | ||
mode="rw_mount", | ||
path="azureml://datastores/workspaceblobstore/paths/outputs" | ||
) | ||
}, | ||
environment=f"{env_name}@latest", | ||
compute=os.environ.get("gpu_v100_cluster"), | ||
display_name="bert-pretrain-GLUE", | ||
description="Pretrain the BERT model on the GLUE dataset.", | ||
experiment_name="pytorch21_Cuda121_Experiment", | ||
distribution=PyTorchDistribution(process_count_per_instance=1), | ||
resources=JobResourceConfiguration(instance_count=2, shm_size='3100m'), | ||
) | ||
|
||
returned_job = ml_client.create_or_update(job) | ||
assert returned_job is not None | ||
|
||
# Poll until final status is reached or timed out | ||
timeout = time.time() + (TIMEOUT_MINUTES * 60) | ||
while time.time() <= timeout: | ||
current_status = ml_client.jobs.get(returned_job.name).status | ||
if current_status in ["Completed", "Failed"]: | ||
break | ||
time.sleep(30) # sleep 30 seconds | ||
|
||
bashCommand = "ls" | ||
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE) | ||
output, error = process.communicate() | ||
print(output) | ||
print(error) | ||
|
||
if current_status == "Failed" or current_status == "Cancelled": | ||
ml_client.jobs.download(returned_job.name) | ||
if STD_LOG.exists(): | ||
print(f"*** BEGIN {STD_LOG} ***") | ||
with open(STD_LOG, "r") as f: | ||
print(f.read(), end="") | ||
print(f"*** END {STD_LOG} ***") | ||
else: | ||
ml_client.jobs.stream(returned_job.name) | ||
|
||
assert current_status == "Completed" |
2 changes: 2 additions & 0 deletions
2
assets/training/general/environments/acpt-pytorch-2.1-cuda12.1/tests/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
azure-ai-ml==1.13.0 | ||
azure.identity==1.15.0 |