Skip to content

Commit

Permalink
Merge pull request #3098 from GoogleCloudPlatform/main
Browse files Browse the repository at this point in the history
Merge v1.40.0 release into develop branch
  • Loading branch information
tpdownes authored Oct 3, 2024
2 parents b07c131 + f9f9256 commit 279ba8d
Show file tree
Hide file tree
Showing 45 changed files with 84 additions and 56 deletions.
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
logging.Fatal("cmd.Help function failed: %s", err)
}
},
Version: "v1.39.0",
Version: "v1.40.0",
Annotations: annotation,
}
)
Expand Down
1 change: 1 addition & 0 deletions community/examples/hpc-slurm6-tpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ deployment_groups:
use: [tpu_nodeset]
settings:
partition_name: tpu
resume_timeout: 600

- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ terraform {
}

provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.40.0"
}
}
2 changes: 1 addition & 1 deletion community/modules/compute/mig/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:mig/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:mig/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.40.0"
}
required_version = ">= 1.1"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.40.0"
}
required_version = ">= 0.13.0"
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ terraform {
required_version = ">= 1.3"

provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ terraform {
required_version = ">= 1.3"

provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0"
}
provider_meta "google-beta" {
module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.40.0"
}

required_version = ">= 0.13.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.40.0"
}
required_version = ">= 0.14.0"
}
2 changes: 1 addition & 1 deletion community/modules/file-system/nfs-server/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.40.0"
}

required_version = ">= 0.14.0"
Expand Down
4 changes: 2 additions & 2 deletions community/modules/files/fsi-montecarlo-on-batch/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0"
}
provider_meta "google-beta" {
module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.40.0"
}
}
4 changes: 2 additions & 2 deletions community/modules/network/private-service-access/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0"
}

provider_meta "google-beta" {
module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.40.0"
}

required_version = ">= 1.2"
Expand Down
2 changes: 1 addition & 1 deletion community/modules/project/service-enablement/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.40.0"
}

required_version = ">= 0.14.0"
Expand Down
4 changes: 2 additions & 2 deletions community/modules/pubsub/bigquery-sub/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0"
}
provider_meta "google-beta" {
module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.40.0"
}
required_version = ">= 1.0"
}
2 changes: 1 addition & 1 deletion community/modules/pubsub/topic/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:topic/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:topic/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.40.0"
}

required_version = ">= 1.1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.40.0"
}

required_version = ">= 1.1.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.40.0"
}

required_version = ">= 1.3.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.40.0"
}
required_version = ">= 1.1"
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.40.0"
}
required_version = ">= 1.1"
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.40.0"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.40.0"
}
}
2 changes: 1 addition & 1 deletion community/modules/scripts/wait-for-startup/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.40.0"
}

required_version = ">= 0.14.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

terraform {
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.40.0"
}

required_version = ">= 0.14.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ deployment_groups:
content: |
#!/bin/bash
curl -s --create-dirs -o /opt/apps/adm/slurm/scripts/receive-data-path-manager \
https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/v5/tools/prologs-epilogs/receive-data-path-manager
https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager
chmod 0755 /opt/apps/adm/slurm/scripts/receive-data-path-manager
mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d
mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d
Expand Down
12 changes: 11 additions & 1 deletion examples/machine-learning/a3-highgpu-8g/nemo-framework/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,16 @@ README

3. Run an example NeMo Framework Pre-Training

First, prepare the cache. This will download several files to the
~/.cache/huggingface folder which are needed to load the tokenizer for
training.

```shell
pip install transformers
python -c "from transformers import AutoTokenizer; \
AutoTokenizer.from_pretrained('gpt2')"
```

This will run an example of training a 5B parameter GPT3 model for 10 steps
using mock data as the input.

Expand All @@ -36,7 +46,7 @@ README
stages=[training] \
env_vars.TRANSFORMERS_OFFLINE=0 \
container=../nemofw+tcpx-23.11.sqsh \
container_mounts='["/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"]' \
container_mounts=[${HOME}/.cache,"/var/lib/tcpx/lib64","/run/tcpx-\${SLURM_JOB_ID}:/run/tcpx"] \
cluster.srun_args=["--container-writable"] \
training.model.data.data_impl=mock \
training.model.data.data_prefix=[] \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,9 @@ deployment_groups:
destination: stage_scripts.sh
content: |
#!/bin/bash
# use script from master branch which is actively maintained
curl -s --create-dirs -o /opt/apps/adm/slurm/scripts/receive-data-path-manager \
https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/v5/tools/prologs-epilogs/receive-data-path-manager
https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/master/tools/prologs-epilogs/receive-data-path-manager
chmod 0755 /opt/apps/adm/slurm/scripts/receive-data-path-manager
mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-prolog_slurmd.d
mkdir -p /opt/apps/adm/slurm/partition-$(vars.a3_partition_name)-epilog_slurmd.d
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG NEMOFW_VERSION=23.11
FROM nvcr.io/nvidia/nemo:${NEMOFW_VERSION}.framework
ARG NEMOFW_VERSION=24.07
FROM nvcr.io/nvidia/nemo:$NEMOFW_VERSION

ENV NCCL_FASTRAK_CTRL_DEV=enp0s12
ENV NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0
ENV NCCL_SOCKET_IFNAME=enp0s12
ENV GLOO_SOCKET_IFNAME=enp0s12
ENV NCCL_CROSS_NIC=0
ENV NCCL_ALGO=Ring
ENV NCCL_ALGO=Ring,Tree
ENV NCCL_PROTO=Simple
ENV NCCL_MIN_NCHANNELS=4
ENV NCCL_DYNAMIC_CHUNK_SIZE=524288
Expand All @@ -35,6 +36,11 @@ ENV NCCL_NET_GDR_LEVEL=PIX
ENV NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0
ENV NCCL_FASTRAK_USE_LLCM=1
ENV NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices
ENV NCCL_TUNER_PLUGIN=libnccl-tuner.io
ENV NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto
ENV NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config.textproto
ENV NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS=600000
ENV NCCL_NVLS_ENABLE=0

RUN echo "/var/lib/tcpxo/lib64" >> /etc/ld.so.conf.d/tcpxo.conf && ldconfig
ENV LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:$LD_LIBRARY_PATH
18 changes: 14 additions & 4 deletions examples/machine-learning/a3-megagpu-8g/nemo-framework/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ README

1. Set up NeMo Framework Container

This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:23.11.framework](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
This makes a few environment variable modifications to the [nvcr.io/nvidia/nemo:24.07](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo)
container, and submits a Slurm job to copy the framework launcher scripts and a
few other auxiliary files into your working directory.

Expand All @@ -21,13 +21,23 @@ README
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt # Copied from the NeMo Framework Container earlier
# This is needed to use 23.11 and python3.11, which is what is present on
# This is needed to use 24.07 and python3.11, which is what is present on
# Debian 12
pip install -U hydra-core
```

3. Run an example NeMo Framework Pre-Training

First, prepare the cache. This will download several files to the
~/.cache/huggingface folder which are needed to load the tokenizer for
training.

```shell
pip install transformers
python -c "from transformers import AutoTokenizer; \
AutoTokenizer.from_pretrained('gpt2')"
```

This will run an example of training a 5B parameter GPT3 model for 10 steps
using mock data as the input.

Expand All @@ -43,8 +53,8 @@ README
stages=[training] \
training=gpt3/5b \
env_vars.TRANSFORMERS_OFFLINE=0 \
container=../nemofw+tcpxo-23.11.sqsh \
container_mounts='["/var/lib/tcpxo/lib64"]' \
container=../nemofw+tcpxo-24.07.sqsh \
container_mounts=[${HOME}/.cache,/var/lib/tcpxo/lib64] \
cluster.srun_args=["--container-writable"] \
training.model.data.data_impl=mock \
training.model.data.data_prefix=[] \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#SBATCH --partition=a3mega
#SBATCH --exclusive

: "${NEMOFW_VERSION:=23.11}"
: "${NEMOFW_VERSION:=24.07}"

srun docker build --build-arg="NEMOFW_VERSION=${NEMOFW_VERSION}" -t nemofw:tcpxo-"${NEMOFW_VERSION}" .
srun rm -f nemofw+tcpxo-"${NEMOFW_VERSION}".sqsh
Expand All @@ -27,4 +27,4 @@ srun enroot import dockerd://nemofw:tcpxo-"${NEMOFW_VERSION}"
srun \
--container-mounts="${PWD}":/workspace/mount_dir,/var/tmp:/var/tmp \
--container-image=./nemofw+tcpxo-"${NEMOFW_VERSION}".sqsh \
bash -c "cp -r /opt/NeMo-Megatron-Launcher/requirements.txt /opt/NeMo-Megatron-Launcher/launcher_scripts /opt/NeMo-Megatron-Launcher/auto_configurator /workspace/mount_dir/"
bash -c "cp -r /opt/NeMo-Framework-Launcher/requirements.txt /opt/NeMo-Framework-Launcher/launcher_scripts /opt/NeMo-Framework-Launcher/auto_configurator /workspace/mount_dir/"
2 changes: 1 addition & 1 deletion modules/compute/gke-node-pool/versions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ terraform {
}
}
provider_meta "google" {
module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.39.0"
module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.40.0"
}
}
Loading

0 comments on commit 279ba8d

Please sign in to comment.