From 5ada631f8b623adc204dbfa19c0892275e13e8a4 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Mon, 16 Dec 2024 10:15:41 +0000 Subject: [PATCH 1/7] Update README for parallelstore related example blueprint --- examples/README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/examples/README.md b/examples/README.md index 30883ce0f9..4abeb289f9 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1518,6 +1518,30 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml +### [gke-storage-parallelstore.yaml] ![core-badge] ![experimental-badge] + +This blueprint shows how to use parallelstore storage options with GKE in the toolkit. + +The blueprint contains the following: + +* A K8s Job that uses a parallelstore storage volume option. +* A K8s Job that demonstrates ML training workload with parallelstore storage disk ops. + +> **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module. +> The lifecycle of the parallelstore is not managed by the blueprint. +> On glcuster destroy ops, the Parallelstore created will also be destroyed. +> +> [!Note] +> The Kubernetes API server will only allow requests from authorized networks. +> The `gke-cluster` module needs access to the Kubernetes API server +> to create a Persistent Volume and a Persistent Volume Claim. **You must use +> the `authorized_cidr` variable to supply an authorized network which contains +> the IP address of the machine deploying the blueprint, for example +> `--vars authorized_cidr=/32`.** You can use a service like +> [whatismyip.com](https://whatismyip.com) to determine your IP address. + +[gke-storage-parallelstore.yaml]: ../examples/gke-storage-parallelstore.yaml + ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge] This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit. From e74ca300335688a8c8a43250c90c766b5887af32 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Mon, 16 Dec 2024 11:11:11 +0000 Subject: [PATCH 2/7] Update README with GKE parallelstore related example blueprint details --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 4abeb289f9..95cea23dae 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1528,8 +1528,8 @@ The blueprint contains the following: * A K8s Job that demonstrates ML training workload with parallelstore storage disk ops. > **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module. -> The lifecycle of the parallelstore is not managed by the blueprint. -> On glcuster destroy ops, the Parallelstore created will also be destroyed. +> The lifecycle of the parallelstore is managed by the blueprint. +> On glcuster destroy ops, the Parallelstore storage created will also be destroyed. > > [!Note] > The Kubernetes API server will only allow requests from authorized networks. From 2e497b138e886e99b7b6414462f1388312731700 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Mon, 16 Dec 2024 14:10:50 +0000 Subject: [PATCH 3/7] Update README with GKE parallelstore related example blueprint details --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index 95cea23dae..b2a7bd7b3c 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1527,7 +1527,7 @@ The blueprint contains the following: * A K8s Job that uses a parallelstore storage volume option. * A K8s Job that demonstrates ML training workload with parallelstore storage disk ops. -> **Warning**: In this example, when storage type `Parallelstore` is specified in `gke-storage` module. +> **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module. > The lifecycle of the parallelstore is managed by the blueprint. > On glcuster destroy ops, the Parallelstore storage created will also be destroyed. > From d7723f47d706134b8a9b77e5d4b2fd332e646879 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Tue, 17 Dec 2024 06:33:34 +0000 Subject: [PATCH 4/7] Updated blueprint name from gke-storage-parallelstore to gke-storage-managed-parallelstore --- examples/README.md | 10 +++++----- ...ore.yaml => gke-storage-managed-parallelstore.yaml} | 4 ++-- modules/file-system/gke-storage/README.md | 2 +- ...ore.yaml => gke-storage-managed-parallelstore.yaml} | 6 +++--- ...store.yml => gke-storage-managed-parallelstore.yml} | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) rename examples/{gke-storage-parallelstore.yaml => gke-storage-managed-parallelstore.yaml} (98%) rename tools/cloud-build/daily-tests/builds/{gke-storage-parallelstore.yaml => gke-storage-managed-parallelstore.yaml} (93%) rename tools/cloud-build/daily-tests/tests/{gke-storage-parallelstore.yml => gke-storage-managed-parallelstore.yml} (77%) diff --git a/examples/README.md b/examples/README.md index b2a7bd7b3c..73272df3cb 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1518,14 +1518,14 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml -### [gke-storage-parallelstore.yaml] ![core-badge] ![experimental-badge] +### [gke-storage-managed-parallelstore.yaml] ![core-badge] ![experimental-badge] -This blueprint shows how to use parallelstore storage options with GKE in the toolkit. +This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit. The blueprint contains the following: -* A K8s Job that uses a parallelstore storage volume option. -* A K8s Job that demonstrates ML training workload with parallelstore storage disk ops. +* A K8s Job that uses a managed parallelstore storage volume option. +* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk ops. > **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module. > The lifecycle of the parallelstore is managed by the blueprint. @@ -1540,7 +1540,7 @@ The blueprint contains the following: > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. -[gke-storage-parallelstore.yaml]: ../examples/gke-storage-parallelstore.yaml +[gke-storage-managed-parallelstore.yaml]: ../examples/gke-storage-managed-parallelstore.yaml ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge] diff --git a/examples/gke-storage-parallelstore.yaml b/examples/gke-storage-managed-parallelstore.yaml similarity index 98% rename from examples/gke-storage-parallelstore.yaml rename to examples/gke-storage-managed-parallelstore.yaml index ac8f5773b9..414a2b180d 100644 --- a/examples/gke-storage-parallelstore.yaml +++ b/examples/gke-storage-managed-parallelstore.yaml @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-parallelstore +blueprint_name: gke-storage-managed-parallelstore vars: project_id: ## Set GCP Project ID Here ## - deployment_name: gke-storage-ps + deployment_name: gke-storage-managed-ps region: us-central1 zone: us-central1-c # Cidr block containing the IP of the machine calling terraform. diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index 17c718aa37..f4ebd8add0 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -39,7 +39,7 @@ then use them in a `gke-job-template` to dynamically provision the resource. ``` See example -[gke-storage-parallelstore.yaml](../../../examples/README.md#gke-storage-parallelstoreyaml--) blueprint +[gke-storage-managed-parallelstore.yaml](../../../examples/README.md#gke-storage-managed-parallelstoreyaml--) blueprint for a complete example. ### Authorized Network diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml similarity index 93% rename from tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml rename to tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml index a51c8cebab..8fbc9c1794 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage-parallelstore.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml @@ -27,7 +27,7 @@ timeout: 14400s # 4hr steps: ## Test GKE -- id: gke-storage-parallelstore +- id: gke-storage-managed-parallelstore name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: @@ -40,7 +40,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/gke-storage-parallelstore.yaml + SG_EXAMPLE=examples/gke-storage-managed-parallelstore.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} @@ -58,4 +58,4 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml similarity index 77% rename from tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml rename to tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml index a6de4bf239..bfb8bc32d7 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage-parallelstore.yml +++ b/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -test_name: gke-storage-parallelstore -deployment_name: gke-storage-parallelstore-{{ build }} +test_name: gke-storage-managed-parallelstore +deployment_name: gke-storage-managed-parallelstore-{{ build }} zone: us-central1-a # for remote node region: us-central1 workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-storage-parallelstore.yaml" +blueprint_yaml: "{{ workspace }}/examples/gke-storage-managed-parallelstore.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: -- test-validation/test-gke-storage-parallelstore.yml +- test-validation/test-gke-storage-managed-parallelstore.yml custom_vars: project: "{{ project }}" cli_deployment_vars: From 2bc964e2e9ab564dc43dd5c47c521adfc9962c6f Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Wed, 18 Dec 2024 19:19:26 +0000 Subject: [PATCH 5/7] Update ops to operation --- examples/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 73272df3cb..46ab3d11c0 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1525,11 +1525,11 @@ This blueprint shows how to use managed parallelstore storage options with GKE i The blueprint contains the following: * A K8s Job that uses a managed parallelstore storage volume option. -* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk ops. +* A K8s Job that demonstrates ML training workload with managed parallelstore storage disk operation. > **Warning**: In this example blueprint, when storage type `Parallelstore` is specified in `gke-storage` module. > The lifecycle of the parallelstore is managed by the blueprint. -> On glcuster destroy ops, the Parallelstore storage created will also be destroyed. +> On glcuster destroy operation, the Parallelstore storage created will also be destroyed. > > [!Note] > The Kubernetes API server will only allow requests from authorized networks. From ee06379599aa960c8da1c88896304d584cca4c16 Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Thu, 19 Dec 2024 06:30:13 +0000 Subject: [PATCH 6/7] Fix gke parallelstore blueprint name going beyond network char limit --- examples/README.md | 4 ++-- ...-parallelstore.yaml => gke-managed-parallelstore.yaml} | 2 +- modules/file-system/gke-storage/README.md | 2 +- ...-parallelstore.yaml => gke-managed-parallelstore.yaml} | 6 +++--- ...ed-parallelstore.yml => gke-managed-parallelstore.yml} | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) rename examples/{gke-storage-managed-parallelstore.yaml => gke-managed-parallelstore.yaml} (98%) rename tools/cloud-build/daily-tests/builds/{gke-storage-managed-parallelstore.yaml => gke-managed-parallelstore.yaml} (90%) rename tools/cloud-build/daily-tests/tests/{gke-storage-managed-parallelstore.yml => gke-managed-parallelstore.yml} (77%) diff --git a/examples/README.md b/examples/README.md index 46ab3d11c0..29db27df94 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1518,7 +1518,7 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml -### [gke-storage-managed-parallelstore.yaml] ![core-badge] ![experimental-badge] +### [gke-managed-parallelstore.yaml] ![core-badge] ![experimental-badge] This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit. @@ -1540,7 +1540,7 @@ The blueprint contains the following: > `--vars authorized_cidr=/32`.** You can use a service like > [whatismyip.com](https://whatismyip.com) to determine your IP address. -[gke-storage-managed-parallelstore.yaml]: ../examples/gke-storage-managed-parallelstore.yaml +[gke-managed-parallelstore.yaml]: ../examples/gke-managed-parallelstore.yaml ### [gke-a3-megagpu.yaml] ![core-badge] ![experimental-badge] diff --git a/examples/gke-storage-managed-parallelstore.yaml b/examples/gke-managed-parallelstore.yaml similarity index 98% rename from examples/gke-storage-managed-parallelstore.yaml rename to examples/gke-managed-parallelstore.yaml index 414a2b180d..4425f13181 100644 --- a/examples/gke-storage-managed-parallelstore.yaml +++ b/examples/gke-managed-parallelstore.yaml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -blueprint_name: gke-storage-managed-parallelstore +blueprint_name: gke-managed-parallelstore vars: project_id: ## Set GCP Project ID Here ## deployment_name: gke-storage-managed-ps diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index f4ebd8add0..fc65e76d4d 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -39,7 +39,7 @@ then use them in a `gke-job-template` to dynamically provision the resource. ``` See example -[gke-storage-managed-parallelstore.yaml](../../../examples/README.md#gke-storage-managed-parallelstoreyaml--) blueprint +[gke-managed-parallelstore.yaml](../../../examples/README.md#gke-managed-parallelstoreyaml--) blueprint for a complete example. ### Authorized Network diff --git a/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml similarity index 90% rename from tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml rename to tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml index 8fbc9c1794..01010a0435 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage-managed-parallelstore.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-managed-parallelstore.yaml @@ -27,7 +27,7 @@ timeout: 14400s # 4hr steps: ## Test GKE -- id: gke-storage-managed-parallelstore +- id: gke-managed-parallelstore name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: @@ -40,7 +40,7 @@ steps: cd /workspace && make BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} - SG_EXAMPLE=examples/gke-storage-managed-parallelstore.yaml + SG_EXAMPLE=examples/gke-managed-parallelstore.yaml # adding vm to act as remote node echo ' - id: remote-node' >> $${SG_EXAMPLE} @@ -58,4 +58,4 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml" diff --git a/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml b/tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml similarity index 77% rename from tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml rename to tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml index bfb8bc32d7..cd9e7f712b 100644 --- a/tools/cloud-build/daily-tests/tests/gke-storage-managed-parallelstore.yml +++ b/tools/cloud-build/daily-tests/tests/gke-managed-parallelstore.yml @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -test_name: gke-storage-managed-parallelstore -deployment_name: gke-storage-managed-parallelstore-{{ build }} +test_name: gke-managed-parallelstore +deployment_name: gke-managed-parallelstore-{{ build }} zone: us-central1-a # for remote node region: us-central1 workspace: /workspace -blueprint_yaml: "{{ workspace }}/examples/gke-storage-managed-parallelstore.yaml" +blueprint_yaml: "{{ workspace }}/examples/gke-managed-parallelstore.yaml" network: "{{ deployment_name }}-net" remote_node: "{{ deployment_name }}-0" post_deploy_tests: -- test-validation/test-gke-storage-managed-parallelstore.yml +- test-validation/test-gke-managed-parallelstore.yml custom_vars: project: "{{ project }}" cli_deployment_vars: From f6133471ac53b35f76bd6116caa1d3d1ff7c2f9d Mon Sep 17 00:00:00 2001 From: Mohit Chaurasia Date: Thu, 19 Dec 2024 07:15:43 +0000 Subject: [PATCH 7/7] Updated ansible playbook test file name --- ...orage-parallelstore.yml => test-gke-managed-parallelstore.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tools/cloud-build/daily-tests/ansible_playbooks/test-validation/{test-gke-storage-parallelstore.yml => test-gke-managed-parallelstore.yml} (100%) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml b/tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-parallelstore.yml similarity index 100% rename from tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-storage-parallelstore.yml rename to tools/cloud-build/daily-tests/ansible_playbooks/test-validation/test-gke-managed-parallelstore.yml