Skip to content

Commit

Permalink
KEP-2170: Add Torch Distributed Runtime (#2328)
Browse files Browse the repository at this point in the history
* KEP-2170: Add Torch Distributed Runtime

Signed-off-by: Andrey Velichkevich <[email protected]>

* Add pip list

Signed-off-by: Andrey Velichkevich <[email protected]>

---------

Signed-off-by: Andrey Velichkevich <[email protected]>
  • Loading branch information
andreyvelich authored Nov 28, 2024
1 parent 59b233c commit 4cc1709
Show file tree
Hide file tree
Showing 10 changed files with 74 additions and 10 deletions.
9 changes: 0 additions & 9 deletions manifests/v2/base/kustomization.yaml

This file was deleted.

2 changes: 2 additions & 0 deletions manifests/v2/base/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
resources:
- manager.yaml
# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests.
namespace: kubeflow-system
2 changes: 2 additions & 0 deletions manifests/v2/base/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ resources:
- role.yaml
- role_binding.yaml
- service_account.yaml
# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests.
namespace: kubeflow-system
4 changes: 4 additions & 0 deletions manifests/v2/base/runtimes/pre-training/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- torch-distributed.yaml
33 changes: 33 additions & 0 deletions manifests/v2/base/runtimes/pre-training/torch-distributed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
apiVersion: kubeflow.org/v2alpha1
kind: ClusterTrainingRuntime
metadata:
name: torch-distributed
labels:
training.kubeflow.org/phase: pre-training
spec:
mlPolicy:
numNodes: 1
torch:
numProcPerNode: auto
template:
spec:
replicatedJobs:
- name: trainer-node
template:
spec:
template:
spec:
containers:
- name: trainer
image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime
command:
- /bin/bash
- -c
- |
echo "Torch Distributed Runtime"
echo "--------------------------------------"
echo "Torch Default Runtime Env"
env | grep PET_
pip list
2 changes: 2 additions & 0 deletions manifests/v2/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ patches:
kind: ValidatingWebhookConfiguration
configurations:
- kustomizeconfig.yaml
# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests.
namespace: kubeflow-system
18 changes: 18 additions & 0 deletions manifests/v2/overlays/only-manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../base/crds
- ../../base/manager
- ../../base/rbac
- ../../base/webhook
# TODO (andreyvelich): JobSet should support kubeflow-system namespace.
- https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml
images:
- name: kubeflow/training-operator-v2
newTag: latest
secretGenerator:
- name: training-operator-v2-webhook-cert
namespace: kubeflow-system
options:
disableNameSuffixHash: true
4 changes: 4 additions & 0 deletions manifests/v2/overlays/only-manager/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow-system
4 changes: 4 additions & 0 deletions manifests/v2/overlays/only-runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ../../base/runtimes/pre-training
6 changes: 5 additions & 1 deletion manifests/v2/overlays/standalone/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../base
- ../../base/crds
- ../../base/manager
- ../../base/rbac
- ../../base/webhook
- ../../base/runtimes/pre-training
# TODO (andreyvelich): JobSet should support kubeflow-system namespace.
- https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml
images:
Expand Down

0 comments on commit 4cc1709

Please sign in to comment.