diff --git a/manifests/v2/base/kustomization.yaml b/manifests/v2/base/kustomization.yaml deleted file mode 100644 index 43eb72b72e..0000000000 --- a/manifests/v2/base/kustomization.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -# We can't set namespace in the overlays since we use remote JobSet manifests in the resources. -namespace: kubeflow-system -resources: - - ./crds - - ./rbac - - ./webhook - - ./manager diff --git a/manifests/v2/base/manager/kustomization.yaml b/manifests/v2/base/manager/kustomization.yaml index 7394a6d059..a62e9473d9 100644 --- a/manifests/v2/base/manager/kustomization.yaml +++ b/manifests/v2/base/manager/kustomization.yaml @@ -1,2 +1,4 @@ resources: - manager.yaml +# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. +namespace: kubeflow-system diff --git a/manifests/v2/base/rbac/kustomization.yaml b/manifests/v2/base/rbac/kustomization.yaml index 25a37bf74f..e9fca6afba 100644 --- a/manifests/v2/base/rbac/kustomization.yaml +++ b/manifests/v2/base/rbac/kustomization.yaml @@ -2,3 +2,5 @@ resources: - role.yaml - role_binding.yaml - service_account.yaml +# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. +namespace: kubeflow-system diff --git a/manifests/v2/base/runtimes/pre-training/kustomization.yaml b/manifests/v2/base/runtimes/pre-training/kustomization.yaml new file mode 100644 index 0000000000..1fb6985131 --- /dev/null +++ b/manifests/v2/base/runtimes/pre-training/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - torch-distributed.yaml diff --git a/manifests/v2/base/runtimes/pre-training/torch-distributed.yaml b/manifests/v2/base/runtimes/pre-training/torch-distributed.yaml new file mode 100644 index 0000000000..a28523de0e --- /dev/null +++ b/manifests/v2/base/runtimes/pre-training/torch-distributed.yaml @@ -0,0 +1,33 @@ +apiVersion: kubeflow.org/v2alpha1 +kind: ClusterTrainingRuntime +metadata: + name: torch-distributed + labels: + training.kubeflow.org/phase: pre-training +spec: + mlPolicy: + numNodes: 1 + torch: + numProcPerNode: auto + template: + spec: + replicatedJobs: + - name: trainer-node + template: + spec: + template: + spec: + containers: + - name: trainer + image: pytorch/pytorch:2.5.0-cuda12.4-cudnn9-runtime + command: + - /bin/bash + - -c + - | + echo "Torch Distributed Runtime" + + echo "--------------------------------------" + echo "Torch Default Runtime Env" + env | grep PET_ + + pip list diff --git a/manifests/v2/base/webhook/kustomization.yaml b/manifests/v2/base/webhook/kustomization.yaml index 5723808d02..1ea670ceef 100644 --- a/manifests/v2/base/webhook/kustomization.yaml +++ b/manifests/v2/base/webhook/kustomization.yaml @@ -10,3 +10,5 @@ patches: kind: ValidatingWebhookConfiguration configurations: - kustomizeconfig.yaml +# TODO (andreyvelich): Move it to overlays once we copy the JobSet manifests. +namespace: kubeflow-system diff --git a/manifests/v2/overlays/only-manager/kustomization.yaml b/manifests/v2/overlays/only-manager/kustomization.yaml new file mode 100644 index 0000000000..b6f81239d8 --- /dev/null +++ b/manifests/v2/overlays/only-manager/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - ../../base/crds + - ../../base/manager + - ../../base/rbac + - ../../base/webhook + # TODO (andreyvelich): JobSet should support kubeflow-system namespace. + - https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml +images: + - name: kubeflow/training-operator-v2 + newTag: latest +secretGenerator: + - name: training-operator-v2-webhook-cert + namespace: kubeflow-system + options: + disableNameSuffixHash: true diff --git a/manifests/v2/overlays/only-manager/namespace.yaml b/manifests/v2/overlays/only-manager/namespace.yaml new file mode 100644 index 0000000000..6bfc4968bd --- /dev/null +++ b/manifests/v2/overlays/only-manager/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kubeflow-system diff --git a/manifests/v2/overlays/only-runtimes/kustomization.yaml b/manifests/v2/overlays/only-runtimes/kustomization.yaml new file mode 100644 index 0000000000..41fb29b783 --- /dev/null +++ b/manifests/v2/overlays/only-runtimes/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../../base/runtimes/pre-training diff --git a/manifests/v2/overlays/standalone/kustomization.yaml b/manifests/v2/overlays/standalone/kustomization.yaml index 1ddb0a6f9e..2a59e17ed4 100644 --- a/manifests/v2/overlays/standalone/kustomization.yaml +++ b/manifests/v2/overlays/standalone/kustomization.yaml @@ -2,7 +2,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - namespace.yaml - - ../../base + - ../../base/crds + - ../../base/manager + - ../../base/rbac + - ../../base/webhook + - ../../base/runtimes/pre-training # TODO (andreyvelich): JobSet should support kubeflow-system namespace. - https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml images: