From ee39d1d70ff6a73a6389ecd66ac2b6fa2fb1cfdd Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 08:39:20 +0100 Subject: [PATCH 1/7] dra: move all jobs into a single file This makes editing a bit easier because one doesn't have to jump back and forth between different files while trying to keep periodic and presubmit jobs in sync. In the past, changes were made to one but not the other, perhaps because author and reviewer forgot about the other half of the jobs. It also enables usage of YAML anchors and aliases to define some settings only in one place. They are scoped to one document. --- .../sig-node/dynamic-resource-allocation.yaml | 281 +++++++++++++ .../sig-node/sig-node-presubmit.yaml | 383 ------------------ 2 files changed, 281 insertions(+), 383 deletions(-) diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml index 916da5dc9fa4..18ad04ec8acf 100644 --- a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml @@ -1,4 +1,8 @@ +# This file contains periodic and presubmit jobs which run tests covering +# Dynamic Resource Allocation (DRA). + periodics: + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) # on a kind cluster with containerd updated to a version with CDI support. - name: ci-kind-dra @@ -264,3 +268,280 @@ periodics: requests: cpu: 2 memory: 9Gi + +presubmits: + kubernetes/kubernetes: + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) + # on a kind cluster with containerd updated to a version with CDI support. + - name: pull-kubernetes-kind-dra + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-kind-dra + decorate: true + path_alias: k8s.io/kubernetes + # Not relevant for most PRs. + always_run: false + # This covers most of the code related to dynamic resource allocation. + # Periodic variant: ci-kind-dra + run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go + optional: true + decoration_config: + timeout: 90m + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/sh + - -xc + - > + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && + kind build node-image --image=dra/node:latest . && + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && + kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow' + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + requests: + # these are both a bit below peak usage during build + # this is mostly for building kubernetes + memory: "9000Mi" + # during the tests more like 3-20m is used + cpu: 2000m + limits: + memory: "9000Mi" + cpu: 2000m + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (partly alpha, partly beta) + # on a kind cluster with containerd updated to a version with CDI support. + # + # Compared to pull-kubernetes-dra, this one enables all DRA-related features. + - name: pull-kubernetes-kind-dra-all + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-kind-dra-all + decorate: true + path_alias: k8s.io/kubernetes + # Not relevant for most PRs. + always_run: false + # This covers most of the code related to dynamic resource allocation. + # Periodic variant: ci-kind-dra-all + run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go + # The tests might still be flaky or this job might get triggered accidentally for + # an unrelated PR. + optional: true + decoration_config: + timeout: 90m + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/bash + - -xc + - | + set -ex + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind + kind build node-image --image=dra/node:latest . + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT + # Which DRA features exist depends on the PR that is being tested. + features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) + echo "Enabling DRA feature(s): ${features[*]}." + # Those additional features are not in kind.yaml, but they can be added at the end. + kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + requests: + # these are both a bit below peak usage during build + # this is mostly for building kubernetes + memory: "9000Mi" + # during the tests more like 3-20m is used + cpu: 2000m + limits: + memory: "9000Mi" + cpu: 2000m + + - name: pull-kubernetes-node-e2e-crio-cgrpv1-dra + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-features + run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-pull-kubernetes-e2e: "true" + preset-pull-kubernetes-e2e-gce: "true" + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-crio-cgrpv1-dra + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi + + - name: pull-kubernetes-node-e2e-crio-cgrpv2-dra + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-cgrpv2-crio-dra + # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-pull-kubernetes-e2e: "true" + preset-pull-kubernetes-e2e-gce: "true" + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-crio-cgrpv2-dra + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi + + - name: pull-kubernetes-node-e2e-containerd-1-7-dra + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-containerd-1-7-dra + # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-containerd-dra + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///run/containerd/containerd.sock --container-runtime-process-name=/usr/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi diff --git a/config/jobs/kubernetes/sig-node/sig-node-presubmit.yaml b/config/jobs/kubernetes/sig-node/sig-node-presubmit.yaml index c8cc801dec1d..608961381ab6 100644 --- a/config/jobs/kubernetes/sig-node/sig-node-presubmit.yaml +++ b/config/jobs/kubernetes/sig-node/sig-node-presubmit.yaml @@ -3085,121 +3085,6 @@ presubmits: cpu: 4 memory: 6Gi - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) - # on a kind cluster with containerd updated to a version with CDI support. - - name: pull-kubernetes-kind-dra - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-kind-dra - decorate: true - path_alias: k8s.io/kubernetes - # Not relevant for most PRs. - always_run: false - # This covers most of the code related to dynamic resource allocation. - # Periodic variant: ci-kind-dra - run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go - optional: true - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-dind-enabled: "true" - preset-kind-volume-mounts: "true" - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - /bin/sh - - -xc - - > - make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && - curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && - kind build node-image --image=dra/node:latest . && - trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && - kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow' - - # docker-in-docker needs privileged mode - securityContext: - privileged: true - resources: - requests: - # these are both a bit below peak usage during build - # this is mostly for building kubernetes - memory: "9000Mi" - # during the tests more like 3-20m is used - cpu: 2000m - limits: - memory: "9000Mi" - cpu: 2000m - - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (partly alpha, partly beta) - # on a kind cluster with containerd updated to a version with CDI support. - # - # Compared to pull-kubernetes-dra, this one enables all DRA-related features. - - name: pull-kubernetes-kind-dra-all - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-kind-dra-all - decorate: true - path_alias: k8s.io/kubernetes - # Not relevant for most PRs. - always_run: false - # This covers most of the code related to dynamic resource allocation. - # Periodic variant: ci-kind-dra-all - run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go - # The tests might still be flaky or this job might get triggered accidentally for - # an unrelated PR. - optional: true - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-dind-enabled: "true" - preset-kind-volume-mounts: "true" - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - /bin/bash - - -xc - - | - set -ex - make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" - curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind - kind build node-image --image=dra/node:latest . - trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT - # Which DRA features exist depends on the PR that is being tested. - features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) - echo "Enabling DRA feature(s): ${features[*]}." - # Those additional features are not in kind.yaml, but they can be added at the end. - kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" - - # docker-in-docker needs privileged mode - securityContext: - privileged: true - resources: - requests: - # these are both a bit below peak usage during build - # this is mostly for building kubernetes - memory: "9000Mi" - # during the tests more like 3-20m is used - cpu: 2000m - limits: - memory: "9000Mi" - cpu: 2000m - - name: pull-kubernetes-e2e-gce-kubelet-credential-provider cluster: k8s-infra-prow-build always_run: false @@ -4169,274 +4054,6 @@ presubmits: cpu: 4 memory: 6Gi - - name: pull-kubernetes-node-e2e-crio-cgrpv1-dra - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-features - run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-crio-cgrpv1-dra - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml - env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi - - - name: pull-kubernetes-node-e2e-crio-cgrpv1-dra-kubetest2 # experimental alternative to pull-kubernetes-node-e2e-crio-cgrpv1-dra - cluster: k8s-infra-prow-build - # explicitly needs /test pull-kubernetes-node-e2e-crio-cgrpv1-dra-kubetest2 to run - always_run: false - # Don't run automatically while experimental! - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - skip_branches: - - release-\d+\.\d+ # per-release image - decorate: true - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits - testgrid-tab-name: pr-node-kubelet-crio-cgrpv1-dra-kubetest2 - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - kubetest2 - - noop - - --test=node - - -- - - --repo-root=. - - --gcp-zone=us-west1-b - - --parallelism=1 - - '--test-args=--ginkgo.timeout=1h --ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf DynamicResourceAllocation && !Flaky && !Slow" --feature-gates="DynamicResourceAllocation=true" --service-feature-gates="DynamicResourceAllocation=true" --runtime-config=api/alpha=true,api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml - resources: - limits: - cpu: 4 - memory: 6Gi - requests: - cpu: 4 - memory: 6Gi - env: - - name: KUBE_SSH_USER - value: core - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - - - name: pull-kubernetes-node-e2e-crio-cgrpv2-dra - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-cgrpv2-crio-dra - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-crio-cgrpv2-dra - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml - env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi - - - name: pull-kubernetes-node-e2e-crio-cgrpv2-dra-kubetest2 # experimental alternative to pull-kubernetes-node-e2e-crio-cgrpv2-dra - cluster: k8s-infra-prow-build - # explicitly needs /test pull-kubernetes-node-e2e-crio-cgrpv2-dra-kubetest2 to run - always_run: false - # Don't run automatically while experimental! - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - skip_branches: - - release-\d+\.\d+ # per-release image - decorate: true - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits - testgrid-tab-name: pr-node-kubelet-crio-cgrpv2-dra-kubetest2 - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - kubetest2 - - noop - - --test=node - - -- - - --repo-root=. - - --gcp-zone=us-west1-b - - --parallelism=1 - - '--test-args=--ginkgo.timeout=1h --ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf DynamicResourceAllocation && !Flaky && !Slow" --feature-gates="DynamicResourceAllocation=true" --service-feature-gates="DynamicResourceAllocation=true" --runtime-config=api/alpha=true,api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml - resources: - limits: - cpu: 4 - memory: 6Gi - requests: - cpu: 4 - memory: 6Gi - env: - - name: KUBE_SSH_USER - value: core - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - - name: pull-kubernetes-node-e2e-containerd-1-7-dra - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-containerd-1-7-dra - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-containerd-dra - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///run/containerd/containerd.sock --container-runtime-process-name=/usr/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi - - name: pull-kubernetes-node-e2e-resource-health-status cluster: k8s-infra-prow-build skip_branches: From 449711eec22b37fc88bd89dd57cbfc9dcf17ca74 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 09:03:23 +0100 Subject: [PATCH 2/7] dra: add canary jobs Breaking the jobs which are in use while making changes is annoying, but hard to avoid because testing these jobs locally is difficult. The https://docs.prow.k8s.io/docs/build-test-update/#how-to-test-a-prowjob method doesn't work (or at least not easily) because of nested containers (kind inside kind, for E2E) and the need for a special test environment (E2E node). --- .../dynamic-resource-allocation-canary.yaml | 558 ++++++++++++++++++ .../sig-node/dynamic-resource-allocation.yaml | 12 + 2 files changed, 570 insertions(+) create mode 100644 config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml new file mode 100644 index 000000000000..a9f62b8bc245 --- /dev/null +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml @@ -0,0 +1,558 @@ +# This file contains canary periodic and presubmit jobs which run tests covering +# Dynamic Resource Allocation (DRA). +# +# The intent is to make all changes to DRA jobs first for these canary jobs, +# then copy the changes into dynamic-resource-allocation.yaml. Unless some +# experimental changes are being tested, a diff between the two files should +# be limited to: +# - this comment +# - job names +# - interval for the periodic jobs +# +# This command can be used to check this: +# diff dynamic-resource-allocation.yaml <(sed -e 's/-canary//' dynamic-resource-allocation-canary.yaml) + +periodics: + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) + # on a kind cluster with containerd updated to a version with CDI support. + - name: ci-kind-dra-canary + cluster: eks-prow-build-cluster + interval: 1000000h # Run only once on creation and when manually triggered. + annotations: + testgrid-dashboards: sig-node-dynamic-resource-allocation + testgrid-tab-name: ci-kind-dra-canary + description: Runs E2E tests for Dynamic Resource Allocation beta features against a Kubernetes master cluster created with sigs.k8s.io/kind + testgrid-alert-email: patrick.ohly@intel.com + fork-per-release: "true" + decorate: true + decoration_config: + timeout: 3h + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/sh + - -xc + - > + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && + kind build node-image --image=dra/node:latest . && + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && + kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=2h30m hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky' + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + limits: + cpu: 2 + memory: 9Gi + requests: + cpu: 2 + memory: 9Gi + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently alpha, soon beta) + # on a kind cluster with containerd updated to a version with CDI support. + # + # Compared to ci-kind-dra-canary, this one enables all DRA-related features. + - name: ci-kind-dra-all-canary + cluster: eks-prow-build-cluster + interval: 1000000h # Run only once on creation and when manually triggered. + annotations: + testgrid-dashboards: sig-node-dynamic-resource-allocation + testgrid-tab-name: ci-kind-dra-all-canary + description: Runs E2E tests for Dynamic Resource Allocation alpha and beta features against a Kubernetes master cluster created with sigs.k8s.io/kind + testgrid-alert-email: patrick.ohly@intel.com + fork-per-release: "true" + decorate: true + decoration_config: + timeout: 3h + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/bash + - -xc + - | + set -ex + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind + kind build node-image --image=dra/node:latest . + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT + # Which DRA features exist can change over time. + features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) + echo "Enabling DRA feature(s): ${features[*]}." + # Those additional features are not in kind.yaml, but they can be added at the end. + kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + limits: + cpu: 2 + memory: 9Gi + requests: + cpu: 2 + memory: 9Gi + + # This job runs e2e_node.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) + - name: ci-node-e2e-cgrpv1-crio-dra-canary + cluster: k8s-infra-prow-build + interval: 1000000h # Run only once on creation and when manually triggered. + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + testgrid-tab-name: ci-node-e2e-cgrpv1-crio-dra-canary + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v1 + testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com + fork-per-release: "true" + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + decorate: true + decoration_config: + timeout: 90m + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + workdir: true + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + - name: GOPATH + value: /go + resources: + limits: + cpu: 2 + memory: 9Gi + requests: + cpu: 2 + memory: 9Gi + + # This job runs e2e_node.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) + - name: ci-node-e2e-cgrpv2-crio-dra-canary + cluster: k8s-infra-prow-build + interval: 1000000h # Run only once on creation and when manually triggered. + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v2 + testgrid-tab-name: ci-node-e2e-cgrpv2-crio-dra-canary + testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com + fork-per-release: "true" + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + decorate: true + decoration_config: + timeout: 90m + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + workdir: true + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + - name: GOPATH + value: /go + resources: + limits: + cpu: 2 + memory: 9Gi + requests: + cpu: 2 + memory: 9Gi + + # This job runs the same tests as ci-node-e2e-crio-dra-canary with Containerd 1.7 runtime + - name: ci-node-e2e-containerd-1-7-dra-canary + cluster: k8s-infra-prow-build + interval: 1000000h # Run only once on creation and when manually triggered. + annotations: + testgrid-dashboards: sig-node-dynamic-resource-allocation + testgrid-tab-name: ci-node-e2e-containerd-1-7-dra-canary + description: Runs E2E node tests for Dynamic Resource Allocation beta features with containerd + testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com + fork-per-release: "true" + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + decorate: true + decoration_config: + timeout: 90m + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + workdir: true + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --container-runtime-process-name=/usr/local/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml + resources: + limits: + cpu: 2 + memory: 9Gi + requests: + cpu: 2 + memory: 9Gi + +presubmits: + kubernetes/kubernetes: + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) + # on a kind cluster with containerd updated to a version with CDI support. + - name: pull-kubernetes-kind-dra-canary + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-kind-dra-canary + decorate: true + path_alias: k8s.io/kubernetes + # Not relevant for most PRs. + always_run: false + # This covers most of the code related to dynamic resource allocation. + # Periodic variant: ci-kind-dra-canary + run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go + optional: true + decoration_config: + timeout: 90m + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/sh + - -xc + - > + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && + kind build node-image --image=dra/node:latest . && + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && + kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow' + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + requests: + # these are both a bit below peak usage during build + # this is mostly for building kubernetes + memory: "9000Mi" + # during the tests more like 3-20m is used + cpu: 2000m + limits: + memory: "9000Mi" + cpu: 2000m + + # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (partly alpha, partly beta) + # on a kind cluster with containerd updated to a version with CDI support. + # + # Compared to pull-kubernetes-dra-canary, this one enables all DRA-related features. + - name: pull-kubernetes-kind-dra-canary-all + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-kind-dra-canary-all + decorate: true + path_alias: k8s.io/kubernetes + # Not relevant for most PRs. + always_run: false + # This covers most of the code related to dynamic resource allocation. + # Periodic variant: ci-kind-dra-canary-all + run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go + # The tests might still be flaky or this job might get triggered accidentally for + # an unrelated PR. + optional: true + decoration_config: + timeout: 90m + labels: + preset-service-account: "true" + preset-dind-enabled: "true" + preset-kind-volume-mounts: "true" + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + args: + - /bin/bash + - -xc + - | + set -ex + make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" + curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind + kind build node-image --image=dra/node:latest . + trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT + # Which DRA features exist depends on the PR that is being tested. + features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) + echo "Enabling DRA feature(s): ${features[*]}." + # Those additional features are not in kind.yaml, but they can be added at the end. + kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" + + # docker-in-docker needs privileged mode + securityContext: + privileged: true + resources: + requests: + # these are both a bit below peak usage during build + # this is mostly for building kubernetes + memory: "9000Mi" + # during the tests more like 3-20m is used + cpu: 2000m + limits: + memory: "9000Mi" + cpu: 2000m + + - name: pull-kubernetes-node-e2e-crio-cgrpv1-dra-canary + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-features-canary + run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-pull-kubernetes-e2e: "true" + preset-pull-kubernetes-e2e-gce: "true" + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-crio-cgrpv1-dra-canary + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi + + - name: pull-kubernetes-node-e2e-crio-cgrpv2-dra-canary + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-cgrpv2-crio-dra-canary + # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-pull-kubernetes-e2e: "true" + preset-pull-kubernetes-e2e-gce: "true" + annotations: + testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-crio-cgrpv2-dra-canary + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --env=KUBE_SSH_USER=core + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml + env: + - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE + value: "1" + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi + + - name: pull-kubernetes-node-e2e-containerd-1-7-dra-canary + cluster: k8s-infra-prow-build + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-containerd-1-7-dra-canary + # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + optional: true + skip_report: false + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + annotations: + testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation + testgrid-tab-name: pr-node-kubelet-containerd-dra-canary + decorate: true + decoration_config: + timeout: 90m + path_alias: k8s.io/kubernetes + extra_refs: + - org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --deployment=node + - --gcp-zone=us-west1-b + - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///run/containerd/containerd.sock --container-runtime-process-name=/usr/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' + - --node-tests=true + - --provider=gce + - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' + - --timeout=65m + - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml + resources: + requests: + cpu: 4 + memory: 6Gi + limits: + cpu: 4 + memory: 6Gi diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml index 18ad04ec8acf..5870ebf9a029 100644 --- a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml @@ -1,5 +1,17 @@ # This file contains periodic and presubmit jobs which run tests covering # Dynamic Resource Allocation (DRA). +# +# The intent is to make all changes to DRA jobs first for the canary +# jobs defined in dynamic-resource-allocation-canary.yaml, +# then copy the changes into dynamic-resource-allocation.yaml. Unless some +# experimental changes are being tested, a diff between the two files should +# be limited to: +# - this comment +# - job names +# - interval for the periodic jobs +# +# This command can be used to check this: +# diff dynamic-resource-allocation.yaml <(sed -e 's/-canary//' dynamic-resource-allocation-canary.yaml) periodics: From b4f79a9286e86e6c74b01cc0041d5b22424bd8a4 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 09:10:12 +0100 Subject: [PATCH 3/7] dra canary: fix comment The wrong CI job got referenced. --- .../kubernetes/sig-node/dynamic-resource-allocation-canary.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml index a9f62b8bc245..46e377ebf7e3 100644 --- a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml @@ -405,7 +405,7 @@ presubmits: always_run: false # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-features-canary + # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-canary run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) optional: true skip_report: false From 056f34d0fffe6816d4f4ef7acd2e14811ff5c67a Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 10:06:10 +0100 Subject: [PATCH 4/7] dra: add shell script for applying canary changes The benefit of trying out changes in canary jobs is diminished if the actual change then still needs to be done manually. There has been at least one case elsewhere where the canary job changes were okay, but then copying them into the real jobs was bungled such that they broke. To avoid this, the shell script automates copying of changes. To use it, run dra-sync.sh on a new, clean branch and submit the generated commit in a PR. --- .../kubernetes/sig-node/.dra-sync-settings | 2 + config/jobs/kubernetes/sig-node/dra-sync.sh | 62 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 config/jobs/kubernetes/sig-node/.dra-sync-settings create mode 100755 config/jobs/kubernetes/sig-node/dra-sync.sh diff --git a/config/jobs/kubernetes/sig-node/.dra-sync-settings b/config/jobs/kubernetes/sig-node/.dra-sync-settings new file mode 100644 index 000000000000..b1bd8cdc0b97 --- /dev/null +++ b/config/jobs/kubernetes/sig-node/.dra-sync-settings @@ -0,0 +1,2 @@ +# Last commit which was synced into dynamic-resource-allocation.yaml. +last_sync=449711eec22b37fc88bd89dd57cbfc9dcf17ca74 diff --git a/config/jobs/kubernetes/sig-node/dra-sync.sh b/config/jobs/kubernetes/sig-node/dra-sync.sh new file mode 100755 index 000000000000..b2e9c7e5f46b --- /dev/null +++ b/config/jobs/kubernetes/sig-node/dra-sync.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Copyright 2024 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Running this script will automatically take changes made to +# dynamic-resource-allocation-canary.yaml since the last sync +# (tracked in .dra-sync-settings) and create a commit which +# applies those changes to dynamic-resource-allocation.yaml. + +set -o errexit +set -o nounset +set -o pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd -P)" +cd "${REPO_ROOT}" + +# get "last_sync" +source "config/jobs/kubernetes/sig-node/.dra-sync-settings" + +if [ -n "$(git diff --cached 2>&1)" ]; then + echo >&2 "ERROR: The git staging area must be clean." + exit 1 +fi + +new_sync=$(git rev-parse HEAD) + +diff=$(git diff ${last_sync}..${new_sync} config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml | sed -e 's/-canary//g') + +if [ -z "${diff}" ]; then + echo "No changes since last sync, nothing to do." + exit 0 +fi + +# Generate a "git format-patch" alike patch and apply it. +git am < +Date: $(date --rfc-email) +Subject: [PATCH 1/1] dra: apply changes from canary jobs + +--- +${diff} +$(diff -u config/jobs/kubernetes/sig-node/.dra-sync-settings <(sed -e "s/last_sync=.*/last_sync=${new_sync}/" config/jobs/kubernetes/sig-node/.dra-sync-settings) | sed -e 's;^--- .*;--- a/config/jobs/kubernetes/sig-node/.dra-sync-settings;' -e 's;+++ .*;+++ b/config/jobs/kubernetes/sig-node/.dra-sync-settings;') +EOF + +git log -p -n1 + +cat < Date: Wed, 18 Dec 2024 10:09:29 +0100 Subject: [PATCH 5/7] dra: apply changes from canary jobs --- config/jobs/kubernetes/sig-node/.dra-sync-settings | 2 +- .../jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/jobs/kubernetes/sig-node/.dra-sync-settings b/config/jobs/kubernetes/sig-node/.dra-sync-settings index b1bd8cdc0b97..efbb28833e99 100644 --- a/config/jobs/kubernetes/sig-node/.dra-sync-settings +++ b/config/jobs/kubernetes/sig-node/.dra-sync-settings @@ -1,2 +1,2 @@ # Last commit which was synced into dynamic-resource-allocation.yaml. -last_sync=449711eec22b37fc88bd89dd57cbfc9dcf17ca74 +last_sync=056f34d0fffe6816d4f4ef7acd2e14811ff5c67a diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml index 5870ebf9a029..8c12210d22cb 100644 --- a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation.yaml @@ -406,7 +406,7 @@ presubmits: always_run: false # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-features + # Periodic variant: ci-node-e2e-crio-cgrpv1-dra run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) optional: true skip_report: false From 71e5959467e6de8f2c438814116b2cdc7ccc7835 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 14:17:23 +0100 Subject: [PATCH 6/7] node: add YAML test It's convenient to run `go test .` while editing the YAML files to see if there are any parse errors. Furthermore, `go test -v` prints the files in a normalized form. This can be used for before/after comparisons when making larger changes. --- .../jobs/kubernetes/sig-node/sig_node_test.go | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 config/jobs/kubernetes/sig-node/sig_node_test.go diff --git a/config/jobs/kubernetes/sig-node/sig_node_test.go b/config/jobs/kubernetes/sig-node/sig_node_test.go new file mode 100644 index 000000000000..a0259e188fc6 --- /dev/null +++ b/config/jobs/kubernetes/sig-node/sig_node_test.go @@ -0,0 +1,74 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package policy + +// This file validates Kubernetes's jobs configs against policies. + +import ( + "encoding/json" + "io/fs" + "os" + "path/filepath" + "strings" + "testing" + + "sigs.k8s.io/prow/pkg/config" + "sigs.k8s.io/yaml" +) + +func TestYaml(t *testing.T) { + if err := filepath.WalkDir(".", func(path string, d fs.DirEntry, err error) error { + if !strings.HasSuffix(path, ".yaml") { + return nil + } + t.Run(path, func(t *testing.T) { + var content struct { + Presets []config.Preset `json:"presets,omitempty"` + Templates any `json:"templates,omitempty"` + Periodics []config.Periodic `json:"periodics,omitempty"` + PreSubmits map[string][]config.Presubmit `json:"presubmits,omitempty"` + } + + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("error reading file: %v", err) + } + // Because of https://github.com/kubernetes-sigs/yaml/issues/46, + // strict parsing cannot be used as it would complain about + // repeated keys in maps, which is intentional and (in YAML) + // valid when using aliases. + if err := yaml.Unmarshal(data, &content, func(d *json.Decoder) *json.Decoder { + d.DisallowUnknownFields() + return d + }); err != nil { + t.Fatalf("error unmarshaling data: %v", err) + } + + // The Templates are just helpers, what matters are the jobs. + content.Templates = nil + + data, err = yaml.Marshal(&content) + if err != nil { + t.Fatalf("error re-marshaling content: %v", err) + } + t.Logf("\n%s", string(data)) + }) + return nil + }); err != nil { + t.Errorf("Error looking for YAML files: %v", err) + } +} From 3942639dbf072b7c272b21039dae59dc9bf10988 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 18 Dec 2024 15:35:41 +0100 Subject: [PATCH 7/7] dra: use YAML anchors and aliases to avoid duplication These different jobs were all similar, with small variations. Keeping them in sync was an on-going challenge that we lost: - at some point, features were enabled differently in E2E node presubmits and periodics (didn't make a difference, but the they weren't the same anymore) - resource settings for containerd vs CRI-O were different and it is unclear whether that was intentional (no comment about it) - clusters where different Defining common elements via YAML anchors once and reusing them via YAML aliases avoids this. `go test -v -run=TestYaml/dynamic-resource-allocation-canary.yaml .` can be used to see the full job definitions. --- .../dynamic-resource-allocation-canary.yaml | 706 ++++++------------ 1 file changed, 234 insertions(+), 472 deletions(-) diff --git a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml index 46e377ebf7e3..b3fb10e9b0fe 100644 --- a/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml +++ b/config/jobs/kubernetes/sig-node/dynamic-resource-allocation-canary.yaml @@ -12,103 +12,95 @@ # This command can be used to check this: # diff dynamic-resource-allocation.yaml <(sed -e 's/-canary//' dynamic-resource-allocation-canary.yaml) -periodics: - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) - # on a kind cluster with containerd updated to a version with CDI support. - - name: ci-kind-dra-canary +# `templates` has no special meaning. It just holds YAML anchors (= re-usable content) +# that get referenced below via YAML aliases: `<<: *job` includes the content +# and then allows adding or overwriting fields. This is done at the root. If a field +# contains lists or objects, that content gets replaced instead of merged. +# +# Lists cannot be extended the same way (https://github.com/yaml/yaml/issues/35). +# +# If unsure what the expanded jobs look like or to test parsing, run `go test -v .` +templates: + - &job cluster: eks-prow-build-cluster - interval: 1000000h # Run only once on creation and when manually triggered. - annotations: + annotations: &annotations testgrid-dashboards: sig-node-dynamic-resource-allocation - testgrid-tab-name: ci-kind-dra-canary - description: Runs E2E tests for Dynamic Resource Allocation beta features against a Kubernetes master cluster created with sigs.k8s.io/kind - testgrid-alert-email: patrick.ohly@intel.com - fork-per-release: "true" + # Alerting is enabled also for PRs. If someone repeatedly tests a broken + # PR where the change introduced by the PR breaks tests, an alert will + # eventually be triggered. This is a good thing because then we can + # show the PR author how to do local testing... + # + # Disabled for canary jobs, enabled for real jobs. + testgrid-alert-email: # patrick.ohly@intel.com,eduard.bartosh@intel.com # #wg-device-management on Slack + fork-per-release: "false" # Only for canary jobs, must be true for real jobs. decorate: true - decoration_config: - timeout: 3h - labels: - preset-service-account: "true" - preset-dind-enabled: "true" - preset-kind-volume-mounts: "true" - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - /bin/sh - - -xc - - > - make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && - curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && - kind build node-image --image=dra/node:latest . && - trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && - kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=2h30m hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky' - # docker-in-docker needs privileged mode - securityContext: - privileged: true - resources: - limits: - cpu: 2 - memory: 9Gi - requests: - cpu: 2 - memory: 9Gi + - &kubernetes-master + org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently alpha, soon beta) - # on a kind cluster with containerd updated to a version with CDI support. - # - # Compared to ci-kind-dra-canary, this one enables all DRA-related features. - - name: ci-kind-dra-all-canary - cluster: eks-prow-build-cluster + - &test-infra-master + org: kubernetes + repo: test-infra + base_ref: master + path_alias: k8s.io/test-infra + + - &periodic-job interval: 1000000h # Run only once on creation and when manually triggered. - annotations: - testgrid-dashboards: sig-node-dynamic-resource-allocation - testgrid-tab-name: ci-kind-dra-all-canary - description: Runs E2E tests for Dynamic Resource Allocation alpha and beta features against a Kubernetes master cluster created with sigs.k8s.io/kind - testgrid-alert-email: patrick.ohly@intel.com - fork-per-release: "true" - decorate: true + extra_refs: + - *kubernetes-master + + - &periodic-node-job + <<: *periodic-job + extra_refs: + - *kubernetes-master + - *test-infra-master # For test-infra/jobs/e2e_node files. + + - &presubmit-job + skip_branches: + - release-\d+\.\d+ # per-release image + always_run: false + optional: true + skip_report: false + # run_if_changed is set judiciously on some jobs. + + - &e2e-kind-job + <<: *job decoration_config: timeout: 3h labels: preset-service-account: "true" preset-dind-enabled: "true" preset-kind-volume-mounts: "true" - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes + decoration_config: + timeout: 3h spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + - &e2e-kind-container + image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master command: - runner.sh - args: - - /bin/bash - - -xc + - /bin/sh + - -xce - | - set -ex make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind kind build node-image --image=dra/node:latest . trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT - # Which DRA features exist can change over time. - features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) - echo "Enabling DRA feature(s): ${features[*]}." - # Those additional features are not in kind.yaml, but they can be added at the end. - kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" + if ${with_all_features:-false}; then + # Which DRA features exist can change over time. + features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) + echo "Enabling DRA feature(s): ${features[*]}." + # Those additional features are not in kind.yaml, but they can be added at the end. + kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" + else + kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest + KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=2h30m hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky' + fi # docker-in-docker needs privileged mode securityContext: @@ -121,157 +113,61 @@ periodics: cpu: 2 memory: 9Gi - # This job runs e2e_node.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) - - name: ci-node-e2e-cgrpv1-crio-dra-canary - cluster: k8s-infra-prow-build - interval: 1000000h # Run only once on creation and when manually triggered. - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation - testgrid-tab-name: ci-node-e2e-cgrpv1-crio-dra-canary - description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v1 - testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com - fork-per-release: "true" + - &e2e-node-job + <<: *job labels: preset-service-account: "true" preset-k8s-ssh: "true" - decorate: true decoration_config: timeout: 90m - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes - workdir: true - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master + - &e2e-node-container + image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml - env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - - name: GOPATH - value: /go - resources: - limits: - cpu: 2 - memory: 9Gi - requests: - cpu: 2 - memory: 9Gi + - /bin/bash + - -ce + - | + export IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE=1 + export GOPATH=/go - # This job runs e2e_node.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) - - name: ci-node-e2e-cgrpv2-crio-dra-canary - cluster: k8s-infra-prow-build - interval: 1000000h # Run only once on creation and when manually triggered. - annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation - description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v2 - testgrid-tab-name: ci-node-e2e-cgrpv2-crio-dra-canary - testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com - fork-per-release: "true" - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - decorate: true - decoration_config: - timeout: 90m - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes - workdir: true - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml - env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - - name: GOPATH - value: /go - resources: - limits: - cpu: 2 - memory: 9Gi - requests: - cpu: 2 - memory: 9Gi + # Compose the command depending on the `runtime` variable set for the job. + cmd=( + runner.sh + /workspace/scenarios/kubernetes_e2e.py + --deployment=node + --env=KUBE_SSH_USER=core + --gcp-zone=us-west1-b + --node-tests=true + --provider=gce + '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' + --timeout=65m + ) + case ${runtime:-containerd} in + containerd) + cmd+=( + '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///run/containerd/containerd.sock --container-runtime-process-name=/usr/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' + --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml + ) + ;; + crio-cgroupv1) + cmd+=( + '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml + ) + ;; + crio-cgroupv2) + cmd+=( + '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' + --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml + ) + ;; + esac + + # Run it. + set -x + ${cmd[@]} - # This job runs the same tests as ci-node-e2e-crio-dra-canary with Containerd 1.7 runtime - - name: ci-node-e2e-containerd-1-7-dra-canary - cluster: k8s-infra-prow-build - interval: 1000000h # Run only once on creation and when manually triggered. - annotations: - testgrid-dashboards: sig-node-dynamic-resource-allocation - testgrid-tab-name: ci-node-e2e-containerd-1-7-dra-canary - description: Runs E2E node tests for Dynamic Resource Allocation beta features with containerd - testgrid-alert-email: eduard.bartosh@intel.com,patrick.ohly@intel.com - fork-per-release: "true" - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - decorate: true - decoration_config: - timeout: 90m - extra_refs: - - org: kubernetes - repo: kubernetes - base_ref: master - path_alias: k8s.io/kubernetes - workdir: true - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --container-runtime-process-name=/usr/local/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml resources: limits: cpu: 2 @@ -280,279 +176,145 @@ periodics: cpu: 2 memory: 9Gi -presubmits: - kubernetes/kubernetes: +periodics: - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (currently beta) - # on a kind cluster with containerd updated to a version with CDI support. - - name: pull-kubernetes-kind-dra-canary - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image + - <<: *e2e-kind-job + <<: *periodic-job + name: ci-kind-dra-canary annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-kind-dra-canary - decorate: true - path_alias: k8s.io/kubernetes - # Not relevant for most PRs. - always_run: false - # This covers most of the code related to dynamic resource allocation. - # Periodic variant: ci-kind-dra-canary - run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go - optional: true - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-dind-enabled: "true" - preset-kind-volume-mounts: "true" - spec: - containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - /bin/sh - - -xc - - > - make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" && - curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind && - kind build node-image --image=dra/node:latest . && - trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT && - kind create cluster --retain --config test/e2e/dra/kind.yaml --image dra/node:latest && - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter='Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow' - - # docker-in-docker needs privileged mode - securityContext: - privileged: true - resources: - requests: - # these are both a bit below peak usage during build - # this is mostly for building kubernetes - memory: "9000Mi" - # during the tests more like 3-20m is used - cpu: 2000m - limits: - memory: "9000Mi" - cpu: 2000m + <<: *annotations + testgrid-tab-name: ci-kind-dra-canary + description: Runs E2E tests for Dynamic Resource Allocation beta features against a Kubernetes master cluster created with sigs.k8s.io/kind - # This jobs runs e2e.test with a focus on tests for the Dynamic Resource Allocation feature (partly alpha, partly beta) - # on a kind cluster with containerd updated to a version with CDI support. - # - # Compared to pull-kubernetes-dra-canary, this one enables all DRA-related features. - - name: pull-kubernetes-kind-dra-canary-all - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image + - <<: *e2e-kind-job + <<: *periodic-job + name: ci-kind-dra-all-canary annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-kind-dra-canary-all - decorate: true - path_alias: k8s.io/kubernetes - # Not relevant for most PRs. - always_run: false - # This covers most of the code related to dynamic resource allocation. - # Periodic variant: ci-kind-dra-canary-all - run_if_changed: /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go - # The tests might still be flaky or this job might get triggered accidentally for - # an unrelated PR. - optional: true - decoration_config: - timeout: 90m - labels: - preset-service-account: "true" - preset-dind-enabled: "true" - preset-kind-volume-mounts: "true" + <<: *annotations + testgrid-tab-name: ci-kind-dra-all-canary + description: Runs E2E tests for Dynamic Resource Allocation alpha and beta features against a Kubernetes master cluster created with sigs.k8s.io/kind spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - args: - - /bin/bash - - -xc - - | - set -ex - make WHAT="github.com/onsi/ginkgo/v2/ginkgo k8s.io/kubernetes/test/e2e/e2e.test" - curl -sSL https://kind.sigs.k8s.io/dl/latest/linux-amd64.tgz | tar xvfz - -C "${PATH%%:*}/" kind - kind build node-image --image=dra/node:latest . - trap 'kind export logs "${ARTIFACTS}/kind"; kind delete cluster' EXIT - # Which DRA features exist depends on the PR that is being tested. - features=( $(grep '"DRA' pkg/features/kube_features.go | sed 's/.*"\(.*\)"/\1/') ) - echo "Enabling DRA feature(s): ${features[*]}." - # Those additional features are not in kind.yaml, but they can be added at the end. - kind create cluster --retain --config <(cat test/e2e/dra/kind.yaml; for feature in ${features}; do echo " ${feature}: true"; done) --image dra/node:latest - KUBERNETES_PROVIDER=local KUBECONFIG=${HOME}/.kube/config GINKGO_PARALLEL_NODES=8 E2E_REPORT_DIR=${ARTIFACTS} GINKGO_TIMEOUT=1h hack/ginkgo-e2e.sh -ginkgo.label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Alpha, Beta, DynamicResourceAllocation$(for feature in ${features}; do echo , ${feature}; done)} && !Flaky && !Slow" - - # docker-in-docker needs privileged mode - securityContext: - privileged: true - resources: - requests: - # these are both a bit below peak usage during build - # this is mostly for building kubernetes - memory: "9000Mi" - # during the tests more like 3-20m is used - cpu: 2000m - limits: - memory: "9000Mi" - cpu: 2000m + - <<: *e2e-kind-container + env: + - name: with_all_features + value: "true" - - name: pull-kubernetes-node-e2e-crio-cgrpv1-dra-canary - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-crio-cgrpv1-dra-canary - run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" + - <<: *e2e-node-job + <<: *periodic-node-job + name: ci-node-e2e-cgrpv1-crio-dra-canary annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-crio-cgrpv1-dra-canary - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra + <<: *annotations + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + testgrid-tab-name: ci-node-e2e-cgrpv1-crio-dra-canary + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v1 spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv1-serial.yaml + - <<: *e2e-node-container env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi + - name: runtime + value: crio-cgroupv1 - - name: pull-kubernetes-node-e2e-crio-cgrpv2-dra-canary - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-cgrpv2-crio-dra-canary - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" - preset-pull-kubernetes-e2e: "true" - preset-pull-kubernetes-e2e-gce: "true" + - <<: *e2e-node-job + <<: *periodic-node-job + name: ci-node-e2e-cgrpv2-crio-dra-canary annotations: - testgrid-dashboards: sig-node-cri-o, sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-crio-cgrpv2-dra-canary - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra + <<: *annotations + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v2 + testgrid-tab-name: ci-node-e2e-cgrpv2-crio-dra-canary spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --env=KUBE_SSH_USER=core - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///var/run/crio/crio.sock --container-runtime-process-name=/usr/local/bin/crio --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/crio.service --kubelet-cgroups=/system.slice/kubelet.service" --extra-log="{\"name\": \"crio.log\", \"journalctl\": [\"-u\", \"crio\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/crio/latest/image-config-cgroupv2-serial.yaml + - <<: *e2e-node-container env: - - name: IGNITION_INJECT_GCE_SSH_PUBLIC_KEY_FILE - value: "1" - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi + - name: runtime + value: crio-cgroupv1 - - name: pull-kubernetes-node-e2e-containerd-1-7-dra-canary - cluster: k8s-infra-prow-build - skip_branches: - - release-\d+\.\d+ # per-release image - always_run: false - # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. - # CRI-O was picked because it was solid for testing so far. - # Periodic variant: ci-node-e2e-containerd-1-7-dra-canary - # run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) - optional: true - skip_report: false - labels: - preset-service-account: "true" - preset-k8s-ssh: "true" + - <<: *e2e-node-job + <<: *periodic-node-job + name: ci-node-e2e-containerd-dra-canary annotations: - testgrid-dashboards: sig-node-presubmits, sig-node-dynamic-resource-allocation - testgrid-tab-name: pr-node-kubelet-containerd-dra-canary - decorate: true - decoration_config: - timeout: 90m - path_alias: k8s.io/kubernetes - extra_refs: - - org: kubernetes - repo: test-infra - base_ref: master - path_alias: k8s.io/test-infra + <<: *annotations + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v1 + testgrid-tab-name: ci-node-e2e-containerd-dra-canary spec: containers: - - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20241128-8df65c072f-master - command: - - runner.sh - - /workspace/scenarios/kubernetes_e2e.py - args: - - --deployment=node - - --gcp-zone=us-west1-b - - '--node-test-args=--feature-gates=DynamicResourceAllocation=true --service-feature-gates=DynamicResourceAllocation=true --runtime-config=api/beta=true --container-runtime-endpoint=unix:///run/containerd/containerd.sock --container-runtime-process-name=/usr/bin/containerd --container-runtime-pid-file= --kubelet-flags="--cgroup-driver=systemd --cgroups-per-qos=true --cgroup-root=/ --runtime-cgroups=/system.slice/containerd.service" --extra-log="{\"name\": \"containerd.log\", \"journalctl\": [\"-u\", \"containerd\"]}"' - - --node-tests=true - - --provider=gce - - '--test_args=--timeout=1h --label-filter="Feature: containsAny DynamicResourceAllocation && Feature: isSubsetOf { Beta, DynamicResourceAllocation } && !Flaky && !Slow"' - - --timeout=65m - - --node-args=--image-config-file=/home/prow/go/src/k8s.io/test-infra/jobs/e2e_node/dra/image-config-containerd-1.7.yaml - resources: - requests: - cpu: 4 - memory: 6Gi - limits: - cpu: 4 - memory: 6Gi + - <<: *e2e-node-container + env: + - name: runtime + value: containerd + +presubmits: + kubernetes/kubernetes: + + - <<: *e2e-kind-job + <<: *presubmit-job + name: pull-kubernetes-kind-dra-canary + annotations: + <<: *annotations + testgrid-tab-name: pull-kubernetes-kind-dra-canary + description: Runs E2E tests for Dynamic Resource Allocation beta features against a Kubernetes master cluster created with sigs.k8s.io/kind + run_if_changed: &e2e-run-if-changed-e2e /(dra|dynamicresources|resourceclaim|deviceclass|resourceslice|resourceclaimtemplate|dynamic-resource-allocation|pkg/apis/resource|api/resource)/.*.go + + - <<: *e2e-kind-job + <<: *presubmit-job + name: pull-kubernetes-kind-dra-all-canary + annotations: + <<: *annotations + testgrid-tab-name: pull-kubernetes-kind-dra-all-canary + description: Runs E2E tests for Dynamic Resource Allocation alpha and beta features against a Kubernetes master cluster created with sigs.k8s.io/kind + run_if_changed: *e2e-run-if-changed-e2e + spec: + containers: + - <<: *e2e-kind-container + env: + - name: with_all_features + value: "true" + + - <<: *e2e-node-job + <<: *presubmit-job + name: pull-kubernetes-node-e2e-cgrpv1-crio-dra-canary + annotations: + <<: *annotations + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + testgrid-tab-name: pull-kubernetes-node-e2e-cgrpv1-crio-dra-canary + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v1 + # Automatically testing with one container runtime in one configuration is sufficient to detect basic problems in kubelet early. + # CRI-O was picked because it was solid for testing so far. + # Periodic variant: ci-node-e2e-crio-cgrpv1-dra + run_if_changed: (/dra/|/dynamicresources/|/resourceclaim/|/deviceclass/|/resourceslice/|/resourceclaimtemplate/|/dynamic-resource-allocation/|/pkg/apis/resource/|/api/resource/|/test/e2e_node/dra_).*\.(go|yaml) + spec: + containers: + - <<: *e2e-node-container + env: + - name: runtime + value: crio-cgroupv1 + + - <<: *e2e-node-job + <<: *presubmit-job + name: pull-kubernetes-node-e2e-cgrpv2-crio-dra-canary + annotations: + <<: *annotations + testgrid-dashboards: sig-node-cri-o, sig-node-dynamic-resource-allocation + description: Runs E2E node tests for Dynamic Resource Allocation beta features with CRI-O using cgroup v2 + testgrid-tab-name: pull-kubernetes-node-e2e-cgrpv2-crio-dra-canary + spec: + containers: + - <<: *e2e-node-container + env: + - name: runtime + value: crio-cgroupv2 + + - <<: *e2e-node-job + <<: *presubmit-job + name: pull-kubernetes-node-e2e-containerd-dra-canary + annotations: + <<: *annotations + description: Runs E2E node tests for Dynamic Resource Allocation beta features with containerd + testgrid-tab-name: pull-kubernetes-node-e2e-containerd-dra-canary + spec: + containers: + - <<: *e2e-node-container + env: + - name: runtime + value: containerd