From e5c96bad083b85fbcc169c377a97b8d8c061cefd Mon Sep 17 00:00:00 2001 From: michael mccune Date: Fri, 5 Jan 2024 16:49:43 -0500 Subject: [PATCH] UPSTREAM: carry: improve replica counting on openshift This change adds logic to count the number of owned machines by each machineset when calculating the replica count to the core autoscaler. It is needed because the machine-api controllers do not include machines in deleting phase when updating their replica field. This causes a problem with the core autoscaler as the count of nodes will not match the resources from the cloud provider. This can be removed when the machine-api controllers have been fully removed from openshift. --- .../clusterapi/clusterapi_controller.go | 1 + .../clusterapi/clusterapi_unstructured.go | 57 +++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index e6296ea46807..8ef245b28261 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -46,6 +46,7 @@ const ( machinePoolProviderIDIndex = "machinePoolProviderIDIndex" nodeProviderIDIndex = "nodeProviderIDIndex" defaultCAPIGroup = "cluster.x-k8s.io" + openshiftMAPIGroup = "machine.openshift.io" // CAPIGroupEnvVar contains the environment variable name which allows overriding defaultCAPIGroup. CAPIGroupEnvVar = "CAPI_GROUP" // CAPIVersionEnvVar contains the environment variable name which allows overriding the Cluster API group version. diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go index b9801f2575d2..907c3ff7e31b 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go @@ -18,6 +18,7 @@ package clusterapi import ( "context" + "encoding/json" "fmt" "path" "strings" @@ -94,6 +95,13 @@ func (r unstructuredScalableResource) Replicas() (int, error) { return 0, err } + // this function needs to differentiate between machine-api and cluster-api + // due to the fact that the machine-api controllers exclude machines in + // deleting phase when calculating replicas. + if gvr.Group == openshiftMAPIGroup { + return r.replicas_openshift() + } + s, err := r.controller.managementScaleClient.Scales(r.Namespace()).Get(context.TODO(), gvr.GroupResource(), r.Name(), metav1.GetOptions{}) if err != nil { return 0, err @@ -105,6 +113,55 @@ func (r unstructuredScalableResource) Replicas() (int, error) { return int(s.Spec.Replicas), nil } +func (r unstructuredScalableResource) replicas_openshift() (int, error) { + gvr, err := r.GroupVersionResource() + if err != nil { + return 0, err + } + + if gvr.Group != openshiftMAPIGroup { + return 0, fmt.Errorf("incorrect group for replica count on %s %s/%s", r.Kind(), r.Namespace(), r.Name()) + } + + // get the selector labels from the scalable resource to find the machines + selector, found, err := unstructured.NestedMap(r.unstructured.Object, "spec", "selector") + if !found || err != nil { + return 0, err + } + + // we want to massage the unstructured selector data into a LabelSelector struct + // so that we can more easily create the necessary string for the ListOptions struct, + // the following code helps with that. + data, err := json.Marshal(selector) + if err != nil { + return 0, err + } + + var labelSelector metav1.LabelSelector + err = json.Unmarshal(data, &labelSelector) + if err != nil { + return 0, err + } + + // get a list of machines filtered by the namespace and the selector labels from the scalable resource + machinesList, err := r.controller.managementClient.Resource(r.controller.machineResource).Namespace(r.Namespace()).List(context.TODO(), metav1.ListOptions{LabelSelector: labelSelector.String()}) + if err != nil { + return 0, err + } + + // filter out inactive machines + var activeMachines []unstructured.Unstructured + for _, item := range machinesList.Items { + if metav1.GetControllerOf(&item) != nil && !metav1.IsControlledBy(&item, r.unstructured) { + continue + } + + activeMachines = append(activeMachines, item) + } + + return len(activeMachines), nil +} + func (r unstructuredScalableResource) SetSize(nreplicas int) error { switch { case nreplicas > r.maxSize: