Skip to content

Commit

Permalink
Merge pull request #1063 from mmerkes/release-1.31
Browse files Browse the repository at this point in the history
Cherry picks gracefully handles getting instance topology and unhandled variants in 1.31
  • Loading branch information
k8s-ci-robot authored Nov 25, 2024
2 parents 5bd8cb9 + 881a6c0 commit 31b87bc
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 1 deletion.
6 changes: 5 additions & 1 deletion pkg/providers/v1/instances_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"
)

func (c *Cloud) getProviderID(ctx context.Context, node *v1.Node) (string, error) {
Expand Down Expand Up @@ -82,8 +83,11 @@ func (c *Cloud) getAdditionalLabels(ctx context.Context, zoneName string, instan
// If topology labels are already set, skip.
if _, ok := existingLabels[LabelNetworkNodePrefix+"1"]; !ok {
nodeTopology, err := c.instanceTopologyManager.GetNodeTopology(ctx, instanceType, region, instanceID)
// We've seen some edge cases where this functionality is problematic, so swallowing errors and logging
// to avoid short-circuiting syncing nodes. If it's an intermittent issue, the labels will be added
// on subsequent attempts.
if err != nil {
return nil, err
klog.Warningf("Failed to get node topology. Moving on without setting labels: %q", err)
} else if nodeTopology != nil {
for index, networkNode := range nodeTopology.NetworkNodes {
layer := index + 1
Expand Down
25 changes: 25 additions & 0 deletions pkg/providers/v1/instances_v2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/stretchr/testify/mock"
v1 "k8s.io/api/core/v1"
"k8s.io/cloud-provider-aws/pkg/resourcemanagers"
"k8s.io/cloud-provider-aws/pkg/services"
)

func TestGetProviderId(t *testing.T) {
Expand Down Expand Up @@ -236,6 +237,30 @@ func TestInstanceMetadata(t *testing.T) {
// Validate that labels are unchanged.
assert.Equal(t, map[string]string{}, result.AdditionalLabels)
})

t.Run("Should swallow errors if getting node topology fails", func(t *testing.T) {
instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true)
c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance})
var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager
c.instanceTopologyManager = &mockedTopologyManager
mockedTopologyManager.On("GetNodeTopology", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil,
services.NewMockAPIError("InvalidParameterValue", "Nope."))
node := &v1.Node{
Spec: v1.NodeSpec{
ProviderID: fmt.Sprintf("aws:///us-west-2c/1abc-2def/%s", *instance.InstanceId),
},
}

result, err := c.InstanceMetadata(context.TODO(), node)
if err != nil {
t.Errorf("Should not error getting InstanceMetadata: %s", err)
}

mockedTopologyManager.AssertNumberOfCalls(t, "GetNodeTopology", 1)
assert.Equal(t, map[string]string{
LabelZoneID: "az1",
}, result.AdditionalLabels)
})
}

func getCloudWithMockedDescribeInstances(instanceExists bool, instanceState string) *Cloud {
Expand Down
5 changes: 5 additions & 0 deletions pkg/resourcemanagers/topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ func (t *instanceTopologyManager) addUnsupported(key string) {
}

func (t *instanceTopologyManager) mightSupportTopology(instanceType string, region string) bool {
// In the case of fargate and possibly other variants, the instance type will be empty.
if len(instanceType) == 0 {
return false
}

if _, exists, err := t.unsupportedKeyStore.GetByKey(region); exists {
return false
} else if err != nil {
Expand Down
15 changes: 15 additions & 0 deletions pkg/resourcemanagers/topology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ import (
)

func TestGetNodeTopology(t *testing.T) {
t.Run("Should skip nodes that don't have instance type set", func(t *testing.T) {
mockedEc2SdkV2 := services.MockedEc2SdkV2{}
topologyManager := NewInstanceTopologyManager(&mockedEc2SdkV2)
// Loop multiple times to check cache use
topology, err := topologyManager.GetNodeTopology(context.TODO(), "" /* empty instance type */, "some-region", "some-id")
if err != nil {
t.Errorf("Should not error getting node topology: %s", err)
}
if topology != nil {
t.Errorf("Should not be returning a topology: %v", topology)
}

mockedEc2SdkV2.AssertNumberOfCalls(t, "DescribeInstanceTopology", 0)
})

t.Run("Should handle unsupported regions and utilize cache", func(t *testing.T) {
mockedEc2SdkV2 := services.MockedEc2SdkV2{}
topologyManager := NewInstanceTopologyManager(&mockedEc2SdkV2)
Expand Down

0 comments on commit 31b87bc

Please sign in to comment.