Skip to content

Commit

Permalink
Remove unnecessary code to check for ComputeDomain existence and reacat
Browse files Browse the repository at this point in the history
We no longer need this code now that we have a finalizer on the
ComputeDomain object itself that only gets removed if all other linked
objects have been removed.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed Jan 27, 2025
1 parent c19f6c0 commit 800d905
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 197 deletions.
8 changes: 2 additions & 6 deletions cmd/nvidia-dra-imex-controller/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (

nvapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/v1beta1"
nvinformers "github.com/NVIDIA/k8s-dra-driver/pkg/nvidia.com/informers/externalversions"
nvlisters "github.com/NVIDIA/k8s-dra-driver/pkg/nvidia.com/listers/resource/v1beta1"
)

type GetComputeDomainFunc func(uid string) (*nvapi.ComputeDomain, error)
Expand All @@ -47,7 +46,6 @@ type ComputeDomainManager struct {

factory nvinformers.SharedInformerFactory
informer cache.SharedIndexInformer
lister nvlisters.ComputeDomainLister

deploymentManager *DeploymentManager
deviceClassManager *DeviceClassManager
Expand All @@ -58,17 +56,15 @@ type ComputeDomainManager struct {
func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
factory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, informerResyncPeriod)
informer := factory.Resource().V1beta1().ComputeDomains().Informer()
lister := nvlisters.NewComputeDomainLister(informer.GetIndexer())

m := &ComputeDomainManager{
config: config,
factory: factory,
informer: informer,
lister: lister,
}
m.deploymentManager = NewDeploymentManager(config, m.Get)
m.deviceClassManager = NewDeviceClassManager(config, m.Get)
m.resourceClaimManager = NewResourceClaimManager(config, m.Get)
m.deviceClassManager = NewDeviceClassManager(config)
m.resourceClaimManager = NewResourceClaimManager(config)

return m
}
Expand Down
33 changes: 8 additions & 25 deletions cmd/nvidia-dra-imex-controller/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/client-go/informers"
appsv1listers "k8s.io/client-go/listers/apps/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -63,7 +62,6 @@ type DeploymentManager struct {

factory informers.SharedInformerFactory
informer cache.SharedIndexInformer
lister appsv1listers.DeploymentLister

resourceClaimTemplateManager *ResourceClaimTemplateManager
imexChannelManager *ImexChannelManager
Expand All @@ -90,18 +88,16 @@ func NewDeploymentManager(config *ManagerConfig, getComputeDomain GetComputeDoma
)

informer := factory.Apps().V1().Deployments().Informer()
lister := factory.Apps().V1().Deployments().Lister()

m := &DeploymentManager{
config: config,
getComputeDomain: getComputeDomain,
factory: factory,
informer: informer,
lister: lister,
podManagers: make(map[string]*DeploymentPodManager),
}
m.imexChannelManager = NewImexChannelManager(config)
m.resourceClaimTemplateManager = NewResourceClaimTemplateManager(config, getComputeDomain)
m.resourceClaimTemplateManager = NewResourceClaimTemplateManager(config)

return m
}
Expand Down Expand Up @@ -307,26 +303,8 @@ func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error {
return fmt.Errorf("failed to cast to Deployment")
}

d, err := m.lister.Deployments(d.Namespace).Get(d.Name)
if err != nil && errors.IsNotFound(err) {
return nil
}
if err != nil {
return fmt.Errorf("erroring retreiving Deployment: %w", err)
}

klog.Infof("Processing added or updated Deployment: %s/%s", d.Namespace, d.Name)

cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey])
if err != nil {
return fmt.Errorf("error getting ComputeDomain: %w", err)
}
if cd == nil {
if err := m.Delete(ctx, d.Labels[computeDomainLabelKey]); err != nil {
return fmt.Errorf("error deleting Deployment '%s/%s': %w", d.Namespace, d.Name, err)
}
return nil
}

Check failure on line 308 in cmd/nvidia-dra-imex-controller/deployment.go

View workflow job for this annotation

GitHub Actions / check

File is not properly formatted (gofmt)
if err := m.addPodManager(ctx, d.Spec.Selector, int(*d.Spec.Replicas)); err != nil {
return fmt.Errorf("error adding Pod manager '%s/%s': %w", d.Namespace, d.Name, err)
Expand All @@ -336,14 +314,19 @@ func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error {
return nil
}

if err := m.createOrUpdatePool(d, cd); err != nil {
if err := m.createOrUpdatePool(d); err != nil {
return fmt.Errorf("error creating or updating pool: %w", err)
}

return nil
}

func (m *DeploymentManager) createOrUpdatePool(d *appsv1.Deployment, cd *nvapi.ComputeDomain) error {
func (m *DeploymentManager) createOrUpdatePool(d *appsv1.Deployment) error {
cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey])
if cd == nil || err != nil {
return fmt.Errorf("error getting ComputeDomain: %w", err)
}

var nodeNames []string
for _, node := range cd.Status.Nodes {
nodeNames = append(nodeNames, node.Name)
Expand Down
62 changes: 7 additions & 55 deletions cmd/nvidia-dra-imex-controller/deviceclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,22 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"

nvapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/v1beta1"
)

type DeviceClassManager struct {
config *ManagerConfig
waitGroup sync.WaitGroup
cancelContext context.CancelFunc
getComputeDomain GetComputeDomainFunc
config *ManagerConfig
waitGroup sync.WaitGroup
cancelContext context.CancelFunc

factory informers.SharedInformerFactory
informer cache.SharedIndexInformer
lister resourcelisters.DeviceClassLister
}

func NewDeviceClassManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *DeviceClassManager {
func NewDeviceClassManager(config *ManagerConfig) *DeviceClassManager {
labelSelector := &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Expand All @@ -62,14 +59,11 @@ func NewDeviceClassManager(config *ManagerConfig, getComputeDomain GetComputeDom
)

informer := factory.Resource().V1beta1().DeviceClasses().Informer()
lister := factory.Resource().V1beta1().DeviceClasses().Lister()

m := &DeviceClassManager{
config: config,
getComputeDomain: getComputeDomain,
factory: factory,
informer: informer,
lister: lister,
config: config,
factory: factory,
informer: informer,
}

return m
Expand All @@ -91,18 +85,6 @@ func (m *DeviceClassManager) Start(ctx context.Context) (rerr error) {
return fmt.Errorf("error adding indexer for MulitNodeEnvironment label: %w", err)
}

_, err := m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj any) {
m.config.workQueue.Enqueue(obj, m.onAddOrUpdate)
},
UpdateFunc: func(objOld, objNew any) {
m.config.workQueue.Enqueue(objNew, m.onAddOrUpdate)
},
})
if err != nil {
return fmt.Errorf("error adding event handlers for DeviceClass informer: %w", err)
}

m.waitGroup.Add(1)
go func() {
defer m.waitGroup.Done()
Expand Down Expand Up @@ -237,33 +219,3 @@ func (m *DeviceClassManager) RemoveFinalizer(ctx context.Context, cdUID string)

return nil
}

func (m *DeviceClassManager) onAddOrUpdate(ctx context.Context, obj any) error {
dc, ok := obj.(*resourceapi.DeviceClass)
if !ok {
return fmt.Errorf("failed to cast to DeviceClass")
}

dc, err := m.lister.Get(dc.Name)
if err != nil && errors.IsNotFound(err) {
return nil
}
if err != nil {
return fmt.Errorf("erroring retreiving DeviceClass: %w", err)
}

klog.Infof("Processing added or updated DeviceClass: %s", dc.Name)

cd, err := m.getComputeDomain(dc.Labels[computeDomainLabelKey])
if err != nil {
return fmt.Errorf("error getting ComputeDomain: %w", err)
}
if cd == nil {
if err := m.Delete(ctx, dc.Labels[computeDomainLabelKey]); err != nil {
return fmt.Errorf("error deleting DeviceClass '%s': %w", dc.Name, err)
}
return nil
}

return nil
}
63 changes: 7 additions & 56 deletions cmd/nvidia-dra-imex-controller/resourceclaim.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,22 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/informers"
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
"k8s.io/client-go/tools/cache"
"k8s.io/klog/v2"

nvapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/v1beta1"
)

type ResourceClaimManager struct {
config *ManagerConfig
waitGroup sync.WaitGroup
cancelContext context.CancelFunc
getComputeDomain GetComputeDomainFunc
config *ManagerConfig
waitGroup sync.WaitGroup
cancelContext context.CancelFunc

factory informers.SharedInformerFactory
informer cache.SharedIndexInformer
lister resourcelisters.ResourceClaimLister
}

func NewResourceClaimManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *ResourceClaimManager {
func NewResourceClaimManager(config *ManagerConfig) *ResourceClaimManager {
labelSelector := &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Expand All @@ -62,14 +59,11 @@ func NewResourceClaimManager(config *ManagerConfig, getComputeDomain GetComputeD
)

informer := factory.Resource().V1beta1().ResourceClaims().Informer()
lister := factory.Resource().V1beta1().ResourceClaims().Lister()

m := &ResourceClaimManager{
config: config,
getComputeDomain: getComputeDomain,
factory: factory,
informer: informer,
lister: lister,
config: config,
factory: factory,
informer: informer,
}

return m
Expand All @@ -91,18 +85,6 @@ func (m *ResourceClaimManager) Start(ctx context.Context) (rerr error) {
return fmt.Errorf("error adding indexer for MulitNodeEnvironment label: %w", err)
}

_, err := m.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(obj any) {
m.config.workQueue.Enqueue(obj, m.onAddOrUpdate)
},
UpdateFunc: func(objOld, objNew any) {
m.config.workQueue.Enqueue(objNew, m.onAddOrUpdate)
},
})
if err != nil {
return fmt.Errorf("error adding event handlers for ResourceClaim informer: %w", err)
}

m.waitGroup.Add(1)
go func() {
defer m.waitGroup.Done()
Expand Down Expand Up @@ -196,7 +178,6 @@ func (m *ResourceClaimManager) RemoveFinalizer(ctx context.Context, cdUID string
return nil
}


for _, rc := range rcs {
newRC := rc.DeepCopy()
newRC.Finalizers = []string{}
Expand All @@ -216,33 +197,3 @@ func (m *ResourceClaimManager) RemoveFinalizer(ctx context.Context, cdUID string

return nil
}

func (m *ResourceClaimManager) onAddOrUpdate(ctx context.Context, obj any) error {
rc, ok := obj.(*resourceapi.ResourceClaim)
if !ok {
return fmt.Errorf("failed to cast to ResourceClaim")
}

rc, err := m.lister.ResourceClaims(rc.Namespace).Get(rc.Name)
if err != nil && errors.IsNotFound(err) {
return nil
}
if err != nil {
return fmt.Errorf("erroring retreiving ResourceClaim: %w", err)
}

klog.Infof("Processing added or updated ResourceClaim: %s/%s", rc.Namespace, rc.Name)

cd, err := m.getComputeDomain(rc.Labels[computeDomainLabelKey])
if err != nil {
return fmt.Errorf("error getting ComputeDomain: %w", err)
}
if cd == nil {
if err := m.Delete(ctx, rc.Labels[computeDomainLabelKey]); err != nil {
return fmt.Errorf("error deleting ResourceClaim '%s/%s': %w", rc.Namespace, rc.Name, err)
}
return nil
}

return nil
}
Loading

0 comments on commit 800d905

Please sign in to comment.