Skip to content

Commit

Permalink
Refactor UpgradeStrategy to UpgradeSpec.Type (#2678)
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanaoleary authored Jan 3, 2025
1 parent 6350033 commit f0ff2c1
Show file tree
Hide file tree
Showing 11 changed files with 178 additions and 68 deletions.
20 changes: 18 additions & 2 deletions docs/reference/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ _Appears in:_
| `serviceUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `deploymentUnhealthySecondThreshold` _integer_ | Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685 | | |
| `serveService` _[Service](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.28/#service-v1-core)_ | ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics. | | |
| `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None` | | |
| `upgradeStrategy` _[RayServiceUpgradeStrategy](#rayserviceupgradestrategy)_ | UpgradeStrategy defines the scaling policy used when upgrading the RayService. | | |
| `serveConfigV2` _string_ | Important: Run "make" to regenerate code after modifying this file<br />Defines the applications and deployments to deploy, should be a YAML multi-line scalar string. | | |
| `rayClusterConfig` _[RayClusterSpec](#rayclusterspec)_ | | | |
| `excludeHeadPodFromServeSvc` _boolean_ | If the field is set to true, the value of the label `ray.io/serve` on the head Pod should always be false.<br />Therefore, the head Pod's endpoint will not be added to the Kubernetes Serve service. | | |
Expand All @@ -219,7 +219,7 @@ _Appears in:_

#### RayServiceUpgradeStrategy

_Underlying type:_ _string_




Expand All @@ -228,6 +228,22 @@ _Underlying type:_ _string_
_Appears in:_
- [RayServiceSpec](#rayservicespec)

| Field | Description | Default | Validation |
| --- | --- | --- | --- |
| `type` _[RayServiceUpgradeType](#rayserviceupgradetype)_ | Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`. | | |


#### RayServiceUpgradeType

_Underlying type:_ _string_





_Appears in:_
- [RayServiceUpgradeStrategy](#rayserviceupgradestrategy)



#### ScaleStrategy
Expand Down
5 changes: 4 additions & 1 deletion helm-chart/kuberay-operator/crds/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 9 additions & 4 deletions ray-operator/apis/ray/v1/rayservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ const (
Restarting ServiceStatus = "Restarting"
)

type RayServiceUpgradeStrategy string
type RayServiceUpgradeType string

const (
// During upgrade, NewCluster strategy will create new upgraded cluster and switch to it when it becomes ready
NewCluster RayServiceUpgradeStrategy = "NewCluster"
NewCluster RayServiceUpgradeType = "NewCluster"
// No new cluster will be created while the strategy is set to None
None RayServiceUpgradeStrategy = "None"
None RayServiceUpgradeType = "None"
)

// These statuses should match Ray Serve's application statuses
Expand Down Expand Up @@ -54,6 +54,11 @@ var DeploymentStatusEnum = struct {
UNHEALTHY: "UNHEALTHY",
}

type RayServiceUpgradeStrategy struct {
// Type represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`.
Type *RayServiceUpgradeType `json:"type,omitempty"`
}

// RayServiceSpec defines the desired state of RayService
type RayServiceSpec struct {
// Deprecated: This field is not used anymore. ref: https://github.com/ray-project/kuberay/issues/1685
Expand All @@ -62,7 +67,7 @@ type RayServiceSpec struct {
DeploymentUnhealthySecondThreshold *int32 `json:"deploymentUnhealthySecondThreshold,omitempty"`
// ServeService is the Kubernetes service for head node and worker nodes who have healthy http proxy to serve traffics.
ServeService *corev1.Service `json:"serveService,omitempty"`
// UpgradeStrategy represents the strategy used when upgrading the RayService. Currently supports `NewCluster` and `None`
// UpgradeStrategy defines the scaling policy used when upgrading the RayService.
UpgradeStrategy *RayServiceUpgradeStrategy `json:"upgradeStrategy,omitempty"`
// Important: Run "make" to regenerate code after modifying this file
// Defines the applications and deployments to deploy, should be a YAML multi-line scalar string.
Expand Down
22 changes: 21 additions & 1 deletion ray-operator/apis/ray/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion ray-operator/config/crd/bases/ray.io_rayservices.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 18 additions & 13 deletions ray-operator/controllers/ray/rayservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,10 +238,12 @@ func validateRayServiceSpec(rayService *rayv1.RayService) error {
if headSvc := rayService.Spec.RayClusterSpec.HeadGroupSpec.HeadService; headSvc != nil && headSvc.Name != "" {
return fmt.Errorf("spec.rayClusterConfig.headGroupSpec.headService.metadata.name should not be set")
}

if upgradeStrategy := rayService.Spec.UpgradeStrategy; upgradeStrategy != nil {
if *upgradeStrategy != rayv1.NewCluster && *upgradeStrategy != rayv1.None {
return fmt.Errorf("spec.UpgradeStrategy value %s is invalid, valid options are %s or %s", *upgradeStrategy, rayv1.NewCluster, rayv1.None)
if rayService.Spec.UpgradeStrategy == nil {
return nil
}
if upgradeType := rayService.Spec.UpgradeStrategy.Type; upgradeType != nil {
if *upgradeType != rayv1.NewCluster && *upgradeType != rayv1.None {
return fmt.Errorf("Spec.UpgradeStrategy.Type value %s is invalid, valid options are %s or %s", *upgradeType, rayv1.NewCluster, rayv1.None)
}
}
return nil
Expand Down Expand Up @@ -411,24 +413,27 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
// For LLM serving, some users might not have sufficient GPU resources to run two RayClusters simultaneously.
// Therefore, KubeRay offers ENABLE_ZERO_DOWNTIME as a feature flag for zero-downtime upgrades.
zeroDowntimeEnvVar := os.Getenv(ENABLE_ZERO_DOWNTIME)
rayServiceSpecUpgradeStrategy := rayServiceInstance.Spec.UpgradeStrategy
// There are two ways to enable zero downtime upgrade. Through ENABLE_ZERO_DOWNTIME env var or setting Spec.UpgradeStrategy.
var rayServiceSpecUpgradeType *rayv1.RayServiceUpgradeType
if rayServiceInstance.Spec.UpgradeStrategy != nil {
rayServiceSpecUpgradeType = rayServiceInstance.Spec.UpgradeStrategy.Type
}
// There are two ways to enable zero downtime upgrade. Through ENABLE_ZERO_DOWNTIME env var or setting Spec.UpgradeStrategy.Type.
// If no fields are set, zero downtime upgrade by default is enabled.
// Spec.UpgradeStrategy takes precedence over ENABLE_ZERO_DOWNTIME.
// Spec.UpgradeStrategy.Type takes precedence over ENABLE_ZERO_DOWNTIME.
enableZeroDowntime := true
strategyMessage := ""
typeMessage := ""
if zeroDowntimeEnvVar != "" {
enableZeroDowntime = strings.ToLower(zeroDowntimeEnvVar) != "false"
strategyMessage = fmt.Sprintf("ENABLE_ZERO_DOWNTIME environmental variable is set to %q", strings.ToLower(zeroDowntimeEnvVar))
typeMessage = fmt.Sprintf("ENABLE_ZERO_DOWNTIME environmental variable is set to %q", strings.ToLower(zeroDowntimeEnvVar))
}
if rayServiceSpecUpgradeStrategy != nil {
enableZeroDowntime = *rayServiceSpecUpgradeStrategy == rayv1.NewCluster
strategyMessage = fmt.Sprintf("Upgrade Strategy is set to %q", *rayServiceSpecUpgradeStrategy)
if rayServiceSpecUpgradeType != nil {
enableZeroDowntime = *rayServiceSpecUpgradeType == rayv1.NewCluster
typeMessage = fmt.Sprintf("Upgrade Type is set to %q", *rayServiceSpecUpgradeType)
}

if enableZeroDowntime || !enableZeroDowntime && activeRayCluster == nil {
if enableZeroDowntime && activeRayCluster != nil {
r.Recorder.Event(rayServiceInstance, "Normal", "UpgradeStrategy", strategyMessage)
r.Recorder.Event(rayServiceInstance, "Normal", "UpgradeStrategy.Type", typeMessage)
}
// Add a pending cluster name. In the next reconcile loop, shouldPrepareNewRayCluster will return DoNothing and we will
// actually create the pending RayCluster instance.
Expand Down
75 changes: 39 additions & 36 deletions ray-operator/controllers/ray/rayservice_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,15 @@ func TestValidateRayServiceSpec(t *testing.T) {
})
assert.NoError(t, err, "The RayService spec is valid.")

var upgradeStrat rayv1.RayServiceUpgradeStrategy = "invalidStrategy"
var upgradeStrat rayv1.RayServiceUpgradeType = "invalidStrategy"
err = validateRayServiceSpec(&rayv1.RayService{
Spec: rayv1.RayServiceSpec{
UpgradeStrategy: &upgradeStrat,
UpgradeStrategy: &rayv1.RayServiceUpgradeStrategy{
Type: &upgradeStrat,
},
},
})
assert.Error(t, err, "spec.UpgradeStrategy is invalid")
assert.Error(t, err, "spec.UpgradeSpec.Type is invalid")
}

func TestGenerateHashWithoutReplicasAndWorkersToDelete(t *testing.T) {
Expand Down Expand Up @@ -771,13 +773,13 @@ func TestReconcileRayCluster(t *testing.T) {
}

tests := map[string]struct {
activeCluster *rayv1.RayCluster
rayServiceUpgradeStrategy rayv1.RayServiceUpgradeStrategy
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
activeCluster *rayv1.RayCluster
rayServiceUpgradeType rayv1.RayServiceUpgradeType
kubeRayVersion string
updateRayClusterSpec bool
enableZeroDowntime bool
shouldPrepareNewCluster bool
updateKubeRayVersion bool
}{
// Test 1: Neither active nor pending clusters exist. The `markRestart` function will be called, so the `PendingServiceStatus.RayClusterName` should be set.
"Zero-downtime upgrade is enabled. Neither active nor pending clusters exist.": {
Expand Down Expand Up @@ -827,42 +829,42 @@ func TestReconcileRayCluster(t *testing.T) {
},
// Test 7: Zero downtime upgrade is enabled, but is enabled through the RayServiceSpec
"Zero-downtime upgrade enabled. The active cluster exist. Zero-downtime upgrade is triggered through RayServiceSpec.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: rayv1.NewCluster,
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeType: rayv1.NewCluster,
},
// Test 8: Zero downtime upgrade is enabled. Env var is set to false but RayServiceSpec is set to NewCluster. Trigger the zero-downtime upgrade.
"Zero-downtime upgrade is enabled through RayServiceSpec and not through env var. Active cluster exist. Trigger the zero-downtime upgrade.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: rayv1.NewCluster,
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
shouldPrepareNewCluster: true,
rayServiceUpgradeType: rayv1.NewCluster,
},
// Test 9: Zero downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None.
"Zero-downtime upgrade is disabled. Env var is set to true but RayServiceSpec is set to None.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: false,
rayServiceUpgradeStrategy: rayv1.None,
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: true,
shouldPrepareNewCluster: false,
rayServiceUpgradeType: rayv1.None,
},
// Test 10: Zero downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set. Trigger the zero-downtime upgrade.
"Zero-downtime upgrade is enabled. Neither the env var nor the RayServiceSpec is set.": {
activeCluster: nil,
updateRayClusterSpec: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeStrategy: "",
activeCluster: nil,
updateRayClusterSpec: true,
shouldPrepareNewCluster: true,
rayServiceUpgradeType: "",
},
// Test 11: Zero downtime upgrade is disabled. Both the env var and the RayServiceSpec is set to disable zero-downtime upgrade.
"Zero-downtime upgrade is disabled by both env var and RayServiceSpec.": {
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
shouldPrepareNewCluster: false,
rayServiceUpgradeStrategy: rayv1.None,
activeCluster: activeCluster.DeepCopy(),
updateRayClusterSpec: true,
enableZeroDowntime: false,
shouldPrepareNewCluster: false,
rayServiceUpgradeType: rayv1.None,
},
}

Expand All @@ -888,8 +890,9 @@ func TestReconcileRayCluster(t *testing.T) {
Recorder: record.NewFakeRecorder(1),
}
service := rayService.DeepCopy()
if tc.rayServiceUpgradeStrategy != "" {
service.Spec.UpgradeStrategy = &tc.rayServiceUpgradeStrategy
service.Spec.UpgradeStrategy = &rayv1.RayServiceUpgradeStrategy{}
if tc.rayServiceUpgradeType != "" {
service.Spec.UpgradeStrategy.Type = &tc.rayServiceUpgradeType
}
if tc.updateRayClusterSpec {
service.Spec.RayClusterSpec.RayVersion = "new-version"
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit f0ff2c1

Please sign in to comment.