Skip to content

Commit

Permalink
Simplify DataScienceCluster conditions
Browse files Browse the repository at this point in the history
This commit aims to simpplify the DataScienceCluster status conditions
by removing transient conditions (Reconciling, Deploying, etc) and
replacing them with two conditsion:

- Available that is used to reflects the work done by the controller, so
  an Available condition with status False would signal that the
  controller encountered some troubles while reconciling the status of
  the cluster
- Ready that is used to reflect the status of the component managed by
  the controller.

It is perfectly possible to be in a state where Available=False and
Ready=True which can happen in case the controller has troubles to
perform some reconciliation logic because of i.e. a transient kubernetes
error, but at the same time all the components are running.

The object would then result in soimething like:

    apiVersion: datasciencecluster.opendatahub.io/v1
    kind: DataScienceCluster
    metadata:
        name: default-dsc
    spec: {}
    status:
        conditions:
        - lastHeartbeatTime: "2025-01-15T12:54:20Z"
            lastTransitionTime: "2025-01-15T12:54:20Z"
            message: DataScienceCluster resource reconciled successfully
            reason: Available
            status: "True"
            type: Available
        - lastHeartbeatTime: "2025-01-15T12:54:20Z"
            lastTransitionTime: "2025-01-15T12:54:20Z"
            message: Ready
            reason: Ready
            status: "True"
            type: Ready

It is possible to wait for the DataScienceCluster to be ready by using
the kubectl wait command:

    kubectl wait --for=condition=ready datascienceclusters.datasciencecluster.opendatahub.io default-dsc
  • Loading branch information
lburgazzoli committed Jan 16, 2025
1 parent 8622753 commit fd42704
Show file tree
Hide file tree
Showing 10 changed files with 703 additions and 136 deletions.
4 changes: 2 additions & 2 deletions controllers/components/modelcontroller/modelcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ import (
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations"
)

type componentHandler struct{}

func init() { //nolint:gochecknoinits
componentsregistry.Add(&componentHandler{})
cr.Add(&componentHandler{})

Check warning on line 27 in controllers/components/modelcontroller/modelcontroller.go

View check run for this annotation

Codecov / codecov/patch

controllers/components/modelcontroller/modelcontroller.go#L27

Added line #L27 was not covered by tests
}

func (s *componentHandler) GetName() string {
Expand Down
4 changes: 2 additions & 2 deletions controllers/components/modelmeshserving/modelmeshserving.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ import (
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations"
)

type componentHandler struct{}

func init() { //nolint:gochecknoinits
componentsregistry.Add(&componentHandler{})
cr.Add(&componentHandler{})

Check warning on line 27 in controllers/components/modelmeshserving/modelmeshserving.go

View check run for this annotation

Codecov / codecov/patch

controllers/components/modelmeshserving/modelmeshserving.go#L27

Added line #L27 was not covered by tests
}

func (s *componentHandler) GetName() string {
Expand Down
138 changes: 9 additions & 129 deletions controllers/datasciencecluster/datasciencecluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@ import (
"slices"
"strings"

operatorv1 "github.com/openshift/api/operator/v1"
conditionsv1 "github.com/openshift/custom-resource-status/conditions/v1"
corev1 "k8s.io/api/core/v1"
k8serr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/tools/record"
Expand All @@ -39,12 +36,10 @@ import (
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/opendatahub-io/opendatahub-operator/v2/apis/common"
componentApi "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1alpha1"
dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1"
dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1"
"github.com/opendatahub-io/opendatahub-operator/v2/controllers/status"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster"
cr "github.com/opendatahub-io/opendatahub-operator/v2/pkg/componentsregistry"
odhClient "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/client"
"github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/handlers"
Expand Down Expand Up @@ -97,14 +92,18 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R
// validate pre-requisites
if err := r.validate(ctx, instance); err != nil {
log.Info(err.Error())
status.SetCondition(&instance.Status.Conditions, "Degraded", status.ReconcileFailed, err.Error(), corev1.ConditionTrue)

status.SetCondition(
&instance.Status.Conditions,
string(conditionsv1.ConditionAvailable),
status.DegradedReason,
err.Error(),
corev1.ConditionFalse,
)

Check warning on line 102 in controllers/datasciencecluster/datasciencecluster_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/datasciencecluster/datasciencecluster_controller.go#L95-L102

Added lines #L95 - L102 were not covered by tests
}

// deploy components
if err := r.reconcileComponents(ctx, instance, cr.DefaultRegistry()); err != nil {
log.Info(err.Error())
status.SetCondition(&instance.Status.Conditions, "Degraded", status.ReconcileFailed, err.Error(), corev1.ConditionTrue)
}
reconcileComponents(ctx, r.Client, instance, cr.DefaultRegistry())

Check warning on line 106 in controllers/datasciencecluster/datasciencecluster_controller.go

View check run for this annotation

Codecov / codecov/patch

controllers/datasciencecluster/datasciencecluster_controller.go#L106

Added line #L106 was not covered by tests

// keep conditions sorted
slices.SortFunc(instance.Status.Conditions, func(a, b conditionsv1.Condition) int {
Expand Down Expand Up @@ -150,125 +149,6 @@ func (r *DataScienceClusterReconciler) validate(ctx context.Context, _ *dscv1.Da
return nil
}

func (r *DataScienceClusterReconciler) reconcileComponents(
ctx context.Context,
instance *dscv1.DataScienceCluster,
reg *cr.Registry) error {
log := logf.FromContext(ctx).WithName("DataScienceCluster")

notReadyComponents := make([]string, 0)

// all DSC defined components
componentErrors := reg.ForEach(func(component cr.ComponentHandler) error {
ci, err := r.reconcileComponent(ctx, instance, component)
if err != nil {
return err
}

if !cr.IsManaged(component, instance) {
return nil
}

if !meta.IsStatusConditionTrue(ci.GetStatus().Conditions, status.ConditionTypeReady) {
notReadyComponents = append(notReadyComponents, component.GetName())
}

return nil
})

// Process errors for components
if componentErrors != nil {
log.Info("DataScienceCluster Deployment Incomplete.")

status.SetCompleteCondition(
&instance.Status.Conditions,
status.ReconcileCompletedWithComponentErrors,
fmt.Sprintf("DataScienceCluster resource reconciled with component errors: %v", componentErrors),
)

r.Recorder.Eventf(instance, corev1.EventTypeNormal,
"DataScienceClusterComponentFailures",
"DataScienceCluster instance %s created, but have some failures in component %v", instance.Name, componentErrors)
} else {
log.Info("DataScienceCluster Deployment Completed.")

// finalize reconciliation
status.SetCompleteCondition(
&instance.Status.Conditions,
status.ReconcileCompleted,
"DataScienceCluster resource reconciled successfully",
)
}

if len(notReadyComponents) != 0 {
instance.Status.Phase = status.PhaseNotReady

conditionsv1.SetStatusCondition(&instance.Status.Conditions, conditionsv1.Condition{
Type: conditionsv1.ConditionType(status.ConditionTypeReady),
Status: corev1.ConditionFalse,
Reason: "NotReady",
Message: fmt.Sprintf("Some components are not ready: %s", strings.Join(notReadyComponents, ",")),
})
} else {
instance.Status.Phase = status.PhaseReady

conditionsv1.SetStatusCondition(&instance.Status.Conditions, conditionsv1.Condition{
Type: conditionsv1.ConditionType(status.ConditionTypeReady),
Status: corev1.ConditionTrue,
Reason: "Ready",
Message: "Ready",
})
}

instance.Status.Release = cluster.GetRelease()
instance.Status.ObservedGeneration = instance.Generation

if componentErrors != nil {
return componentErrors
}

return nil
}

func (r *DataScienceClusterReconciler) reconcileComponent(
ctx context.Context,
instance *dscv1.DataScienceCluster,
component cr.ComponentHandler,
) (common.PlatformObject, error) {
ms := component.GetManagementState(instance)
componentCR := component.NewCRObject(instance)

switch ms {
case operatorv1.Managed:
err := ctrl.SetControllerReference(instance, componentCR, r.Scheme)
if err != nil {
return nil, err
}
err = r.Client.Apply(ctx, componentCR, client.FieldOwner(fieldOwner), client.ForceOwnership)
if err != nil {
return nil, err
}
case operatorv1.Removed:
err := r.Client.Delete(ctx, componentCR, client.PropagationPolicy(metav1.DeletePropagationForeground))
if err != nil && !k8serr.IsNotFound(err) {
return nil, err
}
default:
return nil, fmt.Errorf("unsupported management state: %s", ms)
}

if instance.Status.InstalledComponents == nil {
instance.Status.InstalledComponents = make(map[string]bool)
}

err := component.UpdateDSCStatus(instance, componentCR)
if err != nil {
return nil, fmt.Errorf("failed to update status of DataScienceCluster component %s: %w", component.GetName(), err)
}

return componentCR, nil
}

func (r *DataScienceClusterReconciler) reportError(ctx context.Context, err error, instance *dscv1.DataScienceCluster, message string) {
logf.FromContext(ctx).Error(err, message, "instance.Name", instance.Name)
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "DataScienceClusterReconcileError",
Expand Down
Loading

0 comments on commit fd42704

Please sign in to comment.