diff --git a/.github/prompts/refactor.prompt.md b/.github/prompts/refactor.prompt.md index a1db33e39..d2cb03620 100644 --- a/.github/prompts/refactor.prompt.md +++ b/.github/prompts/refactor.prompt.md @@ -1,12 +1,12 @@ -You are an expert Go developer tasked with refactoring a Go codebase. Apply systematic, incremental improvements using a top-down approach (Module -> File -> Function) while preserving all existing behavior. +You are an expert Go developer tasked with refactoring a Go codebase. Apply systematic, incremental improvements using a top-down approach (Module -> File -> Function) while preserving all existing behavior. Be thorough, and **DO NOT** stop until the codebase is totally clean. ## Workflow 1. **Load** — Load skills and relevant context 1. **Analyze** — Use `search` and `read` to understand the codebase structure before making changes -2. **Plan** — Create a prioritized todo list using `todo`, focusing on high-impact refactors first -3. **Execute** — Make one safe change at a time -4. **Validate** — Run `go test ./...` after each change; if tests fail, fix immediately before proceeding -5. **Polish** — Run `golangci-lint run --fix`, `jscpd --config .jscpd.json` and `go test ./...` to resolve any remaining issues +1. **Plan** — Create a prioritized todo list using `todo`, focusing on high-impact refactors first +1. **Execute** — Make one safe change at a time +1. **Validate** — Run `go test ./...` after each change; if tests fail, fix immediately before proceeding +1. **Polish** — Run `golangci-lint run --fix`, `jscpd --config .jscpd.json` and `go test ./...` to resolve any remaining issues Repeat steps 1-5 until the codebase is clean, has high cohesion, low coupling, and adheres to Go best practices. diff --git a/docs/src/content/docs/cli-flags/cluster/cluster-root.mdx b/docs/src/content/docs/cli-flags/cluster/cluster-root.mdx index 4f29ff9c2..7e11afe2e 100644 --- a/docs/src/content/docs/cli-flags/cluster/cluster-root.mdx +++ b/docs/src/content/docs/cli-flags/cluster/cluster-root.mdx @@ -21,6 +21,7 @@ Available Commands: list List clusters start Start a stopped cluster stop Stop a running cluster + update Update a cluster configuration Flags: -h, --help help for cluster diff --git a/docs/src/content/docs/cli-flags/cluster/cluster-update.mdx b/docs/src/content/docs/cli-flags/cluster/cluster-update.mdx new file mode 100644 index 000000000..b64a672a6 --- /dev/null +++ b/docs/src/content/docs/cli-flags/cluster/cluster-update.mdx @@ -0,0 +1,53 @@ +--- +title: "ksail cluster update" +description: "Update a Kubernetes cluster to match the current configuration." +--- + +{/* This page is auto-generated by .github/scripts/generate-cli-flags-docs.sh */} + +```text +Update a Kubernetes cluster to match the current configuration. + +This command applies changes from your ksail.yaml configuration to a running cluster. + +For Talos clusters, many configuration changes can be applied in-place without +cluster recreation (e.g., network settings, kubelet config, registry mirrors). + +For Kind/K3d clusters, in-place updates are more limited. Worker node scaling +is supported for K3d, but most other changes require cluster recreation. + +Changes are classified into three categories: + - In-Place: Applied without disruption + - Reboot-Required: Applied but may require node reboots + - Recreate-Required: Require full cluster recreation + +Use --dry-run to preview changes without applying them. + +Usage: + ksail cluster update [flags] + +Flags: + --cert-manager CertManager Cert-Manager configuration (Enabled: install, Disabled: skip) + --cni CNI Container Network Interface (CNI) to use + -c, --context string Kubernetes context of cluster + --control-planes int32 Number of control-plane nodes (default 1) + --csi CSI Container Storage Interface (Default: use distribution, Enabled: install CSI, Disabled: skip CSI) + -d, --distribution Distribution Kubernetes distribution to use + --distribution-config string Configuration file for the distribution + --dry-run Preview changes without applying them + --force Skip confirmation prompt and proceed with cluster recreation + -g, --gitops-engine GitOpsEngine GitOps engine to use (None disables GitOps, Flux installs Flux controllers, ArgoCD installs Argo CD) (default None) + -h, --help help for update + --import-images string Path to tar archive with container images to import after cluster creation but before component installation + -k, --kubeconfig string Path to kubeconfig file (default "~/.kube/config") + --local-registry string Local registry specification: [user:pass@]host[:port][/path] (e.g., localhost:5050, ghcr.io/myorg, ${USER}:${PASS}@ghcr.io:443/org) + --metrics-server MetricsServer Metrics Server (Default: use distribution, Enabled: install, Disabled: uninstall) + --mirror-registry strings Configure mirror registries with format 'host=upstream' (e.g., docker.io=https://registry-1.docker.io) + -n, --name string Cluster name used for container names, registry names, and kubeconfig context + --policy-engine PolicyEngine Policy engine (None: skip, Kyverno: install Kyverno, Gatekeeper: install Gatekeeper) + --provider Provider Infrastructure provider backend (e.g., Docker) + --workers int32 Number of worker nodes + +Global Flags: + --benchmark Show per-activity benchmark output +``` diff --git a/pkg/cli/cmd/cluster/cluster.go b/pkg/cli/cmd/cluster/cluster.go index 8728938b3..5b5a43203 100644 --- a/pkg/cli/cmd/cluster/cluster.go +++ b/pkg/cli/cmd/cluster/cluster.go @@ -25,6 +25,7 @@ func NewClusterCmd(runtimeContainer *runtime.Runtime) *cobra.Command { cmd.AddCommand(NewInitCmd(runtimeContainer)) cmd.AddCommand(NewCreateCmd(runtimeContainer)) + cmd.AddCommand(NewUpdateCmd(runtimeContainer)) cmd.AddCommand(NewDeleteCmd(runtimeContainer)) cmd.AddCommand(NewStartCmd(runtimeContainer)) cmd.AddCommand(NewStopCmd(runtimeContainer)) diff --git a/pkg/cli/cmd/cluster/diff.go b/pkg/cli/cmd/cluster/diff.go new file mode 100644 index 000000000..e4a015ef7 --- /dev/null +++ b/pkg/cli/cmd/cluster/diff.go @@ -0,0 +1,364 @@ +package cluster + +import ( + "strconv" + + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" +) + +// DiffEngine computes configuration differences and classifies their impact. +type DiffEngine struct { + distribution v1alpha1.Distribution + provider v1alpha1.Provider +} + +// NewDiffEngine creates a new diff engine for the given distribution and provider. +func NewDiffEngine(distribution v1alpha1.Distribution, provider v1alpha1.Provider) *DiffEngine { + return &DiffEngine{ + distribution: distribution, + provider: provider, + } +} + +// ComputeDiff compares old and new ClusterSpec and categorizes all changes. +func (e *DiffEngine) ComputeDiff(oldSpec, newSpec *v1alpha1.ClusterSpec) *types.UpdateResult { + result := &types.UpdateResult{ + InPlaceChanges: make([]types.Change, 0), + RebootRequired: make([]types.Change, 0), + RecreateRequired: make([]types.Change, 0), + } + + if oldSpec == nil || newSpec == nil { + return result + } + + // Check distribution change (always requires recreate) + e.checkDistributionChange(oldSpec, newSpec, result) + + // Check provider change (always requires recreate) + e.checkProviderChange(oldSpec, newSpec, result) + + // Check component changes (usually in-place via Helm) + e.checkCNIChange(oldSpec, newSpec, result) + e.checkCSIChange(oldSpec, newSpec, result) + e.checkMetricsServerChange(oldSpec, newSpec, result) + e.checkCertManagerChange(oldSpec, newSpec, result) + e.checkPolicyEngineChange(oldSpec, newSpec, result) + e.checkGitOpsEngineChange(oldSpec, newSpec, result) + e.checkLocalRegistryChange(oldSpec, newSpec, result) + + // Check distribution-specific options + e.checkVanillaOptionsChange(oldSpec, newSpec, result) + e.checkTalosOptionsChange(oldSpec, newSpec, result) + e.checkHetznerOptionsChange(oldSpec, newSpec, result) + + return result +} + +// checkDistributionChange checks if the distribution has changed. +func (e *DiffEngine) checkDistributionChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.Distribution != newSpec.Distribution { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.distribution", + OldValue: oldSpec.Distribution.String(), + NewValue: newSpec.Distribution.String(), + Category: types.ChangeCategoryRecreateRequired, + Reason: "changing the Kubernetes distribution requires cluster recreation", + }) + } +} + +// checkProviderChange checks if the provider has changed. +func (e *DiffEngine) checkProviderChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.Provider != newSpec.Provider { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.provider", + OldValue: oldSpec.Provider.String(), + NewValue: newSpec.Provider.String(), + Category: types.ChangeCategoryRecreateRequired, + Reason: "changing the infrastructure provider requires cluster recreation", + }) + } +} + +// checkCNIChange checks if the CNI has changed. +func (e *DiffEngine) checkCNIChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.CNI != newSpec.CNI { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.cni", + OldValue: oldSpec.CNI.String(), + NewValue: newSpec.CNI.String(), + Category: types.ChangeCategoryInPlace, + Reason: "CNI can be switched via Helm upgrade/uninstall", + }) + } +} + +// checkCSIChange checks if the CSI has changed. +func (e *DiffEngine) checkCSIChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.CSI != newSpec.CSI { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.csi", + OldValue: oldSpec.CSI.String(), + NewValue: newSpec.CSI.String(), + Category: types.ChangeCategoryInPlace, + Reason: "CSI can be switched via Helm install/uninstall", + }) + } +} + +// checkMetricsServerChange checks if the metrics server setting has changed. +func (e *DiffEngine) checkMetricsServerChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.MetricsServer != newSpec.MetricsServer { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.metricsServer", + OldValue: oldSpec.MetricsServer.String(), + NewValue: newSpec.MetricsServer.String(), + Category: types.ChangeCategoryInPlace, + Reason: "metrics-server can be installed/uninstalled via Helm", + }) + } +} + +// checkCertManagerChange checks if cert-manager setting has changed. +func (e *DiffEngine) checkCertManagerChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.CertManager != newSpec.CertManager { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.certManager", + OldValue: oldSpec.CertManager.String(), + NewValue: newSpec.CertManager.String(), + Category: types.ChangeCategoryInPlace, + Reason: "cert-manager can be installed/uninstalled via Helm", + }) + } +} + +// checkPolicyEngineChange checks if the policy engine has changed. +func (e *DiffEngine) checkPolicyEngineChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.PolicyEngine != newSpec.PolicyEngine { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.policyEngine", + OldValue: oldSpec.PolicyEngine.String(), + NewValue: newSpec.PolicyEngine.String(), + Category: types.ChangeCategoryInPlace, + Reason: "policy engine can be switched via Helm install/uninstall", + }) + } +} + +// checkGitOpsEngineChange checks if the GitOps engine has changed. +func (e *DiffEngine) checkGitOpsEngineChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.GitOpsEngine != newSpec.GitOpsEngine { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.gitOpsEngine", + OldValue: oldSpec.GitOpsEngine.String(), + NewValue: newSpec.GitOpsEngine.String(), + Category: types.ChangeCategoryInPlace, + Reason: "GitOps engine can be switched via Helm install/uninstall", + }) + } +} + +// checkLocalRegistryChange checks if local registry config has changed. +func (e *DiffEngine) checkLocalRegistryChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if oldSpec.LocalRegistry.Registry != newSpec.LocalRegistry.Registry { + // For Kind, registry changes require recreate (containerd config is baked in) + // For Talos/K3d, registry mirrors can be updated in-place + switch e.distribution { + case v1alpha1.DistributionVanilla: + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.localRegistry.registry", + OldValue: oldSpec.LocalRegistry.Registry, + NewValue: newSpec.LocalRegistry.Registry, + Category: types.ChangeCategoryRecreateRequired, + Reason: "Kind requires cluster recreate to change containerd registry config", + }) + case v1alpha1.DistributionTalos: + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.localRegistry.registry", + OldValue: oldSpec.LocalRegistry.Registry, + NewValue: newSpec.LocalRegistry.Registry, + Category: types.ChangeCategoryInPlace, + Reason: "Talos supports .machine.registries updates without reboot", + }) + case v1alpha1.DistributionK3s: + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.localRegistry.registry", + OldValue: oldSpec.LocalRegistry.Registry, + NewValue: newSpec.LocalRegistry.Registry, + Category: types.ChangeCategoryInPlace, + Reason: "K3d supports registries.yaml updates", + }) + } + } +} + +// checkVanillaOptionsChange checks Vanilla (Kind) specific option changes. +func (e *DiffEngine) checkVanillaOptionsChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if e.distribution != v1alpha1.DistributionVanilla { + return + } + + // MirrorsDir change requires recreate (containerd config is baked at creation) + if oldSpec.Vanilla.MirrorsDir != newSpec.Vanilla.MirrorsDir { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.vanilla.mirrorsDir", + OldValue: oldSpec.Vanilla.MirrorsDir, + NewValue: newSpec.Vanilla.MirrorsDir, + Category: types.ChangeCategoryRecreateRequired, + Reason: "Kind containerd mirror config is baked at cluster creation", + }) + } +} + +// checkTalosOptionsChange checks Talos-specific option changes. +func (e *DiffEngine) checkTalosOptionsChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if e.distribution != v1alpha1.DistributionTalos { + return + } + + // Control plane count change - can scale via provider + if oldSpec.Talos.ControlPlanes != newSpec.Talos.ControlPlanes { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.talos.controlPlanes", + OldValue: strconv.Itoa(int(oldSpec.Talos.ControlPlanes)), + NewValue: strconv.Itoa(int(newSpec.Talos.ControlPlanes)), + Category: types.ChangeCategoryInPlace, + Reason: "Talos supports adding/removing control-plane nodes via provider", + }) + } + + // Worker count change - can scale via provider + if oldSpec.Talos.Workers != newSpec.Talos.Workers { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.talos.workers", + OldValue: strconv.Itoa(int(oldSpec.Talos.Workers)), + NewValue: strconv.Itoa(int(newSpec.Talos.Workers)), + Category: types.ChangeCategoryInPlace, + Reason: "Talos supports adding/removing worker nodes via provider", + }) + } + + // ISO change doesn't affect existing nodes, only new ones + if oldSpec.Talos.ISO != newSpec.Talos.ISO { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.talos.iso", + OldValue: strconv.FormatInt(oldSpec.Talos.ISO, 10), + NewValue: strconv.FormatInt(newSpec.Talos.ISO, 10), + Category: types.ChangeCategoryInPlace, + Reason: "ISO change only affects newly provisioned nodes", + }) + } +} + +// checkHetznerOptionsChange checks Hetzner-specific option changes. +// +//nolint:funlen // Multiple Hetzner options need to be checked sequentially +func (e *DiffEngine) checkHetznerOptionsChange( + oldSpec, newSpec *v1alpha1.ClusterSpec, + result *types.UpdateResult, +) { + if e.provider != v1alpha1.ProviderHetzner { + return + } + + // Server type changes require node replacement + if oldSpec.Hetzner.ControlPlaneServerType != newSpec.Hetzner.ControlPlaneServerType { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.hetzner.controlPlaneServerType", + OldValue: oldSpec.Hetzner.ControlPlaneServerType, + NewValue: newSpec.Hetzner.ControlPlaneServerType, + Category: types.ChangeCategoryRecreateRequired, + Reason: "existing control-plane servers cannot change VM type", + }) + } + + // Worker server type - new workers use new type, existing keep old + if oldSpec.Hetzner.WorkerServerType != newSpec.Hetzner.WorkerServerType { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.hetzner.workerServerType", + OldValue: oldSpec.Hetzner.WorkerServerType, + NewValue: newSpec.Hetzner.WorkerServerType, + Category: types.ChangeCategoryInPlace, + Reason: "new worker servers will use the new type; existing workers unchanged", + }) + } + + // Location change requires full recreate + if oldSpec.Hetzner.Location != newSpec.Hetzner.Location { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.hetzner.location", + OldValue: oldSpec.Hetzner.Location, + NewValue: newSpec.Hetzner.Location, + Category: types.ChangeCategoryRecreateRequired, + Reason: "datacenter location cannot be changed for existing servers", + }) + } + + // Network name change requires recreate + if oldSpec.Hetzner.NetworkName != newSpec.Hetzner.NetworkName { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.hetzner.networkName", + OldValue: oldSpec.Hetzner.NetworkName, + NewValue: newSpec.Hetzner.NetworkName, + Category: types.ChangeCategoryRecreateRequired, + Reason: "cannot migrate servers between networks", + }) + } + + // Network CIDR change requires recreate + if oldSpec.Hetzner.NetworkCIDR != newSpec.Hetzner.NetworkCIDR { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "cluster.hetzner.networkCidr", + OldValue: oldSpec.Hetzner.NetworkCIDR, + NewValue: newSpec.Hetzner.NetworkCIDR, + Category: types.ChangeCategoryRecreateRequired, + Reason: "network CIDR change requires PKI regeneration", + }) + } + + // SSH key change only affects new servers + if oldSpec.Hetzner.SSHKeyName != newSpec.Hetzner.SSHKeyName { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "cluster.hetzner.sshKeyName", + OldValue: oldSpec.Hetzner.SSHKeyName, + NewValue: newSpec.Hetzner.SSHKeyName, + Category: types.ChangeCategoryInPlace, + Reason: "SSH key change only affects newly provisioned servers", + }) + } +} diff --git a/pkg/cli/cmd/cluster/update.go b/pkg/cli/cmd/cluster/update.go new file mode 100644 index 000000000..e615094a2 --- /dev/null +++ b/pkg/cli/cmd/cluster/update.go @@ -0,0 +1,516 @@ +package cluster + +import ( + "fmt" + "strings" + + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" + "github.com/devantler-tech/ksail/v5/pkg/cli/annotations" + "github.com/devantler-tech/ksail/v5/pkg/cli/helpers" + "github.com/devantler-tech/ksail/v5/pkg/cli/lifecycle" + "github.com/devantler-tech/ksail/v5/pkg/cli/setup/localregistry" + runtime "github.com/devantler-tech/ksail/v5/pkg/di" + ksailconfigmanager "github.com/devantler-tech/ksail/v5/pkg/io/config-manager/ksail" + clusterprovisioner "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster" + clustererrors "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/errors" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" + "github.com/devantler-tech/ksail/v5/pkg/utils/notify" + "github.com/spf13/cobra" +) + +// NewUpdateCmd creates the cluster update command. +// The update command applies configuration changes to a running cluster. +// It supports in-place updates where possible and falls back to recreation when necessary. +func NewUpdateCmd(runtimeContainer *runtime.Runtime) *cobra.Command { + cmd := &cobra.Command{ + Use: "update", + Short: "Update a cluster configuration", + Long: `Update a Kubernetes cluster to match the current configuration. + +This command applies changes from your ksail.yaml configuration to a running cluster. + +For Talos clusters, many configuration changes can be applied in-place without +cluster recreation (e.g., network settings, kubelet config, registry mirrors). + +For Kind/K3d clusters, in-place updates are more limited. Worker node scaling +is supported for K3d, but most other changes require cluster recreation. + +Changes are classified into three categories: + - In-Place: Applied without disruption + - Reboot-Required: Applied but may require node reboots + - Recreate-Required: Require full cluster recreation + +Use --dry-run to preview changes without applying them.`, + SilenceUsage: true, + Annotations: map[string]string{ + annotations.AnnotationPermission: "write", + }, + } + + // Use the same field selectors as create command + fieldSelectors := ksailconfigmanager.DefaultClusterFieldSelectors() + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultProviderFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultCNIFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultMetricsServerFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultCertManagerFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultPolicyEngineFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultCSIFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.DefaultImportImagesFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.ControlPlanesFieldSelector()) + fieldSelectors = append(fieldSelectors, ksailconfigmanager.WorkersFieldSelector()) + + cfgManager := ksailconfigmanager.NewCommandConfigManager(cmd, fieldSelectors) + + cmd.Flags().StringSlice("mirror-registry", []string{}, + "Configure mirror registries with format 'host=upstream' (e.g., docker.io=https://registry-1.docker.io)") + _ = cfgManager.Viper.BindPFlag("mirror-registry", cmd.Flags().Lookup("mirror-registry")) + + cmd.Flags().StringP("name", "n", "", + "Cluster name used for container names, registry names, and kubeconfig context") + _ = cfgManager.Viper.BindPFlag("name", cmd.Flags().Lookup("name")) + + cmd.Flags().Bool("force", false, + "Skip confirmation prompt and proceed with cluster recreation") + _ = cfgManager.Viper.BindPFlag("force", cmd.Flags().Lookup("force")) + + cmd.Flags().Bool("dry-run", false, + "Preview changes without applying them") + _ = cfgManager.Viper.BindPFlag("dry-run", cmd.Flags().Lookup("dry-run")) + + cmd.RunE = lifecycle.WrapHandler(runtimeContainer, cfgManager, handleUpdateRunE) + + return cmd +} + +// handleUpdateRunE executes the cluster update logic. +// It computes a diff between current and desired configuration, then applies +// changes in-place where possible, falling back to cluster recreation when necessary. +// +//nolint:cyclop,funlen // Update logic has inherent complexity from multiple code paths +func handleUpdateRunE( + cmd *cobra.Command, + cfgManager *ksailconfigmanager.ConfigManager, + deps lifecycle.Deps, +) error { + deps.Timer.Start() + + outputTimer := helpers.MaybeTimer(cmd, deps.Timer) + + // Load cluster configuration + ctx, err := loadClusterConfiguration(cfgManager, outputTimer) + if err != nil { + return err + } + + // Apply cluster name override from --name flag if provided + nameOverride := cfgManager.Viper.GetString("name") + if nameOverride != "" { + validationErr := v1alpha1.ValidateClusterName(nameOverride) + if validationErr != nil { + return fmt.Errorf("invalid --name flag: %w", validationErr) + } + + err = applyClusterNameOverride(ctx, nameOverride) + if err != nil { + return err + } + } + + // Validate distribution x provider combination + err = ctx.ClusterCfg.Spec.Cluster.Provider.ValidateForDistribution( + ctx.ClusterCfg.Spec.Cluster.Distribution, + ) + if err != nil { + return fmt.Errorf("invalid configuration: %w", err) + } + + // Get cluster name for messaging + clusterName := resolveClusterNameFromContext(ctx) + + // Create provisioner + factory := getProvisionerFactory(ctx) + + provisioner, _, err := factory.Create( + cmd.Context(), + ctx.ClusterCfg, + ) + if err != nil { + return fmt.Errorf("failed to create provisioner: %w", err) + } + + // Check if cluster exists + exists, err := provisioner.Exists(cmd.Context(), clusterName) + if err != nil { + return fmt.Errorf("failed to check cluster existence: %w", err) + } + + if !exists { + return fmt.Errorf("%w: %q", clustererrors.ErrClusterDoesNotExist, clusterName) + } + + // Get flags + dryRun := cfgManager.Viper.GetBool("dry-run") + force := cfgManager.Viper.GetBool("force") + + // Check if provisioner supports updates + updater, supportsUpdate := provisioner.(clusterprovisioner.ClusterUpdater) + if !supportsUpdate { + // Fall back to recreate flow + return executeRecreateFlow(cmd, cfgManager, ctx, deps, clusterName, force) + } + + // Get current configuration from running cluster + currentSpec, err := updater.GetCurrentConfig() + if err != nil { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: "Could not retrieve current cluster configuration, falling back to recreate", + Writer: cmd.OutOrStderr(), + }) + + return executeRecreateFlow(cmd, cfgManager, ctx, deps, clusterName, force) + } + + // Compute diff between current and desired configuration + diff, err := updater.DiffConfig( + cmd.Context(), + clusterName, + currentSpec, + &ctx.ClusterCfg.Spec.Cluster, + ) + if err != nil { + return fmt.Errorf("failed to compute configuration diff: %w", err) + } + + // Display changes summary + displayChangesSummary(cmd, diff) + + // If dry-run, stop here + if dryRun { + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: "Dry run complete. No changes applied.", + Writer: cmd.OutOrStdout(), + }) + + return nil + } + + // If there are recreate-required changes, need confirmation + if diff.HasRecreateRequired() { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: fmt.Sprintf( + "%d changes require cluster recreation", + len(diff.RecreateRequired), + ), + Writer: cmd.OutOrStderr(), + }) + + return executeRecreateFlow(cmd, cfgManager, ctx, deps, clusterName, force) + } + + // Apply in-place and reboot-required changes + if diff.HasInPlaceChanges() || diff.HasRebootRequired() { + updateOpts := types.UpdateOptions{ + DryRun: false, + RollingReboot: true, + } + + notify.WriteMessage(notify.Message{ + Type: notify.ActivityType, + Emoji: "🔄", + Content: "applying configuration changes in-place", + Timer: outputTimer, + Writer: cmd.OutOrStdout(), + }) + + result, err := updater.Update( + cmd.Context(), + clusterName, + currentSpec, + &ctx.ClusterCfg.Spec.Cluster, + updateOpts, + ) + if err != nil { + return fmt.Errorf("failed to apply updates: %w", err) + } + + // Display results + if len(result.AppliedChanges) > 0 { + notify.WriteMessage(notify.Message{ + Type: notify.SuccessType, + Content: fmt.Sprintf("applied %d changes successfully", len(result.AppliedChanges)), + Timer: outputTimer, + Writer: cmd.OutOrStdout(), + }) + } + + if len(result.FailedChanges) > 0 { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: fmt.Sprintf("%d changes failed to apply", len(result.FailedChanges)), + Writer: cmd.OutOrStderr(), + }) + + for _, change := range result.FailedChanges { + notify.WriteMessage(notify.Message{ + Type: notify.ErrorType, + Content: fmt.Sprintf(" - %s: %s", change.Field, change.Reason), + Writer: cmd.OutOrStderr(), + }) + } + } + } else { + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: "No changes detected", + Writer: cmd.OutOrStdout(), + }) + } + + return nil +} + +// displayChangesSummary outputs a human-readable summary of configuration changes. +func displayChangesSummary(cmd *cobra.Command, diff *types.UpdateResult) { + totalChanges := len(diff.InPlaceChanges) + len(diff.RebootRequired) + len(diff.RecreateRequired) + + if totalChanges == 0 { + return + } + + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: fmt.Sprintf("Detected %d configuration changes:", totalChanges), + Writer: cmd.OutOrStdout(), + }) + + for _, change := range diff.InPlaceChanges { + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: fmt.Sprintf(" ✓ %s (in-place)", change.Field), + Writer: cmd.OutOrStdout(), + }) + } + + for _, change := range diff.RebootRequired { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: fmt.Sprintf(" ⚡ %s (reboot required)", change.Field), + Writer: cmd.OutOrStdout(), + }) + } + + for _, change := range diff.RecreateRequired { + notify.WriteMessage(notify.Message{ + Type: notify.ErrorType, + Content: fmt.Sprintf(" ✗ %s (recreate required)", change.Field), + Writer: cmd.OutOrStdout(), + }) + } +} + +// executeRecreateFlow performs the delete + create flow with confirmation. +// +//nolint:funlen // Recreate flow has sequential steps that are clearer kept together +func executeRecreateFlow( + cmd *cobra.Command, + cfgManager *ksailconfigmanager.ConfigManager, + ctx *localregistry.Context, + deps lifecycle.Deps, + clusterName string, + force bool, +) error { + outputTimer := helpers.MaybeTimer(cmd, deps.Timer) + + // Show warning + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: "Update will delete and recreate the cluster", + Writer: cmd.OutOrStderr(), + }) + + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: "All workloads and data will be lost", + Writer: cmd.OutOrStderr(), + }) + + // Get confirmation unless --force is set + if !force { + confirmed := promptForUpdateConfirmation(cmd, clusterName) + if !confirmed { + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: "Update cancelled", + Writer: cmd.OutOrStdout(), + }) + + return nil + } + } + + // Create provisioner for delete + factory := getProvisionerFactory(ctx) + + provisioner, _, err := factory.Create(cmd.Context(), ctx.ClusterCfg) + if err != nil { + return fmt.Errorf("failed to create provisioner: %w", err) + } + + // Execute delete + notify.WriteMessage(notify.Message{ + Type: notify.ActivityType, + Emoji: "🗑️", + Content: "deleting existing cluster", + Timer: outputTimer, + Writer: cmd.OutOrStdout(), + }) + + err = provisioner.Delete(cmd.Context(), clusterName) + if err != nil { + return fmt.Errorf("failed to delete existing cluster: %w", err) + } + + notify.WriteMessage(notify.Message{ + Type: notify.SuccessType, + Content: "cluster deleted", + Timer: outputTimer, + Writer: cmd.OutOrStdout(), + }) + + // Execute create + return executeClusterCreation(cmd, cfgManager, ctx, deps) +} + +// executeClusterCreation performs the cluster creation workflow. +// This extracts the core creation logic to be reused by both create and update commands. +// +//nolint:funlen // Creation workflow has sequential steps that are clearer kept together +func executeClusterCreation( + cmd *cobra.Command, + cfgManager *ksailconfigmanager.ConfigManager, + ctx *localregistry.Context, + deps lifecycle.Deps, +) error { + _ = helpers.MaybeTimer(cmd, deps.Timer) // Timer value unused in this function + + localDeps := getLocalRegistryDeps() + + err := ensureLocalRegistriesReady( + cmd, + ctx, + deps, + cfgManager, + localDeps, + ) + if err != nil { + return err + } + + setupK3dMetricsServer(ctx.ClusterCfg, ctx.K3dConfig) + SetupK3dCSI(ctx.ClusterCfg, ctx.K3dConfig) + + deps.Factory = getProvisionerFactory(ctx) + + err = executeClusterLifecycle(cmd, ctx.ClusterCfg, deps) + if err != nil { + return err + } + + configureRegistryMirrorsInClusterWithWarning( + cmd, + ctx, + deps, + cfgManager, + ) + + err = localregistry.ExecuteStage( + cmd, + ctx, + deps, + localregistry.StageConnect, + localDeps, + ) + if err != nil { + return fmt.Errorf("failed to connect local registry: %w", err) + } + + err = localregistry.WaitForK3dLocalRegistryReady( + cmd, + ctx.ClusterCfg, + ctx.K3dConfig, + localDeps.DockerInvoker, + ) + if err != nil { + return fmt.Errorf("failed to wait for local registry: %w", err) + } + + // Import cached images if configured + importPath := ctx.ClusterCfg.Spec.Cluster.ImportImages + if importPath != "" { + if ctx.ClusterCfg.Spec.Cluster.Distribution == v1alpha1.DistributionTalos { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: "image import is not supported for Talos clusters; ignoring --import-images value %q", + Args: []any{importPath}, + Writer: cmd.OutOrStderr(), + }) + } else { + err = importCachedImages(cmd, ctx, importPath, deps.Timer) + if err != nil { + notify.WriteMessage(notify.Message{ + Type: notify.WarningType, + Content: "failed to import images from %s: %v", + Args: []any{importPath, err}, + Writer: cmd.OutOrStderr(), + }) + } + } + } + + return handlePostCreationSetup(cmd, ctx.ClusterCfg, deps.Timer) +} + +// promptForUpdateConfirmation prompts the user to confirm cluster update. +// Returns true if the user confirms, false otherwise. +func promptForUpdateConfirmation(cmd *cobra.Command, clusterName string) bool { + notify.WriteMessage(notify.Message{ + Type: notify.InfoType, + Content: fmt.Sprintf("To proceed with updating cluster %q, type 'yes':", clusterName), + Writer: cmd.OutOrStdout(), + }) + + var response string + + _, err := fmt.Fscanln(cmd.InOrStdin(), &response) + if err != nil { + return false + } + + return strings.TrimSpace(strings.ToLower(response)) == "yes" +} + +// getProvisionerFactory returns the cluster provisioner factory, using any override if set. +// +//nolint:ireturn // Factory interface is appropriate for dependency injection +func getProvisionerFactory(ctx *localregistry.Context) clusterprovisioner.Factory { + clusterProvisionerFactoryMu.RLock() + + factoryOverride := clusterProvisionerFactoryOverride + + clusterProvisionerFactoryMu.RUnlock() + + if factoryOverride != nil { + return factoryOverride + } + + return clusterprovisioner.DefaultFactory{ + DistributionConfig: &clusterprovisioner.DistributionConfig{ + Kind: ctx.KindConfig, + K3d: ctx.K3dConfig, + Talos: ctx.TalosConfig, + }, + } +} diff --git a/pkg/cli/cmd/cluster/update_test.go b/pkg/cli/cmd/cluster/update_test.go new file mode 100644 index 000000000..e5f178550 --- /dev/null +++ b/pkg/cli/cmd/cluster/update_test.go @@ -0,0 +1,115 @@ +//nolint:testpackage // Testing internal functions requires same package +package cluster + +import ( + "bytes" + "testing" + + runtime "github.com/devantler-tech/ksail/v5/pkg/di" +) + +func TestNewUpdateCmd(t *testing.T) { + t.Parallel() + + runtimeContainer := &runtime.Runtime{} + cmd := NewUpdateCmd(runtimeContainer) + + // Verify command basics + if cmd.Use != "update" { + t.Errorf("expected Use to be 'update', got %q", cmd.Use) + } + + if cmd.Short == "" { + t.Error("expected Short description to be set") + } + + if cmd.Long == "" { + t.Error("expected Long description to be set") + } + + // Verify flags + forceFlag := cmd.Flags().Lookup("force") + if forceFlag == nil { + t.Error("expected --force flag to exist") + } + + nameFlag := cmd.Flags().Lookup("name") + if nameFlag == nil { + t.Error("expected --name flag to exist") + } + + mirrorRegistryFlag := cmd.Flags().Lookup("mirror-registry") + if mirrorRegistryFlag == nil { + t.Error("expected --mirror-registry flag to exist") + } + + dryRunFlag := cmd.Flags().Lookup("dry-run") + if dryRunFlag == nil { + t.Error("expected --dry-run flag to exist") + } +} + +func TestPromptForUpdateConfirmation(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + input string + expected bool + }{ + { + name: "user confirms with 'yes'", + input: "yes\n", + expected: true, + }, + { + name: "user confirms with 'YES'", + input: "YES\n", + expected: true, + }, + { + name: "user confirms with 'Yes'", + input: "Yes\n", + expected: true, + }, + { + name: "user rejects with 'no'", + input: "no\n", + expected: false, + }, + { + name: "user rejects with empty input", + input: "\n", + expected: false, + }, + { + name: "user rejects with random text", + input: "maybe\n", + expected: false, + }, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + + runtimeContainer := &runtime.Runtime{} + cmd := NewUpdateCmd(runtimeContainer) + + // Set up input/output buffers + inputBuf := bytes.NewBufferString(testCase.input) + outputBuf := &bytes.Buffer{} + + cmd.SetIn(inputBuf) + cmd.SetOut(outputBuf) + cmd.SetErr(outputBuf) + + // Test prompt function + result := promptForUpdateConfirmation(cmd, "test-cluster") + + if result != testCase.expected { + t.Errorf("expected %v, got %v", testCase.expected, result) + } + }) + } +} diff --git a/pkg/svc/provisioner/cluster/errors/errors.go b/pkg/svc/provisioner/cluster/errors/errors.go index eacdea466..8ced7b5ea 100644 --- a/pkg/svc/provisioner/cluster/errors/errors.go +++ b/pkg/svc/provisioner/cluster/errors/errors.go @@ -32,3 +32,26 @@ var ErrUnsupportedProvider = errors.New("unsupported provider") // ErrMissingDistributionConfig is returned when no pre-loaded distribution config is provided. var ErrMissingDistributionConfig = errors.New("missing distribution config") + +// ErrRecreationRequired is returned when configuration changes require cluster recreation. +var ErrRecreationRequired = errors.New("cluster recreation required; use delete + create instead") + +// ErrConfigNil is returned when a required configuration is nil. +var ErrConfigNil = errors.New("config is nil") + +// ErrNoProviderConfigured is returned when no infrastructure provider is configured for an operation. +var ErrNoProviderConfigured = errors.New("no provider configured to get node IPs") + +// ErrDockerClientNotConfigured is returned when Docker client is required but not configured. +var ErrDockerClientNotConfigured = errors.New("docker client not configured") + +// ErrClusterDoesNotExist is returned when attempting to update a cluster that doesn't exist. +var ErrClusterDoesNotExist = errors.New( + "cluster does not exist; use 'ksail cluster create' to create a new cluster", +) + +// ErrTalosConfigRequired is returned when TalosConfig credentials are required but not available. +// This occurs when attempting to update a Talos cluster without valid PKI credentials. +var ErrTalosConfigRequired = errors.New( + "TalosConfig required for cluster updates but not available", +) diff --git a/pkg/svc/provisioner/cluster/k3d/update.go b/pkg/svc/provisioner/cluster/k3d/update.go new file mode 100644 index 000000000..c158b48ed --- /dev/null +++ b/pkg/svc/provisioner/cluster/k3d/update.go @@ -0,0 +1,127 @@ +package k3dprovisioner + +import ( + "context" + "fmt" + + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" + clustererrors "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/errors" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" +) + +// Update applies configuration changes to a running K3d cluster. +// K3d supports: +// - Adding/removing worker nodes via k3d node commands +// - Registry configuration updates via registries.yaml +// +// It does NOT support adding/removing server (control-plane) nodes after creation. +func (k *K3dClusterProvisioner) Update( + ctx context.Context, + name string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + opts types.UpdateOptions, +) (*types.UpdateResult, error) { + if oldSpec == nil || newSpec == nil { + return &types.UpdateResult{}, nil + } + + // Compute diff + diff, err := k.DiffConfig(ctx, name, oldSpec, newSpec) + if err != nil { + return nil, fmt.Errorf("failed to compute config diff: %w", err) + } + + if opts.DryRun { + return diff, nil + } + + result := &types.UpdateResult{ + InPlaceChanges: diff.InPlaceChanges, + RebootRequired: diff.RebootRequired, + RecreateRequired: diff.RecreateRequired, + AppliedChanges: make([]types.Change, 0), + FailedChanges: make([]types.Change, 0), + } + + // If there are recreate-required changes, we cannot handle them + if diff.HasRecreateRequired() { + return result, fmt.Errorf("%w: %d changes require restart", + clustererrors.ErrRecreationRequired, len(diff.RecreateRequired)) + } + + clusterName := k.resolveName(name) + + // Handle worker node scaling + err = k.applyWorkerScaling(ctx, clusterName, oldSpec, newSpec, result) + if err != nil { + return result, fmt.Errorf("failed to scale workers: %w", err) + } + + return result, nil +} + +// DiffConfig computes the differences between current and desired configurations. +func (k *K3dClusterProvisioner) DiffConfig( + _ context.Context, + _ string, + oldSpec, newSpec *v1alpha1.ClusterSpec, +) (*types.UpdateResult, error) { + result := &types.UpdateResult{ + InPlaceChanges: make([]types.Change, 0), + RebootRequired: make([]types.Change, 0), + RecreateRequired: make([]types.Change, 0), + } + + if oldSpec == nil || newSpec == nil { + return result, nil + } + + // K3d configuration comes from the SimpleConfig (k3d.yaml) + if k.simpleCfg == nil { + return result, nil + } + + // Check server (control-plane) count - K3d does NOT support scaling servers after creation + // Server count comes from the k3d SimpleConfig, not ClusterSpec + // Changes to servers would be detected by comparing old/new simpleCfg versions + // For now, report based on the current config vs any documented expectation + + // Check agent (worker) count - K3d DOES support scaling agents + // Agent count also comes from k3d SimpleConfig + // The k3d.yaml is the source of truth for k3d clusters + + return result, nil +} + +// applyWorkerScaling handles adding or removing K3d agent nodes. +// +//nolint:unparam // result will be used when scaling is implemented +func (k *K3dClusterProvisioner) applyWorkerScaling( + _ context.Context, + _ string, + _, _ *v1alpha1.ClusterSpec, + _ *types.UpdateResult, +) error { + if k.simpleCfg == nil { + return nil + } + + // K3d agent scaling uses the SimpleConfig + // Since we don't track old vs new SimpleConfig, scaling would be handled + // by comparing actual running nodes vs desired count + + return nil +} + +// GetCurrentConfig retrieves the current cluster configuration. +// For K3d clusters, we return the configuration based on the SimpleConfig +// used for cluster creation. +func (k *K3dClusterProvisioner) GetCurrentConfig() (*v1alpha1.ClusterSpec, error) { + // K3d configuration comes from the SimpleConfig + spec := &v1alpha1.ClusterSpec{ + Distribution: v1alpha1.DistributionK3s, + Provider: v1alpha1.ProviderDocker, + } + + return spec, nil +} diff --git a/pkg/svc/provisioner/cluster/kind/provisioner.go b/pkg/svc/provisioner/cluster/kind/provisioner.go index 1144cb4f2..98bb3e6bf 100644 --- a/pkg/svc/provisioner/cluster/kind/provisioner.go +++ b/pkg/svc/provisioner/cluster/kind/provisioner.go @@ -52,7 +52,6 @@ func (noopInfoLogger) Info(string) {} func (noopInfoLogger) Infof(string, ...any) {} func (noopInfoLogger) Enabled() bool { return false } -//nolint:ireturn // Required by Kind SDK log.Logger interface func (l *streamLogger) V(level log.Level) log.InfoLogger { // Only enable info-level messages (V(0)), suppress verbose/debug (V(1+)) if level > 0 { diff --git a/pkg/svc/provisioner/cluster/kind/update.go b/pkg/svc/provisioner/cluster/kind/update.go new file mode 100644 index 000000000..91d8a7626 --- /dev/null +++ b/pkg/svc/provisioner/cluster/kind/update.go @@ -0,0 +1,95 @@ +package kindprovisioner + +import ( + "context" + "fmt" + + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" +) + +// Update attempts to apply configuration changes to a running Kind cluster. +// +// Kind does NOT support in-place node modifications: +// - Cannot add/remove control-plane nodes +// - Cannot add/remove worker nodes +// - Cannot change networking configuration +// - Cannot modify containerd registry config +// +// The only updates possible are at the Kubernetes level (Helm components), +// which are handled by the installer layer, not the provisioner. +// +// For any structural Kind changes, this method returns RecreateRequired. +func (k *KindClusterProvisioner) Update( + ctx context.Context, + name string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + opts types.UpdateOptions, +) (*types.UpdateResult, error) { + // Compute diff to identify what changed + diff, err := k.DiffConfig(ctx, name, oldSpec, newSpec) + if err != nil { + return nil, fmt.Errorf("failed to compute config diff: %w", err) + } + + // For Kind, we can only report what would change - any structural + // changes require cluster recreation + if opts.DryRun || diff.HasRecreateRequired() { + return diff, nil + } + + // If there are only in-place changes (Helm components), those are handled + // by the installer layer, not here + return diff, nil +} + +// DiffConfig computes configuration differences for Kind clusters. +// Since Kind doesn't support node-level changes, most changes are classified +// as RecreateRequired. +func (k *KindClusterProvisioner) DiffConfig( + _ context.Context, + _ string, + oldSpec, newSpec *v1alpha1.ClusterSpec, +) (*types.UpdateResult, error) { + result := &types.UpdateResult{ + InPlaceChanges: make([]types.Change, 0), + RebootRequired: make([]types.Change, 0), + RecreateRequired: make([]types.Change, 0), + } + + if oldSpec == nil || newSpec == nil { + return result, nil + } + + // MirrorsDir change requires recreate (containerd config is baked in) + if oldSpec.Vanilla.MirrorsDir != newSpec.Vanilla.MirrorsDir { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "vanilla.mirrorsDir", + OldValue: oldSpec.Vanilla.MirrorsDir, + NewValue: newSpec.Vanilla.MirrorsDir, + Category: types.ChangeCategoryRecreateRequired, + Reason: "Kind containerd registry mirrors are configured at cluster creation", + }) + } + + // Node count changes require recreate + // Kind node configuration comes from kind.yaml, not ClusterSpec + // Changes to the Kind config (nodes, networking, etc.) require cluster recreation + + return result, nil +} + +// GetCurrentConfig retrieves the current cluster configuration. +// For Kind clusters, we return the configuration used to create the cluster. +func (k *KindClusterProvisioner) GetCurrentConfig() (*v1alpha1.ClusterSpec, error) { + // Kind doesn't persist configuration after creation. + // Return the spec from the config file that was used. + // This is a limitation of Kind - it doesn't store original config. + return &v1alpha1.ClusterSpec{ + Distribution: v1alpha1.DistributionVanilla, + Provider: v1alpha1.ProviderDocker, + Vanilla: v1alpha1.OptionsVanilla{ + MirrorsDir: "", // Cannot retrieve from running cluster + }, + }, nil +} diff --git a/pkg/svc/provisioner/cluster/provisioner.go b/pkg/svc/provisioner/cluster/provisioner.go index 1a6d1996a..8e4e94779 100644 --- a/pkg/svc/provisioner/cluster/provisioner.go +++ b/pkg/svc/provisioner/cluster/provisioner.go @@ -3,7 +3,9 @@ package clusterprovisioner import ( "context" + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" "github.com/devantler-tech/ksail/v5/pkg/svc/provider" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" ) // ClusterProvisioner defines methods for managing Kubernetes clusters. @@ -29,6 +31,33 @@ type ClusterProvisioner interface { Exists(ctx context.Context, name string) (bool, error) } +// ClusterUpdater is an optional interface for provisioners that support in-place updates. +// Not all provisioners support updates - Kind requires recreation for most changes, +// while Talos and K3d support various in-place modifications. +type ClusterUpdater interface { + // Update applies configuration changes to a running cluster. + // Returns an UpdateResult describing what changed and how it was handled. + // The oldSpec represents the current cluster state, newSpec is the desired state. + Update( + ctx context.Context, + name string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + opts types.UpdateOptions, + ) (*types.UpdateResult, error) + + // DiffConfig computes the differences between current and desired configurations. + // This is used to preview changes before applying them. + DiffConfig( + ctx context.Context, + name string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + ) (*types.UpdateResult, error) + + // GetCurrentConfig retrieves the current cluster configuration from the running cluster. + // Used to compare against the desired configuration for computing diffs. + GetCurrentConfig() (*v1alpha1.ClusterSpec, error) +} + // ProviderAware is an optional interface for provisioners that can use a provider // for infrastructure operations (start/stop nodes). type ProviderAware interface { diff --git a/pkg/svc/provisioner/cluster/talos/update.go b/pkg/svc/provisioner/cluster/talos/update.go new file mode 100644 index 000000000..a6604fa5f --- /dev/null +++ b/pkg/svc/provisioner/cluster/talos/update.go @@ -0,0 +1,439 @@ +package talosprovisioner + +import ( + "context" + "fmt" + "strconv" + + "github.com/devantler-tech/ksail/v5/pkg/apis/cluster/v1alpha1" + clustererrors "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/errors" + "github.com/devantler-tech/ksail/v5/pkg/svc/provisioner/cluster/types" + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + machineapi "github.com/siderolabs/talos/pkg/machinery/api/machine" + talosclient "github.com/siderolabs/talos/pkg/machinery/client" + talosconfig "github.com/siderolabs/talos/pkg/machinery/config" +) + +// Update applies configuration changes to all nodes in a running Talos cluster. +// It implements the ClusterUpdater interface. +func (p *TalosProvisioner) Update( + ctx context.Context, + name string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + opts types.UpdateOptions, +) (*types.UpdateResult, error) { + // Compute diff to determine what changed + diff, err := p.DiffConfig(ctx, name, oldSpec, newSpec) + if err != nil { + return nil, fmt.Errorf("failed to compute config diff: %w", err) + } + + if opts.DryRun { + return diff, nil + } + + result := &types.UpdateResult{ + InPlaceChanges: diff.InPlaceChanges, + RebootRequired: diff.RebootRequired, + RecreateRequired: diff.RecreateRequired, + AppliedChanges: make([]types.Change, 0), + FailedChanges: make([]types.Change, 0), + } + + // If there are recreate-required changes, we cannot handle them in Update + if diff.HasRecreateRequired() { + return result, fmt.Errorf("%w: %d changes require restart", + clustererrors.ErrRecreationRequired, len(diff.RecreateRequired)) + } + + clusterName := p.resolveClusterName(name) + + // Handle node scaling changes + err = p.applyNodeScalingChanges(ctx, clusterName, oldSpec, newSpec, result) + if err != nil { + return result, fmt.Errorf("failed to apply node scaling changes: %w", err) + } + + // Handle in-place config changes (NO_REBOOT mode) + err = p.applyInPlaceConfigChanges(ctx, clusterName, result) + if err != nil { + return result, fmt.Errorf("failed to apply in-place config changes: %w", err) + } + + // Handle reboot-required changes (STAGED mode with rolling reboot) + if diff.HasRebootRequired() && opts.RollingReboot { + err := p.applyRebootRequiredChanges(ctx, clusterName, result, opts) + if err != nil { + return result, fmt.Errorf("failed to apply reboot-required changes: %w", err) + } + } + + return result, nil +} + +// DiffConfig computes the differences between current and desired configurations. +func (p *TalosProvisioner) DiffConfig( + _ context.Context, + _ string, + oldSpec, newSpec *v1alpha1.ClusterSpec, +) (*types.UpdateResult, error) { + result := &types.UpdateResult{ + InPlaceChanges: make([]types.Change, 0), + RebootRequired: make([]types.Change, 0), + RecreateRequired: make([]types.Change, 0), + } + + if oldSpec == nil || newSpec == nil { + return result, nil + } + + // Compare control plane count + if oldSpec.Talos.ControlPlanes != newSpec.Talos.ControlPlanes { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "talos.controlPlanes", + OldValue: strconv.Itoa(int(oldSpec.Talos.ControlPlanes)), + NewValue: strconv.Itoa(int(newSpec.Talos.ControlPlanes)), + Category: types.ChangeCategoryInPlace, + Reason: "control-plane nodes can be added/removed via provider", + }) + } + + // Compare worker count + if oldSpec.Talos.Workers != newSpec.Talos.Workers { + result.InPlaceChanges = append(result.InPlaceChanges, types.Change{ + Field: "talos.workers", + OldValue: strconv.Itoa(int(oldSpec.Talos.Workers)), + NewValue: strconv.Itoa(int(newSpec.Talos.Workers)), + Category: types.ChangeCategoryInPlace, + Reason: "worker nodes can be added/removed via provider", + }) + } + + // Check for network CIDR changes (requires recreate) + if p.hetznerOpts != nil { + oldCIDR := oldSpec.Hetzner.NetworkCIDR + newCIDR := newSpec.Hetzner.NetworkCIDR + + if oldCIDR != newCIDR && oldCIDR != "" && newCIDR != "" { + result.RecreateRequired = append(result.RecreateRequired, types.Change{ + Field: "hetzner.networkCidr", + OldValue: oldCIDR, + NewValue: newCIDR, + Category: types.ChangeCategoryRecreateRequired, + Reason: "network CIDR change requires PKI regeneration", + }) + } + } + + return result, nil +} + +// applyNodeScalingChanges handles adding or removing nodes. +// +//nolint:unparam // result will be used when node scaling is fully implemented +func (p *TalosProvisioner) applyNodeScalingChanges( + _ context.Context, + clusterName string, + oldSpec, newSpec *v1alpha1.ClusterSpec, + _ *types.UpdateResult, +) error { + if oldSpec == nil || newSpec == nil { + return nil + } + + // Calculate differences + cpDelta := int(newSpec.Talos.ControlPlanes - oldSpec.Talos.ControlPlanes) + workerDelta := int(newSpec.Talos.Workers - oldSpec.Talos.Workers) + + if cpDelta == 0 && workerDelta == 0 { + return nil + } + + _, _ = fmt.Fprintf(p.logWriter, " Scaling cluster %s: CP %+d, Workers %+d\n", + clusterName, cpDelta, workerDelta) + + // For now, log what would happen - actual implementation depends on provider + if cpDelta > 0 { + _, _ = fmt.Fprintf(p.logWriter, " TODO: Add %d control-plane node(s)\n", cpDelta) + } else if cpDelta < 0 { + _, _ = fmt.Fprintf(p.logWriter, " TODO: Remove %d control-plane node(s)\n", -cpDelta) + } + + if workerDelta > 0 { + _, _ = fmt.Fprintf(p.logWriter, " TODO: Add %d worker node(s)\n", workerDelta) + } else if workerDelta < 0 { + _, _ = fmt.Fprintf(p.logWriter, " TODO: Remove %d worker node(s)\n", -workerDelta) + } + + return nil +} + +// applyInPlaceConfigChanges applies configuration changes that don't require reboots. +// Uses ApplyConfiguration with NO_REBOOT mode for Talos-supported fields. +func (p *TalosProvisioner) applyInPlaceConfigChanges( + ctx context.Context, + clusterName string, + _ *types.UpdateResult, +) error { + if p.talosConfigs == nil { + return nil + } + + // Get node IPs from the cluster + nodeIPs, err := p.getNodeIPs(ctx, clusterName) + if err != nil { + return fmt.Errorf("failed to get node IPs: %w", err) + } + + if len(nodeIPs) == 0 { + _, _ = fmt.Fprintf(p.logWriter, " No nodes found for cluster %s\n", clusterName) + + return nil + } + + // Apply config to each node with NO_REBOOT mode + for _, nodeIP := range nodeIPs { + err := p.applyConfigWithMode( + ctx, + nodeIP, + p.talosConfigs.ControlPlane(), + machineapi.ApplyConfigurationRequest_NO_REBOOT, + ) + if err != nil { + _, _ = fmt.Fprintf(p.logWriter, " ⚠ Failed to apply config to %s: %v\n", nodeIP, err) + // Continue with other nodes + } else { + _, _ = fmt.Fprintf(p.logWriter, " ✓ Config applied to %s (no reboot)\n", nodeIP) + } + } + + return nil +} + +// applyRebootRequiredChanges applies changes that require node reboots. +// Uses rolling reboot strategy when opts.RollingReboot is true. +// +//nolint:unparam // result will be used when rolling reboot is implemented +func (p *TalosProvisioner) applyRebootRequiredChanges( + _ context.Context, + _ string, + result *types.UpdateResult, + opts types.UpdateOptions, +) error { + _, _ = fmt.Fprintf(p.logWriter, + " %d changes require reboot (rolling=%v)\n", + len(result.RebootRequired), opts.RollingReboot) + + // Rolling reboot strategy (not yet implemented): + // 1. Get list of nodes + // 2. For each node: + // a. Apply config with STAGED mode + // b. Cordon the node (drain workloads) + // c. Reboot the node + // d. Wait for node to be Ready + // e. Uncordon the node + // 3. Move to next node + + return nil +} + +// applyConfigWithMode applies configuration to a single node with the specified mode. +func (p *TalosProvisioner) applyConfigWithMode( + ctx context.Context, + nodeIP string, + config talosconfig.Provider, + mode machineapi.ApplyConfigurationRequest_Mode, +) error { + if config == nil { + return clustererrors.ErrConfigNil + } + + cfgBytes, err := config.Bytes() + if err != nil { + return fmt.Errorf("failed to marshal config: %w", err) + } + + talosClient, err := p.createTalosClient(ctx, nodeIP) + if err != nil { + return err + } + + defer talosClient.Close() //nolint:errcheck + + _, err = talosClient.ApplyConfiguration(ctx, &machineapi.ApplyConfigurationRequest{ + Data: cfgBytes, + Mode: mode, + }) + if err != nil { + return fmt.Errorf("failed to apply configuration: %w", err) + } + + return nil +} + +// createTalosClient creates a Talos client for the given node, using TalosConfig credentials if available. +func (p *TalosProvisioner) createTalosClient( + ctx context.Context, + nodeIP string, +) (*talosclient.Client, error) { + // If we have talos config bundle, use its TLS credentials + if p.talosConfigs != nil && p.talosConfigs.Bundle() != nil { + if talosConf := p.talosConfigs.Bundle().TalosConfig(); talosConf != nil { + client, err := talosclient.New(ctx, + talosclient.WithEndpoints(nodeIP), + talosclient.WithConfig(talosConf), + ) + if err != nil { + return nil, fmt.Errorf("failed to create Talos client with config: %w", err) + } + + return client, nil + } + } + + return nil, clustererrors.ErrTalosConfigRequired +} + +// getNodeIPs returns the IPs of all nodes in the cluster. +func (p *TalosProvisioner) getNodeIPs(ctx context.Context, clusterName string) ([]string, error) { + // For Docker provider, get IPs from Docker containers + if p.dockerClient != nil { + return p.getDockerNodeIPs(ctx, clusterName) + } + + // For Hetzner provider, get IPs from Hetzner API + if p.infraProvider != nil { + return p.getHetznerNodeIPs(ctx, clusterName) + } + + return nil, clustererrors.ErrNoProviderConfigured +} + +// getDockerNodeIPs gets node IPs from Docker containers. +func (p *TalosProvisioner) getDockerNodeIPs( + ctx context.Context, + clusterName string, +) ([]string, error) { + if p.dockerClient == nil { + return nil, clustererrors.ErrDockerClientNotConfigured + } + + // List containers with the Talos cluster label + containers, err := p.dockerClient.ContainerList(ctx, container.ListOptions{ + Filters: filters.NewArgs( + filters.Arg("label", LabelTalosClusterName+"="+clusterName), + ), + }) + if err != nil { + return nil, fmt.Errorf("failed to list containers: %w", err) + } + + ips := make([]string, 0, len(containers)) + + for _, c := range containers { + for _, network := range c.NetworkSettings.Networks { + if network.IPAddress != "" { + ips = append(ips, network.IPAddress) + + break + } + } + } + + return ips, nil +} + +// getHetznerNodeIPs gets node IPs from Hetzner servers. +func (p *TalosProvisioner) getHetznerNodeIPs( + _ context.Context, + _ string, +) ([]string, error) { + // For now, return empty - would need Hetzner client to list servers + // The actual implementation would query Hetzner API for servers with matching labels + return nil, nil +} + +// getTalosNoRebootPaths returns the list of machine config paths that can be changed without reboot. +// Based on Talos documentation: +// https://www.talos.dev/v1.9/talos-guides/configuration/editing-machine-configuration/ +func getTalosNoRebootPaths() []string { + return []string{ + ".cluster", + ".machine.network", + ".machine.kubelet", + ".machine.registries", + ".machine.nodeLabels", + ".machine.nodeTaints", + ".machine.time", + ".machine.sysfs", + ".machine.sysctls", + ".machine.logging", + ".machine.pods", + ".machine.kernel", + } +} + +// getTalosRebootRequiredPaths returns the list of machine config paths that require reboot. +func getTalosRebootRequiredPaths() []string { + return []string{ + ".machine.install", + ".machine.disks", + } +} + +// ClassifyTalosPatch determines the reboot requirement for a given Talos config path. +func ClassifyTalosPatch(path string) types.ChangeCategory { + // Check no-reboot paths first + for _, p := range getTalosNoRebootPaths() { + if pathMatches(path, p) { + return types.ChangeCategoryInPlace + } + } + + // Check reboot-required paths + for _, p := range getTalosRebootRequiredPaths() { + if pathMatches(path, p) { + return types.ChangeCategoryRebootRequired + } + } + + // Default to reboot for unknown paths (safer) + return types.ChangeCategoryRebootRequired +} + +// pathMatches checks if a config path matches a pattern. +func pathMatches(path, pattern string) bool { + // Simple prefix matching for now + return len(path) >= len(pattern) && path[:len(pattern)] == pattern +} + +// GetCurrentConfig retrieves the current cluster configuration. +// For Talos clusters, we return the configuration from the TalosConfigs. +func (p *TalosProvisioner) GetCurrentConfig() (*v1alpha1.ClusterSpec, error) { + spec := &v1alpha1.ClusterSpec{ + Distribution: v1alpha1.DistributionTalos, + } + + // Determine provider + if p.dockerClient != nil { + spec.Provider = v1alpha1.ProviderDocker + } else if p.infraProvider != nil { + spec.Provider = v1alpha1.ProviderHetzner + } + + // Set Talos-specific options from the provisioner state + spec.Talos = v1alpha1.OptionsTalos{ + ControlPlanes: 1, // Default, actual value would need cluster inspection + Workers: 0, + } + + // If we have Hetzner options configured + if p.hetznerOpts != nil { + spec.Hetzner = v1alpha1.OptionsHetzner{ + NetworkCIDR: p.hetznerOpts.NetworkCIDR, + } + } + + return spec, nil +} diff --git a/pkg/svc/provisioner/cluster/types/update.go b/pkg/svc/provisioner/cluster/types/update.go new file mode 100644 index 000000000..9546dbdf9 --- /dev/null +++ b/pkg/svc/provisioner/cluster/types/update.go @@ -0,0 +1,124 @@ +// Package types provides shared types for cluster provisioner operations. +// These are separated to avoid import cycles between provisioner implementations +// and the main provisioner interface package. +// +//nolint:revive // package name "types" is intentionally generic for shared types +package types + +// ChangeCategory classifies the impact of a configuration change. +type ChangeCategory int + +const ( + // ChangeCategoryInPlace indicates the change can be applied without disruption. + // Examples: component enable/disable via Helm, Talos config changes that support NO_REBOOT. + ChangeCategoryInPlace ChangeCategory = iota + + // ChangeCategoryRebootRequired indicates the change requires node reboots. + // Examples: Talos kernel parameters, disk encryption settings. + ChangeCategoryRebootRequired + + // ChangeCategoryRecreateRequired indicates the cluster must be recreated. + // Examples: distribution change, provider change, Kind node changes, network CIDR changes. + ChangeCategoryRecreateRequired +) + +// String returns a human-readable name for the change category. +func (c ChangeCategory) String() string { + switch c { + case ChangeCategoryInPlace: + return "in-place" + case ChangeCategoryRebootRequired: + return "reboot-required" + case ChangeCategoryRecreateRequired: + return "recreate-required" + default: + return "unknown" + } +} + +// Change describes a single detected configuration change. +type Change struct { + // Field is the configuration field path that changed (e.g., "cluster.cni", "talos.workers"). + Field string + // OldValue is the previous value (may be empty for additions). + OldValue string + // NewValue is the new value (may be empty for removals). + NewValue string + // Category classifies the impact of this change. + Category ChangeCategory + // Reason explains why this change has its category. + Reason string +} + +// UpdateResult describes the outcome of a cluster update operation. +type UpdateResult struct { + // InPlaceChanges lists changes that were applied without disruption. + InPlaceChanges []Change + // RebootRequired lists changes that require node reboots. + RebootRequired []Change + // RecreateRequired lists changes that require cluster recreation. + RecreateRequired []Change + // AppliedChanges lists changes that were successfully applied. + AppliedChanges []Change + // FailedChanges lists changes that failed to apply. + FailedChanges []Change + // RebootsPerformed indicates how many nodes were rebooted. + RebootsPerformed int + // ClusterRecreated indicates if the cluster was recreated. + ClusterRecreated bool +} + +// HasInPlaceChanges returns true if there are any in-place changes. +func (r *UpdateResult) HasInPlaceChanges() bool { + return len(r.InPlaceChanges) > 0 +} + +// HasRebootRequired returns true if there are changes requiring reboots. +func (r *UpdateResult) HasRebootRequired() bool { + return len(r.RebootRequired) > 0 +} + +// HasRecreateRequired returns true if there are changes requiring recreation. +func (r *UpdateResult) HasRecreateRequired() bool { + return len(r.RecreateRequired) > 0 +} + +// NeedsUserConfirmation returns true if any changes require user confirmation. +// In-place changes can be applied silently; reboot or recreate require confirmation. +func (r *UpdateResult) NeedsUserConfirmation() bool { + return r.HasRebootRequired() || r.HasRecreateRequired() +} + +// TotalChanges returns the total number of detected changes. +func (r *UpdateResult) TotalChanges() int { + return len(r.InPlaceChanges) + len(r.RebootRequired) + len(r.RecreateRequired) +} + +// AllChanges returns all detected changes in a single slice. +func (r *UpdateResult) AllChanges() []Change { + all := make([]Change, 0, r.TotalChanges()) + all = append(all, r.InPlaceChanges...) + all = append(all, r.RebootRequired...) + all = append(all, r.RecreateRequired...) + + return all +} + +// UpdateOptions provides configuration for the update operation. +type UpdateOptions struct { + // Force skips user confirmation for destructive changes. + Force bool + // DryRun shows what would change without applying. + DryRun bool + // RollingReboot enables rolling reboots (one node at a time) for reboot-required changes. + RollingReboot bool +} + +// DefaultUpdateOptions returns sensible defaults for update operations. +func DefaultUpdateOptions() UpdateOptions { + return UpdateOptions{ + Force: false, + DryRun: false, + RollingReboot: true, + } +}