From 240277a1e528f5810e3b8bd2571a7e02a1d3ed9d Mon Sep 17 00:00:00 2001 From: William Batista Date: Thu, 8 Aug 2024 13:17:23 -0400 Subject: [PATCH 1/5] Show errors when trying to check BG health check Ran into an issue where I was unable to tell if a request failed because my code application code was bad, or it was a platform error. Being able to tell the difference let's users know when to retry deploys or fix their code --- internal/command/deploy/strategy_bluegreen.go | 41 ++++++++++++++----- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index a173702107..c8bde4b52e 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -289,7 +289,7 @@ func (bg *blueGreen) changeDetected(a, b map[string]string) bool { return false } -func (bg *blueGreen) renderMachineHealthchecks(state map[string]*fly.HealthCheckStatus) func() { +func (bg *blueGreen) renderMachineHealthchecks(state map[string]healthCheckStatusResult) func() { firstRun := true previousView := map[string]string{} @@ -300,8 +300,11 @@ func (bg *blueGreen) renderMachineHealthchecks(state map[string]*fly.HealthCheck bg.healthLock.RLock() for id, value := range state { status := "unchecked" - if value.Total != 0 { - status = fmt.Sprintf("%d/%d passing", value.Passing, value.Total) + if value.healthCheckStatus.Total != 0 { + status = fmt.Sprintf("%d/%d passing", value.healthCheckStatus.Passing, value.healthCheckStatus.Total) + } + if value.err != nil { + status = fmt.Sprintf("error: %v", value.err) } currentView[id] = status @@ -324,18 +327,18 @@ func (bg *blueGreen) renderMachineHealthchecks(state map[string]*fly.HealthCheck } } -func (bg *blueGreen) allMachinesHealthy(stateMap map[string]*fly.HealthCheckStatus) bool { +func (bg *blueGreen) allMachinesHealthy(stateMap map[string]healthCheckStatusResult) bool { passed := 0 bg.healthLock.RLock() for _, v := range stateMap { // we initialize all machine ids with an empty struct, so all fields are zero'd on init. // without v.hcs.Total != 0, the first call to this function will pass since 0 == 0 - if v.Total == 0 { + if v.healthCheckStatus.Total == 0 { continue } - if v.Passing == v.Total { + if v.healthCheckStatus.Passing == v.healthCheckStatus.Total { passed += 1 } } @@ -344,18 +347,25 @@ func (bg *blueGreen) allMachinesHealthy(stateMap map[string]*fly.HealthCheckStat return passed == len(stateMap) } +type healthCheckStatusResult struct { + healthCheckStatus *fly.HealthCheckStatus + err error +} + func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error { ctx, span := tracing.GetTracer().Start(ctx, "green_machines_health_wait") defer span.End() wait := time.NewTicker(bg.timeout) - machineIDToHealthStatus := map[string]*fly.HealthCheckStatus{} + machineIDToHealthStatus := map[string]healthCheckStatusResult{} errChan := make(chan error) render := bg.renderMachineHealthchecks(machineIDToHealthStatus) for _, gm := range bg.greenMachines { if gm.launchInput.SkipLaunch { - machineIDToHealthStatus[gm.leasableMachine.FormattedMachineId()] = &fly.HealthCheckStatus{Total: 1, Passing: 1} + machineIDToHealthStatus[gm.leasableMachine.FormattedMachineId()] = healthCheckStatusResult{ + healthCheckStatus: &fly.HealthCheckStatus{Total: 1, Passing: 1}, + } continue } @@ -366,7 +376,9 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error continue } - machineIDToHealthStatus[gm.leasableMachine.FormattedMachineId()] = &fly.HealthCheckStatus{} + machineIDToHealthStatus[gm.leasableMachine.FormattedMachineId()] = healthCheckStatusResult{ + healthCheckStatus: &fly.HealthCheckStatus{}, + } } for _, gm := range bg.greenMachines { @@ -394,16 +406,24 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error switch { case waitCtx.Err() != nil: + machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ + err: waitCtx.Err(), + } errChan <- waitCtx.Err() return case err != nil: + machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ + err: waitCtx.Err(), + } errChan <- err return } status := updateMachine.TopLevelChecks() bg.healthLock.Lock() - machineIDToHealthStatus[m.FormattedMachineId()] = status + machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ + healthCheckStatus: status, + } bg.healthLock.Unlock() if (status.Total == status.Passing) && (status.Total != 0) { @@ -428,6 +448,7 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error select { case err := <-errChan: + span.RecordError(err) return err case <-wait.C: return ErrWaitTimeout From cd7a2d0e730e984f1c5748112f93831fdad297fe Mon Sep 17 00:00:00 2001 From: William Batista Date: Thu, 8 Aug 2024 13:23:10 -0400 Subject: [PATCH 2/5] Retry context deadline exceeded errors context.DeadlineExceeded usually happens when the platform fails to respond to a Get in time. We should retry these errors to smooth over platform issues --- internal/command/deploy/strategy_bluegreen.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index c8bde4b52e..0131cdc70d 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -401,10 +401,19 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error time.Sleep(gracePeriod) + numRetries := 0 for { updateMachine, err := bg.flaps.Get(waitCtx, m.Machine().ID) switch { + // context.DeadlineExceeded usually happens when the platform fails to respond to a Get in time. We should retry these + case errors.Is(err, context.DeadlineExceeded) && numRetries < 3: + waitCtx = context.WithoutCancel(ctx) + waitCtx, cancel = context.WithTimeout(waitCtx, bg.timeout) + defer cancel() + + numRetries += 1 + continue case waitCtx.Err() != nil: machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ err: waitCtx.Err(), From 76ef1470118970a985ab336155f2a963b219f896 Mon Sep 17 00:00:00 2001 From: William Batista Date: Fri, 9 Aug 2024 10:31:36 -0400 Subject: [PATCH 3/5] Revert "Retry context deadline exceeded errors" This reverts commit cd7a2d0e730e984f1c5748112f93831fdad297fe. I want to write some proper retry code in fly-go --- internal/command/deploy/strategy_bluegreen.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index 0131cdc70d..c8bde4b52e 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -401,19 +401,10 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error time.Sleep(gracePeriod) - numRetries := 0 for { updateMachine, err := bg.flaps.Get(waitCtx, m.Machine().ID) switch { - // context.DeadlineExceeded usually happens when the platform fails to respond to a Get in time. We should retry these - case errors.Is(err, context.DeadlineExceeded) && numRetries < 3: - waitCtx = context.WithoutCancel(ctx) - waitCtx, cancel = context.WithTimeout(waitCtx, bg.timeout) - defer cancel() - - numRetries += 1 - continue case waitCtx.Err() != nil: machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ err: waitCtx.Err(), From a65cac6a0ef7d4ca7d08f2b1a61c6e50f4734e57 Mon Sep 17 00:00:00 2001 From: William Batista Date: Fri, 9 Aug 2024 11:54:50 -0400 Subject: [PATCH 4/5] More tracing for BG health checks --- internal/command/deploy/strategy_bluegreen.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index c8bde4b52e..205e33f397 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -290,6 +290,8 @@ func (bg *blueGreen) changeDetected(a, b map[string]string) bool { } func (bg *blueGreen) renderMachineHealthchecks(state map[string]healthCheckStatusResult) func() { + _, span := tracing.GetTracer().Start(context.Background(), "render_machine_healthchecks") + defer span.End() firstRun := true previousView := map[string]string{} @@ -328,6 +330,8 @@ func (bg *blueGreen) renderMachineHealthchecks(state map[string]healthCheckStatu } func (bg *blueGreen) allMachinesHealthy(stateMap map[string]healthCheckStatusResult) bool { + _, span := tracing.GetTracer().Start(context.Background(), "all_machines_healthy") + defer span.End() passed := 0 bg.healthLock.RLock() @@ -394,11 +398,17 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error } go func(m machine.LeasableMachine) { + ctx, span := tracing.GetTracer().Start(ctx, "green_machine_health_check", trace.WithAttributes( + attribute.String("machine_id", m.FormattedMachineId()), + )) + waitCtx, cancel := context.WithTimeout(ctx, bg.timeout) defer cancel() interval, gracePeriod := m.GetMinIntervalAndMinGracePeriod() + span.SetAttributes(attribute.Int("interval_ms", int(interval.Milliseconds())), attribute.Int("grace_period_ms", int(gracePeriod.Milliseconds()))) + span.AddEvent("sleeping for grace period") time.Sleep(gracePeriod) for { @@ -406,12 +416,14 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error switch { case waitCtx.Err() != nil: + span.RecordError(waitCtx.Err()) machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ err: waitCtx.Err(), } errChan <- waitCtx.Err() return case err != nil: + span.RecordError(err) machineIDToHealthStatus[m.FormattedMachineId()] = healthCheckStatusResult{ err: waitCtx.Err(), } @@ -425,11 +437,14 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error healthCheckStatus: status, } bg.healthLock.Unlock() + span.AddEvent(fmt.Sprintf("total health checks: %d", status.Total)) + span.AddEvent(fmt.Sprintf("passing health checks: %d", status.Passing)) if (status.Total == status.Passing) && (status.Total != 0) { return } + span.AddEvent("sleeping for interval") time.Sleep(interval) } }(gm.leasableMachine) From af60580acd987bdd4467b8b6f0c65ab4b64496eb Mon Sep 17 00:00:00 2001 From: William Batista Date: Tue, 13 Aug 2024 12:20:01 -0400 Subject: [PATCH 5/5] even more tracing wowie --- internal/build/imgsrc/depot.go | 11 ++++++++++- internal/command/deploy/strategy_bluegreen.go | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/build/imgsrc/depot.go b/internal/build/imgsrc/depot.go index 685cd096b3..4aa95fdf1d 100644 --- a/internal/build/imgsrc/depot.go +++ b/internal/build/imgsrc/depot.go @@ -151,6 +151,8 @@ func depotBuild(ctx context.Context, streams *iostreams.IOStreams, opts ImageOpt } func initBuilder(ctx context.Context, buildState *build, appName string, streams *iostreams.IOStreams) (*depotmachine.Machine, *depotbuild.Build, error) { + ctx, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() apiClient := flyutil.ClientFromContext(ctx) region := os.Getenv("FLY_REMOTE_BUILDER_REGION") if region != "" { @@ -168,15 +170,22 @@ func initBuilder(ctx context.Context, buildState *build, appName string, streams return nil, nil, err } + fmt.Println("creating a build from an existing one") build, err := depotbuild.FromExistingBuild(ctx, *buildInfo.EnsureDepotRemoteBuilder.BuildId, *buildInfo.EnsureDepotRemoteBuilder.BuildToken) if err != nil { + fmt.Println("failed to create a build from an existing one") + streams.StopProgressIndicator() return nil, nil, err } + fmt.Println("created a build from an existing one") // Set the buildErr to any error that represents the build failing. + fmt.Println("huh") var buildErr error + fmt.Println("wut") var buildkit *depotmachine.Machine + fmt.Println("get") buildkit, buildErr = depotmachine.Acquire(ctx, build.ID, build.Token, "amd64") if buildErr != nil { build.Finish(buildErr) @@ -184,7 +193,7 @@ func initBuilder(ctx context.Context, buildState *build, appName string, streams return nil, nil, buildErr } - return buildkit, &build, err + return buildkit, &build, nil } func buildImage(ctx context.Context, buildkitClient *client.Client, opts ImageOptions, dockerfilePath string) (*client.SolveResponse, error) { diff --git a/internal/command/deploy/strategy_bluegreen.go b/internal/command/deploy/strategy_bluegreen.go index 205e33f397..e229c8b58d 100644 --- a/internal/command/deploy/strategy_bluegreen.go +++ b/internal/command/deploy/strategy_bluegreen.go @@ -400,7 +400,9 @@ func (bg *blueGreen) WaitForGreenMachinesToBeHealthy(ctx context.Context) error go func(m machine.LeasableMachine) { ctx, span := tracing.GetTracer().Start(ctx, "green_machine_health_check", trace.WithAttributes( attribute.String("machine_id", m.FormattedMachineId()), + attribute.Int("bg_timeout_ms", int(bg.timeout.Milliseconds())), )) + defer span.End() waitCtx, cancel := context.WithTimeout(ctx, bg.timeout) defer cancel()