From 30723be5729a7c499779bdb37d4700b52e477e1f Mon Sep 17 00:00:00 2001 From: shaohoukun Date: Fri, 15 Mar 2024 14:57:03 +0800 Subject: [PATCH 1/5] feat(support parallel chunk data): support parallel chunk data --- go.mod | 1 + go.sum | 2 + go/base/context.go | 99 +++++++------ go/cmd/gh-ost/main.go | 2 + go/logic/applier.go | 53 ++++--- go/logic/migrator.go | 86 ++++++++---- vendor/golang.org/x/sync/LICENSE | 27 ++++ vendor/golang.org/x/sync/PATENTS | 22 +++ vendor/golang.org/x/sync/errgroup/errgroup.go | 132 ++++++++++++++++++ vendor/modules.txt | 3 + 10 files changed, 336 insertions(+), 91 deletions(-) create mode 100644 vendor/golang.org/x/sync/LICENSE create mode 100644 vendor/golang.org/x/sync/PATENTS create mode 100644 vendor/golang.org/x/sync/errgroup/errgroup.go diff --git a/go.mod b/go.mod index 9be1453fc..4938b0ba5 100644 --- a/go.mod +++ b/go.mod @@ -11,6 +11,7 @@ require ( golang.org/x/net v0.17.0 golang.org/x/term v0.13.0 golang.org/x/text v0.13.0 + golang.org/x/sync v0.1.0 ) require ( diff --git a/go.sum b/go.sum index 1a540d1cc..d773989c8 100644 --- a/go.sum +++ b/go.sum @@ -94,6 +94,8 @@ golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/go/base/context.go b/go/base/context.go index 300ec1201..acbfc7b12 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -72,6 +72,14 @@ func NewThrottleCheckResult(throttle bool, reason string, reasonHint ThrottleRea } } +type IterationRangeValues struct { + Min *sql.ColumnValues + Max *sql.ColumnValues + Size int64 + IsIncludeMinValues bool + HasFurtherRange bool +} + // MigrationContext has the general, global state of migration. It is used by // all components throughout the migration process. type MigrationContext struct { @@ -119,6 +127,7 @@ type MigrationContext struct { HeartbeatIntervalMilliseconds int64 defaultNumRetries int64 ChunkSize int64 + ChunkParallelSize int64 niceRatio float64 MaxLagMillisecondsThrottleThreshold int64 throttleControlReplicaKeys *mysql.InstanceKeyMap @@ -210,25 +219,26 @@ type MigrationContext struct { InCutOverCriticalSectionFlag int64 PanicAbort chan error - OriginalTableColumnsOnApplier *sql.ColumnList - OriginalTableColumns *sql.ColumnList - OriginalTableVirtualColumns *sql.ColumnList - OriginalTableUniqueKeys [](*sql.UniqueKey) - OriginalTableAutoIncrement uint64 - GhostTableColumns *sql.ColumnList - GhostTableVirtualColumns *sql.ColumnList - GhostTableUniqueKeys [](*sql.UniqueKey) - UniqueKey *sql.UniqueKey - SharedColumns *sql.ColumnList - ColumnRenameMap map[string]string - DroppedColumnsMap map[string]bool - MappedSharedColumns *sql.ColumnList - MigrationRangeMinValues *sql.ColumnValues - MigrationRangeMaxValues *sql.ColumnValues - Iteration int64 - MigrationIterationRangeMinValues *sql.ColumnValues - MigrationIterationRangeMaxValues *sql.ColumnValues - ForceTmpTableName string + OriginalTableColumnsOnApplier *sql.ColumnList + OriginalTableColumns *sql.ColumnList + OriginalTableVirtualColumns *sql.ColumnList + OriginalTableUniqueKeys [](*sql.UniqueKey) + OriginalTableAutoIncrement uint64 + GhostTableColumns *sql.ColumnList + GhostTableVirtualColumns *sql.ColumnList + GhostTableUniqueKeys [](*sql.UniqueKey) + UniqueKey *sql.UniqueKey + SharedColumns *sql.ColumnList + ColumnRenameMap map[string]string + DroppedColumnsMap map[string]bool + MappedSharedColumns *sql.ColumnList + MigrationRangeMinValues *sql.ColumnValues + MigrationRangeMaxValues *sql.ColumnValues + Iteration int64 + MigrationIterationRangeMinValues *sql.ColumnValues + MigrationIterationRangeMaxValues *sql.ColumnValues + CalculateNextIterationRangeEndValuesLock *sync.Mutex + ForceTmpTableName string recentBinlogCoordinates mysql.BinlogCoordinates @@ -269,26 +279,27 @@ type ContextConfig struct { func NewMigrationContext() *MigrationContext { return &MigrationContext{ - Uuid: uuid.NewString(), - defaultNumRetries: 60, - ChunkSize: 1000, - InspectorConnectionConfig: mysql.NewConnectionConfig(), - ApplierConnectionConfig: mysql.NewConnectionConfig(), - MaxLagMillisecondsThrottleThreshold: 1500, - CutOverLockTimeoutSeconds: 3, - DMLBatchSize: 10, - etaNanoseonds: ETAUnknown, - maxLoad: NewLoadMap(), - criticalLoad: NewLoadMap(), - throttleMutex: &sync.Mutex{}, - throttleHTTPMutex: &sync.Mutex{}, - throttleControlReplicaKeys: mysql.NewInstanceKeyMap(), - configMutex: &sync.Mutex{}, - pointOfInterestTimeMutex: &sync.Mutex{}, - lastHeartbeatOnChangelogMutex: &sync.Mutex{}, - ColumnRenameMap: make(map[string]string), - PanicAbort: make(chan error), - Log: NewDefaultLogger(), + Uuid: uuid.NewString(), + defaultNumRetries: 60, + ChunkSize: 1000, + InspectorConnectionConfig: mysql.NewConnectionConfig(), + ApplierConnectionConfig: mysql.NewConnectionConfig(), + MaxLagMillisecondsThrottleThreshold: 1500, + CutOverLockTimeoutSeconds: 3, + DMLBatchSize: 10, + etaNanoseonds: ETAUnknown, + maxLoad: NewLoadMap(), + criticalLoad: NewLoadMap(), + throttleMutex: &sync.Mutex{}, + throttleHTTPMutex: &sync.Mutex{}, + throttleControlReplicaKeys: mysql.NewInstanceKeyMap(), + configMutex: &sync.Mutex{}, + pointOfInterestTimeMutex: &sync.Mutex{}, + lastHeartbeatOnChangelogMutex: &sync.Mutex{}, + CalculateNextIterationRangeEndValuesLock: &sync.Mutex{}, + ColumnRenameMap: make(map[string]string), + PanicAbort: make(chan error), + Log: NewDefaultLogger(), } } @@ -616,6 +627,16 @@ func (this *MigrationContext) SetChunkSize(chunkSize int64) { atomic.StoreInt64(&this.ChunkSize, chunkSize) } +func (this *MigrationContext) SetChunkParallelSize(chunkParallelSize int64) { + if chunkParallelSize < 1 { + chunkParallelSize = 1 + } + if chunkParallelSize > 10 { + chunkParallelSize = 10 + } + atomic.StoreInt64(&this.ChunkParallelSize, chunkParallelSize) +} + func (this *MigrationContext) SetDMLBatchSize(batchSize int64) { if batchSize < 1 { batchSize = 1 diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index 139703077..df17f51da 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -104,6 +104,7 @@ func main() { flag.BoolVar(&migrationContext.CutOverExponentialBackoff, "cut-over-exponential-backoff", false, "Wait exponentially longer intervals between failed cut-over attempts. Wait intervals obey a maximum configurable with 'exponential-backoff-max-interval').") exponentialBackoffMaxInterval := flag.Int64("exponential-backoff-max-interval", 64, "Maximum number of seconds to wait between attempts when performing various operations with exponential backoff.") chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 10-100,000)") + chunkParallelSize := flag.Int64("chunk-parallel-size", 1, "number of chunks to process in parallel (allowed range: 1-100)") dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)") defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking") cutOverLockTimeoutSeconds := flag.Int64("cut-over-lock-timeout-seconds", 3, "Max number of seconds to hold locks on tables while attempting to cut-over (retry attempted when lock exceeds timeout)") @@ -294,6 +295,7 @@ func main() { migrationContext.SetHeartbeatIntervalMilliseconds(*heartbeatIntervalMillis) migrationContext.SetNiceRatio(*niceRatio) migrationContext.SetChunkSize(*chunkSize) + migrationContext.SetChunkParallelSize(*chunkParallelSize) migrationContext.SetDMLBatchSize(*dmlBatchSize) migrationContext.SetMaxLagMillisecondsThrottleThreshold(*maxLagMillis) migrationContext.SetThrottleQuery(*throttleQuery) diff --git a/go/logic/applier.go b/go/logic/applier.go index fa374a70f..6a5fa3a08 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -561,11 +561,18 @@ func (this *Applier) ReadMigrationRangeValues() error { // which will be used for copying the next chunk of rows. Ir returns "false" if there is // no further chunk to work through, i.e. we're past the last chunk and are done with // iterating the range (and this done with copying row chunks) -func (this *Applier) CalculateNextIterationRangeEndValues() (hasFurtherRange bool, err error) { - this.migrationContext.MigrationIterationRangeMinValues = this.migrationContext.MigrationIterationRangeMaxValues - if this.migrationContext.MigrationIterationRangeMinValues == nil { - this.migrationContext.MigrationIterationRangeMinValues = this.migrationContext.MigrationRangeMinValues +func (this *Applier) CalculateNextIterationRangeEndValues() (values *base.IterationRangeValues, err error) { + this.migrationContext.CalculateNextIterationRangeEndValuesLock.Lock() + defer this.migrationContext.CalculateNextIterationRangeEndValuesLock.Unlock() + + iterationRangeValues := &base.IterationRangeValues{} + + iterationRangeValues.Min = this.migrationContext.MigrationIterationRangeMaxValues + if iterationRangeValues.Min == nil { + iterationRangeValues.Min = this.migrationContext.MigrationRangeMinValues + iterationRangeValues.IsIncludeMinValues = true } + for i := 0; i < 2; i++ { buildFunc := sql.BuildUniqueKeyRangeEndPreparedQueryViaOffset if i == 1 { @@ -575,46 +582,48 @@ func (this *Applier) CalculateNextIterationRangeEndValues() (hasFurtherRange boo this.migrationContext.DatabaseName, this.migrationContext.OriginalTableName, &this.migrationContext.UniqueKey.Columns, - this.migrationContext.MigrationIterationRangeMinValues.AbstractValues(), + iterationRangeValues.Min.AbstractValues(), this.migrationContext.MigrationRangeMaxValues.AbstractValues(), atomic.LoadInt64(&this.migrationContext.ChunkSize), - this.migrationContext.GetIteration() == 0, + iterationRangeValues.IsIncludeMinValues, fmt.Sprintf("iteration:%d", this.migrationContext.GetIteration()), ) if err != nil { - return hasFurtherRange, err + return iterationRangeValues, err } rows, err := this.db.Query(query, explodedArgs...) if err != nil { - return hasFurtherRange, err + return iterationRangeValues, err } defer rows.Close() iterationRangeMaxValues := sql.NewColumnValues(this.migrationContext.UniqueKey.Len()) for rows.Next() { if err = rows.Scan(iterationRangeMaxValues.ValuesPointers...); err != nil { - return hasFurtherRange, err + return iterationRangeValues, err } - hasFurtherRange = true + iterationRangeValues.HasFurtherRange = true } if err = rows.Err(); err != nil { - return hasFurtherRange, err + return iterationRangeValues, err } - if hasFurtherRange { - this.migrationContext.MigrationIterationRangeMaxValues = iterationRangeMaxValues - return hasFurtherRange, nil + if iterationRangeValues.HasFurtherRange { + iterationRangeValues.Max = iterationRangeMaxValues + this.migrationContext.MigrationIterationRangeMinValues = iterationRangeValues.Min + this.migrationContext.MigrationIterationRangeMaxValues = iterationRangeValues.Max + return iterationRangeValues, nil } } this.migrationContext.Log.Debugf("Iteration complete: no further range to iterate") - return hasFurtherRange, nil + return iterationRangeValues, nil } // ApplyIterationInsertQuery issues a chunk-INSERT query on the ghost table. It is where // data actually gets copied from original table. -func (this *Applier) ApplyIterationInsertQuery() (chunkSize int64, rowsAffected int64, duration time.Duration, err error) { +func (this *Applier) ApplyIterationInsertQuery(iterationRangeValues *base.IterationRangeValues) (chunkSize int64, rowsAffected int64, duration time.Duration, err error) { startTime := time.Now() - chunkSize = atomic.LoadInt64(&this.migrationContext.ChunkSize) + chunkSize = iterationRangeValues.Size query, explodedArgs, err := sql.BuildRangeInsertPreparedQuery( this.migrationContext.DatabaseName, @@ -624,9 +633,9 @@ func (this *Applier) ApplyIterationInsertQuery() (chunkSize int64, rowsAffected this.migrationContext.MappedSharedColumns.Names(), this.migrationContext.UniqueKey.Name, &this.migrationContext.UniqueKey.Columns, - this.migrationContext.MigrationIterationRangeMinValues.AbstractValues(), - this.migrationContext.MigrationIterationRangeMaxValues.AbstractValues(), - this.migrationContext.GetIteration() == 0, + iterationRangeValues.Min.AbstractValues(), + iterationRangeValues.Max.AbstractValues(), + iterationRangeValues.IsIncludeMinValues, this.migrationContext.IsTransactionalTable(), ) if err != nil { @@ -663,8 +672,8 @@ func (this *Applier) ApplyIterationInsertQuery() (chunkSize int64, rowsAffected duration = time.Since(startTime) this.migrationContext.Log.Debugf( "Issued INSERT on range: [%s]..[%s]; iteration: %d; chunk-size: %d", - this.migrationContext.MigrationIterationRangeMinValues, - this.migrationContext.MigrationIterationRangeMaxValues, + iterationRangeValues.Min, + iterationRangeValues.Max, this.migrationContext.GetIteration(), chunkSize) return chunkSize, rowsAffected, duration, nil diff --git a/go/logic/migrator.go b/go/logic/migrator.go index fed7c944b..1416bb1da 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -9,6 +9,7 @@ import ( "context" "errors" "fmt" + "golang.org/x/sync/errgroup" "io" "math" "os" @@ -1190,41 +1191,66 @@ func (this *Migrator) iterateChunks() error { // When hasFurtherRange is false, original table might be write locked and CalculateNextIterationRangeEndValues would hangs forever - hasFurtherRange := false - if err := this.retryOperation(func() (e error) { - hasFurtherRange, e = this.applier.CalculateNextIterationRangeEndValues() - return e - }); err != nil { - return terminateRowIteration(err) - } - if !hasFurtherRange { - atomic.StoreInt64(&hasNoFurtherRangeFlag, 1) - return terminateRowIteration(nil) + parallelSize := atomic.LoadInt64(&this.migrationContext.ChunkParallelSize) + if parallelSize == 0 { + parallelSize = 1 } - // Copy task: - applyCopyRowsFunc := func() error { - if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { - // No need for more writes. - // This is the de-facto place where we avoid writing in the event of completed cut-over. - // There could _still_ be a race condition, but that's as close as we can get. - // What about the race condition? Well, there's actually no data integrity issue. - // when rowCopyCompleteFlag==1 that means **guaranteed** all necessary rows have been copied. - // But some are still then collected at the binary log, and these are the ones we're trying to - // not apply here. If the race condition wins over us, then we just attempt to apply onto the - // _ghost_ table, which no longer exists. So, bothering error messages and all, but no damage. + + g, _ := errgroup.WithContext(context.Background()) + g.SetLimit(int(parallelSize)) + + for i := 0; i < int(parallelSize); i++ { + g.Go(func() error { + var iterationRangeValues *base.IterationRangeValues + + if err := this.retryOperation(func() (e error) { + iterationRangeValues, e = this.applier.CalculateNextIterationRangeEndValues() + return e + }); err != nil { + return err + } + + // Copy task: + applyCopyRowsFunc := func() error { + if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { + // No need for more writes. + // This is the de-facto place where we avoid writing in the event of completed cut-over. + // There could _still_ be a race condition, but that's as close as we can get. + // What about the race condition? Well, there's actually no data integrity issue. + // when rowCopyCompleteFlag==1 that means **guaranteed** all necessary rows have been copied. + // But some are still then collected at the binary log, and these are the ones we're trying to + // not apply here. If the race condition wins over us, then we just attempt to apply onto the + // _ghost_ table, which no longer exists. So, bothering error messages and all, but no damage. + return nil + } + _, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery(iterationRangeValues) + if err != nil { + return err // wrapping call will retry + } + atomic.AddInt64(&this.migrationContext.TotalRowsCopied, rowsAffected) + atomic.AddInt64(&this.migrationContext.Iteration, 1) + return nil + } + + if err := this.retryOperation(applyCopyRowsFunc); err != nil { + return err + } + + if !iterationRangeValues.HasFurtherRange { + atomic.StoreInt64(&hasNoFurtherRangeFlag, 1) + } return nil - } - _, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery() - if err != nil { - return err // wrapping call will retry - } - atomic.AddInt64(&this.migrationContext.TotalRowsCopied, rowsAffected) - atomic.AddInt64(&this.migrationContext.Iteration, 1) - return nil + }) } - if err := this.retryOperation(applyCopyRowsFunc); err != nil { + + if err := g.Wait(); err != nil { return terminateRowIteration(err) } + + if atomic.LoadInt64(&hasNoFurtherRangeFlag) == 1 { + return terminateRowIteration(nil) + } + return nil } // Enqueue copy operation; to be executed by executeWriteFuncs() diff --git a/vendor/golang.org/x/sync/LICENSE b/vendor/golang.org/x/sync/LICENSE new file mode 100644 index 000000000..6a66aea5e --- /dev/null +++ b/vendor/golang.org/x/sync/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/golang.org/x/sync/PATENTS b/vendor/golang.org/x/sync/PATENTS new file mode 100644 index 000000000..733099041 --- /dev/null +++ b/vendor/golang.org/x/sync/PATENTS @@ -0,0 +1,22 @@ +Additional IP Rights Grant (Patents) + +"This implementation" means the copyrightable works distributed by +Google as part of the Go project. + +Google hereby grants to You a perpetual, worldwide, non-exclusive, +no-charge, royalty-free, irrevocable (except as stated in this section) +patent license to make, have made, use, offer to sell, sell, import, +transfer and otherwise run, modify and propagate the contents of this +implementation of Go, where such license applies only to those patent +claims, both currently owned or controlled by Google and acquired in +the future, licensable by Google that are necessarily infringed by this +implementation of Go. This grant does not include claims that would be +infringed only as a consequence of further modification of this +implementation. If you or your agent or exclusive licensee institute or +order or agree to the institution of patent litigation against any +entity (including a cross-claim or counterclaim in a lawsuit) alleging +that this implementation of Go or any code incorporated within this +implementation of Go constitutes direct or contributory patent +infringement, or inducement of patent infringement, then any patent +rights granted to you under this License for this implementation of Go +shall terminate as of the date such litigation is filed. diff --git a/vendor/golang.org/x/sync/errgroup/errgroup.go b/vendor/golang.org/x/sync/errgroup/errgroup.go new file mode 100644 index 000000000..cbee7a4e2 --- /dev/null +++ b/vendor/golang.org/x/sync/errgroup/errgroup.go @@ -0,0 +1,132 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package errgroup provides synchronization, error propagation, and Context +// cancelation for groups of goroutines working on subtasks of a common task. +package errgroup + +import ( + "context" + "fmt" + "sync" +) + +type token struct{} + +// A Group is a collection of goroutines working on subtasks that are part of +// the same overall task. +// +// A zero Group is valid, has no limit on the number of active goroutines, +// and does not cancel on error. +type Group struct { + cancel func() + + wg sync.WaitGroup + + sem chan token + + errOnce sync.Once + err error +} + +func (g *Group) done() { + if g.sem != nil { + <-g.sem + } + g.wg.Done() +} + +// WithContext returns a new Group and an associated Context derived from ctx. +// +// The derived Context is canceled the first time a function passed to Go +// returns a non-nil error or the first time Wait returns, whichever occurs +// first. +func WithContext(ctx context.Context) (*Group, context.Context) { + ctx, cancel := context.WithCancel(ctx) + return &Group{cancel: cancel}, ctx +} + +// Wait blocks until all function calls from the Go method have returned, then +// returns the first non-nil error (if any) from them. +func (g *Group) Wait() error { + g.wg.Wait() + if g.cancel != nil { + g.cancel() + } + return g.err +} + +// Go calls the given function in a new goroutine. +// It blocks until the new goroutine can be added without the number of +// active goroutines in the group exceeding the configured limit. +// +// The first call to return a non-nil error cancels the group's context, if the +// group was created by calling WithContext. The error will be returned by Wait. +func (g *Group) Go(f func() error) { + if g.sem != nil { + g.sem <- token{} + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel() + } + }) + } + }() +} + +// TryGo calls the given function in a new goroutine only if the number of +// active goroutines in the group is currently below the configured limit. +// +// The return value reports whether the goroutine was started. +func (g *Group) TryGo(f func() error) bool { + if g.sem != nil { + select { + case g.sem <- token{}: + // Note: this allows barging iff channels in general allow barging. + default: + return false + } + } + + g.wg.Add(1) + go func() { + defer g.done() + + if err := f(); err != nil { + g.errOnce.Do(func() { + g.err = err + if g.cancel != nil { + g.cancel() + } + }) + } + }() + return true +} + +// SetLimit limits the number of active goroutines in this group to at most n. +// A negative value indicates no limit. +// +// Any subsequent call to the Go method will block until it can add an active +// goroutine without exceeding the configured limit. +// +// The limit must not be modified while any goroutines in the group are active. +func (g *Group) SetLimit(n int) { + if n < 0 { + g.sem = nil + return + } + if len(g.sem) != 0 { + panic(fmt.Errorf("errgroup: modify limit while %v goroutines in the group are still active", len(g.sem))) + } + g.sem = make(chan token, n) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 19cadde30..09ef4f692 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -43,6 +43,9 @@ go.uber.org/atomic # golang.org/x/net v0.17.0 ## explicit; go 1.17 golang.org/x/net/context +# golang.org/x/sync v0.1.0 +## explicit +golang.org/x/sync/errgroup # golang.org/x/sys v0.13.0 ## explicit; go 1.17 golang.org/x/sys/plan9 From f674355f3797c08e13eb5fcacd55219d841fa3cc Mon Sep 17 00:00:00 2001 From: shaohoukun Date: Fri, 15 Mar 2024 15:06:32 +0800 Subject: [PATCH 2/5] docs(feat-parallel-chunk-data): add doc --- doc/command-line-flags.md | 3 +++ go/cmd/gh-ost/main.go | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index ac491b268..bce8dfe52 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -64,6 +64,9 @@ It is not reliable to parse the `ALTER` statement to determine if it is instant ### binlogsyncer-max-reconnect-attempts `--binlogsyncer-max-reconnect-attempts=0`, the maximum number of attempts to re-establish a broken inspector connection for sync binlog. `0` or `negative number` means infinite retry, default `0` +### chunk-parallel-size +`--chunk-parallel-size=1`, The number of goroutines to execute chunks concurrently in each copy time slot, default `1`, allowed range `0`-`10`. + ### conf `--conf=/path/to/my.cnf`: file where credentials are specified. Should be in (or contain) the following format: diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index df17f51da..d61f49629 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -104,7 +104,7 @@ func main() { flag.BoolVar(&migrationContext.CutOverExponentialBackoff, "cut-over-exponential-backoff", false, "Wait exponentially longer intervals between failed cut-over attempts. Wait intervals obey a maximum configurable with 'exponential-backoff-max-interval').") exponentialBackoffMaxInterval := flag.Int64("exponential-backoff-max-interval", 64, "Maximum number of seconds to wait between attempts when performing various operations with exponential backoff.") chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 10-100,000)") - chunkParallelSize := flag.Int64("chunk-parallel-size", 1, "number of chunks to process in parallel (allowed range: 1-100)") + chunkParallelSize := flag.Int64("chunk-parallel-size", 1, "The number of goroutines to execute chunks concurrently in each copy time slot, default 1 (allowed range: 1-10)") dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)") defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking") cutOverLockTimeoutSeconds := flag.Int64("cut-over-lock-timeout-seconds", 3, "Max number of seconds to hold locks on tables while attempting to cut-over (retry attempted when lock exceeds timeout)") From 18e0674f1130c13d3732e6786bc0e2b48470847a Mon Sep 17 00:00:00 2001 From: shaohoukun Date: Fri, 15 Mar 2024 15:25:34 +0800 Subject: [PATCH 3/5] fix(feat-parallel-chunk-data): x --- go/logic/migrator.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 1416bb1da..3b88c3e6f 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -1210,6 +1210,11 @@ func (this *Migrator) iterateChunks() error { return err } + if !iterationRangeValues.HasFurtherRange { + atomic.StoreInt64(&hasNoFurtherRangeFlag, 1) + return nil + } + // Copy task: applyCopyRowsFunc := func() error { if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 { @@ -1236,9 +1241,6 @@ func (this *Migrator) iterateChunks() error { return err } - if !iterationRangeValues.HasFurtherRange { - atomic.StoreInt64(&hasNoFurtherRangeFlag, 1) - } return nil }) } From c19a177a6b3d5dd6f2b18772675d9eebffc03c2f Mon Sep 17 00:00:00 2001 From: shaohoukun Date: Tue, 19 Mar 2024 11:36:54 +0800 Subject: [PATCH 4/5] fix(feat-parallel-chunk-data): set applier db MaxOpenConns --- go/logic/applier.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/go/logic/applier.go b/go/logic/applier.go index 6a5fa3a08..853c011e8 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -76,6 +76,10 @@ func (this *Applier) InitDBConnections() (err error) { if this.db, _, err = mysql.GetDB(this.migrationContext.Uuid, applierUri); err != nil { return err } + chunkParallelSize := atomic.LoadInt64(&this.migrationContext.ChunkParallelSize) + if chunkParallelSize > mysql.MaxDBPoolConnections { + this.db.SetMaxOpenConns(int(chunkParallelSize)) + } singletonApplierUri := fmt.Sprintf("%s&timeout=0", applierUri) if this.singletonDB, _, err = mysql.GetDB(this.migrationContext.Uuid, singletonApplierUri); err != nil { return err From c0907d09589ad82a92fc8cfca3363f26ddc4b8ef Mon Sep 17 00:00:00 2001 From: shaohoukun Date: Wed, 20 Mar 2024 11:05:35 +0800 Subject: [PATCH 5/5] style(feat-parallel-chunk-data): change parallel to concurrent --- doc/command-line-flags.md | 4 ++-- go/base/context.go | 14 +++++++------- go/cmd/gh-ost/main.go | 4 ++-- go/logic/applier.go | 6 +++--- go/logic/migrator.go | 10 +++++----- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/doc/command-line-flags.md b/doc/command-line-flags.md index bce8dfe52..a54cb62e0 100644 --- a/doc/command-line-flags.md +++ b/doc/command-line-flags.md @@ -64,8 +64,8 @@ It is not reliable to parse the `ALTER` statement to determine if it is instant ### binlogsyncer-max-reconnect-attempts `--binlogsyncer-max-reconnect-attempts=0`, the maximum number of attempts to re-establish a broken inspector connection for sync binlog. `0` or `negative number` means infinite retry, default `0` -### chunk-parallel-size -`--chunk-parallel-size=1`, The number of goroutines to execute chunks concurrently in each copy time slot, default `1`, allowed range `0`-`10`. +### chunk-concurrent-size +`--chunk-concurrent-size=1`, The number of goroutines to execute chunks concurrently in each copy time slot, default `1`, allowed range `0`-`100`. ### conf diff --git a/go/base/context.go b/go/base/context.go index acbfc7b12..a18194678 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -127,7 +127,7 @@ type MigrationContext struct { HeartbeatIntervalMilliseconds int64 defaultNumRetries int64 ChunkSize int64 - ChunkParallelSize int64 + ChunkConcurrentSize int64 niceRatio float64 MaxLagMillisecondsThrottleThreshold int64 throttleControlReplicaKeys *mysql.InstanceKeyMap @@ -627,14 +627,14 @@ func (this *MigrationContext) SetChunkSize(chunkSize int64) { atomic.StoreInt64(&this.ChunkSize, chunkSize) } -func (this *MigrationContext) SetChunkParallelSize(chunkParallelSize int64) { - if chunkParallelSize < 1 { - chunkParallelSize = 1 +func (this *MigrationContext) SetChunkConcurrentSize(chunkConcurrentSize int64) { + if chunkConcurrentSize < 1 { + chunkConcurrentSize = 1 } - if chunkParallelSize > 10 { - chunkParallelSize = 10 + if chunkConcurrentSize > 100 { + chunkConcurrentSize = 100 } - atomic.StoreInt64(&this.ChunkParallelSize, chunkParallelSize) + atomic.StoreInt64(&this.ChunkConcurrentSize, chunkConcurrentSize) } func (this *MigrationContext) SetDMLBatchSize(batchSize int64) { diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index d61f49629..6dc7e27e0 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -104,7 +104,7 @@ func main() { flag.BoolVar(&migrationContext.CutOverExponentialBackoff, "cut-over-exponential-backoff", false, "Wait exponentially longer intervals between failed cut-over attempts. Wait intervals obey a maximum configurable with 'exponential-backoff-max-interval').") exponentialBackoffMaxInterval := flag.Int64("exponential-backoff-max-interval", 64, "Maximum number of seconds to wait between attempts when performing various operations with exponential backoff.") chunkSize := flag.Int64("chunk-size", 1000, "amount of rows to handle in each iteration (allowed range: 10-100,000)") - chunkParallelSize := flag.Int64("chunk-parallel-size", 1, "The number of goroutines to execute chunks concurrently in each copy time slot, default 1 (allowed range: 1-10)") + chunkConcurrentSize := flag.Int64("chunk-concurrent-size", 1, "The number of goroutines to execute chunks concurrently in each copy time slot, default 1 (allowed range: 1-100)") dmlBatchSize := flag.Int64("dml-batch-size", 10, "batch size for DML events to apply in a single transaction (range 1-100)") defaultRetries := flag.Int64("default-retries", 60, "Default number of retries for various operations before panicking") cutOverLockTimeoutSeconds := flag.Int64("cut-over-lock-timeout-seconds", 3, "Max number of seconds to hold locks on tables while attempting to cut-over (retry attempted when lock exceeds timeout)") @@ -295,7 +295,7 @@ func main() { migrationContext.SetHeartbeatIntervalMilliseconds(*heartbeatIntervalMillis) migrationContext.SetNiceRatio(*niceRatio) migrationContext.SetChunkSize(*chunkSize) - migrationContext.SetChunkParallelSize(*chunkParallelSize) + migrationContext.SetChunkConcurrentSize(*chunkConcurrentSize) migrationContext.SetDMLBatchSize(*dmlBatchSize) migrationContext.SetMaxLagMillisecondsThrottleThreshold(*maxLagMillis) migrationContext.SetThrottleQuery(*throttleQuery) diff --git a/go/logic/applier.go b/go/logic/applier.go index 853c011e8..4ae931c67 100644 --- a/go/logic/applier.go +++ b/go/logic/applier.go @@ -76,9 +76,9 @@ func (this *Applier) InitDBConnections() (err error) { if this.db, _, err = mysql.GetDB(this.migrationContext.Uuid, applierUri); err != nil { return err } - chunkParallelSize := atomic.LoadInt64(&this.migrationContext.ChunkParallelSize) - if chunkParallelSize > mysql.MaxDBPoolConnections { - this.db.SetMaxOpenConns(int(chunkParallelSize)) + concurrentSize := atomic.LoadInt64(&this.migrationContext.ChunkConcurrentSize) + if concurrentSize > mysql.MaxDBPoolConnections { + this.db.SetMaxOpenConns(int(concurrentSize)) } singletonApplierUri := fmt.Sprintf("%s&timeout=0", applierUri) if this.singletonDB, _, err = mysql.GetDB(this.migrationContext.Uuid, singletonApplierUri); err != nil { diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 3b88c3e6f..380282cdb 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -1191,15 +1191,15 @@ func (this *Migrator) iterateChunks() error { // When hasFurtherRange is false, original table might be write locked and CalculateNextIterationRangeEndValues would hangs forever - parallelSize := atomic.LoadInt64(&this.migrationContext.ChunkParallelSize) - if parallelSize == 0 { - parallelSize = 1 + concurrentSize := atomic.LoadInt64(&this.migrationContext.ChunkConcurrentSize) + if concurrentSize == 0 { + concurrentSize = 1 } g, _ := errgroup.WithContext(context.Background()) - g.SetLimit(int(parallelSize)) + g.SetLimit(int(concurrentSize)) - for i := 0; i < int(parallelSize); i++ { + for i := 0; i < int(concurrentSize); i++ { g.Go(func() error { var iterationRangeValues *base.IterationRangeValues