From 94c70c77534ae1f99fe64f501b6f9f8c7bdda619 Mon Sep 17 00:00:00 2001 From: coldWater <254244460@qq.com> Date: Fri, 15 Mar 2024 18:59:11 +0800 Subject: [PATCH] Refactor graceful manager, fix misused WaitGroup (#29738) Follow #29629 (cherry picked from commit d08f4360c96e130e0454b76ecef9405f2bd312a1) --- modules/graceful/manager.go | 5 ++- modules/graceful/manager_common.go | 5 +-- modules/graceful/manager_unix.go | 44 ++++++++++++------------ modules/graceful/manager_windows.go | 52 +++++++++++++++-------------- 4 files changed, 55 insertions(+), 51 deletions(-) diff --git a/modules/graceful/manager.go b/modules/graceful/manager.go index f3f412863a..3f1115066a 100644 --- a/modules/graceful/manager.go +++ b/modules/graceful/manager.go @@ -233,7 +233,10 @@ func (g *Manager) setStateTransition(old, new state) bool { // At the moment the total number of servers (numberOfServersToCreate) are pre-defined as a const before global init, // so this function MUST be called if a server is not used. func (g *Manager) InformCleanup() { - g.createServerWaitGroup.Done() + g.createServerCond.L.Lock() + defer g.createServerCond.L.Unlock() + g.createdServer++ + g.createServerCond.Signal() } // Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating diff --git a/modules/graceful/manager_common.go b/modules/graceful/manager_common.go index 27196e1531..f6dbcc748d 100644 --- a/modules/graceful/manager_common.go +++ b/modules/graceful/manager_common.go @@ -42,8 +42,9 @@ type Manager struct { terminateCtxCancel context.CancelFunc managerCtxCancel context.CancelFunc runningServerWaitGroup sync.WaitGroup - createServerWaitGroup sync.WaitGroup terminateWaitGroup sync.WaitGroup + createServerCond sync.Cond + createdServer int shutdownRequested chan struct{} toRunAtShutdown []func() @@ -52,7 +53,7 @@ type Manager struct { func newGracefulManager(ctx context.Context) *Manager { manager := &Manager{ctx: ctx, shutdownRequested: make(chan struct{})} - manager.createServerWaitGroup.Add(numberOfServersToCreate) + manager.createServerCond.L = &sync.Mutex{} manager.prepare(ctx) manager.start() return manager diff --git a/modules/graceful/manager_unix.go b/modules/graceful/manager_unix.go index edf5fc248f..f49c42650c 100644 --- a/modules/graceful/manager_unix.go +++ b/modules/graceful/manager_unix.go @@ -57,20 +57,27 @@ func (g *Manager) start() { // Handle clean up of unused provided listeners and delayed start-up startupDone := make(chan struct{}) go func() { - defer close(startupDone) - // Wait till we're done getting all the listeners and then close the unused ones - func() { - // FIXME: there is a fundamental design problem of the "manager" and the "wait group". - // If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned - // There is no clear solution besides a complete rewriting of the "manager" - defer func() { - _ = recover() - }() - g.createServerWaitGroup.Wait() + defer func() { + close(startupDone) + // Close the unused listeners and ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function + _ = CloseProvidedListeners() }() - // Ignore the error here there's not much we can do with it, they're logged in the CloseProvidedListeners function - _ = CloseProvidedListeners() - g.notify(readyMsg) + // Wait for all servers to be created + g.createServerCond.L.Lock() + for { + if g.createdServer >= numberOfServersToCreate { + g.createServerCond.L.Unlock() + g.notify(readyMsg) + return + } + select { + case <-g.IsShutdown(): + g.createServerCond.L.Unlock() + return + default: + } + g.createServerCond.Wait() + } }() if setting.StartupTimeout > 0 { go func() { @@ -78,16 +85,7 @@ func (g *Manager) start() { case <-startupDone: return case <-g.IsShutdown(): - func() { - // When WaitGroup counter goes negative it will panic - we don't care about this so we can just ignore it. - defer func() { - _ = recover() - }() - // Ensure that the createServerWaitGroup stops waiting - for { - g.createServerWaitGroup.Done() - } - }() + g.createServerCond.Signal() return case <-time.After(setting.StartupTimeout): log.Error("Startup took too long! Shutting down") diff --git a/modules/graceful/manager_windows.go b/modules/graceful/manager_windows.go index ecf30af3f3..d776e0e9f9 100644 --- a/modules/graceful/manager_windows.go +++ b/modules/graceful/manager_windows.go @@ -149,33 +149,35 @@ hammerLoop: func (g *Manager) awaitServer(limit time.Duration) bool { c := make(chan struct{}) go func() { - defer close(c) - func() { - // FIXME: there is a fundamental design problem of the "manager" and the "wait group". - // If nothing has started, the "Wait" just panics: sync: WaitGroup is reused before previous Wait has returned - // There is no clear solution besides a complete rewriting of the "manager" - defer func() { - _ = recover() - }() - g.createServerWaitGroup.Wait() - }() + g.createServerCond.L.Lock() + for { + if g.createdServer >= numberOfServersToCreate { + g.createServerCond.L.Unlock() + close(c) + return + } + select { + case <-g.IsShutdown(): + g.createServerCond.L.Unlock() + return + default: + } + g.createServerCond.Wait() + } }() + + var tc <-chan time.Time if limit > 0 { - select { - case <-c: - return true // completed normally - case <-time.After(limit): - return false // timed out - case <-g.IsShutdown(): - return false - } - } else { - select { - case <-c: - return true // completed normally - case <-g.IsShutdown(): - return false - } + tc = time.After(limit) + } + select { + case <-c: + return true // completed normally + case <-tc: + return false // timed out + case <-g.IsShutdown(): + g.createServerCond.Signal() + return false } }