From 84c8ab9fd109145d04d0c58cadd46160f1ee9263 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Mon, 29 May 2023 10:52:32 +0800 Subject: [PATCH] Help to recover from corrupted levelqueue (#24912) gitea.com experienced the corrupted LevelQueue bug again. I think the problem is clear now: if the keys in LevelDB went out-of-sync, the LevelQueue itself doesn't have the ability to recover, eg: * LevelQueue.Len() reports 100 * LevelQueue.LPop() reports ErrNotFound = errors.New("no key found") So it needs to dive into the LevelDB to remove all keys to recover the corrupted LevelQueue. More comments are in TestCorruptedLevelQueue. --- modules/queue/base_levelqueue.go | 39 +++++++++++------ modules/queue/base_levelqueue_common.go | 17 ++++---- modules/queue/base_levelqueue_test.go | 55 +++++++++++++++++++++++ modules/queue/base_levelqueue_unique.go | 58 +++++++++++-------------- modules/queue/lqinternal/lqinternal.go | 48 ++++++++++++++++++++ 5 files changed, 162 insertions(+), 55 deletions(-) create mode 100644 modules/queue/lqinternal/lqinternal.go diff --git a/modules/queue/base_levelqueue.go b/modules/queue/base_levelqueue.go index afde502116..efc57c9c9c 100644 --- a/modules/queue/base_levelqueue.go +++ b/modules/queue/base_levelqueue.go @@ -5,16 +5,21 @@ package queue import ( "context" + "sync/atomic" "code.gitea.io/gitea/modules/nosql" + "code.gitea.io/gitea/modules/queue/lqinternal" "gitea.com/lunny/levelqueue" + "github.com/syndtr/goleveldb/leveldb" ) type baseLevelQueue struct { - internal *levelqueue.Queue - conn string - cfg *BaseConfig + internal atomic.Pointer[levelqueue.Queue] + + conn string + cfg *BaseConfig + db *leveldb.DB } var _ baseQueue = (*baseLevelQueue)(nil) @@ -31,21 +36,23 @@ func newBaseLevelQueueSimple(cfg *BaseConfig) (baseQueue, error) { if err != nil { return nil, err } - q := &baseLevelQueue{conn: conn, cfg: cfg} - q.internal, err = levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false) + q := &baseLevelQueue{conn: conn, cfg: cfg, db: db} + lq, err := levelqueue.NewQueue(db, []byte(cfg.QueueFullName), false) if err != nil { return nil, err } - + q.internal.Store(lq) return q, nil } func (q *baseLevelQueue) PushItem(ctx context.Context, data []byte) error { - return baseLevelQueueCommon(q.cfg, q.internal, nil).PushItem(ctx, data) + c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() }) + return c.PushItem(ctx, data) } func (q *baseLevelQueue) PopItem(ctx context.Context) ([]byte, error) { - return baseLevelQueueCommon(q.cfg, q.internal, nil).PopItem(ctx) + c := baseLevelQueueCommon(q.cfg, nil, func() baseLevelQueuePushPoper { return q.internal.Load() }) + return c.PopItem(ctx) } func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error) { @@ -53,20 +60,24 @@ func (q *baseLevelQueue) HasItem(ctx context.Context, data []byte) (bool, error) } func (q *baseLevelQueue) Len(ctx context.Context) (int, error) { - return int(q.internal.Len()), nil + return int(q.internal.Load().Len()), nil } func (q *baseLevelQueue) Close() error { - err := q.internal.Close() + err := q.internal.Load().Close() _ = nosql.GetManager().CloseLevelDB(q.conn) + q.db = nil // the db is not managed by us, it's managed by the nosql manager return err } func (q *baseLevelQueue) RemoveAll(ctx context.Context) error { - for q.internal.Len() > 0 { - if _, err := q.internal.LPop(); err != nil { - return err - } + lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName)) + lq, err := levelqueue.NewQueue(q.db, []byte(q.cfg.QueueFullName), false) + if err != nil { + return err } + old := q.internal.Load() + q.internal.Store(lq) + _ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good return nil } diff --git a/modules/queue/base_levelqueue_common.go b/modules/queue/base_levelqueue_common.go index 409a965517..78d3b85a8a 100644 --- a/modules/queue/base_levelqueue_common.go +++ b/modules/queue/base_levelqueue_common.go @@ -17,6 +17,7 @@ import ( "github.com/syndtr/goleveldb/leveldb" ) +// baseLevelQueuePushPoper is the common interface for levelqueue.Queue and levelqueue.UniqueQueue type baseLevelQueuePushPoper interface { RPush(data []byte) error LPop() ([]byte, error) @@ -24,9 +25,9 @@ type baseLevelQueuePushPoper interface { } type baseLevelQueueCommonImpl struct { - length int - internal baseLevelQueuePushPoper - mu *sync.Mutex + length int + internalFunc func() baseLevelQueuePushPoper + mu *sync.Mutex } func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) error { @@ -36,11 +37,11 @@ func (q *baseLevelQueueCommonImpl) PushItem(ctx context.Context, data []byte) er defer q.mu.Unlock() } - cnt := int(q.internal.Len()) + cnt := int(q.internalFunc().Len()) if cnt >= q.length { return true, nil } - retry, err = false, q.internal.RPush(data) + retry, err = false, q.internalFunc().RPush(data) if err == levelqueue.ErrAlreadyInQueue { err = ErrAlreadyInQueue } @@ -55,7 +56,7 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error) defer q.mu.Unlock() } - data, err = q.internal.LPop() + data, err = q.internalFunc().LPop() if err == levelqueue.ErrNotFound { return true, nil, nil } @@ -66,8 +67,8 @@ func (q *baseLevelQueueCommonImpl) PopItem(ctx context.Context) ([]byte, error) }) } -func baseLevelQueueCommon(cfg *BaseConfig, internal baseLevelQueuePushPoper, mu *sync.Mutex) *baseLevelQueueCommonImpl { - return &baseLevelQueueCommonImpl{length: cfg.Length, internal: internal} +func baseLevelQueueCommon(cfg *BaseConfig, mu *sync.Mutex, internalFunc func() baseLevelQueuePushPoper) *baseLevelQueueCommonImpl { + return &baseLevelQueueCommonImpl{length: cfg.Length, mu: mu, internalFunc: internalFunc} } func prepareLevelDB(cfg *BaseConfig) (conn string, db *leveldb.DB, err error) { diff --git a/modules/queue/base_levelqueue_test.go b/modules/queue/base_levelqueue_test.go index 712a0892cd..b881802ca2 100644 --- a/modules/queue/base_levelqueue_test.go +++ b/modules/queue/base_levelqueue_test.go @@ -6,9 +6,12 @@ package queue import ( "testing" + "code.gitea.io/gitea/modules/queue/lqinternal" "code.gitea.io/gitea/modules/setting" + "gitea.com/lunny/levelqueue" "github.com/stretchr/testify/assert" + "github.com/syndtr/goleveldb/leveldb" ) func TestBaseLevelDB(t *testing.T) { @@ -21,3 +24,55 @@ func TestBaseLevelDB(t *testing.T) { testQueueBasic(t, newBaseLevelQueueSimple, toBaseConfig("baseLevelQueue", setting.QueueSettings{Datadir: t.TempDir() + "/queue-test", Length: 10}), false) testQueueBasic(t, newBaseLevelQueueUnique, toBaseConfig("baseLevelQueueUnique", setting.QueueSettings{ConnStr: "leveldb://" + t.TempDir() + "/queue-test", Length: 10}), true) } + +func TestCorruptedLevelQueue(t *testing.T) { + // sometimes the levelqueue could be in a corrupted state, this test is to make sure it can recover from it + dbDir := t.TempDir() + "/levelqueue-test" + db, err := leveldb.OpenFile(dbDir, nil) + if !assert.NoError(t, err) { + return + } + defer db.Close() + + assert.NoError(t, db.Put([]byte("other-key"), []byte("other-value"), nil)) + + nameQueuePrefix := []byte("queue_name") + nameSetPrefix := []byte("set_name") + lq, err := levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false) + assert.NoError(t, err) + assert.NoError(t, lq.RPush([]byte("item-1"))) + + itemKey := lqinternal.QueueItemKeyBytes(nameQueuePrefix, 1) + itemValue, err := db.Get(itemKey, nil) + assert.NoError(t, err) + assert.Equal(t, []byte("item-1"), itemValue) + + // there should be 5 keys in db: queue low, queue high, 1 queue item, 1 set item, and "other-key" + keys := lqinternal.ListLevelQueueKeys(db) + assert.Len(t, keys, 5) + + // delete the queue item key, to corrupt the queue + assert.NoError(t, db.Delete(itemKey, nil)) + // now the queue is corrupted, it never works again + _, err = lq.LPop() + assert.ErrorIs(t, err, levelqueue.ErrNotFound) + assert.NoError(t, lq.Close()) + + // remove all the queue related keys to reset the queue + lqinternal.RemoveLevelQueueKeys(db, nameQueuePrefix) + lqinternal.RemoveLevelQueueKeys(db, nameSetPrefix) + // now there should be only 1 key in db: "other-key" + keys = lqinternal.ListLevelQueueKeys(db) + assert.Len(t, keys, 1) + assert.Equal(t, []byte("other-key"), keys[0]) + + // re-create a queue from db + lq, err = levelqueue.NewUniqueQueue(db, nameQueuePrefix, nameSetPrefix, false) + assert.NoError(t, err) + assert.NoError(t, lq.RPush([]byte("item-new-1"))) + // now the queue works again + itemValue, err = lq.LPop() + assert.NoError(t, err) + assert.Equal(t, []byte("item-new-1"), itemValue) + assert.NoError(t, lq.Close()) +} diff --git a/modules/queue/base_levelqueue_unique.go b/modules/queue/base_levelqueue_unique.go index 1acd504e32..968a4e98d4 100644 --- a/modules/queue/base_levelqueue_unique.go +++ b/modules/queue/base_levelqueue_unique.go @@ -6,18 +6,21 @@ package queue import ( "context" "sync" - "unsafe" + "sync/atomic" "code.gitea.io/gitea/modules/nosql" + "code.gitea.io/gitea/modules/queue/lqinternal" "gitea.com/lunny/levelqueue" "github.com/syndtr/goleveldb/leveldb" ) type baseLevelQueueUnique struct { - internal *levelqueue.UniqueQueue - conn string - cfg *BaseConfig + internal atomic.Pointer[levelqueue.UniqueQueue] + + conn string + cfg *BaseConfig + db *leveldb.DB mu sync.Mutex // the levelqueue.UniqueQueue is not thread-safe, there is no mutex protecting the underlying queue&set together } @@ -29,39 +32,42 @@ func newBaseLevelQueueUnique(cfg *BaseConfig) (baseQueue, error) { if err != nil { return nil, err } - q := &baseLevelQueueUnique{conn: conn, cfg: cfg} - q.internal, err = levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false) + q := &baseLevelQueueUnique{conn: conn, cfg: cfg, db: db} + lq, err := levelqueue.NewUniqueQueue(db, []byte(cfg.QueueFullName), []byte(cfg.SetFullName), false) if err != nil { return nil, err } - + q.internal.Store(lq) return q, nil } func (q *baseLevelQueueUnique) PushItem(ctx context.Context, data []byte) error { - return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PushItem(ctx, data) + c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() }) + return c.PushItem(ctx, data) } func (q *baseLevelQueueUnique) PopItem(ctx context.Context) ([]byte, error) { - return baseLevelQueueCommon(q.cfg, q.internal, &q.mu).PopItem(ctx) + c := baseLevelQueueCommon(q.cfg, &q.mu, func() baseLevelQueuePushPoper { return q.internal.Load() }) + return c.PopItem(ctx) } func (q *baseLevelQueueUnique) HasItem(ctx context.Context, data []byte) (bool, error) { q.mu.Lock() defer q.mu.Unlock() - return q.internal.Has(data) + return q.internal.Load().Has(data) } func (q *baseLevelQueueUnique) Len(ctx context.Context) (int, error) { q.mu.Lock() defer q.mu.Unlock() - return int(q.internal.Len()), nil + return int(q.internal.Load().Len()), nil } func (q *baseLevelQueueUnique) Close() error { q.mu.Lock() defer q.mu.Unlock() - err := q.internal.Close() + err := q.internal.Load().Close() + q.db = nil // the db is not managed by us, it's managed by the nosql manager _ = nosql.GetManager().CloseLevelDB(q.conn) return err } @@ -69,28 +75,14 @@ func (q *baseLevelQueueUnique) Close() error { func (q *baseLevelQueueUnique) RemoveAll(ctx context.Context) error { q.mu.Lock() defer q.mu.Unlock() - - type levelUniqueQueue struct { - q *levelqueue.Queue - set *levelqueue.Set - db *leveldb.DB - } - lq := (*levelUniqueQueue)(unsafe.Pointer(q.internal)) - - for lq.q.Len() > 0 { - if _, err := lq.q.LPop(); err != nil { - return err - } - } - - // the "set" must be cleared after the "list" because there is no transaction. - // it's better to have duplicate items than losing items. - members, err := lq.set.Members() + lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.QueueFullName)) + lqinternal.RemoveLevelQueueKeys(q.db, []byte(q.cfg.SetFullName)) + lq, err := levelqueue.NewUniqueQueue(q.db, []byte(q.cfg.QueueFullName), []byte(q.cfg.SetFullName), false) if err != nil { - return err // seriously corrupted - } - for _, v := range members { - _, _ = lq.set.Remove(v) + return err } + old := q.internal.Load() + q.internal.Store(lq) + _ = old.Close() // Not ideal for concurrency. Luckily, the levelqueue only sets its db=nil because it doesn't manage the db, so far so good return nil } diff --git a/modules/queue/lqinternal/lqinternal.go b/modules/queue/lqinternal/lqinternal.go new file mode 100644 index 0000000000..89aa4e5989 --- /dev/null +++ b/modules/queue/lqinternal/lqinternal.go @@ -0,0 +1,48 @@ +// Copyright 2023 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package lqinternal + +import ( + "bytes" + "encoding/binary" + + "github.com/syndtr/goleveldb/leveldb" + "github.com/syndtr/goleveldb/leveldb/opt" +) + +func QueueItemIDBytes(id int64) []byte { + buf := make([]byte, 8) + binary.PutVarint(buf, id) + return buf +} + +func QueueItemKeyBytes(prefix []byte, id int64) []byte { + key := make([]byte, len(prefix), len(prefix)+1+8) + copy(key, prefix) + key = append(key, '-') + return append(key, QueueItemIDBytes(id)...) +} + +func RemoveLevelQueueKeys(db *leveldb.DB, namePrefix []byte) { + keyPrefix := make([]byte, len(namePrefix)+1) + copy(keyPrefix, namePrefix) + keyPrefix[len(namePrefix)] = '-' + + it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict}) + defer it.Release() + for it.Next() { + if bytes.HasPrefix(it.Key(), keyPrefix) { + _ = db.Delete(it.Key(), nil) + } + } +} + +func ListLevelQueueKeys(db *leveldb.DB) (res [][]byte) { + it := db.NewIterator(nil, &opt.ReadOptions{Strict: opt.NoStrict}) + defer it.Release() + for it.Next() { + res = append(res, it.Key()) + } + return res +}