heal: Avoid marking a bucket as done when remote drives are offline (#19587)

This commit is contained in:
Anis Eleuch 2024-04-26 07:32:14 +01:00 committed by GitHub
parent f4f1c42cba
commit 135874ebdc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 24 additions and 22 deletions

View File

@ -44,8 +44,8 @@ const (
healingMetricCheckAbandonedParts
)
func (er erasureObjects) listAndHeal(bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error {
ctx, cancel := context.WithCancel(context.Background())
func (er erasureObjects) listAndHeal(ctx context.Context, bucket, prefix string, scanMode madmin.HealScanMode, healEntry func(string, metaCacheEntry, madmin.HealScanMode) error) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
disks, _ := er.getOnlineDisksWithHealing(false)

View File

@ -2270,7 +2270,7 @@ func (z *erasureServerPools) HealObjects(ctx context.Context, bucket, prefix str
go func(idx int, set *erasureObjects) {
defer wg.Done()
errs[idx] = set.listAndHeal(bucket, prefix, opts.ScanMode, healEntry)
errs[idx] = set.listAndHeal(ctx, bucket, prefix, opts.ScanMode, healEntry)
}(idx, set)
}
wg.Wait()

View File

@ -272,11 +272,11 @@ func (er erasureObjects) LocalStorageInfo(ctx context.Context, metrics bool) Sto
}
// getOnlineDisksWithHealingAndInfo - returns online disks and overall healing status.
// Disks are randomly ordered, but in the following groups:
// Disks are ordered in the following groups:
// - Non-scanning disks
// - Non-healing disks
// - Healing disks (if inclHealing is true)
func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing bool) {
func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (newDisks []StorageAPI, newInfos []DiskInfo, healing int) {
var wg sync.WaitGroup
disks := er.getDisks()
infos := make([]DiskInfo, len(disks))
@ -315,7 +315,7 @@ func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (new
continue
}
if info.Healing {
healing = true
healing++
if inclHealing {
healingDisks = append(healingDisks, disks[i])
healingInfos = append(healingInfos, infos[i])
@ -343,9 +343,9 @@ func (er erasureObjects) getOnlineDisksWithHealingAndInfo(inclHealing bool) (new
return newDisks, newInfos, healing
}
func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) (newDisks []StorageAPI, healing bool) {
newDisks, _, healing = er.getOnlineDisksWithHealingAndInfo(inclHealing)
return
func (er erasureObjects) getOnlineDisksWithHealing(inclHealing bool) ([]StorageAPI, bool) {
newDisks, _, healing := er.getOnlineDisksWithHealingAndInfo(inclHealing)
return newDisks, healing > 0
}
// Clean-up previously deleted objects. from .minio.sys/tmp/.trash/

View File

@ -259,12 +259,17 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
bucket, humanize.Ordinal(er.setIndex+1))
}
disks, _ := er.getOnlineDisksWithHealing(false)
if len(disks) == 0 {
// No object healing necessary
tracker.bucketDone(bucket)
healingLogIf(ctx, tracker.update(ctx))
continue
disks, _, healing := er.getOnlineDisksWithHealingAndInfo(true)
if len(disks) == healing {
// All drives in this erasure set were reformatted for some reasons, abort healing and mark it as successful
healingLogIf(ctx, errors.New("all drives are in healing state, aborting.."))
return nil
}
disks = disks[:len(disks)-healing] // healing drives are always at the end of the list
if len(disks) < er.setDriveCount/2 {
return fmt.Errorf("not enough drives (found=%d, healing=%d, total=%d) are available to heal `%s`", len(disks), healing, er.setDriveCount, tracker.disk.String())
}
rand.Shuffle(len(disks), func(i, j int) {
@ -465,27 +470,24 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
waitForLowHTTPReq()
}
actualBucket, prefix := path2BucketObject(bucket)
// How to resolve partial results.
resolver := metadataResolutionParams{
dirQuorum: 1,
objQuorum: 1,
bucket: actualBucket,
bucket: bucket,
}
err = listPathRaw(ctx, listPathRawOptions{
disks: disks,
fallbackDisks: fallbackDisks,
bucket: actualBucket,
path: prefix,
bucket: bucket,
recursive: true,
forwardTo: forwardTo,
minDisks: 1,
reportNotFound: false,
agreed: func(entry metaCacheEntry) {
jt.Take()
go healEntry(actualBucket, entry)
go healEntry(bucket, entry)
},
partial: func(entries metaCacheEntries, _ []error) {
entry, ok := entries.resolve(&resolver)
@ -495,7 +497,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
entry, _ = entries.firstFound()
}
jt.Take()
go healEntry(actualBucket, *entry)
go healEntry(bucket, *entry)
},
finished: nil,
})