heal: calculate the number of workers based on NRRequests (#17945)

This commit is contained in:
Anis Eleuch 2023-09-11 14:48:54 -07:00 committed by GitHub
parent 9878031cfd
commit 41de53996b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 131 additions and 61 deletions

View File

@ -683,13 +683,6 @@ func (h *healSequence) healSequenceStart(objAPI ObjectLayer) {
}
}
func (h *healSequence) logHeal(healType madmin.HealItemType) {
h.mutex.Lock()
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
h.mutex.Unlock()
}
func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItemType) error {
// Send heal request
task := healTask{

View File

@ -20,6 +20,7 @@ package cmd
import (
"context"
"fmt"
"runtime"
"sort"
"time"
@ -30,6 +31,7 @@ import (
"github.com/minio/minio/internal/logger"
"github.com/minio/pkg/v2/console"
"github.com/minio/pkg/v2/wildcard"
"github.com/minio/pkg/v2/workers"
)
const (
@ -132,30 +134,8 @@ func getLocalBackgroundHealStatus(ctx context.Context, o ObjectLayer) (madmin.Bg
return status, true
}
func mustGetHealSequence(ctx context.Context) *healSequence {
// Get background heal sequence to send elements to heal
for {
globalHealStateLK.RLock()
hstate := globalBackgroundHealState
globalHealStateLK.RUnlock()
if hstate == nil {
time.Sleep(time.Second)
continue
}
bgSeq, ok := hstate.getHealSequenceByToken(bgHealingUUID)
if !ok {
time.Sleep(time.Second)
continue
}
return bgSeq
}
}
// healErasureSet lists and heals all objects in a specific erasure set
func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error {
bgSeq := mustGetHealSequence(ctx)
scanMode := madmin.HealNormalScan
// Make sure to copy since `buckets slice`
@ -173,6 +153,30 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
}
}
info, err := tracker.disk.DiskInfo(ctx, false)
if err != nil {
return fmt.Errorf("unable to get disk information before healing it: %w", err)
}
var numHealers uint64
if numCores := uint64(runtime.GOMAXPROCS(0)); info.NRRequests > numCores {
numHealers = numCores / 4
} else {
numHealers = info.NRRequests / 4
}
if numHealers < 4 {
numHealers = 4
}
// allow overriding this value as well..
if v := globalHealConfig.GetWorkers(); v > 0 {
numHealers = uint64(v)
}
logger.Info(fmt.Sprintf("Healing drive '%s' - use %d parallel workers.", tracker.disk.String(), numHealers))
jt, _ := workers.New(int(numHealers))
var retErr error
// Heal all buckets with all objects
for _, bucket := range healBuckets {
@ -267,6 +271,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
// Note: updates from healEntry to tracker must be sent on results channel.
healEntry := func(bucket string, entry metaCacheEntry) {
defer jt.Give()
if entry.name == "" && len(entry.metadata) == 0 {
// ignore entries that don't have metadata.
return
@ -291,14 +297,17 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
}
}
// erasureObjects layer needs object names to be encoded
encodedEntryName := encodeDirObject(entry.name)
var result healEntryResult
fivs, err := entry.fileInfoVersions(bucket)
if err != nil {
err := bgSeq.queueHealTask(healSource{
bucket: bucket,
object: entry.name,
versionID: "",
}, madmin.HealItemObject)
_, err := er.HealObject(ctx, bucket, encodedEntryName, "",
madmin.HealOpts{
ScanMode: scanMode,
Remove: healDeleteDangling,
})
if err != nil {
if isErrObjectNotFound(err) {
// queueing happens across namespace, ignore
@ -321,11 +330,11 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
if version.ModTime.After(tracker.Started) {
continue
}
if err := bgSeq.queueHealTask(healSource{
bucket: bucket,
object: version.Name,
versionID: version.VersionID,
}, madmin.HealItemObject); err != nil {
if _, err := er.HealObject(ctx, bucket, encodedEntryName,
version.VersionID, madmin.HealOpts{
ScanMode: scanMode,
Remove: healDeleteDangling,
}); err != nil {
if isErrObjectNotFound(err) {
// queueing happens across namespace, ignore
// objects that are not found.
@ -344,7 +353,6 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} else {
result = healEntrySuccess(uint64(version.Size))
}
bgSeq.logHeal(madmin.HealItemObject)
if !send(result) {
return
@ -382,7 +390,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
minDisks: 1,
reportNotFound: false,
agreed: func(entry metaCacheEntry) {
healEntry(actualBucket, entry)
jt.Take()
go healEntry(actualBucket, entry)
},
partial: func(entries metaCacheEntries, _ []error) {
entry, ok := entries.resolve(&resolver)
@ -391,10 +400,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
// proceed to heal nonetheless.
entry, _ = entries.firstFound()
}
healEntry(actualBucket, *entry)
jt.Take()
go healEntry(actualBucket, *entry)
},
finished: nil,
})
jt.Wait() // synchronize all the concurrent heal jobs
close(results)
if err != nil {
// Set this such that when we return this function

View File

@ -46,6 +46,7 @@ type DiskInfo struct {
FreeInodes uint64
Major uint32
Minor uint32
NRRequests uint64
FSType string
RootDisk bool
Healing bool

View File

@ -14,8 +14,8 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err)
return
}
if zb0001 != 17 {
err = msgp.ArrayError{Wanted: 17, Got: zb0001}
if zb0001 != 18 {
err = msgp.ArrayError{Wanted: 18, Got: zb0001}
return
}
z.Total, err = dc.ReadUint64()
@ -53,6 +53,11 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err, "Minor")
return
}
z.NRRequests, err = dc.ReadUint64()
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
z.FSType, err = dc.ReadString()
if err != nil {
err = msgp.WrapError(err, "FSType")
@ -108,8 +113,8 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
// EncodeMsg implements msgp.Encodable
func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
// array header, size 17
err = en.Append(0xdc, 0x0, 0x11)
// array header, size 18
err = en.Append(0xdc, 0x0, 0x12)
if err != nil {
return
}
@ -148,6 +153,11 @@ func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
err = msgp.WrapError(err, "Minor")
return
}
err = en.WriteUint64(z.NRRequests)
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
err = en.WriteString(z.FSType)
if err != nil {
err = msgp.WrapError(err, "FSType")
@ -204,8 +214,8 @@ func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
// MarshalMsg implements msgp.Marshaler
func (z *DiskInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.Require(b, z.Msgsize())
// array header, size 17
o = append(o, 0xdc, 0x0, 0x11)
// array header, size 18
o = append(o, 0xdc, 0x0, 0x12)
o = msgp.AppendUint64(o, z.Total)
o = msgp.AppendUint64(o, z.Free)
o = msgp.AppendUint64(o, z.Used)
@ -213,6 +223,7 @@ func (z *DiskInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.AppendUint64(o, z.FreeInodes)
o = msgp.AppendUint32(o, z.Major)
o = msgp.AppendUint32(o, z.Minor)
o = msgp.AppendUint64(o, z.NRRequests)
o = msgp.AppendString(o, z.FSType)
o = msgp.AppendBool(o, z.RootDisk)
o = msgp.AppendBool(o, z.Healing)
@ -238,8 +249,8 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err)
return
}
if zb0001 != 17 {
err = msgp.ArrayError{Wanted: 17, Got: zb0001}
if zb0001 != 18 {
err = msgp.ArrayError{Wanted: 18, Got: zb0001}
return
}
z.Total, bts, err = msgp.ReadUint64Bytes(bts)
@ -277,6 +288,11 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err, "Minor")
return
}
z.NRRequests, bts, err = msgp.ReadUint64Bytes(bts)
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
z.FSType, bts, err = msgp.ReadStringBytes(bts)
if err != nil {
err = msgp.WrapError(err, "FSType")
@ -333,7 +349,7 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
// Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message
func (z *DiskInfo) Msgsize() (s int) {
s = 3 + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint32Size + msgp.Uint32Size + msgp.StringPrefixSize + len(z.FSType) + msgp.BoolSize + msgp.BoolSize + msgp.BoolSize + msgp.StringPrefixSize + len(z.Endpoint) + msgp.StringPrefixSize + len(z.MountPath) + msgp.StringPrefixSize + len(z.ID) + msgp.BoolSize + z.Metrics.Msgsize() + msgp.StringPrefixSize + len(z.Error)
s = 3 + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint32Size + msgp.Uint32Size + msgp.Uint64Size + msgp.StringPrefixSize + len(z.FSType) + msgp.BoolSize + msgp.BoolSize + msgp.BoolSize + msgp.StringPrefixSize + len(z.Endpoint) + msgp.StringPrefixSize + len(z.MountPath) + msgp.StringPrefixSize + len(z.ID) + msgp.BoolSize + z.Metrics.Msgsize() + msgp.StringPrefixSize + len(z.Error)
return
}

View File

@ -114,6 +114,8 @@ type xlStorage struct {
formatData []byte
nrRequests uint64
// mutex to prevent concurrent read operations overloading walks.
rotational bool
walkMu *sync.Mutex
@ -244,6 +246,11 @@ func newXLStorage(ep Endpoint, cleanUp bool) (s *xlStorage, err error) {
diskIndex: -1,
}
// Sanitize before setting it
if info.NRRequests > 0 {
s.nrRequests = info.NRRequests
}
// We stagger listings only on HDDs.
if info.Rotational == nil || *info.Rotational {
s.rotational = true
@ -658,6 +665,7 @@ func (s *xlStorage) DiskInfo(_ context.Context, _ bool) (info DiskInfo, err erro
dcinfo.UsedInodes = di.Files - di.Ffree
dcinfo.FreeInodes = di.Ffree
dcinfo.FSType = di.FSType
dcinfo.NRRequests = s.nrRequests
dcinfo.Rotational = s.rotational
diskID, err := s.GetDiskID()
// Healing is 'true' when

View File

@ -273,19 +273,23 @@ Once set the scanner settings are automatically applied without the need for ser
### Healing
Healing is enabled by default. The following configuration settings allow for more staggered delay in terms of healing. The healing system by default adapts to the system speed and pauses up to '1sec' per object when the system has `max_io` number of concurrent requests. It is possible to adjust the `max_sleep` and `max_io` values thereby increasing the healing speed. The delays between each operation of the healer can be adjusted by the `mc admin config set alias/ heal max_sleep=1s` and maximum concurrent requests allowed before we start slowing things down can be configured with `mc admin config set alias/ heal max_io=30` . By default the wait delay is `1sec` beyond 10 concurrent operations. This means the healer will sleep *1 second* at max for each heal operation if there are more than *10* concurrent client requests.
Healing is enabled by default. The following configuration settings allow for more staggered delay in terms of healing. The healing system by default adapts to the system speed and pauses up to '250ms' per object when the system has `max_io` number of concurrent requests. It is possible to adjust the `max_sleep` and `max_io` values thereby increasing the healing speed. The delays between each operation of the healer can be adjusted by the `mc admin config set alias/ heal max_sleep=1s` and maximum concurrent requests allowed before we start slowing things down can be configured with `mc admin config set alias/ heal max_io=30` . By default the wait delay is `250ms` beyond 100 concurrent operations. This means the healer will sleep *250 milliseconds* at max for each heal operation if there are more than *100* concurrent client requests.
In most setups this is sufficient to heal the content after drive replacements. Setting `max_sleep` to a *lower* value and setting `max_io` to a *higher* value would make heal go faster.
Each node is responsible of healing its local drives; Each drive will have multiple heal workers which is the quarter of the number of CPU cores of the node or the quarter of the configured nr_requests of the drive (https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt). It is also possible to provide a custom number of workers by using this command: `mc admin config set alias/ heal drive_workers=100` .
```
~ mc admin config set alias/ heal
KEY:
heal manage object healing frequency and bitrot verification checks
ARGS:
bitrotscan (on|off) perform bitrot scan on drives when checking objects during scanner
max_sleep (duration) maximum sleep duration between objects to slow down heal operation. eg. 2s
max_io (int) maximum IO requests allowed between objects to slow down heal operation. eg. 3
bitrotscan (on|off) perform bitrot scan on drives when checking objects during scanner
max_sleep (duration) maximum sleep duration between objects to slow down heal operation. eg. 2s
max_io (int) maximum IO requests allowed between objects to slow down heal operation. eg. 3
drive_workers (int) the number of workers per drive to heal a new disk replacement.
```
Example: The following settings will increase the heal operation speed by allowing healing operation to run without delay up to `100` concurrent requests, and the maximum delay between each heal operation is set to `300ms`.

View File

@ -31,13 +31,15 @@ import (
// Compression environment variables
const (
Bitrot = "bitrotscan"
Sleep = "max_sleep"
IOCount = "max_io"
Bitrot = "bitrotscan"
Sleep = "max_sleep"
IOCount = "max_io"
DriveWorkers = "drive_workers"
EnvBitrot = "MINIO_HEAL_BITROTSCAN"
EnvSleep = "MINIO_HEAL_MAX_SLEEP"
EnvIOCount = "MINIO_HEAL_MAX_IO"
EnvBitrot = "MINIO_HEAL_BITROTSCAN"
EnvSleep = "MINIO_HEAL_MAX_SLEEP"
EnvIOCount = "MINIO_HEAL_MAX_IO"
EnvDriveWorkers = "MINIO_HEAL_DRIVE_WORKERS"
)
var configMutex sync.RWMutex
@ -51,6 +53,8 @@ type Config struct {
Sleep time.Duration `json:"sleep"`
IOCount int `json:"iocount"`
DriveWorkers int `json:"drive_workers"`
// Cached value from Bitrot field
cache struct {
// -1: bitrot enabled, 0: bitrot disabled, > 0: bitrot cycle
@ -77,6 +81,13 @@ func (opts Config) Clone() (int, time.Duration, string) {
return opts.IOCount, opts.Sleep, opts.Bitrot
}
// GetWorkers returns the number of workers, -1 is none configured
func (opts Config) GetWorkers() int {
configMutex.RLock()
defer configMutex.RUnlock()
return opts.DriveWorkers
}
// Update updates opts with nopts
func (opts *Config) Update(nopts Config) {
configMutex.Lock()
@ -85,6 +96,7 @@ func (opts *Config) Update(nopts Config) {
opts.Bitrot = nopts.Bitrot
opts.IOCount = nopts.IOCount
opts.Sleep = nopts.Sleep
opts.DriveWorkers = nopts.DriveWorkers
opts.cache.bitrotCycle, _ = parseBitrotConfig(nopts.Bitrot)
}
@ -103,6 +115,10 @@ var DefaultKVS = config.KVS{
Key: IOCount,
Value: "100",
},
config.KV{
Key: DriveWorkers,
Value: "",
},
}
const minimumBitrotCycleInMonths = 1
@ -154,5 +170,18 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
if err != nil {
return cfg, fmt.Errorf("'heal:max_io' value invalid: %w", err)
}
if ws := env.Get(EnvDriveWorkers, kvs.GetWithDefault(DriveWorkers, DefaultKVS)); ws != "" {
w, err := strconv.Atoi(ws)
if err != nil {
return cfg, fmt.Errorf("'heal:drive_workers' value invalid: %w", err)
}
if w < 1 {
return cfg, fmt.Errorf("'heal:drive_workers' value invalid: zero or negative integer unsupported")
}
cfg.DriveWorkers = w
} else {
cfg.DriveWorkers = -1
}
return cfg, nil
}

View File

@ -45,5 +45,11 @@ var (
Optional: true,
Type: "int",
},
config.HelpKV{
Key: DriveWorkers,
Description: `the number of workers per drive to heal a new disk replacement` + defaultHelpPostfix(DriveWorkers),
Optional: true,
Type: "int",
},
}
)

View File

@ -37,6 +37,7 @@ type Info struct {
Minor uint32
Name string
Rotational *bool
NRRequests uint64
}
// DevID is the drive major and minor ids

View File

@ -98,6 +98,7 @@ func GetInfo(path string, firstTime bool) (info Info, err error) {
}
}
if err == nil {
info.NRRequests = qst.NRRequests
rot := qst.Rotational == 1 // Rotational is '1' if the device is HDD
info.Rotational = &rot
}