From 4f660a8eb7addeb84e105db453062a0062e65195 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Mon, 1 Apr 2024 23:48:36 -0700 Subject: [PATCH] fix: missing metrics for healed objects (#19392) all healed successful objects via queueHealTask in a non-blocking heal weren't being reported correctly, this PR fixes this comprehensively. --- cmd/admin-heal-ops.go | 66 +++++++++++++++++++++----------------- cmd/background-heal-ops.go | 16 +++++++-- cmd/metrics-v2.go | 2 +- cmd/metrics.go | 2 +- 4 files changed, 51 insertions(+), 35 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index b1d48b3d1..04cdac8f0 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay // Add heal state and start sequence ahs.healSeqMap[hpath] = h - // Launch top-level background heal go-routine - go h.healSequenceStart(objAPI) - clientToken := h.clientToken if globalIsDistErasure { clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints)) } + if h.clientToken == bgHealingUUID { + // For background heal do nothing, do not spawn an unnecessary goroutine. + } else { + // Launch top-level background heal go-routine + go h.healSequenceStart(objAPI) + } + b, err := json.Marshal(madmin.HealStartSuccess{ ClientToken: clientToken, ClientAddress: h.clientAddress, @@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 { return retMap } -// gethealFailedItemsMap - returns map of all items where heal failed against +// getHealFailedItemsMap - returns map of all items where heal failed against // drive endpoint and status -func (h *healSequence) gethealFailedItemsMap() map[string]int64 { +func (h *healSequence) getHealFailedItemsMap() map[string]int64 { h.mutex.RLock() defer h.mutex.RUnlock() @@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 { return retMap } +func (h *healSequence) countFailed(res madmin.HealResultItem) { + h.mutex.Lock() + defer h.mutex.Unlock() + + for _, d := range res.After.Drives { + // For failed items we report the endpoint and drive state + // This will help users take corrective actions for drives + h.healFailedItemsMap[d.Endpoint+","+d.State]++ + } + + h.lastHealActivity = UTCNow() +} + +func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) { + h.mutex.Lock() + defer h.mutex.Unlock() + + if !healed { + h.scannedItemsMap[healType]++ + } else { + h.healedItemsMap[healType]++ + } + + h.lastHealActivity = UTCNow() +} + // isQuitting - determines if the heal sequence is quitting (due to an // external signal) func (h *healSequence) isQuitting() bool { @@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem task.opts.ScanMode = madmin.HealNormalScan } - h.mutex.Lock() - h.scannedItemsMap[healType]++ - h.lastHealActivity = UTCNow() - h.mutex.Unlock() + h.countHeals(healType, false) if source.noWait { select { @@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem return nil } - h.mutex.Lock() - defer h.mutex.Unlock() - - // Progress is not reported in case of background heal processing. - // Instead we increment relevant counter based on the heal result - // for prometheus reporting. - if res.err != nil { - for _, d := range res.result.After.Drives { - // For failed items we report the endpoint and drive state - // This will help users take corrective actions for drives - h.healFailedItemsMap[d.Endpoint+","+d.State]++ - } - } else { - // Only object type reported for successful healing - h.healedItemsMap[res.result.Type]++ - } - // Report caller of any failure return res.err } res.result.Type = healType if res.err != nil { - // Only report object error - if healType != madmin.HealItemObject { - return res.err - } res.result.Detail = res.err.Error() } return h.pushHealResultItem(res.result) diff --git a/cmd/background-heal-ops.go b/cmd/background-heal-ops.go index f4affaed9..7085a389c 100644 --- a/cmd/background-heal-ops.go +++ b/cmd/background-heal-ops.go @@ -101,16 +101,17 @@ func waitForLowHTTPReq() { } func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) { + bgSeq := newBgHealSequence() // Run the background healer for i := 0; i < globalBackgroundHealRoutine.workers; i++ { - go globalBackgroundHealRoutine.AddWorker(ctx, objAPI) + go globalBackgroundHealRoutine.AddWorker(ctx, objAPI, bgSeq) } - globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence(), objAPI) + globalBackgroundHealState.LaunchNewHealSequence(bgSeq, objAPI) } // Wait for heal requests and process them -func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) { +func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *healSequence) { for { select { case task, ok := <-h.tasks: @@ -133,6 +134,15 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) { } } + if bgSeq != nil { + // We increment relevant counter based on the heal result for prometheus reporting. + if err != nil { + bgSeq.countFailed(res) + } else { + bgSeq.countHeals(res.Type, false) + } + } + if task.respCh != nil { task.respCh <- healResult{result: res, err: err} } diff --git a/cmd/metrics-v2.go b/cmd/metrics-v2.go index 53be68915..cbbc16a91 100644 --- a/cmd/metrics-v2.go +++ b/cmd/metrics-v2.go @@ -2654,7 +2654,7 @@ func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroupV2 { } func getFailedItems(seq *healSequence) (m []MetricV2) { - items := seq.gethealFailedItemsMap() + items := seq.getHealFailedItemsMap() m = make([]MetricV2, 0, len(items)) for k, v := range items { s := strings.Split(k, ",") diff --git a/cmd/metrics.go b/cmd/metrics.go index a198593b4..b31688d50 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -172,7 +172,7 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) { float64(v), string(k), ) } - for k, v := range bgSeq.gethealFailedItemsMap() { + for k, v := range bgSeq.getHealFailedItemsMap() { // healFailedItemsMap stores the endpoint and volume state separated by comma, // split the fields and pass to channel at correct index s := strings.Split(k, ",")