fix: missing metrics for healed objects (#19392)

all healed successful objects via queueHealTask in a non-blocking heal weren't being reported correctly, this PR fixes this comprehensively.
2024-04-01 23:48:36 -07:00 · 2024-04-01 23:48:36 -07:00 · 4f660a8eb7
parent ae4fb1b72e
commit 4f660a8eb7
4 changed files with 51 additions and 35 deletions
--- a/cmd/admin-heal-ops.go
+++ b/cmd/admin-heal-ops.go
@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay
 	// Add heal state and start sequence
 	ahs.healSeqMap[hpath] = h

-	// Launch top-level background heal go-routine
-	go h.healSequenceStart(objAPI)
-
 	clientToken := h.clientToken
 	if globalIsDistErasure {
 		clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
 	}

+	if h.clientToken == bgHealingUUID {
+		// For background heal do nothing, do not spawn an unnecessary goroutine.
+	} else {
+		// Launch top-level background heal go-routine
+		go h.healSequenceStart(objAPI)
+	}
+
 	b, err := json.Marshal(madmin.HealStartSuccess{
 		ClientToken:   clientToken,
 		ClientAddress: h.clientAddress,
@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
 	return retMap
 }

-// gethealFailedItemsMap - returns map of all items where heal failed against
+// getHealFailedItemsMap - returns map of all items where heal failed against
 // drive endpoint and status
-func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
+func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
 	h.mutex.RLock()
 	defer h.mutex.RUnlock()

@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
 	return retMap
 }

+func (h *healSequence) countFailed(res madmin.HealResultItem) {
+	h.mutex.Lock()
+	defer h.mutex.Unlock()
+
+	for _, d := range res.After.Drives {
+		// For failed items we report the endpoint and drive state
+		// This will help users take corrective actions for drives
+		h.healFailedItemsMap[d.Endpoint+","+d.State]++
+	}
+
+	h.lastHealActivity = UTCNow()
+}
+
+func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
+	h.mutex.Lock()
+	defer h.mutex.Unlock()
+
+	if !healed {
+		h.scannedItemsMap[healType]++
+	} else {
+		h.healedItemsMap[healType]++
+	}
+
+	h.lastHealActivity = UTCNow()
+}
+
 // isQuitting - determines if the heal sequence is quitting (due to an
 // external signal)
 func (h *healSequence) isQuitting() bool {
@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 		task.opts.ScanMode = madmin.HealNormalScan
 	}

-	h.mutex.Lock()
-	h.scannedItemsMap[healType]++
-	h.lastHealActivity = UTCNow()
-	h.mutex.Unlock()
+	h.countHeals(healType, false)

 	if source.noWait {
 		select {
@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
 				return nil
 			}

-			h.mutex.Lock()
-			defer h.mutex.Unlock()
-
-			// Progress is not reported in case of background heal processing.
-			// Instead we increment relevant counter based on the heal result
-			// for prometheus reporting.
-			if res.err != nil {
-				for _, d := range res.result.After.Drives {
-					// For failed items we report the endpoint and drive state
-					// This will help users take corrective actions for drives
-					h.healFailedItemsMap[d.Endpoint+","+d.State]++
-				}
-			} else {
-				// Only object type reported for successful healing
-				h.healedItemsMap[res.result.Type]++
-			}
-
 			// Report caller of any failure
 			return res.err
 		}
 		res.result.Type = healType
 		if res.err != nil {
-			// Only report object error
-			if healType != madmin.HealItemObject {
-				return res.err
-			}
 			res.result.Detail = res.err.Error()
 		}
 		return h.pushHealResultItem(res.result)
--- a/cmd/background-heal-ops.go
+++ b/cmd/background-heal-ops.go
@ -101,16 +101,17 @@ func waitForLowHTTPReq() {
 }

 func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
+	bgSeq := newBgHealSequence()
 	// Run the background healer
 	for i := 0; i < globalBackgroundHealRoutine.workers; i++ {
-		go globalBackgroundHealRoutine.AddWorker(ctx, objAPI)
+		go globalBackgroundHealRoutine.AddWorker(ctx, objAPI, bgSeq)
 	}

-	globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence(), objAPI)
+	globalBackgroundHealState.LaunchNewHealSequence(bgSeq, objAPI)
 }

 // Wait for heal requests and process them
-func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
+func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *healSequence) {
 	for {
 		select {
 		case task, ok := <-h.tasks:
@ -133,6 +134,15 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
 				}
 			}

+			if bgSeq != nil {
+				// We increment relevant counter based on the heal result for prometheus reporting.
+				if err != nil {
+					bgSeq.countFailed(res)
+				} else {
+					bgSeq.countHeals(res.Type, false)
+				}
+			}
+
 			if task.respCh != nil {
 				task.respCh <- healResult{result: res, err: err}
 			}
--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
@ -2654,7 +2654,7 @@ func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
 }

 func getFailedItems(seq *healSequence) (m []MetricV2) {
-	items := seq.gethealFailedItemsMap()
+	items := seq.getHealFailedItemsMap()
 	m = make([]MetricV2, 0, len(items))
 	for k, v := range items {
 		s := strings.Split(k, ",")
--- a/cmd/metrics.go
+++ b/cmd/metrics.go
@ -172,7 +172,7 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
 			float64(v), string(k),
 		)
 	}
-	for k, v := range bgSeq.gethealFailedItemsMap() {
+	for k, v := range bgSeq.getHealFailedItemsMap() {
 		// healFailedItemsMap stores the endpoint and volume state separated by comma,
 		// split the fields and pass to channel at correct index
 		s := strings.Split(k, ",")