fix: missing metrics for healed objects (#19392)

all healed successful objects via queueHealTask
in a non-blocking heal weren't being reported
correctly, this PR fixes this comprehensively.
This commit is contained in:
Harshavardhana 2024-04-01 23:48:36 -07:00 committed by GitHub
parent ae4fb1b72e
commit 4f660a8eb7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 51 additions and 35 deletions

View File

@ -329,14 +329,18 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence, objAPI ObjectLay
// Add heal state and start sequence
ahs.healSeqMap[hpath] = h
// Launch top-level background heal go-routine
go h.healSequenceStart(objAPI)
clientToken := h.clientToken
if globalIsDistErasure {
clientToken = fmt.Sprintf("%s:%d", h.clientToken, GetProxyEndpointLocalIndex(globalProxyEndpoints))
}
if h.clientToken == bgHealingUUID {
// For background heal do nothing, do not spawn an unnecessary goroutine.
} else {
// Launch top-level background heal go-routine
go h.healSequenceStart(objAPI)
}
b, err := json.Marshal(madmin.HealStartSuccess{
ClientToken: clientToken,
ClientAddress: h.clientAddress,
@ -537,9 +541,9 @@ func (h *healSequence) getHealedItemsMap() map[madmin.HealItemType]int64 {
return retMap
}
// gethealFailedItemsMap - returns map of all items where heal failed against
// getHealFailedItemsMap - returns map of all items where heal failed against
// drive endpoint and status
func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
func (h *healSequence) getHealFailedItemsMap() map[string]int64 {
h.mutex.RLock()
defer h.mutex.RUnlock()
@ -552,6 +556,32 @@ func (h *healSequence) gethealFailedItemsMap() map[string]int64 {
return retMap
}
func (h *healSequence) countFailed(res madmin.HealResultItem) {
h.mutex.Lock()
defer h.mutex.Unlock()
for _, d := range res.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}
h.lastHealActivity = UTCNow()
}
func (h *healSequence) countHeals(healType madmin.HealItemType, healed bool) {
h.mutex.Lock()
defer h.mutex.Unlock()
if !healed {
h.scannedItemsMap[healType]++
} else {
h.healedItemsMap[healType]++
}
h.lastHealActivity = UTCNow()
}
// isQuitting - determines if the heal sequence is quitting (due to an
// external signal)
func (h *healSequence) isQuitting() bool {
@ -704,10 +734,7 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
task.opts.ScanMode = madmin.HealNormalScan
}
h.mutex.Lock()
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
h.mutex.Unlock()
h.countHeals(healType, false)
if source.noWait {
select {
@ -744,32 +771,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
return nil
}
h.mutex.Lock()
defer h.mutex.Unlock()
// Progress is not reported in case of background heal processing.
// Instead we increment relevant counter based on the heal result
// for prometheus reporting.
if res.err != nil {
for _, d := range res.result.After.Drives {
// For failed items we report the endpoint and drive state
// This will help users take corrective actions for drives
h.healFailedItemsMap[d.Endpoint+","+d.State]++
}
} else {
// Only object type reported for successful healing
h.healedItemsMap[res.result.Type]++
}
// Report caller of any failure
return res.err
}
res.result.Type = healType
if res.err != nil {
// Only report object error
if healType != madmin.HealItemObject {
return res.err
}
res.result.Detail = res.err.Error()
}
return h.pushHealResultItem(res.result)

View File

@ -101,16 +101,17 @@ func waitForLowHTTPReq() {
}
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
bgSeq := newBgHealSequence()
// Run the background healer
for i := 0; i < globalBackgroundHealRoutine.workers; i++ {
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI)
go globalBackgroundHealRoutine.AddWorker(ctx, objAPI, bgSeq)
}
globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence(), objAPI)
globalBackgroundHealState.LaunchNewHealSequence(bgSeq, objAPI)
}
// Wait for heal requests and process them
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer, bgSeq *healSequence) {
for {
select {
case task, ok := <-h.tasks:
@ -133,6 +134,15 @@ func (h *healRoutine) AddWorker(ctx context.Context, objAPI ObjectLayer) {
}
}
if bgSeq != nil {
// We increment relevant counter based on the heal result for prometheus reporting.
if err != nil {
bgSeq.countFailed(res)
} else {
bgSeq.countHeals(res.Type, false)
}
}
if task.respCh != nil {
task.respCh <- healResult{result: res, err: err}
}

View File

@ -2654,7 +2654,7 @@ func getMinioHealingMetrics(opts MetricsGroupOpts) *MetricsGroupV2 {
}
func getFailedItems(seq *healSequence) (m []MetricV2) {
items := seq.gethealFailedItemsMap()
items := seq.getHealFailedItemsMap()
m = make([]MetricV2, 0, len(items))
for k, v := range items {
s := strings.Split(k, ",")

View File

@ -172,7 +172,7 @@ func healingMetricsPrometheus(ch chan<- prometheus.Metric) {
float64(v), string(k),
)
}
for k, v := range bgSeq.gethealFailedItemsMap() {
for k, v := range bgSeq.getHealFailedItemsMap() {
// healFailedItemsMap stores the endpoint and volume state separated by comma,
// split the fields and pass to channel at correct index
s := strings.Split(k, ",")