Add system CPU metrics to metrics-v3 (#19560)

endpoint: /minio/metrics/v3/system/cpu

metrics:
- minio_system_cpu_avg_idle
- minio_system_cpu_avg_iowait
- minio_system_cpu_load
- minio_system_cpu_load_perc
- minio_system_cpu_nice
- minio_system_cpu_steal
- minio_system_cpu_system
- minio_system_cpu_user
This commit is contained in:
Shireesh Anjal 2024-04-24 05:26:12 +05:30 committed by GitHub
parent 9693c382a8
commit f7b665347e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 150 additions and 9 deletions

View File

@ -162,14 +162,7 @@ func init() {
resourceCollector = newMinioResourceCollector(resourceMetricsGroups)
}
func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) {
resourceMetricsMapMu.Lock()
defer resourceMetricsMapMu.Unlock()
subsysMetrics, found := resourceMetricsMap[subSys]
if !found {
subsysMetrics = ResourceMetrics{}
}
func getResourceKey(name MetricName, labels map[string]string) string {
// labels are used to uniquely identify a metric
// e.g. reads_per_sec_{drive} inside the map
sfx := ""
@ -180,7 +173,18 @@ func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64,
sfx += v
}
key := string(name) + "_" + sfx
return string(name) + "_" + sfx
}
func updateResourceMetrics(subSys MetricSubsystem, name MetricName, val float64, labels map[string]string, isCumulative bool) {
resourceMetricsMapMu.Lock()
defer resourceMetricsMapMu.Unlock()
subsysMetrics, found := resourceMetricsMap[subSys]
if !found {
subsysMetrics = ResourceMetrics{}
}
key := getResourceKey(name, labels)
metric, found := subsysMetrics[key]
if !found {
metric = ResourceMetric{

View File

@ -35,6 +35,7 @@ type metricsCache struct {
esetHealthResult *cachevalue.Cache[HealthResult]
driveMetrics *cachevalue.Cache[storageMetrics]
memoryMetrics *cachevalue.Cache[madmin.MemInfo]
cpuMetrics *cachevalue.Cache[madmin.CPUMetrics]
clusterDriveMetrics *cachevalue.Cache[storageMetrics]
nodesUpDown *cachevalue.Cache[nodesOnline]
}
@ -45,6 +46,7 @@ func newMetricsCache() *metricsCache {
esetHealthResult: newESetHealthResultCache(),
driveMetrics: newDriveMetricsCache(),
memoryMetrics: newMemoryMetricsCache(),
cpuMetrics: newCPUMetricsCache(),
clusterDriveMetrics: newClusterStorageInfoCache(),
nodesUpDown: newNodesUpDownCache(),
}
@ -200,6 +202,31 @@ func newDriveMetricsCache() *cachevalue.Cache[storageMetrics] {
loadDriveMetrics)
}
func newCPUMetricsCache() *cachevalue.Cache[madmin.CPUMetrics] {
loadCPUMetrics := func() (v madmin.CPUMetrics, err error) {
var types madmin.MetricType = madmin.MetricsCPU
m := collectLocalMetrics(types, collectMetricsOpts{
hosts: map[string]struct{}{
globalLocalNodeName: {},
},
})
for _, hm := range m.ByHost {
if hm.CPU != nil {
v = *hm.CPU
break
}
}
return
}
return cachevalue.NewFromFunc(1*time.Minute,
cachevalue.Opts{ReturnLastGood: true},
loadCPUMetrics)
}
func newMemoryMetricsCache() *cachevalue.Cache[madmin.MemInfo] {
loadMemoryMetrics := func() (v madmin.MemInfo, err error) {
var types madmin.MetricType = madmin.MetricsMem

View File

@ -0,0 +1,82 @@
// Copyright (c) 2015-2024 MinIO, Inc.
//
// # This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"context"
"math"
)
const (
sysCPUAvgIdle = "avg_idle"
sysCPUAvgIOWait = "avg_iowait"
sysCPULoad = "load"
sysCPULoadPerc = "load_perc"
sysCPUNice = "nice"
sysCPUSteal = "steal"
sysCPUSystem = "system"
sysCPUUser = "user"
)
var (
sysCPUAvgIdleMD = NewGaugeMD(sysCPUAvgIdle, "Average CPU idle time")
sysCPUAvgIOWaitMD = NewGaugeMD(sysCPUAvgIOWait, "Average CPU IOWait time")
sysCPULoadMD = NewGaugeMD(sysCPULoad, "CPU load average 1min")
sysCPULoadPercMD = NewGaugeMD(sysCPULoadPerc, "CPU load average 1min (percentage)")
sysCPUNiceMD = NewGaugeMD(sysCPUNice, "CPU nice time")
sysCPUStealMD = NewGaugeMD(sysCPUSteal, "CPU steal time")
sysCPUSystemMD = NewGaugeMD(sysCPUSystem, "CPU system time")
sysCPUUserMD = NewGaugeMD(sysCPUUser, "CPU user time")
)
// loadCPUMetrics - `MetricsLoaderFn` for system CPU metrics.
func loadCPUMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
cpuMetrics, _ := c.cpuMetrics.Get()
if cpuMetrics.LoadStat != nil {
m.Set(sysCPULoad, cpuMetrics.LoadStat.Load1)
perc := cpuMetrics.LoadStat.Load1 * 100 / float64(cpuMetrics.CPUCount)
m.Set(sysCPULoadPerc, math.Round(perc*100)/100)
}
ts := cpuMetrics.TimesStat
tot := ts.User + ts.System + ts.Idle + ts.Iowait + ts.Nice + ts.Steal
cpuUserVal := math.Round(ts.User/tot*100*100) / 100
m.Set(sysCPUUser, cpuUserVal)
cpuSystemVal := math.Round(ts.System/tot*100*100) / 100
m.Set(sysCPUSystem, cpuSystemVal)
cpuNiceVal := math.Round(ts.Nice/tot*100*100) / 100
m.Set(sysCPUNice, cpuNiceVal)
cpuStealVal := math.Round(ts.Steal/tot*100*100) / 100
m.Set(sysCPUSteal, cpuStealVal)
// metrics-resource.go runs a job to collect resource metrics including their Avg values and
// stores them in resourceMetricsMap. We can use it to get the Avg values of CPU idle and IOWait.
cpuResourceMetrics, found := resourceMetricsMap[cpuSubsystem]
if found {
if cpuIdleMetric, ok := cpuResourceMetrics[getResourceKey(cpuIdle, nil)]; ok {
avgVal := math.Round(cpuIdleMetric.Avg*100) / 100
m.Set(sysCPUAvgIdle, avgVal)
}
if cpuIOWaitMetric, ok := cpuResourceMetrics[getResourceKey(cpuIOWait, nil)]; ok {
avgVal := math.Round(cpuIOWaitMetric.Avg*100) / 100
m.Set(sysCPUAvgIOWait, avgVal)
}
}
return nil
}

View File

@ -36,6 +36,7 @@ const (
systemNetworkInternodeCollectorPath collectorPath = "/system/network/internode"
systemDriveCollectorPath collectorPath = "/system/drive"
systemMemoryCollectorPath collectorPath = "/system/memory"
systemCPUCollectorPath collectorPath = "/system/cpu"
systemProcessCollectorPath collectorPath = "/system/process"
systemGoCollectorPath collectorPath = "/system/go"
@ -128,6 +129,20 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
loadMemoryMetrics,
)
systemCPUMG := NewMetricsGroup(systemCPUCollectorPath,
[]MetricDescriptor{
sysCPUAvgIdleMD,
sysCPUAvgIOWaitMD,
sysCPULoadMD,
sysCPULoadPercMD,
sysCPUNiceMD,
sysCPUStealMD,
sysCPUSystemMD,
sysCPUUserMD,
},
loadCPUMetrics,
)
systemDriveMG := NewMetricsGroup(systemDriveCollectorPath,
[]MetricDescriptor{
driveUsedBytesMD,
@ -235,6 +250,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
systemNetworkInternodeMG,
systemDriveMG,
systemMemoryMG,
systemCPUMG,
clusterHealthMG,
clusterUsageObjectsMG,

View File

@ -139,6 +139,18 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
| `minio_system_memory_shared` | `gauge` | Shared memory on the node | `server` |
| `minio_system_memory_available` | `gauge` | Available memory on the node | `server` |
### `/system/cpu`
| Name | Type | Help | Labels |
|-------------------------------|---------|------------------------------------|----------|
| `minio_system_cpu_avg_idle` | `gauge` | Average CPU idle time | `server` |
| `minio_system_cpu_avg_iowait` | `gauge` | Average CPU IOWait time | `server` |
| `minio_system_cpu_load` | `gauge` | CPU load average 1min | `server` |
| `minio_system_cpu_load_perc` | `gauge` | CPU load average 1min (percentage) | `server` |
| `minio_system_cpu_nice` | `gauge` | CPU nice time | `server` |
| `minio_system_cpu_steal` | `gauge` | CPU steal time | `server` |
| `minio_system_cpu_system` | `gauge` | CPU system time | `server` |
| `minio_system_cpu_user` | `gauge` | CPU user time | `server` |
### `/system/network/internode`