Capture percentage of cpu load and memory used (#18596)

By default the cpu load is the cumulative of all cores. Capture the
percentage load (load * 100 / cpu-count)

Also capture the percentage memory used (used * 100 / total)
This commit is contained in:
Shireesh Anjal 2023-12-07 02:49:59 +05:30 committed by GitHub
parent 5cc2c62c66
commit 7350a29fec
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 102 additions and 65 deletions

View File

@ -91,7 +91,7 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma
}
cm, err := c.Times(false)
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cputimes)", globalMinioAddr, err.Error()))
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuTimes)", globalMinioAddr, err.Error()))
} else {
// not collecting per-cpu stats, so there will be only one element
if len(cm) == 1 {
@ -100,6 +100,13 @@ func collectLocalMetrics(types madmin.MetricType, opts collectMetricsOpts) (m ma
m.Errors = append(m.Errors, fmt.Sprintf("%s: Expected one CPU stat, got %d", globalMinioAddr, len(cm)))
}
}
cpuCount, err := c.Counts(true)
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (cpuCount)", globalMinioAddr, err.Error()))
} else {
m.Aggregated.CPU.CPUCount = cpuCount
}
loadStat, err := load.Avg()
if err != nil {
m.Errors = append(m.Errors, fmt.Sprintf("%s: %v (loadStat)", globalMinioAddr, err.Error()))

View File

@ -53,6 +53,7 @@ const (
// memory stats
memUsed MetricName = "used"
memUsedPerc MetricName = "used_perc"
memFree MetricName = "free"
memShared MetricName = "shared"
memBuffers MetricName = "buffers"
@ -60,15 +61,18 @@ const (
memAvailable MetricName = "available"
// cpu stats
cpuUser MetricName = "user"
cpuSystem MetricName = "system"
cpuIOWait MetricName = "iowait"
cpuIdle MetricName = "idle"
cpuNice MetricName = "nice"
cpuSteal MetricName = "steal"
cpuLoad1 MetricName = "load1"
cpuLoad5 MetricName = "load5"
cpuLoad15 MetricName = "load15"
cpuUser MetricName = "user"
cpuSystem MetricName = "system"
cpuIOWait MetricName = "iowait"
cpuIdle MetricName = "idle"
cpuNice MetricName = "nice"
cpuSteal MetricName = "steal"
cpuLoad1 MetricName = "load1"
cpuLoad5 MetricName = "load5"
cpuLoad15 MetricName = "load15"
cpuLoad1Perc MetricName = "load1_perc"
cpuLoad5Perc MetricName = "load5_perc"
cpuLoad15Perc MetricName = "load15_perc"
)
var (
@ -126,6 +130,7 @@ func init() {
interfaceTxErrors: "Transmit errors in " + interval,
total: "Total memory on the node",
memUsed: "Used memory on the node",
memUsedPerc: "Used memory percentage on the node",
memFree: "Free memory on the node",
memShared: "Shared memory on the node",
memBuffers: "Buffers memory on the node",
@ -151,6 +156,9 @@ func init() {
cpuLoad1: "CPU load average 1min",
cpuLoad5: "CPU load average 5min",
cpuLoad15: "CPU load average 15min",
cpuLoad1Perc: "CPU load average 1min (perentage)",
cpuLoad5Perc: "CPU load average 5min (percentage)",
cpuLoad15Perc: "CPU load average 15min (percentage)",
}
resourceMetricsGroups = []*MetricsGroup{
getResourceMetrics(),
@ -283,6 +291,8 @@ func collectLocalResourceMetrics() {
stats := hm.Mem.Info
updateResourceMetrics(memSubsystem, total, float64(stats.Total), labels, false)
updateResourceMetrics(memSubsystem, memUsed, float64(stats.Used), labels, false)
perc := math.Round(float64(stats.Used*100*100)/float64(stats.Total)) / 100
updateResourceMetrics(memSubsystem, memUsedPerc, perc, labels, false)
updateResourceMetrics(memSubsystem, memFree, float64(stats.Free), labels, false)
updateResourceMetrics(memSubsystem, memShared, float64(stats.Shared), labels, false)
updateResourceMetrics(memSubsystem, memBuffers, float64(stats.Buffers), labels, false)
@ -312,6 +322,14 @@ func collectLocalResourceMetrics() {
updateResourceMetrics(cpuSubsystem, cpuLoad1, ls.Load1, labels, false)
updateResourceMetrics(cpuSubsystem, cpuLoad5, ls.Load5, labels, false)
updateResourceMetrics(cpuSubsystem, cpuLoad15, ls.Load15, labels, false)
if hm.CPU.CPUCount > 0 {
perc := math.Round(ls.Load1*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad1Perc, perc, labels, false)
perc = math.Round(ls.Load5*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad5Perc, perc, labels, false)
perc = math.Round(ls.Load15*100*100/float64(hm.CPU.CPUCount)) / 100
updateResourceMetrics(cpuSubsystem, cpuLoad15Perc, perc, labels, false)
}
}
}
break // only one host expected

View File

@ -345,58 +345,70 @@ For deployments behind a load balancer, use the load balancer hostname instead o
## CPU Metrics
| Name | Description |
|:--------------------------------|:------------------------------|
| `minio_node_cpu_avg_user` | CPU user time. |
| `minio_node_cpu_avg_user_avg` | CPU user time (avg). |
| `minio_node_cpu_avg_user_max` | CPU user time (max). |
| `minio_node_cpu_avg_system` | CPU system time. |
| `minio_node_cpu_avg_system_avg` | CPU system time (avg). |
| `minio_node_cpu_avg_system_max` | CPU system time (max). |
| `minio_node_cpu_avg_idle` | CPU idle time. |
| `minio_node_cpu_avg_idle_avg` | CPU idle time (avg). |
| `minio_node_cpu_avg_idle_max` | CPU idle time (max). |
| `minio_node_cpu_avg_iowait` | CPU ioWait time. |
| `minio_node_cpu_avg_iowait_avg` | CPU ioWait time (avg). |
| `minio_node_cpu_avg_iowait_max` | CPU ioWait time (max). |
| `minio_node_cpu_avg_nice` | CPU nice time. |
| `minio_node_cpu_avg_nice_avg` | CPU nice time (avg). |
| `minio_node_cpu_avg_nice_max` | CPU nice time (max). |
| `minio_node_cpu_avg_steal` | CPU steam time. |
| `minio_node_cpu_avg_steal_avg` | CPU steam time (avg). |
| `minio_node_cpu_avg_steal_max` | CPU steam time (max). |
| `minio_node_cpu_avg_load1` | CPU load average 1min. |
| `minio_node_cpu_avg_load1_avg` | CPU load average 1min (avg). |
| `minio_node_cpu_avg_load1_max` | CPU load average 1min (max). |
| `minio_node_cpu_avg_load5` | CPU load average 5min. |
| `minio_node_cpu_avg_load5_avg` | CPU load average 5min (avg). |
| `minio_node_cpu_avg_load5_max` | CPU load average 5min (max). |
| `minio_node_cpu_avg_load15` | CPU load average 15min. |
| `minio_node_cpu_avg_load15_avg` | CPU load average 15min (avg). |
| `minio_node_cpu_avg_load15_max` | CPU load average 15min (max). |
| Name | Description |
|:-------------------------------------|:-------------------------------------------|
| `minio_node_cpu_avg_user` | CPU user time. |
| `minio_node_cpu_avg_user_avg` | CPU user time (avg). |
| `minio_node_cpu_avg_user_max` | CPU user time (max). |
| `minio_node_cpu_avg_system` | CPU system time. |
| `minio_node_cpu_avg_system_avg` | CPU system time (avg). |
| `minio_node_cpu_avg_system_max` | CPU system time (max). |
| `minio_node_cpu_avg_idle` | CPU idle time. |
| `minio_node_cpu_avg_idle_avg` | CPU idle time (avg). |
| `minio_node_cpu_avg_idle_max` | CPU idle time (max). |
| `minio_node_cpu_avg_iowait` | CPU ioWait time. |
| `minio_node_cpu_avg_iowait_avg` | CPU ioWait time (avg). |
| `minio_node_cpu_avg_iowait_max` | CPU ioWait time (max). |
| `minio_node_cpu_avg_nice` | CPU nice time. |
| `minio_node_cpu_avg_nice_avg` | CPU nice time (avg). |
| `minio_node_cpu_avg_nice_max` | CPU nice time (max). |
| `minio_node_cpu_avg_steal` | CPU steam time. |
| `minio_node_cpu_avg_steal_avg` | CPU steam time (avg). |
| `minio_node_cpu_avg_steal_max` | CPU steam time (max). |
| `minio_node_cpu_avg_load1` | CPU load average 1min. |
| `minio_node_cpu_avg_load1_avg` | CPU load average 1min (avg). |
| `minio_node_cpu_avg_load1_max` | CPU load average 1min (max). |
| `minio_node_cpu_avg_load1_perc` | CPU load average 1min (percentage). |
| `minio_node_cpu_avg_load1_perc_avg` | CPU load average 1min (percentage) (avg). |
| `minio_node_cpu_avg_load1_perc_max` | CPU load average 1min (percentage) (max). |
| `minio_node_cpu_avg_load5` | CPU load average 5min. |
| `minio_node_cpu_avg_load5_avg` | CPU load average 5min (avg). |
| `minio_node_cpu_avg_load5_max` | CPU load average 5min (max). |
| `minio_node_cpu_avg_load5_perc` | CPU load average 5min (percentage). |
| `minio_node_cpu_avg_load5_perc_avg` | CPU load average 5min (percentage) (avg). |
| `minio_node_cpu_avg_load5_perc_max` | CPU load average 5min (percentage) (max). |
| `minio_node_cpu_avg_load15` | CPU load average 15min. |
| `minio_node_cpu_avg_load15_avg` | CPU load average 15min (avg). |
| `minio_node_cpu_avg_load15_max` | CPU load average 15min (max). |
| `minio_node_cpu_avg_load15_perc` | CPU load average 15min (percentage). |
| `minio_node_cpu_avg_load15_perc_avg` | CPU load average 15min (percentage) (avg). |
| `minio_node_cpu_avg_load15_perc_max` | CPU load average 15min (percentage) (max). |
## Memory Metrics
| Name | Description |
|:-------------------------------|:------------------------------------|
| `minio_node_mem_available` | Available memory on the node. |
| `minio_node_mem_available_avg` | Available memory on the node (avg). |
| `minio_node_mem_available_max` | Available memory on the node (max). |
| `minio_node_mem_buffers` | Buffers memory on the node. |
| `minio_node_mem_buffers_avg` | Buffers memory on the node (avg). |
| `minio_node_mem_buffers_max` | Buffers memory on the node (max). |
| `minio_node_mem_cache` | Cache memory on the node. |
| `minio_node_mem_cache_avg` | Cache memory on the node (avg). |
| `minio_node_mem_cache_max` | Cache memory on the node (max). |
| `minio_node_mem_free` | Free memory on the node. |
| `minio_node_mem_free_avg` | Free memory on the node (avg). |
| `minio_node_mem_free_max` | Free memory on the node (max). |
| `minio_node_mem_shared` | Shared memory on the node. |
| `minio_node_mem_shared_avg` | Shared memory on the node (avg). |
| `minio_node_mem_shared_max` | Shared memory on the node (max). |
| `minio_node_mem_total` | Total memory on the node. |
| `minio_node_mem_total_avg` | Total memory on the node (avg). |
| `minio_node_mem_total_max` | Total memory on the node (max). |
| `minio_node_mem_used` | Used memory on the node. |
| `minio_node_mem_used_avg` | Used memory on the node (avg). |
| `minio_node_mem_used_max` | Used memory on the node (max). |
| Name | Description |
|:-------------------------------|:------------------------------------------|
| `minio_node_mem_available` | Available memory on the node. |
| `minio_node_mem_available_avg` | Available memory on the node (avg). |
| `minio_node_mem_available_max` | Available memory on the node (max). |
| `minio_node_mem_buffers` | Buffers memory on the node. |
| `minio_node_mem_buffers_avg` | Buffers memory on the node (avg). |
| `minio_node_mem_buffers_max` | Buffers memory on the node (max). |
| `minio_node_mem_cache` | Cache memory on the node. |
| `minio_node_mem_cache_avg` | Cache memory on the node (avg). |
| `minio_node_mem_cache_max` | Cache memory on the node (max). |
| `minio_node_mem_free` | Free memory on the node. |
| `minio_node_mem_free_avg` | Free memory on the node (avg). |
| `minio_node_mem_free_max` | Free memory on the node (max). |
| `minio_node_mem_shared` | Shared memory on the node. |
| `minio_node_mem_shared_avg` | Shared memory on the node (avg). |
| `minio_node_mem_shared_max` | Shared memory on the node (max). |
| `minio_node_mem_total` | Total memory on the node. |
| `minio_node_mem_total_avg` | Total memory on the node (avg). |
| `minio_node_mem_total_max` | Total memory on the node (max). |
| `minio_node_mem_used` | Used memory on the node. |
| `minio_node_mem_used_avg` | Used memory on the node (avg). |
| `minio_node_mem_used_max` | Used memory on the node (max). |
| `minio_node_mem_used_perc` | Used memory percentage on the node. |
| `minio_node_mem_used_perc_avg` | Used memory percentage on the node (avg). |
| `minio_node_mem_used_perc_max` | Used memory percentage on the node (max). |

2
go.mod
View File

@ -49,7 +49,7 @@ require (
github.com/minio/dperf v0.5.2
github.com/minio/highwayhash v1.0.2
github.com/minio/kes-go v0.2.0
github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20
github.com/minio/madmin-go/v3 v3.0.36
github.com/minio/minio-go/v7 v7.0.65-0.20231122233251-1f7dd6b7e3e1
github.com/minio/mux v1.9.0
github.com/minio/pkg/v2 v2.0.4

4
go.sum
View File

@ -446,8 +446,8 @@ github.com/minio/highwayhash v1.0.2 h1:Aak5U0nElisjDCfPSG79Tgzkn2gl66NxOMspRrKnA
github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLTk+kldvVxY=
github.com/minio/kes-go v0.2.0 h1:HA33arq9s3MErbsj3PAXFVfFo4U4yw7lTKQ5kWFrpCA=
github.com/minio/kes-go v0.2.0/go.mod h1:VorHLaIYis9/MxAHAtXN4d8PUMNKhIxTIlvFt0hBOEo=
github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20 h1:5kfjAypPN18QOOQaZjR3jfGzXyIwzLdKMS7d/cPY3Wc=
github.com/minio/madmin-go/v3 v3.0.35-0.20231130082526-199918d0ff20/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8=
github.com/minio/madmin-go/v3 v3.0.36 h1:Ewu/Rt7WVSs9slWW+SZHRc5RPQdYAGIdNZnRr+gyN4k=
github.com/minio/madmin-go/v3 v3.0.36/go.mod h1:4QN2NftLSV7MdlT50dkrenOMmNVHluxTvlqJou3hte8=
github.com/minio/mc v0.0.0-20231127112613-5e6ae2172e25 h1:8jT9Tz4opgrX6mnyFWW+TQ90AnrJqJ0mzeFXUWDHNGo=
github.com/minio/mc v0.0.0-20231127112613-5e6ae2172e25/go.mod h1:8kat72LmpzZ2/xykDcq64tcRRJkkWo1Kd/Z5coC6t0w=
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=