Add process metrics in `metrics-v3` (#19612)

endpoint: /minio/metrics/v3/system/process
metrics:
- locks_read_total
- locks_write_total
- cpu_total_seconds
- go_routine_total
- io_rchar_bytes
- io_read_bytes
- io_wchar_bytes
- io_write_bytes
- start_time_seconds
- uptime_seconds
- file_descriptor_limit_total
- file_descriptor_open_total
- syscall_read_total
- syscall_write_total
- resident_memory_bytes
- virtual_memory_bytes
- virtual_memory_max_bytes

Since the standard process collector implements only a subset of these
metrics, remove it and implement our own custom process collector that
captures all the process metrics we need.
This commit is contained in:
Shireesh Anjal 2024-04-26 21:37:23 +05:30 committed by GitHub
parent a658b976f5
commit 4caa3422bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 220 additions and 5 deletions

View File

@ -0,0 +1,172 @@
// Copyright (c) 2015-2024 MinIO, Inc.
//
// # This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package cmd
import (
"context"
"runtime"
"time"
"github.com/prometheus/procfs"
)
const (
processLocksReadTotal = "locks_read_total"
processLocksWriteTotal = "locks_write_total"
processCPUTotalSeconds = "cpu_total_seconds"
processGoRoutineTotal = "go_routine_total"
processIORCharBytes = "io_rchar_bytes"
processIOReadBytes = "io_read_bytes"
processIOWCharBytes = "io_wchar_bytes"
processIOWriteBytes = "io_write_bytes"
processStartTimeSeconds = "start_time_seconds"
processUptimeSeconds = "uptime_seconds"
processFileDescriptorLimitTotal = "file_descriptor_limit_total"
processFileDescriptorOpenTotal = "file_descriptor_open_total"
processSyscallReadTotal = "syscall_read_total"
processSyscallWriteTotal = "syscall_write_total"
processResidentMemoryBytes = "resident_memory_bytes"
processVirtualMemoryBytes = "virtual_memory_bytes"
processVirtualMemoryMaxBytes = "virtual_memory_max_bytes"
)
var (
processLocksReadTotalMD = NewGaugeMD(processLocksReadTotal, "Number of current READ locks on this peer")
processLocksWriteTotalMD = NewGaugeMD(processLocksWriteTotal, "Number of current WRITE locks on this peer")
processCPUTotalSecondsMD = NewCounterMD(processCPUTotalSeconds, "Total user and system CPU time spent in seconds")
processGoRoutineTotalMD = NewGaugeMD(processGoRoutineTotal, "Total number of go routines running")
processIORCharBytesMD = NewCounterMD(processIORCharBytes, "Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar")
processIOReadBytesMD = NewCounterMD(processIOReadBytes, "Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes")
processIOWCharBytesMD = NewCounterMD(processIOWCharBytes, "Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar")
processIOWriteBytesMD = NewCounterMD(processIOWriteBytes, "Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes")
processStarttimeSecondsMD = NewGaugeMD(processStartTimeSeconds, "Start time for MinIO process in seconds since Unix epoc")
processUptimeSecondsMD = NewGaugeMD(processUptimeSeconds, "Uptime for MinIO process in seconds")
processFileDescriptorLimitTotalMD = NewGaugeMD(processFileDescriptorLimitTotal, "Limit on total number of open file descriptors for the MinIO Server process")
processFileDescriptorOpenTotalMD = NewGaugeMD(processFileDescriptorOpenTotal, "Total number of open file descriptors by the MinIO Server process")
processSyscallReadTotalMD = NewCounterMD(processSyscallReadTotal, "Total read SysCalls to the kernel. /proc/[pid]/io syscr")
processSyscallWriteTotalMD = NewCounterMD(processSyscallWriteTotal, "Total write SysCalls to the kernel. /proc/[pid]/io syscw")
processResidentMemoryBytesMD = NewGaugeMD(processResidentMemoryBytes, "Resident memory size in bytes")
processVirtualMemoryBytesMD = NewGaugeMD(processVirtualMemoryBytes, "Virtual memory size in bytes")
processVirtualMemoryMaxBytesMD = NewGaugeMD(processVirtualMemoryMaxBytes, "Maximum virtual memory size in bytes")
)
func loadProcStatMetrics(ctx context.Context, stat procfs.ProcStat, m MetricValues) {
if stat.CPUTime() > 0 {
m.Set(processCPUTotalSeconds, float64(stat.CPUTime()))
}
if stat.ResidentMemory() > 0 {
m.Set(processResidentMemoryBytes, float64(stat.ResidentMemory()))
}
if stat.VirtualMemory() > 0 {
m.Set(processVirtualMemoryBytes, float64(stat.VirtualMemory()))
}
startTime, err := stat.StartTime()
if err != nil {
metricsLogIf(ctx, err)
} else if startTime > 0 {
m.Set(processStartTimeSeconds, float64(startTime))
}
}
func loadProcIOMetrics(ctx context.Context, io procfs.ProcIO, m MetricValues) {
if io.RChar > 0 {
m.Set(processIORCharBytes, float64(io.RChar))
}
if io.ReadBytes > 0 {
m.Set(processIOReadBytes, float64(io.ReadBytes))
}
if io.WChar > 0 {
m.Set(processIOWCharBytes, float64(io.WChar))
}
if io.WriteBytes > 0 {
m.Set(processIOWriteBytes, float64(io.WriteBytes))
}
if io.SyscR > 0 {
m.Set(processSyscallReadTotal, float64(io.SyscR))
}
if io.SyscW > 0 {
m.Set(processSyscallWriteTotal, float64(io.SyscW))
}
}
func loadProcFSMetrics(ctx context.Context, p procfs.Proc, m MetricValues) {
stat, err := p.Stat()
if err != nil {
metricsLogIf(ctx, err)
} else {
loadProcStatMetrics(ctx, stat, m)
}
io, err := p.IO()
if err != nil {
metricsLogIf(ctx, err)
} else {
loadProcIOMetrics(ctx, io, m)
}
l, err := p.Limits()
if err != nil {
metricsLogIf(ctx, err)
} else {
if l.OpenFiles > 0 {
m.Set(processFileDescriptorLimitTotal, float64(l.OpenFiles))
}
if l.AddressSpace > 0 {
m.Set(processVirtualMemoryMaxBytes, float64(l.AddressSpace))
}
}
openFDs, err := p.FileDescriptorsLen()
if err != nil {
metricsLogIf(ctx, err)
} else if openFDs > 0 {
m.Set(processFileDescriptorOpenTotal, float64(openFDs))
}
}
// loadProcessMetrics - `MetricsLoaderFn` for process metrics
func loadProcessMetrics(ctx context.Context, m MetricValues, c *metricsCache) error {
m.Set(processGoRoutineTotal, float64(runtime.NumGoroutine()))
if !globalBootTime.IsZero() {
m.Set(processUptimeSeconds, time.Since(globalBootTime).Seconds())
}
p, err := procfs.Self()
if err != nil {
metricsLogIf(ctx, err)
} else {
loadProcFSMetrics(ctx, p, m)
}
if globalIsDistErasure && globalLockServer != nil {
st := globalLockServer.stats()
m.Set(processLocksReadTotal, float64(st.Reads))
m.Set(processLocksWriteTotal, float64(st.Writes))
}
return nil
}

View File

@ -144,6 +144,29 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
loadCPUMetrics,
)
systemProcessMG := NewMetricsGroup(systemProcessCollectorPath,
[]MetricDescriptor{
processLocksReadTotalMD,
processLocksWriteTotalMD,
processCPUTotalSecondsMD,
processGoRoutineTotalMD,
processIORCharBytesMD,
processIOReadBytesMD,
processIOWCharBytesMD,
processIOWriteBytesMD,
processStarttimeSecondsMD,
processUptimeSecondsMD,
processFileDescriptorLimitTotalMD,
processFileDescriptorOpenTotalMD,
processSyscallReadTotalMD,
processSyscallWriteTotalMD,
processResidentMemoryBytesMD,
processVirtualMemoryBytesMD,
processVirtualMemoryMaxBytesMD,
},
loadProcessMetrics,
)
systemDriveMG := NewMetricsGroup(systemDriveCollectorPath,
[]MetricDescriptor{
driveUsedBytesMD,
@ -263,6 +286,7 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
systemDriveMG,
systemMemoryMG,
systemCPUMG,
systemProcessMG,
clusterHealthMG,
clusterUsageObjectsMG,
@ -299,13 +323,10 @@ func newMetricGroups(r *prometheus.Registry) *metricsV3Collection {
}
// Prepare to register the collectors. Other than `MetricGroup` collectors,
// we also have standard collectors like `ProcessCollector` and `GoCollector`.
// we also have standard collectors like `GoCollector`.
// Create all Non-`MetricGroup` collectors here.
collectors := map[collectorPath]prometheus.Collector{
systemProcessCollectorPath: collectors.NewProcessCollector(collectors.ProcessCollectorOpts{
ReportErrors: true,
}),
systemGoCollectorPath: collectors.NewGoCollector(),
}

View File

@ -64,7 +64,7 @@ These present metrics about the whole MinIO cluster.
Each of the following sub-sections list metrics returned by each of the endpoints.
The standard metrics groups for ProcessCollector and GoCollector are not shown below.
The standard metrics group for GoCollector is not shown below.
### `/api/requests`
@ -163,6 +163,28 @@ The standard metrics groups for ProcessCollector and GoCollector are not shown b
| `minio_system_network_internode_sent_bytes_total` | `counter` | Total number of bytes sent to other peer nodes | `server,pool_index` |
| `minio_system_network_internode_recv_bytes_total` | `counter` | Total number of bytes received from other peer nodes | `server,pool_index` |
### `/system/process`
| Name | Type | Help | Labels |
|-------------------------------|-----------|----------------------------------------------------------------------------------------------------------------|----------|
| `locks_read_total` | `gauge` | Number of current READ locks on this peer | `server` |
| `locks_write_total` | `gauge` | Number of current WRITE locks on this peer | `server` |
| `cpu_total_seconds` | `counter` | Total user and system CPU time spent in seconds | `server` |
| `go_routine_total` | `gauge` | Total number of go routines running | `server` |
| `io_rchar_bytes` | `counter` | Total bytes read by the process from the underlying storage system including cache, /proc/[pid]/io rchar | `server` |
| `io_read_bytes` | `counter` | Total bytes read by the process from the underlying storage system, /proc/[pid]/io read_bytes | `server` |
| `io_wchar_bytes` | `counter` | Total bytes written by the process to the underlying storage system including page cache, /proc/[pid]/io wchar | `server` |
| `io_write_bytes` | `counter` | Total bytes written by the process to the underlying storage system, /proc/[pid]/io write_bytes | `server` |
| `start_time_seconds` | `gauge` | Start time for MinIO process in seconds since Unix epoc | `server` |
| `uptime_seconds` | `gauge` | Uptime for MinIO process in seconds | `server` |
| `file_descriptor_limit_total` | `gauge` | Limit on total number of open file descriptors for the MinIO Server process | `server` |
| `file_descriptor_open_total` | `gauge` | Total number of open file descriptors by the MinIO Server process | `server` |
| `syscall_read_total` | `counter` | Total read SysCalls to the kernel. /proc/[pid]/io syscr | `server` |
| `syscall_write_total` | `counter` | Total write SysCalls to the kernel. /proc/[pid]/io syscw | `server` |
| `resident_memory_bytes` | `gauge` | Resident memory size in bytes | `server` |
| `virtual_memory_bytes` | `gauge` | Virtual memory size in bytes | `server` |
| `virtual_memory_max_bytes` | `gauge` | Maximum virtual memory size in bytes | `server` |
### `/cluster/health`
| Name | Type | Help | Labels |