honor replaced disk properly by updating globalLocalDrives (#19038)

globalLocalDrives seem to be not updated during the
HealFormat() leads to a requirement where the server
needs to be restarted for the healing to continue.
This commit is contained in:
Harshavardhana 2024-02-12 13:00:20 -08:00 committed by GitHub
parent 6d381f7c0a
commit eac4e4b279
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 25 additions and 59 deletions

View File

@ -1155,9 +1155,18 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
s.erasureDisks[m][n] = disk
if disk.IsLocal() && globalIsDistErasure {
if disk.IsLocal() {
globalLocalDrivesMu.Lock()
globalLocalSetDrives[s.poolIndex][m][n] = disk
if globalIsDistErasure {
globalLocalSetDrives[s.poolIndex][m][n] = disk
}
for i, ldisk := range globalLocalDrives {
_, k, l := ldisk.GetDiskLoc()
if k == m && l == n {
globalLocalDrives[i] = disk
break
}
}
globalLocalDrivesMu.Unlock()
}
}

View File

@ -203,7 +203,6 @@ func getDisksInfo(disks []StorageAPI, endpoints []Endpoint, metrics bool) (disks
APICalls: make(map[string]uint64, len(info.Metrics.APICalls)),
TotalErrorsAvailability: info.Metrics.TotalErrorsAvailability,
TotalErrorsTimeout: info.Metrics.TotalErrorsTimeout,
TotalTokens: info.Metrics.TotalTokens,
TotalWaiting: info.Metrics.TotalWaiting,
}
for k, v := range info.Metrics.LastMinute {

View File

@ -419,7 +419,7 @@ var (
globalServiceFreezeMu sync.Mutex // Updates.
// List of local drives to this node, this is only set during server startup,
// and should never be mutated. Hold globalLocalDrivesMu to access.
// and is only mutated by HealFormat. Hold globalLocalDrivesMu to access.
globalLocalDrives []StorageAPI
globalLocalDrivesMu sync.RWMutex

View File

@ -556,16 +556,6 @@ func getNodeDriveWaitingIOMD() MetricDescription {
}
}
func getNodeDriveTokensIOMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
Subsystem: driveSubsystem,
Name: "io_tokens",
Help: "Total number concurrent I/O operations configured on drive",
Type: counterMetric,
}
}
func getNodeDriveFreeBytesMD() MetricDescription {
return MetricDescription{
Namespace: nodeMetricNamespace,
@ -3485,12 +3475,6 @@ func getLocalStorageMetrics(opts MetricsGroupOpts) *MetricsGroup {
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
metrics = append(metrics, Metric{
Description: getNodeDriveTokensIOMD(),
Value: float64(disk.Metrics.TotalTokens),
VariableLabels: map[string]string{"drive": disk.DrivePath},
})
for apiName, latency := range disk.Metrics.LastMinute {
metrics = append(metrics, Metric{
Description: getNodeDriveAPILatencyMD(),

View File

@ -833,16 +833,19 @@ func (s *peerRESTServer) CommitBinaryHandler(w http.ResponseWriter, r *http.Requ
var errUnsupportedSignal = fmt.Errorf("unsupported signal")
func waitingDrivesNode() map[string]madmin.DiskMetrics {
errs := make([]error, len(globalLocalDrives))
infos := make([]DiskInfo, len(globalLocalDrives))
for i, drive := range globalLocalDrives {
globalLocalDrivesMu.RLock()
localDrives := globalLocalDrives
globalLocalDrivesMu.RUnlock()
errs := make([]error, len(localDrives))
infos := make([]DiskInfo, len(localDrives))
for i, drive := range localDrives {
infos[i], errs[i] = drive.DiskInfo(GlobalContext, DiskInfoOptions{})
}
infoMaps := make(map[string]madmin.DiskMetrics)
for i := range infos {
if infos[i].Metrics.TotalWaiting >= 1 && errors.Is(errs[i], errFaultyDisk) {
infoMaps[infos[i].Endpoint] = madmin.DiskMetrics{
TotalTokens: infos[i].Metrics.TotalTokens,
TotalWaiting: infos[i].Metrics.TotalWaiting,
}
}

View File

@ -80,7 +80,6 @@ type DiskInfo struct {
type DiskMetrics struct {
LastMinute map[string]AccElem `json:"apiLatencies,omitempty"`
APICalls map[string]uint64 `json:"apiCalls,omitempty"`
TotalTokens uint32 `json:"totalTokens,omitempty"`
TotalWaiting uint32 `json:"totalWaiting,omitempty"`
TotalErrorsAvailability uint64 `json:"totalErrsAvailability"`
TotalErrorsTimeout uint64 `json:"totalErrsTimeout"`

View File

@ -1480,12 +1480,6 @@ func (z *DiskMetrics) DecodeMsg(dc *msgp.Reader) (err error) {
}
z.APICalls[za0003] = za0004
}
case "TotalTokens":
z.TotalTokens, err = dc.ReadUint32()
if err != nil {
err = msgp.WrapError(err, "TotalTokens")
return
}
case "TotalWaiting":
z.TotalWaiting, err = dc.ReadUint32()
if err != nil {
@ -1529,9 +1523,9 @@ func (z *DiskMetrics) DecodeMsg(dc *msgp.Reader) (err error) {
// EncodeMsg implements msgp.Encodable
func (z *DiskMetrics) EncodeMsg(en *msgp.Writer) (err error) {
// map header, size 8
// map header, size 7
// write "LastMinute"
err = en.Append(0x88, 0xaa, 0x4c, 0x61, 0x73, 0x74, 0x4d, 0x69, 0x6e, 0x75, 0x74, 0x65)
err = en.Append(0x87, 0xaa, 0x4c, 0x61, 0x73, 0x74, 0x4d, 0x69, 0x6e, 0x75, 0x74, 0x65)
if err != nil {
return
}
@ -1574,16 +1568,6 @@ func (z *DiskMetrics) EncodeMsg(en *msgp.Writer) (err error) {
return
}
}
// write "TotalTokens"
err = en.Append(0xab, 0x54, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73)
if err != nil {
return
}
err = en.WriteUint32(z.TotalTokens)
if err != nil {
err = msgp.WrapError(err, "TotalTokens")
return
}
// write "TotalWaiting"
err = en.Append(0xac, 0x54, 0x6f, 0x74, 0x61, 0x6c, 0x57, 0x61, 0x69, 0x74, 0x69, 0x6e, 0x67)
if err != nil {
@ -1640,9 +1624,9 @@ func (z *DiskMetrics) EncodeMsg(en *msgp.Writer) (err error) {
// MarshalMsg implements msgp.Marshaler
func (z *DiskMetrics) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.Require(b, z.Msgsize())
// map header, size 8
// map header, size 7
// string "LastMinute"
o = append(o, 0x88, 0xaa, 0x4c, 0x61, 0x73, 0x74, 0x4d, 0x69, 0x6e, 0x75, 0x74, 0x65)
o = append(o, 0x87, 0xaa, 0x4c, 0x61, 0x73, 0x74, 0x4d, 0x69, 0x6e, 0x75, 0x74, 0x65)
o = msgp.AppendMapHeader(o, uint32(len(z.LastMinute)))
for za0001, za0002 := range z.LastMinute {
o = msgp.AppendString(o, za0001)
@ -1659,9 +1643,6 @@ func (z *DiskMetrics) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.AppendString(o, za0003)
o = msgp.AppendUint64(o, za0004)
}
// string "TotalTokens"
o = append(o, 0xab, 0x54, 0x6f, 0x74, 0x61, 0x6c, 0x54, 0x6f, 0x6b, 0x65, 0x6e, 0x73)
o = msgp.AppendUint32(o, z.TotalTokens)
// string "TotalWaiting"
o = append(o, 0xac, 0x54, 0x6f, 0x74, 0x61, 0x6c, 0x57, 0x61, 0x69, 0x74, 0x69, 0x6e, 0x67)
o = msgp.AppendUint32(o, z.TotalWaiting)
@ -1758,12 +1739,6 @@ func (z *DiskMetrics) UnmarshalMsg(bts []byte) (o []byte, err error) {
}
z.APICalls[za0003] = za0004
}
case "TotalTokens":
z.TotalTokens, bts, err = msgp.ReadUint32Bytes(bts)
if err != nil {
err = msgp.WrapError(err, "TotalTokens")
return
}
case "TotalWaiting":
z.TotalWaiting, bts, err = msgp.ReadUint32Bytes(bts)
if err != nil {
@ -1822,7 +1797,7 @@ func (z *DiskMetrics) Msgsize() (s int) {
s += msgp.StringPrefixSize + len(za0003) + msgp.Uint64Size
}
}
s += 12 + msgp.Uint32Size + 13 + msgp.Uint32Size + 24 + msgp.Uint64Size + 19 + msgp.Uint64Size + 12 + msgp.Uint64Size + 13 + msgp.Uint64Size
s += 13 + msgp.Uint32Size + 24 + msgp.Uint64Size + 19 + msgp.Uint64Size + 12 + msgp.Uint64Size + 13 + msgp.Uint64Size
return
}

View File

@ -20,9 +20,7 @@ package cmd
//go:generate msgp -file $GOFILE -unexported
const (
// Added orig-volume support for CreateFile, WriteMetadata, ReadVersion, ListDir
// this is needed for performance optimization on bucket checks.
storageRESTVersion = "v56"
storageRESTVersion = "v57" // Remove TotalTokens from DiskMetrics
storageRESTVersionPrefix = SlashSeparator + storageRESTVersion
storageRESTPrefix = minioReservedBucketPath + "/storage"
)

View File

@ -194,7 +194,6 @@ For deployments with [bucket](https://min.io/docs/minio/linux/administration/buc
| `minio_node_drive_errors_timeout` | Total number of drive timeout errors since server start |
| `minio_node_drive_errors_availability` | Total number of drive I/O errors, permission denied and timeouts since server start |
| `minio_node_drive_io_waiting` | Total number I/O operations waiting on drive |
| `minio_node_drive_io_tokens` | Total number concurrent I/O operations configured on drive |
## Identity and Access Management (IAM) Metrics