From 9a3c992d7a2729e865940e7031f158fcf831144a Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Thu, 25 Apr 2024 22:55:41 +0100 Subject: [PATCH] heal: Fix regression in healing a new fresh drive (#19615) --- Makefile | 1 + .../verify-healing-empty-erasure-set.sh | 140 ++++++++++++++++++ buildscripts/verify-healing.sh | 55 ++++++- cmd/global-heal.go | 64 ++++---- 4 files changed, 225 insertions(+), 35 deletions(-) create mode 100755 buildscripts/verify-healing-empty-erasure-set.sh diff --git a/Makefile b/Makefile index 51d9dd710..7347b7a35 100644 --- a/Makefile +++ b/Makefile @@ -131,6 +131,7 @@ verify-healing: ## verify healing and replacing disks with minio binary @echo "Verify healing build with race" @GORACE=history_size=7 CGO_ENABLED=1 go build -race -tags kqueue -trimpath --ldflags "$(LDFLAGS)" -o $(PWD)/minio 1>/dev/null @(env bash $(PWD)/buildscripts/verify-healing.sh) + @(env bash $(PWD)/buildscripts/verify-healing-empty-erasure-set.sh) @(env bash $(PWD)/buildscripts/heal-inconsistent-versions.sh) verify-healing-with-root-disks: ## verify healing root disks diff --git a/buildscripts/verify-healing-empty-erasure-set.sh b/buildscripts/verify-healing-empty-erasure-set.sh new file mode 100755 index 000000000..4a7be9e48 --- /dev/null +++ b/buildscripts/verify-healing-empty-erasure-set.sh @@ -0,0 +1,140 @@ +#!/bin/bash -e +# + +set -E +set -o pipefail + +if [ ! -x "$PWD/minio" ]; then + echo "minio executable binary not found in current directory" + exit 1 +fi + +WORK_DIR="$PWD/.verify-$RANDOM" +MINIO_CONFIG_DIR="$WORK_DIR/.minio" +MINIO=("$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server) + +function start_minio_3_node() { + export MINIO_ROOT_USER=minio + export MINIO_ROOT_PASSWORD=minio123 + export MINIO_ERASURE_SET_DRIVE_COUNT=6 + export MINIO_CI_CD=1 + + start_port=$2 + args="" + for i in $(seq 1 3); do + args="$args http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/1/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/2/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/3/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/4/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/5/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/6/" + done + + "${MINIO[@]}" --address ":$((start_port + 1))" $args >"${WORK_DIR}/dist-minio-server1.log" 2>&1 & + pid1=$! + disown ${pid1} + + "${MINIO[@]}" --address ":$((start_port + 2))" $args >"${WORK_DIR}/dist-minio-server2.log" 2>&1 & + pid2=$! + disown $pid2 + + "${MINIO[@]}" --address ":$((start_port + 3))" $args >"${WORK_DIR}/dist-minio-server3.log" 2>&1 & + pid3=$! + disown $pid3 + + sleep "$1" + + if ! ps -p $pid1 1>&2 >/dev/null; then + echo "server1 log:" + cat "${WORK_DIR}/dist-minio-server1.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + if ! ps -p $pid2 1>&2 >/dev/null; then + echo "server2 log:" + cat "${WORK_DIR}/dist-minio-server2.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + if ! ps -p $pid3 1>&2 >/dev/null; then + echo "server3 log:" + cat "${WORK_DIR}/dist-minio-server3.log" + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + if ! pkill minio; then + for i in $(seq 1 3); do + echo "server$i log:" + cat "${WORK_DIR}/dist-minio-server$i.log" + done + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi + + sleep 1 + if pgrep minio; then + # forcibly killing, to proceed further properly. + if ! pkill -9 minio; then + echo "no minio process running anymore, proceed." + fi + fi +} + +function check_online() { + if ! grep -q 'Status:' ${WORK_DIR}/dist-minio-*.log; then + echo "1" + fi +} + +function purge() { + rm -rf "$1" +} + +function __init__() { + echo "Initializing environment" + mkdir -p "$WORK_DIR" + mkdir -p "$MINIO_CONFIG_DIR" + + ## version is purposefully set to '3' for minio to migrate configuration file + echo '{"version": "3", "credential": {"accessKey": "minio", "secretKey": "minio123"}, "region": "us-east-1"}' >"$MINIO_CONFIG_DIR/config.json" +} + +function perform_test() { + start_minio_3_node 120 $2 + + echo "Testing Distributed Erasure setup healing of drives" + echo "Remove the contents of the disks belonging to '${1}' erasure set" + + rm -rf ${WORK_DIR}/${1}/*/ + + set -x + start_minio_3_node 120 $2 + + rv=$(check_online) + if [ "$rv" == "1" ]; then + for i in $(seq 1 3); do + echo "server$i log:" + cat "${WORK_DIR}/dist-minio-server$i.log" + done + pkill -9 minio + echo "FAILED" + purge "$WORK_DIR" + exit 1 + fi +} + +function main() { + # use same ports for all tests + start_port=$(shuf -i 10000-65000 -n 1) + + perform_test "2" ${start_port} + perform_test "1" ${start_port} + perform_test "3" ${start_port} +} + +(__init__ "$@" && main "$@") +rv=$? +purge "$WORK_DIR" +exit "$rv" diff --git a/buildscripts/verify-healing.sh b/buildscripts/verify-healing.sh index 4a7be9e48..45056b5b6 100755 --- a/buildscripts/verify-healing.sh +++ b/buildscripts/verify-healing.sh @@ -12,6 +12,7 @@ fi WORK_DIR="$PWD/.verify-$RANDOM" MINIO_CONFIG_DIR="$WORK_DIR/.minio" MINIO=("$PWD/minio" --config-dir "$MINIO_CONFIG_DIR" server) +GOPATH=/tmp/gopath function start_minio_3_node() { export MINIO_ROOT_USER=minio @@ -19,10 +20,14 @@ function start_minio_3_node() { export MINIO_ERASURE_SET_DRIVE_COUNT=6 export MINIO_CI_CD=1 + first_time=$(find ${WORK_DIR}/ | grep format.json | wc -l) + start_port=$2 args="" - for i in $(seq 1 3); do - args="$args http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/1/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/2/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/3/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/4/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/5/ http://127.0.0.1:$((start_port + i))${WORK_DIR}/$i/6/" + for d in $(seq 1 3 5); do + args="$args http://127.0.0.1:$((start_port + 1))${WORK_DIR}/1/${d}/ http://127.0.0.1:$((start_port + 2))${WORK_DIR}/2/${d}/ http://127.0.0.1:$((start_port + 3))${WORK_DIR}/3/${d}/ " + d=$((d + 1)) + args="$args http://127.0.0.1:$((start_port + 1))${WORK_DIR}/1/${d}/ http://127.0.0.1:$((start_port + 2))${WORK_DIR}/2/${d}/ http://127.0.0.1:$((start_port + 3))${WORK_DIR}/3/${d}/ " done "${MINIO[@]}" --address ":$((start_port + 1))" $args >"${WORK_DIR}/dist-minio-server1.log" 2>&1 & @@ -39,6 +44,8 @@ function start_minio_3_node() { sleep "$1" + [ ${first_time} -eq 0 ] && upload_objects $start_port + if ! ps -p $pid1 1>&2 >/dev/null; then echo "server1 log:" cat "${WORK_DIR}/dist-minio-server1.log" @@ -82,10 +89,23 @@ function start_minio_3_node() { fi } -function check_online() { +function check_heal() { if ! grep -q 'Status:' ${WORK_DIR}/dist-minio-*.log; then - echo "1" + return 1 fi + + for ((i = 0; i < 20; i++)); do + test -f ${WORK_DIR}/$1/1/.minio.sys/format.json + v1=$? + nextInES=$(($1 + 1)) && [ $nextInES -gt 3 ] && nextInES=1 + foundFiles1=$(find ${WORK_DIR}/$1/1/ | grep -v .minio.sys | grep xl.meta | wc -l) + foundFiles2=$(find ${WORK_DIR}/$nextInES/1/ | grep -v .minio.sys | grep xl.meta | wc -l) + test $foundFiles1 -eq $foundFiles2 + v2=$? + [ $v1 == 0 -a $v2 == 0 ] && return 0 + sleep 10 + done + return 1 } function purge() { @@ -99,20 +119,39 @@ function __init__() { ## version is purposefully set to '3' for minio to migrate configuration file echo '{"version": "3", "credential": {"accessKey": "minio", "secretKey": "minio123"}, "region": "us-east-1"}' >"$MINIO_CONFIG_DIR/config.json" + + if [ ! -f /tmp/mc ]; then + wget --quiet -O /tmp/mc https://dl.minio.io/client/mc/release/linux-amd64/mc && + chmod +x /tmp/mc + fi +} + +function upload_objects() { + start_port=$1 + + /tmp/mc alias set myminio http://127.0.0.1:$((start_port + 1)) minio minio123 --api=s3v4 + /tmp/mc ready myminio + /tmp/mc mb myminio/testbucket/ + for ((i = 0; i < 20; i++)); do + echo "my content" | /tmp/mc pipe myminio/testbucket/file-$i + done } function perform_test() { - start_minio_3_node 120 $2 + start_port=$2 + + start_minio_3_node 120 $start_port echo "Testing Distributed Erasure setup healing of drives" - echo "Remove the contents of the disks belonging to '${1}' erasure set" + echo "Remove the contents of the disks belonging to '${1}' node" rm -rf ${WORK_DIR}/${1}/*/ set -x - start_minio_3_node 120 $2 + start_minio_3_node 120 $start_port - rv=$(check_online) + check_heal ${1} + rv=$? if [ "$rv" == "1" ]; then for i in $(seq 1 3); do echo "server$i log:" diff --git a/cmd/global-heal.go b/cmd/global-heal.go index 74dd6d070..f27b0f27a 100644 --- a/cmd/global-heal.go +++ b/cmd/global-heal.go @@ -28,6 +28,10 @@ import ( "github.com/dustin/go-humanize" "github.com/minio/madmin-go/v3" + "github.com/minio/minio/internal/bucket/lifecycle" + objectlock "github.com/minio/minio/internal/bucket/object/lock" + "github.com/minio/minio/internal/bucket/replication" + "github.com/minio/minio/internal/bucket/versioning" "github.com/minio/minio/internal/color" "github.com/minio/minio/internal/config/storageclass" xioutil "github.com/minio/minio/internal/ioutil" @@ -214,34 +218,40 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, continue } - vc, err := globalBucketVersioningSys.Get(bucket) - if err != nil { - retErr = err - healingLogIf(ctx, err) - continue - } + var ( + vc *versioning.Versioning + lc *lifecycle.Lifecycle + lr objectlock.Retention + rcfg *replication.Config + ) - // Check if the current bucket has a configured lifecycle policy - lc, err := globalLifecycleSys.Get(bucket) - if err != nil && !errors.Is(err, BucketLifecycleNotFound{Bucket: bucket}) { - retErr = err - healingLogIf(ctx, err) - continue - } - - // Check if bucket is object locked. - lr, err := globalBucketObjectLockSys.Get(bucket) - if err != nil { - retErr = err - healingLogIf(ctx, err) - continue - } - - rcfg, err := getReplicationConfig(ctx, bucket) - if err != nil { - retErr = err - healingLogIf(ctx, err) - continue + if !isMinioMetaBucketName(bucket) { + vc, err = globalBucketVersioningSys.Get(bucket) + if err != nil { + retErr = err + healingLogIf(ctx, err) + continue + } + // Check if the current bucket has a configured lifecycle policy + lc, err = globalLifecycleSys.Get(bucket) + if err != nil && !errors.Is(err, BucketLifecycleNotFound{Bucket: bucket}) { + retErr = err + healingLogIf(ctx, err) + continue + } + // Check if bucket is object locked. + lr, err = globalBucketObjectLockSys.Get(bucket) + if err != nil { + retErr = err + healingLogIf(ctx, err) + continue + } + rcfg, err = getReplicationConfig(ctx, bucket) + if err != nil { + retErr = err + healingLogIf(ctx, err) + continue + } } if serverDebugLog {