diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-10 10:37:53 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-10 10:37:53 +0300 |
| commit | 965e61016751d132fe83a8f44c6a1bf87d92b1a8 (patch) | |
| tree | c95a7067681512ff5a7a05a03c99e40d6db6ad3c | |
| parent | 3964965c8ad5eeee16d3338ded718bbd34e1c69d (diff) | |
nfs-mount-monitor: escalate to reboot after N consecutive fix_mount failures
Persist a consecutive-failure counter to /var/lib/nfs-mount-monitor/fail-count.
Increment on every fix_mount failure; reset to 0 on any successful repair or
when all three probes pass cleanly. After NFS_FAIL_THRESHOLD (default 5, ~50s)
consecutive failures the node is cordoned via kubectl and rebooted with
'systemctl reboot' so the cluster stops routing pods to a silently broken node.
NFS_FAIL_THRESHOLD is configurable via /etc/default/nfs-mount-monitor (deployed
as EnvironmentFile in the .service unit) without touching the script.
Also fix Rexfile path resolution: __FILE__ inside a Rex task resolves to the
internal Rex loader path, not the Rexfile itself; use realpath($::rexfile)
instead.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | f3s/r-nodes/Rexfile | 35 | ||||
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh | 118 | ||||
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default | 13 | ||||
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service | 4 |
4 files changed, 163 insertions, 7 deletions
diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile index 846b539..0dc2aea 100644 --- a/f3s/r-nodes/Rexfile +++ b/f3s/r-nodes/Rexfile @@ -10,8 +10,14 @@ use Rex -feature => [ '1.14', 'exec_autodie' ]; use Rex::Logger; use File::Basename qw(dirname); use File::Spec::Functions qw(catfile rel2abs); +use Cwd qw(realpath); -my $RNODES_DIR = dirname( rel2abs(__FILE__) ); +# Rex loads the Rexfile as a synthetic module (__Rexfile__.pm) via @INC, so +# __FILE__ resolves to the internal Rex loader path rather than this file. +# $::rexfile is set to $0 (the -f argument) in Rex::CLI before any tasks run; +# realpath() resolves any relative component against the CWD at load time so +# the path remains valid even when Rex forks worker processes for parallelism. +my $RNODES_DIR = dirname( realpath($::rexfile) ); # All three k3s Rocky Linux VMs; root SSH is configured via authorized_keys. group r_nodes => qw( @@ -26,14 +32,16 @@ sudo FALSE; # Deploy in parallel — tasks are idempotent and independent per node. parallelism 3; -# Deploy the NFS mount health-monitor script and its systemd units to -# all three r-nodes, then reload systemd and restart the timer so the -# new files take effect immediately. +# Deploy the NFS mount health-monitor script, its systemd units, and the +# tunable configuration file to all three r-nodes, then reload systemd and +# restart the timer so the new files take effect immediately. # # Files managed: # /usr/local/bin/check-nfs-mount.sh (monitor + auto-repair script) +# /etc/default/nfs-mount-monitor (tunable: NFS_FAIL_THRESHOLD) # /etc/systemd/system/nfs-mount-monitor.service # /etc/systemd/system/nfs-mount-monitor.timer +# /var/lib/nfs-mount-monitor/ (state dir for fail-count file) # # Idempotent: Rex only writes the file when content changes; the # on_change handler reloads systemd and restarts the timer only when @@ -47,6 +55,14 @@ task 'nfs_mount_monitor', # Reload flag — set to 1 if any file changed, so we only reload once. my $changed = 0; + # Ensure the state directory for the fail counter exists with tight + # permissions (only root should read/write the counter). + file '/var/lib/nfs-mount-monitor', + ensure => 'directory', + owner => 'root', + group => 'root', + mode => '700'; + # Deploy the health-monitor script. file '/usr/local/bin/check-nfs-mount.sh', source => catfile( $monitor_dir, 'check-nfs-mount.sh' ), @@ -55,6 +71,17 @@ task 'nfs_mount_monitor', mode => '755', on_change => sub { $changed = 1 }; + # Deploy the tunable configuration (NFS_FAIL_THRESHOLD). + # The leading '-' in EnvironmentFile=-/etc/default/... means systemd + # tolerates the file being absent, but we deploy it so the threshold + # is explicitly documented on each node. + file '/etc/default/nfs-mount-monitor', + source => catfile( $monitor_dir, 'nfs-mount-monitor.default' ), + owner => 'root', + group => 'root', + mode => '644', + on_change => sub { $changed = 1 }; + # Deploy the systemd service unit. file '/etc/systemd/system/nfs-mount-monitor.service', source => catfile( $monitor_dir, 'nfs-mount-monitor.service' ), diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh index bc6dcd8..dd71a4d 100644 --- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh +++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh @@ -28,11 +28,33 @@ # A hard 60-second deadline is enforced so the function can never outlast # its own timer interval (10s) by more than 6x, preventing timer pile-up. # +# Consecutive-failure escalation: +# Each fix_mount failure increments a counter persisted to +# /var/lib/nfs-mount-monitor/fail-count. A successful repair resets +# the counter to 0. When the counter reaches NFS_FAIL_THRESHOLD (default +# 5, configurable via /etc/default/nfs-mount-monitor), the node is cordoned +# via kubectl so the scheduler stops placing new pods here, a loud message +# is written to the journal, and 'systemctl reboot' is issued. +# With the timer firing every 10s, threshold=5 means ~50s of continuously +# broken NFS before an auto-reboot — safe because r0/r1/r2 form an HA +# cluster and a Rocky Linux VM reboots in ~30s. +# # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor MOUNT_POINT="/data/nfs/k3svolumes" LOCK_FILE="/var/run/nfs-mount-check.lock" +# State directory for the fail counter; created if absent. +STATE_DIR="/var/lib/nfs-mount-monitor" +FAIL_COUNT_FILE="$STATE_DIR/fail-count" + +# Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile +# deployed alongside this script. Defaults are defined here so the script +# works even if the file is absent. +NFS_FAIL_THRESHOLD=5 +# shellcheck source=/etc/default/nfs-mount-monitor +[ -f /etc/default/nfs-mount-monitor ] && . /etc/default/nfs-mount-monitor + # Use a lock file to prevent concurrent runs (timer fires every 10 s) if [ -f "$LOCK_FILE" ]; then exit 0 @@ -42,6 +64,26 @@ trap "rm -f $LOCK_FILE" EXIT MOUNT_FIXED=0 +# read_fail_count — return the current consecutive-failure counter. +# Returns 0 if the file is absent or contains a non-integer. +read_fail_count() { + local count=0 + if [ -f "$FAIL_COUNT_FILE" ]; then + count=$(< "$FAIL_COUNT_FILE") + # Guard against corrupt file contents + [[ "$count" =~ ^[0-9]+$ ]] || count=0 + fi + echo "$count" +} + +# write_fail_count — persist COUNT to the state file, creating the +# directory if it does not yet exist. +write_fail_count() { + local count="$1" + mkdir -p "$STATE_DIR" + echo "$count" > "$FAIL_COUNT_FILE" +} + # kill_pinning_processes — send SIGKILL to any process whose wchan starts # with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT. # This unblocks D-state processes so that umount can detach the filesystem. @@ -174,14 +216,68 @@ fix_mount () { return 1 } +# escalate_reboot — cordon the k3s node so the scheduler stops placing new +# pods here, log loudly to the journal, then trigger a clean reboot. +# Called only after NFS_FAIL_THRESHOLD consecutive fix_mount failures. +escalate_reboot() { + local node + node=$(hostname) + export KUBECONFIG=/etc/rancher/k3s/k3s.yaml + + echo "CRITICAL: NFS mount $MOUNT_POINT has failed $NFS_FAIL_THRESHOLD" \ + "consecutive repair attempts — escalating to reboot" + + # Cordon the node so the scheduler will not place new pods here while + # the reboot is in progress. Failure to cordon is non-fatal: we still + # reboot because a broken NFS node is worse than an uncordoned one. + if kubectl cordon "$node" 2>&1; then + echo "Node $node cordoned successfully" + else + echo "kubectl cordon failed (will reboot anyway)" + fi + + # systemd-journald flushes on SIGTERM, which systemctl reboot sends to + # all services before the node goes down — the message above will survive. + echo "Initiating systemctl reboot to recover broken NFS mount" + systemctl reboot +} + +# run_fix_mount_with_counter — call fix_mount and update the consecutive- +# failure counter. On success, counter is reset to 0. On failure, counter +# is incremented; if it reaches NFS_FAIL_THRESHOLD, escalate_reboot is called. +run_fix_mount_with_counter() { + if fix_mount; then + # Repair succeeded — reset the failure streak. + write_fail_count 0 + echo "NFS repair succeeded; consecutive-failure counter reset to 0" + else + # Repair failed — increment the counter and check the threshold. + local count + count=$(read_fail_count) + (( count++ )) + write_fail_count "$count" + echo "NFS repair failed; consecutive failures: $count / $NFS_FAIL_THRESHOLD" + + if (( count >= NFS_FAIL_THRESHOLD )); then + escalate_reboot + fi + fi +} + +# PROBE_FAILED tracks whether any probe fired run_fix_mount_with_counter. +# If no probe fires, all checks passed cleanly and we can reset the counter. +PROBE_FAILED=0 + if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then echo "NFS mount $MOUNT_POINT not found" - fix_mount + run_fix_mount_with_counter + PROBE_FAILED=1 fi if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then echo "NFS mount $MOUNT_POINT appears to be unresponsive" - fix_mount + run_fix_mount_with_counter + PROBE_FAILED=1 fi # Write-probe: detect the "reads OK, writes hang" failure mode. @@ -191,7 +287,19 @@ fi HEALTHCHECK_FILE="$MOUNT_POINT/.healthcheck.$(hostname)" if ! timeout 5s sh -c "echo \$\$ > '$HEALTHCHECK_FILE' && rm -f '$HEALTHCHECK_FILE'" 2>/dev/null; then echo "NFS writes hanging on $MOUNT_POINT" - fix_mount + run_fix_mount_with_counter + PROBE_FAILED=1 +fi + +# If all three probes passed cleanly (no repair attempt needed), reset the +# consecutive-failure counter so a previous partial failure streak does not +# lower the effective reboot threshold. We only write the file when the +# counter is non-zero to avoid unnecessary writes on every healthy run. +if [ "$PROBE_FAILED" -eq 0 ]; then + if [ "$(read_fail_count)" -ne 0 ]; then + write_fail_count 0 + echo "All probes passed; consecutive-failure counter reset to 0" + fi fi # After a successful remount, delete pods stuck on this node @@ -212,4 +320,8 @@ if [ "$MOUNT_FIXED" -eq 1 ]; then echo "Deleting stuck pod $ns/$pod" kubectl delete pod -n "$ns" "$pod" --grace-period=0 --force 2>&1 done + + # On a healthy remount, also ensure the fail counter is reset. + write_fail_count 0 + echo "Stuck-pod cleanup done; consecutive-failure counter reset to 0" fi diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default new file mode 100644 index 0000000..e8a27c7 --- /dev/null +++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default @@ -0,0 +1,13 @@ +# /etc/default/nfs-mount-monitor +# Configuration for the NFS mount health monitor. +# Sourced by /usr/local/bin/check-nfs-mount.sh on each invocation. +# Changes take effect on the next timer firing (within 10 seconds). +# +# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor + +# NFS_FAIL_THRESHOLD — number of consecutive fix_mount failures before the +# node is cordoned and rebooted. The timer fires every 10 seconds, so the +# default of 5 means ~50 seconds of continuously broken NFS before escalation. +# A clean Rocky Linux VM reboot takes ~30s; the cluster is HA across r0/r1/r2 +# so losing one node for that window is safe. +NFS_FAIL_THRESHOLD=5 diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service index 6077e0c..e31dbe1 100644 --- a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service +++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service @@ -2,6 +2,9 @@ # Triggered by nfs-mount-monitor.timer (every 10 seconds). # Logs to the journal: journalctl -u nfs-mount-monitor # +# NFS_FAIL_THRESHOLD (consecutive failures before auto-reboot) is tunable +# via /etc/default/nfs-mount-monitor without touching this unit or the script. +# # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor [Unit] @@ -10,6 +13,7 @@ After=network-online.target [Service] Type=oneshot +EnvironmentFile=-/etc/default/nfs-mount-monitor ExecStart=/usr/local/bin/check-nfs-mount.sh StandardOutput=journal StandardError=journal |
