summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-05-10 10:37:53 +0300
committerPaul Buetow <paul@buetow.org>2026-05-10 10:37:53 +0300
commit965e61016751d132fe83a8f44c6a1bf87d92b1a8 (patch)
treec95a7067681512ff5a7a05a03c99e40d6db6ad3c
parent3964965c8ad5eeee16d3338ded718bbd34e1c69d (diff)
nfs-mount-monitor: escalate to reboot after N consecutive fix_mount failures
Persist a consecutive-failure counter to /var/lib/nfs-mount-monitor/fail-count. Increment on every fix_mount failure; reset to 0 on any successful repair or when all three probes pass cleanly. After NFS_FAIL_THRESHOLD (default 5, ~50s) consecutive failures the node is cordoned via kubectl and rebooted with 'systemctl reboot' so the cluster stops routing pods to a silently broken node. NFS_FAIL_THRESHOLD is configurable via /etc/default/nfs-mount-monitor (deployed as EnvironmentFile in the .service unit) without touching the script. Also fix Rexfile path resolution: __FILE__ inside a Rex task resolves to the internal Rex loader path, not the Rexfile itself; use realpath($::rexfile) instead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
-rw-r--r--f3s/r-nodes/Rexfile35
-rw-r--r--f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh118
-rw-r--r--f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default13
-rw-r--r--f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service4
4 files changed, 163 insertions, 7 deletions
diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile
index 846b539..0dc2aea 100644
--- a/f3s/r-nodes/Rexfile
+++ b/f3s/r-nodes/Rexfile
@@ -10,8 +10,14 @@ use Rex -feature => [ '1.14', 'exec_autodie' ];
use Rex::Logger;
use File::Basename qw(dirname);
use File::Spec::Functions qw(catfile rel2abs);
+use Cwd qw(realpath);
-my $RNODES_DIR = dirname( rel2abs(__FILE__) );
+# Rex loads the Rexfile as a synthetic module (__Rexfile__.pm) via @INC, so
+# __FILE__ resolves to the internal Rex loader path rather than this file.
+# $::rexfile is set to $0 (the -f argument) in Rex::CLI before any tasks run;
+# realpath() resolves any relative component against the CWD at load time so
+# the path remains valid even when Rex forks worker processes for parallelism.
+my $RNODES_DIR = dirname( realpath($::rexfile) );
# All three k3s Rocky Linux VMs; root SSH is configured via authorized_keys.
group r_nodes => qw(
@@ -26,14 +32,16 @@ sudo FALSE;
# Deploy in parallel — tasks are idempotent and independent per node.
parallelism 3;
-# Deploy the NFS mount health-monitor script and its systemd units to
-# all three r-nodes, then reload systemd and restart the timer so the
-# new files take effect immediately.
+# Deploy the NFS mount health-monitor script, its systemd units, and the
+# tunable configuration file to all three r-nodes, then reload systemd and
+# restart the timer so the new files take effect immediately.
#
# Files managed:
# /usr/local/bin/check-nfs-mount.sh (monitor + auto-repair script)
+# /etc/default/nfs-mount-monitor (tunable: NFS_FAIL_THRESHOLD)
# /etc/systemd/system/nfs-mount-monitor.service
# /etc/systemd/system/nfs-mount-monitor.timer
+# /var/lib/nfs-mount-monitor/ (state dir for fail-count file)
#
# Idempotent: Rex only writes the file when content changes; the
# on_change handler reloads systemd and restarts the timer only when
@@ -47,6 +55,14 @@ task 'nfs_mount_monitor',
# Reload flag — set to 1 if any file changed, so we only reload once.
my $changed = 0;
+ # Ensure the state directory for the fail counter exists with tight
+ # permissions (only root should read/write the counter).
+ file '/var/lib/nfs-mount-monitor',
+ ensure => 'directory',
+ owner => 'root',
+ group => 'root',
+ mode => '700';
+
# Deploy the health-monitor script.
file '/usr/local/bin/check-nfs-mount.sh',
source => catfile( $monitor_dir, 'check-nfs-mount.sh' ),
@@ -55,6 +71,17 @@ task 'nfs_mount_monitor',
mode => '755',
on_change => sub { $changed = 1 };
+ # Deploy the tunable configuration (NFS_FAIL_THRESHOLD).
+ # The leading '-' in EnvironmentFile=-/etc/default/... means systemd
+ # tolerates the file being absent, but we deploy it so the threshold
+ # is explicitly documented on each node.
+ file '/etc/default/nfs-mount-monitor',
+ source => catfile( $monitor_dir, 'nfs-mount-monitor.default' ),
+ owner => 'root',
+ group => 'root',
+ mode => '644',
+ on_change => sub { $changed = 1 };
+
# Deploy the systemd service unit.
file '/etc/systemd/system/nfs-mount-monitor.service',
source => catfile( $monitor_dir, 'nfs-mount-monitor.service' ),
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
index bc6dcd8..dd71a4d 100644
--- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
+++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
@@ -28,11 +28,33 @@
# A hard 60-second deadline is enforced so the function can never outlast
# its own timer interval (10s) by more than 6x, preventing timer pile-up.
#
+# Consecutive-failure escalation:
+# Each fix_mount failure increments a counter persisted to
+# /var/lib/nfs-mount-monitor/fail-count. A successful repair resets
+# the counter to 0. When the counter reaches NFS_FAIL_THRESHOLD (default
+# 5, configurable via /etc/default/nfs-mount-monitor), the node is cordoned
+# via kubectl so the scheduler stops placing new pods here, a loud message
+# is written to the journal, and 'systemctl reboot' is issued.
+# With the timer firing every 10s, threshold=5 means ~50s of continuously
+# broken NFS before an auto-reboot — safe because r0/r1/r2 form an HA
+# cluster and a Rocky Linux VM reboots in ~30s.
+#
# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
MOUNT_POINT="/data/nfs/k3svolumes"
LOCK_FILE="/var/run/nfs-mount-check.lock"
+# State directory for the fail counter; created if absent.
+STATE_DIR="/var/lib/nfs-mount-monitor"
+FAIL_COUNT_FILE="$STATE_DIR/fail-count"
+
+# Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile
+# deployed alongside this script. Defaults are defined here so the script
+# works even if the file is absent.
+NFS_FAIL_THRESHOLD=5
+# shellcheck source=/etc/default/nfs-mount-monitor
+[ -f /etc/default/nfs-mount-monitor ] && . /etc/default/nfs-mount-monitor
+
# Use a lock file to prevent concurrent runs (timer fires every 10 s)
if [ -f "$LOCK_FILE" ]; then
exit 0
@@ -42,6 +64,26 @@ trap "rm -f $LOCK_FILE" EXIT
MOUNT_FIXED=0
+# read_fail_count — return the current consecutive-failure counter.
+# Returns 0 if the file is absent or contains a non-integer.
+read_fail_count() {
+ local count=0
+ if [ -f "$FAIL_COUNT_FILE" ]; then
+ count=$(< "$FAIL_COUNT_FILE")
+ # Guard against corrupt file contents
+ [[ "$count" =~ ^[0-9]+$ ]] || count=0
+ fi
+ echo "$count"
+}
+
+# write_fail_count — persist COUNT to the state file, creating the
+# directory if it does not yet exist.
+write_fail_count() {
+ local count="$1"
+ mkdir -p "$STATE_DIR"
+ echo "$count" > "$FAIL_COUNT_FILE"
+}
+
# kill_pinning_processes — send SIGKILL to any process whose wchan starts
# with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT.
# This unblocks D-state processes so that umount can detach the filesystem.
@@ -174,14 +216,68 @@ fix_mount () {
return 1
}
+# escalate_reboot — cordon the k3s node so the scheduler stops placing new
+# pods here, log loudly to the journal, then trigger a clean reboot.
+# Called only after NFS_FAIL_THRESHOLD consecutive fix_mount failures.
+escalate_reboot() {
+ local node
+ node=$(hostname)
+ export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
+
+ echo "CRITICAL: NFS mount $MOUNT_POINT has failed $NFS_FAIL_THRESHOLD" \
+ "consecutive repair attempts — escalating to reboot"
+
+ # Cordon the node so the scheduler will not place new pods here while
+ # the reboot is in progress. Failure to cordon is non-fatal: we still
+ # reboot because a broken NFS node is worse than an uncordoned one.
+ if kubectl cordon "$node" 2>&1; then
+ echo "Node $node cordoned successfully"
+ else
+ echo "kubectl cordon failed (will reboot anyway)"
+ fi
+
+ # systemd-journald flushes on SIGTERM, which systemctl reboot sends to
+ # all services before the node goes down — the message above will survive.
+ echo "Initiating systemctl reboot to recover broken NFS mount"
+ systemctl reboot
+}
+
+# run_fix_mount_with_counter — call fix_mount and update the consecutive-
+# failure counter. On success, counter is reset to 0. On failure, counter
+# is incremented; if it reaches NFS_FAIL_THRESHOLD, escalate_reboot is called.
+run_fix_mount_with_counter() {
+ if fix_mount; then
+ # Repair succeeded — reset the failure streak.
+ write_fail_count 0
+ echo "NFS repair succeeded; consecutive-failure counter reset to 0"
+ else
+ # Repair failed — increment the counter and check the threshold.
+ local count
+ count=$(read_fail_count)
+ (( count++ ))
+ write_fail_count "$count"
+ echo "NFS repair failed; consecutive failures: $count / $NFS_FAIL_THRESHOLD"
+
+ if (( count >= NFS_FAIL_THRESHOLD )); then
+ escalate_reboot
+ fi
+ fi
+}
+
+# PROBE_FAILED tracks whether any probe fired run_fix_mount_with_counter.
+# If no probe fires, all checks passed cleanly and we can reset the counter.
+PROBE_FAILED=0
+
if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
echo "NFS mount $MOUNT_POINT not found"
- fix_mount
+ run_fix_mount_with_counter
+ PROBE_FAILED=1
fi
if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then
echo "NFS mount $MOUNT_POINT appears to be unresponsive"
- fix_mount
+ run_fix_mount_with_counter
+ PROBE_FAILED=1
fi
# Write-probe: detect the "reads OK, writes hang" failure mode.
@@ -191,7 +287,19 @@ fi
HEALTHCHECK_FILE="$MOUNT_POINT/.healthcheck.$(hostname)"
if ! timeout 5s sh -c "echo \$\$ > '$HEALTHCHECK_FILE' && rm -f '$HEALTHCHECK_FILE'" 2>/dev/null; then
echo "NFS writes hanging on $MOUNT_POINT"
- fix_mount
+ run_fix_mount_with_counter
+ PROBE_FAILED=1
+fi
+
+# If all three probes passed cleanly (no repair attempt needed), reset the
+# consecutive-failure counter so a previous partial failure streak does not
+# lower the effective reboot threshold. We only write the file when the
+# counter is non-zero to avoid unnecessary writes on every healthy run.
+if [ "$PROBE_FAILED" -eq 0 ]; then
+ if [ "$(read_fail_count)" -ne 0 ]; then
+ write_fail_count 0
+ echo "All probes passed; consecutive-failure counter reset to 0"
+ fi
fi
# After a successful remount, delete pods stuck on this node
@@ -212,4 +320,8 @@ if [ "$MOUNT_FIXED" -eq 1 ]; then
echo "Deleting stuck pod $ns/$pod"
kubectl delete pod -n "$ns" "$pod" --grace-period=0 --force 2>&1
done
+
+ # On a healthy remount, also ensure the fail counter is reset.
+ write_fail_count 0
+ echo "Stuck-pod cleanup done; consecutive-failure counter reset to 0"
fi
diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default
new file mode 100644
index 0000000..e8a27c7
--- /dev/null
+++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default
@@ -0,0 +1,13 @@
+# /etc/default/nfs-mount-monitor
+# Configuration for the NFS mount health monitor.
+# Sourced by /usr/local/bin/check-nfs-mount.sh on each invocation.
+# Changes take effect on the next timer firing (within 10 seconds).
+#
+# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
+
+# NFS_FAIL_THRESHOLD — number of consecutive fix_mount failures before the
+# node is cordoned and rebooted. The timer fires every 10 seconds, so the
+# default of 5 means ~50 seconds of continuously broken NFS before escalation.
+# A clean Rocky Linux VM reboot takes ~30s; the cluster is HA across r0/r1/r2
+# so losing one node for that window is safe.
+NFS_FAIL_THRESHOLD=5
diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
index 6077e0c..e31dbe1 100644
--- a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
+++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
@@ -2,6 +2,9 @@
# Triggered by nfs-mount-monitor.timer (every 10 seconds).
# Logs to the journal: journalctl -u nfs-mount-monitor
#
+# NFS_FAIL_THRESHOLD (consecutive failures before auto-reboot) is tunable
+# via /etc/default/nfs-mount-monitor without touching this unit or the script.
+#
# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
[Unit]
@@ -10,6 +13,7 @@ After=network-online.target
[Service]
Type=oneshot
+EnvironmentFile=-/etc/default/nfs-mount-monitor
ExecStart=/usr/local/bin/check-nfs-mount.sh
StandardOutput=journal
StandardError=journal