nfs-mount-monitor: escalate to reboot after N consecutive fix_mount failures

Persist a consecutive-failure counter to /var/lib/nfs-mount-monitor/fail-count. Increment on every fix_mount failure; reset to 0 on any successful repair or when all three probes pass cleanly. After NFS_FAIL_THRESHOLD (default 5, ~50s) consecutive failures the node is cordoned via kubectl and rebooted with 'systemctl reboot' so the cluster stops routing pods to a silently broken node. NFS_FAIL_THRESHOLD is configurable via /etc/default/nfs-mount-monitor (deployed as EnvironmentFile in the .service unit) without touching the script. Also fix Rexfile path resolution: __FILE__ inside a Rex task resolves to the internal Rex loader path, not the Rexfile itself; use realpath($::rexfile) instead. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-05-10 10:37:53 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-10 10:37:53 +0300
commit: 965e61016751d132fe83a8f44c6a1bf87d92b1a8 (patch)
tree: c95a7067681512ff5a7a05a03c99e40d6db6ad3c
parent: 3964965c8ad5eeee16d3338ded718bbd34e1c69d (diff)
4 files changed, 163 insertions, 7 deletions
diff --git a/f3s/r-nodes/Rexfile b/f3s/r-nodes/Rexfile
index 846b539..0dc2aea 100644
--- a/f3s/r-nodes/Rexfile
+++ b/f3s/r-nodes/Rexfile
@@ -10,8 +10,14 @@ use Rex -feature => [ '1.14', 'exec_autodie' ];
 use Rex::Logger;
 use File::Basename qw(dirname);
 use File::Spec::Functions qw(catfile rel2abs);
+use Cwd qw(realpath);
 
-my $RNODES_DIR = dirname( rel2abs(__FILE__) );
+# Rex loads the Rexfile as a synthetic module (__Rexfile__.pm) via @INC, so
+# __FILE__ resolves to the internal Rex loader path rather than this file.
+# $::rexfile is set to $0 (the -f argument) in Rex::CLI before any tasks run;
+# realpath() resolves any relative component against the CWD at load time so
+# the path remains valid even when Rex forks worker processes for parallelism.
+my $RNODES_DIR = dirname( realpath($::rexfile) );
 
 # All three k3s Rocky Linux VMs; root SSH is configured via authorized_keys.
 group r_nodes => qw(
@@ -26,14 +32,16 @@ sudo FALSE;
 # Deploy in parallel — tasks are idempotent and independent per node.
 parallelism 3;
 
-# Deploy the NFS mount health-monitor script and its systemd units to
-# all three r-nodes, then reload systemd and restart the timer so the
-# new files take effect immediately.
+# Deploy the NFS mount health-monitor script, its systemd units, and the
+# tunable configuration file to all three r-nodes, then reload systemd and
+# restart the timer so the new files take effect immediately.
 #
 # Files managed:
 #   /usr/local/bin/check-nfs-mount.sh       (monitor + auto-repair script)
+#   /etc/default/nfs-mount-monitor          (tunable: NFS_FAIL_THRESHOLD)
 #   /etc/systemd/system/nfs-mount-monitor.service
 #   /etc/systemd/system/nfs-mount-monitor.timer
+#   /var/lib/nfs-mount-monitor/             (state dir for fail-count file)
 #
 # Idempotent: Rex only writes the file when content changes; the
 # on_change handler reloads systemd and restarts the timer only when
@@ -47,6 +55,14 @@ task 'nfs_mount_monitor',
     # Reload flag — set to 1 if any file changed, so we only reload once.
     my $changed = 0;
 
+    # Ensure the state directory for the fail counter exists with tight
+    # permissions (only root should read/write the counter).
+    file '/var/lib/nfs-mount-monitor',
+      ensure => 'directory',
+      owner  => 'root',
+      group  => 'root',
+      mode   => '700';
+
     # Deploy the health-monitor script.
     file '/usr/local/bin/check-nfs-mount.sh',
       source    => catfile( $monitor_dir, 'check-nfs-mount.sh' ),
@@ -55,6 +71,17 @@ task 'nfs_mount_monitor',
       mode      => '755',
       on_change => sub { $changed = 1 };
 
+    # Deploy the tunable configuration (NFS_FAIL_THRESHOLD).
+    # The leading '-' in EnvironmentFile=-/etc/default/... means systemd
+    # tolerates the file being absent, but we deploy it so the threshold
+    # is explicitly documented on each node.
+    file '/etc/default/nfs-mount-monitor',
+      source    => catfile( $monitor_dir, 'nfs-mount-monitor.default' ),
+      owner     => 'root',
+      group     => 'root',
+      mode      => '644',
+      on_change => sub { $changed = 1 };
+
     # Deploy the systemd service unit.
     file '/etc/systemd/system/nfs-mount-monitor.service',
       source    => catfile( $monitor_dir, 'nfs-mount-monitor.service' ),
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
index bc6dcd8..dd71a4d 100644
--- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
+++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
@@ -28,11 +28,33 @@
 # A hard 60-second deadline is enforced so the function can never outlast
 # its own timer interval (10s) by more than 6x, preventing timer pile-up.
 #
+# Consecutive-failure escalation:
+#   Each fix_mount failure increments a counter persisted to
+#   /var/lib/nfs-mount-monitor/fail-count.  A successful repair resets
+#   the counter to 0.  When the counter reaches NFS_FAIL_THRESHOLD (default
+#   5, configurable via /etc/default/nfs-mount-monitor), the node is cordoned
+#   via kubectl so the scheduler stops placing new pods here, a loud message
+#   is written to the journal, and 'systemctl reboot' is issued.
+#   With the timer firing every 10s, threshold=5 means ~50s of continuously
+#   broken NFS before an auto-reboot — safe because r0/r1/r2 form an HA
+#   cluster and a Rocky Linux VM reboots in ~30s.
+#
 # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
 
 MOUNT_POINT="/data/nfs/k3svolumes"
 LOCK_FILE="/var/run/nfs-mount-check.lock"
 
+# State directory for the fail counter; created if absent.
+STATE_DIR="/var/lib/nfs-mount-monitor"
+FAIL_COUNT_FILE="$STATE_DIR/fail-count"
+
+# Load tunable configuration (NFS_FAIL_THRESHOLD) from the EnvironmentFile
+# deployed alongside this script.  Defaults are defined here so the script
+# works even if the file is absent.
+NFS_FAIL_THRESHOLD=5
+# shellcheck source=/etc/default/nfs-mount-monitor
+[ -f /etc/default/nfs-mount-monitor ] && . /etc/default/nfs-mount-monitor
+
 # Use a lock file to prevent concurrent runs (timer fires every 10 s)
 if [ -f "$LOCK_FILE" ]; then
     exit 0
@@ -42,6 +64,26 @@ trap "rm -f $LOCK_FILE" EXIT
 
 MOUNT_FIXED=0
 
+# read_fail_count — return the current consecutive-failure counter.
+# Returns 0 if the file is absent or contains a non-integer.
+read_fail_count() {
+    local count=0
+    if [ -f "$FAIL_COUNT_FILE" ]; then
+        count=$(< "$FAIL_COUNT_FILE")
+        # Guard against corrupt file contents
+        [[ "$count" =~ ^[0-9]+$ ]] || count=0
+    fi
+    echo "$count"
+}
+
+# write_fail_count — persist COUNT to the state file, creating the
+# directory if it does not yet exist.
+write_fail_count() {
+    local count="$1"
+    mkdir -p "$STATE_DIR"
+    echo "$count" > "$FAIL_COUNT_FILE"
+}
+
 # kill_pinning_processes — send SIGKILL to any process whose wchan starts
 # with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT.
 # This unblocks D-state processes so that umount can detach the filesystem.
@@ -174,14 +216,68 @@ fix_mount () {
     return 1
 }
 
+# escalate_reboot — cordon the k3s node so the scheduler stops placing new
+# pods here, log loudly to the journal, then trigger a clean reboot.
+# Called only after NFS_FAIL_THRESHOLD consecutive fix_mount failures.
+escalate_reboot() {
+    local node
+    node=$(hostname)
+    export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
+
+    echo "CRITICAL: NFS mount $MOUNT_POINT has failed $NFS_FAIL_THRESHOLD" \
+         "consecutive repair attempts — escalating to reboot"
+
+    # Cordon the node so the scheduler will not place new pods here while
+    # the reboot is in progress.  Failure to cordon is non-fatal: we still
+    # reboot because a broken NFS node is worse than an uncordoned one.
+    if kubectl cordon "$node" 2>&1; then
+        echo "Node $node cordoned successfully"
+    else
+        echo "kubectl cordon failed (will reboot anyway)"
+    fi
+
+    # systemd-journald flushes on SIGTERM, which systemctl reboot sends to
+    # all services before the node goes down — the message above will survive.
+    echo "Initiating systemctl reboot to recover broken NFS mount"
+    systemctl reboot
+}
+
+# run_fix_mount_with_counter — call fix_mount and update the consecutive-
+# failure counter.  On success, counter is reset to 0.  On failure, counter
+# is incremented; if it reaches NFS_FAIL_THRESHOLD, escalate_reboot is called.
+run_fix_mount_with_counter() {
+    if fix_mount; then
+        # Repair succeeded — reset the failure streak.
+        write_fail_count 0
+        echo "NFS repair succeeded; consecutive-failure counter reset to 0"
+    else
+        # Repair failed — increment the counter and check the threshold.
+        local count
+        count=$(read_fail_count)
+        (( count++ ))
+        write_fail_count "$count"
+        echo "NFS repair failed; consecutive failures: $count / $NFS_FAIL_THRESHOLD"
+
+        if (( count >= NFS_FAIL_THRESHOLD )); then
+            escalate_reboot
+        fi
+    fi
+}
+
+# PROBE_FAILED tracks whether any probe fired run_fix_mount_with_counter.
+# If no probe fires, all checks passed cleanly and we can reset the counter.
+PROBE_FAILED=0
+
 if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
     echo "NFS mount $MOUNT_POINT not found"
-    fix_mount
+    run_fix_mount_with_counter
+    PROBE_FAILED=1
 fi
 
 if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then
     echo "NFS mount $MOUNT_POINT appears to be unresponsive"
-    fix_mount
+    run_fix_mount_with_counter
+    PROBE_FAILED=1
 fi
 
 # Write-probe: detect the "reads OK, writes hang" failure mode.
@@ -191,7 +287,19 @@ fi
 HEALTHCHECK_FILE="$MOUNT_POINT/.healthcheck.$(hostname)"
 if ! timeout 5s sh -c "echo \$\$ > '$HEALTHCHECK_FILE' && rm -f '$HEALTHCHECK_FILE'" 2>/dev/null; then
     echo "NFS writes hanging on $MOUNT_POINT"
-    fix_mount
+    run_fix_mount_with_counter
+    PROBE_FAILED=1
+fi
+
+# If all three probes passed cleanly (no repair attempt needed), reset the
+# consecutive-failure counter so a previous partial failure streak does not
+# lower the effective reboot threshold.  We only write the file when the
+# counter is non-zero to avoid unnecessary writes on every healthy run.
+if [ "$PROBE_FAILED" -eq 0 ]; then
+    if [ "$(read_fail_count)" -ne 0 ]; then
+        write_fail_count 0
+        echo "All probes passed; consecutive-failure counter reset to 0"
+    fi
 fi
 
 # After a successful remount, delete pods stuck on this node
@@ -212,4 +320,8 @@ if [ "$MOUNT_FIXED" -eq 1 ]; then
         echo "Deleting stuck pod $ns/$pod"
         kubectl delete pod -n "$ns" "$pod" --grace-period=0 --force 2>&1
       done
+
+    # On a healthy remount, also ensure the fail counter is reset.
+    write_fail_count 0
+    echo "Stuck-pod cleanup done; consecutive-failure counter reset to 0"
 fi
diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default
new file mode 100644
index 0000000..e8a27c7
--- /dev/null
+++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.default
@@ -0,0 +1,13 @@
+# /etc/default/nfs-mount-monitor
+# Configuration for the NFS mount health monitor.
+# Sourced by /usr/local/bin/check-nfs-mount.sh on each invocation.
+# Changes take effect on the next timer firing (within 10 seconds).
+#
+# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
+
+# NFS_FAIL_THRESHOLD — number of consecutive fix_mount failures before the
+# node is cordoned and rebooted.  The timer fires every 10 seconds, so the
+# default of 5 means ~50 seconds of continuously broken NFS before escalation.
+# A clean Rocky Linux VM reboot takes ~30s; the cluster is HA across r0/r1/r2
+# so losing one node for that window is safe.
+NFS_FAIL_THRESHOLD=5
diff --git a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
index 6077e0c..e31dbe1 100644
--- a/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
+++ b/f3s/r-nodes/nfs-mount-monitor/nfs-mount-monitor.service
@@ -2,6 +2,9 @@
 # Triggered by nfs-mount-monitor.timer (every 10 seconds).
 # Logs to the journal: journalctl -u nfs-mount-monitor
 #
+# NFS_FAIL_THRESHOLD (consecutive failures before auto-reboot) is tunable
+# via /etc/default/nfs-mount-monitor without touching this unit or the script.
+#
 # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
 
 [Unit]
@@ -10,6 +13,7 @@ After=network-online.target
 
 [Service]
 Type=oneshot
+EnvironmentFile=-/etc/default/nfs-mount-monitor
 ExecStart=/usr/local/bin/check-nfs-mount.sh
 StandardOutput=journal
 StandardError=journal
author	Paul Buetow <paul@buetow.org>	2026-05-10 10:37:53 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-10 10:37:53 +0300
commit	965e61016751d132fe83a8f44c6a1bf87d92b1a8 (patch)
tree	c95a7067681512ff5a7a05a03c99e40d6db6ad3c
parent	3964965c8ad5eeee16d3338ded718bbd34e1c69d (diff)