nfs-mount-monitor: strengthen fix_mount recovery sequence

Add lazy umount fallback, D-state process killer, stunnel restart, and 60-second hard deadline to prevent fix_mount from looping forever when processes are stuck in D state on a stale NFSv4-over-stunnel mount. Recovery sequence is now: 1. mount -o remount -f (cheap, no disruption) 2. kill_pinning_processes (SIGKILL D-state procs with nfs_ wchan) 3. umount -f (force unmount) 4. umount -l (lazy detach VFS node if -f failed) 5. systemctl restart stunnel + 2s sleep (refresh TLS transport) 6. mount (fresh mount) The 60s deadline uses bash $SECONDS so fix_mount can never outlast its own 10-second timer interval by an unbounded amount. Deployed to all three r-nodes (r0/r1/r2) via rex nfs_mount_monitor. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-05-10 10:30:55 +0300
committer: Paul Buetow <paul@buetow.org> 2026-05-10 10:30:55 +0300
commit: 3964965c8ad5eeee16d3338ded718bbd34e1c69d (patch)
tree: a1d2e6ef050b8d132cf127800851492a98da14cc
parent: d6b8e0fab3777d887e0abc7b152580a169579785 (diff)
1 files changed, 118 insertions, 17 deletions
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
index 3593fb7..bc6dcd8 100644
--- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
+++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh
@@ -17,6 +17,17 @@
 # any pods on this node that are stuck in Unknown/Pending/ContainerCreating,
 # allowing the kubelet to reschedule them against the now-healthy volume.
 #
+# fix_mount recovery sequence:
+#   1. mount -o remount -f  (cheapest — no disruption if mount is stale)
+#   2. kill D-state processes pinning the mount (so umount can succeed)
+#   3. umount -f            (force unmount)
+#   4. umount -l            (lazy detach VFS node if -f failed)
+#   5. systemctl restart stunnel + 2s sleep (refresh the TLS transport)
+#   6. mount                (fresh mount via stunnel)
+#
+# A hard 60-second deadline is enforced so the function can never outlast
+# its own timer interval (10s) by more than 6x, preventing timer pile-up.
+#
 # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
 
 MOUNT_POINT="/data/nfs/k3svolumes"
@@ -31,46 +42,136 @@ trap "rm -f $LOCK_FILE" EXIT
 
 MOUNT_FIXED=0
 
+# kill_pinning_processes — send SIGKILL to any process whose wchan starts
+# with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT.
+# This unblocks D-state processes so that umount can detach the filesystem.
+# Kubelet/containerd will restart the affected pods automatically.
+kill_pinning_processes() {
+    echo "Scanning for processes pinning $MOUNT_POINT..."
+    local killed=0
+    for pid_dir in /proc/[0-9]*; do
+        local pid
+        pid=$(basename "$pid_dir")
+
+        # Skip non-existent pids that vanished while we iterate
+        [ -d "$pid_dir" ] || continue
+
+        # Check whether this process is stuck in an NFS kernel wait state
+        local wchan
+        wchan=$(cat "$pid_dir/wchan" 2>/dev/null) || continue
+        [[ "$wchan" == nfs_* ]] || continue
+
+        # Verify the process is actually using our mount point (cwd or fds)
+        local cwd_link
+        cwd_link=$(readlink "$pid_dir/cwd" 2>/dev/null) || true
+        if [[ "$cwd_link" == "$MOUNT_POINT"* ]]; then
+            echo "Killing pid $pid (wchan=$wchan, cwd=$cwd_link)"
+            kill -9 "$pid" 2>/dev/null && (( killed++ )) || true
+            continue
+        fi
+
+        # Also check open file descriptors
+        local fd
+        for fd in "$pid_dir/fd"/*; do
+            local fd_target
+            fd_target=$(readlink "$fd" 2>/dev/null) || continue
+            if [[ "$fd_target" == "$MOUNT_POINT"* ]]; then
+                echo "Killing pid $pid (wchan=$wchan, fd=$fd_target)"
+                kill -9 "$pid" 2>/dev/null && (( killed++ )) || true
+                break
+            fi
+        done
+    done
+    echo "Killed $killed process(es) pinning $MOUNT_POINT"
+}
+
 fix_mount () {
+    # Hard deadline: fix_mount must complete within 60 seconds so the
+    # 10-second timer cannot accumulate an unbounded backlog of instances.
+    local deadline=$(( SECONDS + 60 ))
+
+    check_deadline() {
+        if (( SECONDS >= deadline )); then
+            echo "fix_mount: 60-second deadline exceeded — giving up"
+            return 1
+        fi
+        return 0
+    }
+
     echo "Attempting to remount NFS mount $MOUNT_POINT"
+
+    # --- Step 1: cheap remount (no disruption if the mount is merely stale) ---
     if mount -o remount -f "$MOUNT_POINT" 2>/dev/null; then
-        echo "Remount command issued for $MOUNT_POINT"
+        echo "Remount succeeded for $MOUNT_POINT"
     else
-        echo "Failed to remount NFS mount $MOUNT_POINT"
+        echo "Remount failed for $MOUNT_POINT — proceeding to full cycle"
     fi
 
-    echo "Checking if $MOUNT_POINT is a mountpoint"
+    check_deadline || return 1
+
+    # If the path is already a healthy mountpoint after remount, we are done.
     if mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
-        echo "$MOUNT_POINT is a valid mountpoint"
+        echo "$MOUNT_POINT is still a valid mountpoint after remount; trying fresh mount"
     else
-        echo "$MOUNT_POINT is not a valid mountpoint, attempting mount"
-        if mount "$MOUNT_POINT"; then
+        echo "$MOUNT_POINT is not a valid mountpoint — attempting direct mount"
+        if mount "$MOUNT_POINT" 2>/dev/null; then
             echo "Successfully mounted $MOUNT_POINT"
             MOUNT_FIXED=1
-            return
-        else
-            echo "Failed to mount $MOUNT_POINT"
+            return 0
         fi
+        echo "Direct mount failed — proceeding to umount+remount cycle"
     fi
 
-    echo "Attempting to unmount $MOUNT_POINT"
+    check_deadline || return 1
+
+    # --- Step 2: kill D-state processes so umount can detach cleanly ---
+    kill_pinning_processes
+
+    check_deadline || return 1
+
+    # --- Step 3: force unmount ---
+    echo "Attempting forced umount of $MOUNT_POINT"
     if umount -f "$MOUNT_POINT" 2>/dev/null; then
-        echo "Successfully unmounted $MOUNT_POINT"
+        echo "Force umount succeeded for $MOUNT_POINT"
     else
-        echo "Failed to unmount $MOUNT_POINT (it might not be mounted)"
+        echo "Force umount failed for $MOUNT_POINT — trying lazy umount"
+        # --- Step 4: lazy umount detaches the VFS node even when processes
+        # are still stuck, allowing a fresh mount to bind to a clean path ---
+        if umount -l "$MOUNT_POINT" 2>/dev/null; then
+            echo "Lazy umount succeeded for $MOUNT_POINT"
+        else
+            echo "Lazy umount also failed for $MOUNT_POINT — will still attempt mount"
+        fi
     fi
 
+    check_deadline || return 1
+
+    # --- Step 5: restart stunnel to refresh the TLS transport ---
+    # The most common root cause of mount hangs is a stale stunnel client
+    # session (e.g. after a cluster-wide reboot or CARP failover). Restarting
+    # stunnel tears down the old TCP connection and forces a fresh TLS
+    # handshake before the mount call below.
+    echo "Restarting stunnel to refresh TLS transport"
+    if systemctl restart stunnel 2>/dev/null; then
+        echo "stunnel restarted successfully"
+    else
+        echo "stunnel restart failed — mount may fail too"
+    fi
+    # Give stunnel two seconds to establish the new connection before mounting.
+    sleep 2
+
+    check_deadline || return 1
+
+    # --- Step 6: fresh mount ---
     echo "Attempting to mount $MOUNT_POINT"
-    if mount "$MOUNT_POINT"; then
+    if mount "$MOUNT_POINT" 2>/dev/null; then
         echo "NFS mount $MOUNT_POINT mounted successfully"
         MOUNT_FIXED=1
-        return
-    else
-        echo "Failed to mount NFS mount $MOUNT_POINT"
+        return 0
     fi
 
     echo "Failed to fix NFS mount $MOUNT_POINT"
-    exit 1
+    return 1
 }
 
 if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
author	Paul Buetow <paul@buetow.org>	2026-05-10 10:30:55 +0300
committer	Paul Buetow <paul@buetow.org>	2026-05-10 10:30:55 +0300
commit	3964965c8ad5eeee16d3338ded718bbd34e1c69d (patch)
tree	a1d2e6ef050b8d132cf127800851492a98da14cc
parent	d6b8e0fab3777d887e0abc7b152580a169579785 (diff)