diff options
| author | Paul Buetow <paul@buetow.org> | 2026-05-10 10:30:55 +0300 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-05-10 10:30:55 +0300 |
| commit | 3964965c8ad5eeee16d3338ded718bbd34e1c69d (patch) | |
| tree | a1d2e6ef050b8d132cf127800851492a98da14cc | |
| parent | d6b8e0fab3777d887e0abc7b152580a169579785 (diff) | |
nfs-mount-monitor: strengthen fix_mount recovery sequence
Add lazy umount fallback, D-state process killer, stunnel restart, and
60-second hard deadline to prevent fix_mount from looping forever when
processes are stuck in D state on a stale NFSv4-over-stunnel mount.
Recovery sequence is now:
1. mount -o remount -f (cheap, no disruption)
2. kill_pinning_processes (SIGKILL D-state procs with nfs_ wchan)
3. umount -f (force unmount)
4. umount -l (lazy detach VFS node if -f failed)
5. systemctl restart stunnel + 2s sleep (refresh TLS transport)
6. mount (fresh mount)
The 60s deadline uses bash $SECONDS so fix_mount can never outlast its
own 10-second timer interval by an unbounded amount. Deployed to all
three r-nodes (r0/r1/r2) via rex nfs_mount_monitor.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh | 135 |
1 files changed, 118 insertions, 17 deletions
diff --git a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh index 3593fb7..bc6dcd8 100644 --- a/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh +++ b/f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh @@ -17,6 +17,17 @@ # any pods on this node that are stuck in Unknown/Pending/ContainerCreating, # allowing the kubelet to reschedule them against the now-healthy volume. # +# fix_mount recovery sequence: +# 1. mount -o remount -f (cheapest — no disruption if mount is stale) +# 2. kill D-state processes pinning the mount (so umount can succeed) +# 3. umount -f (force unmount) +# 4. umount -l (lazy detach VFS node if -f failed) +# 5. systemctl restart stunnel + 2s sleep (refresh the TLS transport) +# 6. mount (fresh mount via stunnel) +# +# A hard 60-second deadline is enforced so the function can never outlast +# its own timer interval (10s) by more than 6x, preventing timer pile-up. +# # Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor MOUNT_POINT="/data/nfs/k3svolumes" @@ -31,46 +42,136 @@ trap "rm -f $LOCK_FILE" EXIT MOUNT_FIXED=0 +# kill_pinning_processes — send SIGKILL to any process whose wchan starts +# with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT. +# This unblocks D-state processes so that umount can detach the filesystem. +# Kubelet/containerd will restart the affected pods automatically. +kill_pinning_processes() { + echo "Scanning for processes pinning $MOUNT_POINT..." + local killed=0 + for pid_dir in /proc/[0-9]*; do + local pid + pid=$(basename "$pid_dir") + + # Skip non-existent pids that vanished while we iterate + [ -d "$pid_dir" ] || continue + + # Check whether this process is stuck in an NFS kernel wait state + local wchan + wchan=$(cat "$pid_dir/wchan" 2>/dev/null) || continue + [[ "$wchan" == nfs_* ]] || continue + + # Verify the process is actually using our mount point (cwd or fds) + local cwd_link + cwd_link=$(readlink "$pid_dir/cwd" 2>/dev/null) || true + if [[ "$cwd_link" == "$MOUNT_POINT"* ]]; then + echo "Killing pid $pid (wchan=$wchan, cwd=$cwd_link)" + kill -9 "$pid" 2>/dev/null && (( killed++ )) || true + continue + fi + + # Also check open file descriptors + local fd + for fd in "$pid_dir/fd"/*; do + local fd_target + fd_target=$(readlink "$fd" 2>/dev/null) || continue + if [[ "$fd_target" == "$MOUNT_POINT"* ]]; then + echo "Killing pid $pid (wchan=$wchan, fd=$fd_target)" + kill -9 "$pid" 2>/dev/null && (( killed++ )) || true + break + fi + done + done + echo "Killed $killed process(es) pinning $MOUNT_POINT" +} + fix_mount () { + # Hard deadline: fix_mount must complete within 60 seconds so the + # 10-second timer cannot accumulate an unbounded backlog of instances. + local deadline=$(( SECONDS + 60 )) + + check_deadline() { + if (( SECONDS >= deadline )); then + echo "fix_mount: 60-second deadline exceeded — giving up" + return 1 + fi + return 0 + } + echo "Attempting to remount NFS mount $MOUNT_POINT" + + # --- Step 1: cheap remount (no disruption if the mount is merely stale) --- if mount -o remount -f "$MOUNT_POINT" 2>/dev/null; then - echo "Remount command issued for $MOUNT_POINT" + echo "Remount succeeded for $MOUNT_POINT" else - echo "Failed to remount NFS mount $MOUNT_POINT" + echo "Remount failed for $MOUNT_POINT — proceeding to full cycle" fi - echo "Checking if $MOUNT_POINT is a mountpoint" + check_deadline || return 1 + + # If the path is already a healthy mountpoint after remount, we are done. if mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then - echo "$MOUNT_POINT is a valid mountpoint" + echo "$MOUNT_POINT is still a valid mountpoint after remount; trying fresh mount" else - echo "$MOUNT_POINT is not a valid mountpoint, attempting mount" - if mount "$MOUNT_POINT"; then + echo "$MOUNT_POINT is not a valid mountpoint — attempting direct mount" + if mount "$MOUNT_POINT" 2>/dev/null; then echo "Successfully mounted $MOUNT_POINT" MOUNT_FIXED=1 - return - else - echo "Failed to mount $MOUNT_POINT" + return 0 fi + echo "Direct mount failed — proceeding to umount+remount cycle" fi - echo "Attempting to unmount $MOUNT_POINT" + check_deadline || return 1 + + # --- Step 2: kill D-state processes so umount can detach cleanly --- + kill_pinning_processes + + check_deadline || return 1 + + # --- Step 3: force unmount --- + echo "Attempting forced umount of $MOUNT_POINT" if umount -f "$MOUNT_POINT" 2>/dev/null; then - echo "Successfully unmounted $MOUNT_POINT" + echo "Force umount succeeded for $MOUNT_POINT" else - echo "Failed to unmount $MOUNT_POINT (it might not be mounted)" + echo "Force umount failed for $MOUNT_POINT — trying lazy umount" + # --- Step 4: lazy umount detaches the VFS node even when processes + # are still stuck, allowing a fresh mount to bind to a clean path --- + if umount -l "$MOUNT_POINT" 2>/dev/null; then + echo "Lazy umount succeeded for $MOUNT_POINT" + else + echo "Lazy umount also failed for $MOUNT_POINT — will still attempt mount" + fi fi + check_deadline || return 1 + + # --- Step 5: restart stunnel to refresh the TLS transport --- + # The most common root cause of mount hangs is a stale stunnel client + # session (e.g. after a cluster-wide reboot or CARP failover). Restarting + # stunnel tears down the old TCP connection and forces a fresh TLS + # handshake before the mount call below. + echo "Restarting stunnel to refresh TLS transport" + if systemctl restart stunnel 2>/dev/null; then + echo "stunnel restarted successfully" + else + echo "stunnel restart failed — mount may fail too" + fi + # Give stunnel two seconds to establish the new connection before mounting. + sleep 2 + + check_deadline || return 1 + + # --- Step 6: fresh mount --- echo "Attempting to mount $MOUNT_POINT" - if mount "$MOUNT_POINT"; then + if mount "$MOUNT_POINT" 2>/dev/null; then echo "NFS mount $MOUNT_POINT mounted successfully" MOUNT_FIXED=1 - return - else - echo "Failed to mount NFS mount $MOUNT_POINT" + return 0 fi echo "Failed to fix NFS mount $MOUNT_POINT" - exit 1 + return 1 } if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then |
