f3s/r-nodes/nfs-mount-monitor/check-nfs-mount.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215

#!/bin/bash
# NFS mount health monitor — runs every 10 seconds via systemd timer
# (nfs-mount-monitor.timer / nfs-mount-monitor.service)
#
# Checks whether /data/nfs/k3svolumes is mounted and responsive.
# Three probes are run in order:
#   1. mountpoint — detects completely missing mounts
#   2. stat        — detects read hangs / stale cache misses
#   3. write-probe — detects the "reads OK, writes hang" failure mode
#      (stunnel-wrapped NFSv4 can enter a state where stat returns from
#      cache but ALL writes block indefinitely; only the write probe
#      catches this — mount timeo=10 deciseconds = 1s, so 5s gives one
#      full retransmit window plus margin)
#
# If any probe fails, fix_mount is called to attempt a remount, then a
# fresh umount+mount cycle.  On a successful repair it force-deletes
# any pods on this node that are stuck in Unknown/Pending/ContainerCreating,
# allowing the kubelet to reschedule them against the now-healthy volume.
#
# fix_mount recovery sequence:
#   1. mount -o remount -f  (cheapest — no disruption if mount is stale)
#   2. kill D-state processes pinning the mount (so umount can succeed)
#   3. umount -f            (force unmount)
#   4. umount -l            (lazy detach VFS node if -f failed)
#   5. systemctl restart stunnel + 2s sleep (refresh the TLS transport)
#   6. mount                (fresh mount via stunnel)
#
# A hard 60-second deadline is enforced so the function can never outlast
# its own timer interval (10s) by more than 6x, preventing timer pile-up.
#
# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor

MOUNT_POINT="/data/nfs/k3svolumes"
LOCK_FILE="/var/run/nfs-mount-check.lock"

# Use a lock file to prevent concurrent runs (timer fires every 10 s)
if [ -f "$LOCK_FILE" ]; then
    exit 0
fi
touch "$LOCK_FILE"
trap "rm -f $LOCK_FILE" EXIT

MOUNT_FIXED=0

# kill_pinning_processes — send SIGKILL to any process whose wchan starts
# with "nfs_" AND whose open file descriptors or cwd point into MOUNT_POINT.
# This unblocks D-state processes so that umount can detach the filesystem.
# Kubelet/containerd will restart the affected pods automatically.
kill_pinning_processes() {
    echo "Scanning for processes pinning $MOUNT_POINT..."
    local killed=0
    for pid_dir in /proc/[0-9]*; do
        local pid
        pid=$(basename "$pid_dir")

        # Skip non-existent pids that vanished while we iterate
        [ -d "$pid_dir" ] || continue

        # Check whether this process is stuck in an NFS kernel wait state
        local wchan
        wchan=$(cat "$pid_dir/wchan" 2>/dev/null) || continue
        [[ "$wchan" == nfs_* ]] || continue

        # Verify the process is actually using our mount point (cwd or fds)
        local cwd_link
        cwd_link=$(readlink "$pid_dir/cwd" 2>/dev/null) || true
        if [[ "$cwd_link" == "$MOUNT_POINT"* ]]; then
            echo "Killing pid $pid (wchan=$wchan, cwd=$cwd_link)"
            kill -9 "$pid" 2>/dev/null && (( killed++ )) || true
            continue
        fi

        # Also check open file descriptors
        local fd
        for fd in "$pid_dir/fd"/*; do
            local fd_target
            fd_target=$(readlink "$fd" 2>/dev/null) || continue
            if [[ "$fd_target" == "$MOUNT_POINT"* ]]; then
                echo "Killing pid $pid (wchan=$wchan, fd=$fd_target)"
                kill -9 "$pid" 2>/dev/null && (( killed++ )) || true
                break
            fi
        done
    done
    echo "Killed $killed process(es) pinning $MOUNT_POINT"
}

fix_mount () {
    # Hard deadline: fix_mount must complete within 60 seconds so the
    # 10-second timer cannot accumulate an unbounded backlog of instances.
    local deadline=$(( SECONDS + 60 ))

    check_deadline() {
        if (( SECONDS >= deadline )); then
            echo "fix_mount: 60-second deadline exceeded — giving up"
            return 1
        fi
        return 0
    }

    echo "Attempting to remount NFS mount $MOUNT_POINT"

    # --- Step 1: cheap remount (no disruption if the mount is merely stale) ---
    if mount -o remount -f "$MOUNT_POINT" 2>/dev/null; then
        echo "Remount succeeded for $MOUNT_POINT"
    else
        echo "Remount failed for $MOUNT_POINT — proceeding to full cycle"
    fi

    check_deadline || return 1

    # If the path is already a healthy mountpoint after remount, we are done.
    if mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
        echo "$MOUNT_POINT is still a valid mountpoint after remount; trying fresh mount"
    else
        echo "$MOUNT_POINT is not a valid mountpoint — attempting direct mount"
        if mount "$MOUNT_POINT" 2>/dev/null; then
            echo "Successfully mounted $MOUNT_POINT"
            MOUNT_FIXED=1
            return 0
        fi
        echo "Direct mount failed — proceeding to umount+remount cycle"
    fi

    check_deadline || return 1

    # --- Step 2: kill D-state processes so umount can detach cleanly ---
    kill_pinning_processes

    check_deadline || return 1

    # --- Step 3: force unmount ---
    echo "Attempting forced umount of $MOUNT_POINT"
    if umount -f "$MOUNT_POINT" 2>/dev/null; then
        echo "Force umount succeeded for $MOUNT_POINT"
    else
        echo "Force umount failed for $MOUNT_POINT — trying lazy umount"
        # --- Step 4: lazy umount detaches the VFS node even when processes
        # are still stuck, allowing a fresh mount to bind to a clean path ---
        if umount -l "$MOUNT_POINT" 2>/dev/null; then
            echo "Lazy umount succeeded for $MOUNT_POINT"
        else
            echo "Lazy umount also failed for $MOUNT_POINT — will still attempt mount"
        fi
    fi

    check_deadline || return 1

    # --- Step 5: restart stunnel to refresh the TLS transport ---
    # The most common root cause of mount hangs is a stale stunnel client
    # session (e.g. after a cluster-wide reboot or CARP failover). Restarting
    # stunnel tears down the old TCP connection and forces a fresh TLS
    # handshake before the mount call below.
    echo "Restarting stunnel to refresh TLS transport"
    if systemctl restart stunnel 2>/dev/null; then
        echo "stunnel restarted successfully"
    else
        echo "stunnel restart failed — mount may fail too"
    fi
    # Give stunnel two seconds to establish the new connection before mounting.
    sleep 2

    check_deadline || return 1

    # --- Step 6: fresh mount ---
    echo "Attempting to mount $MOUNT_POINT"
    if mount "$MOUNT_POINT" 2>/dev/null; then
        echo "NFS mount $MOUNT_POINT mounted successfully"
        MOUNT_FIXED=1
        return 0
    fi

    echo "Failed to fix NFS mount $MOUNT_POINT"
    return 1
}

if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
    echo "NFS mount $MOUNT_POINT not found"
    fix_mount
fi

if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then
    echo "NFS mount $MOUNT_POINT appears to be unresponsive"
    fix_mount
fi

# Write-probe: detect the "reads OK, writes hang" failure mode.
# A per-host filename prevents r0/r1/r2 from racing on the same file.
# Timeout of 5s covers one full NFS retransmit window (timeo=10 = 1s,
# retrans=2) plus margin, without making the 10-second cron run too long.
HEALTHCHECK_FILE="$MOUNT_POINT/.healthcheck.$(hostname)"
if ! timeout 5s sh -c "echo \$\$ > '$HEALTHCHECK_FILE' && rm -f '$HEALTHCHECK_FILE'" 2>/dev/null; then
    echo "NFS writes hanging on $MOUNT_POINT"
    fix_mount
fi

# After a successful remount, delete pods stuck on this node
if [ "$MOUNT_FIXED" -eq 1 ]; then
    echo "Mount was fixed, checking for stuck pods on this node..."
    NODE=$(hostname)
    export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
    kubectl get pods --all-namespaces --field-selector="spec.nodeName=$NODE" \
      -o json 2>/dev/null | jq -r '
        .items[] |
        select(
          .status.phase == "Unknown" or
          .status.phase == "Pending" or
          (.status.conditions // [] | any(.type == "Ready" and .status == "False")) or
          (.status.containerStatuses // [] | any(.state.waiting.reason == "ContainerCreating"))
        ) | "\(.metadata.namespace) \(.metadata.name)"' | \
      while read ns pod; do
        echo "Deleting stuck pod $ns/$pod"
        kubectl delete pod -n "$ns" "$pod" --grace-period=0 --force 2>&1
      done
fi