blob: 7fabed8a685942211819927abf30e451a885a3cd (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
#!/bin/bash
# NFS mount health monitor — runs every 10 seconds via systemd timer
# (nfs-mount-monitor.timer / nfs-mount-monitor.service)
#
# Checks whether /data/nfs/k3svolumes is mounted and responsive.
# If the mount is stale or missing it attempts a remount, then a
# fresh umount+mount cycle. On a successful repair it force-deletes
# any pods on this node that are stuck in Unknown/Pending/ContainerCreating,
# allowing the kubelet to reschedule them against the now-healthy volume.
#
# Deploy via Rex: rex -f f3s/r-nodes/Rexfile nfs_mount_monitor
MOUNT_POINT="/data/nfs/k3svolumes"
LOCK_FILE="/var/run/nfs-mount-check.lock"
# Use a lock file to prevent concurrent runs (timer fires every 10 s)
if [ -f "$LOCK_FILE" ]; then
exit 0
fi
touch "$LOCK_FILE"
trap "rm -f $LOCK_FILE" EXIT
MOUNT_FIXED=0
fix_mount () {
echo "Attempting to remount NFS mount $MOUNT_POINT"
if mount -o remount -f "$MOUNT_POINT" 2>/dev/null; then
echo "Remount command issued for $MOUNT_POINT"
else
echo "Failed to remount NFS mount $MOUNT_POINT"
fi
echo "Checking if $MOUNT_POINT is a mountpoint"
if mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
echo "$MOUNT_POINT is a valid mountpoint"
else
echo "$MOUNT_POINT is not a valid mountpoint, attempting mount"
if mount "$MOUNT_POINT"; then
echo "Successfully mounted $MOUNT_POINT"
MOUNT_FIXED=1
return
else
echo "Failed to mount $MOUNT_POINT"
fi
fi
echo "Attempting to unmount $MOUNT_POINT"
if umount -f "$MOUNT_POINT" 2>/dev/null; then
echo "Successfully unmounted $MOUNT_POINT"
else
echo "Failed to unmount $MOUNT_POINT (it might not be mounted)"
fi
echo "Attempting to mount $MOUNT_POINT"
if mount "$MOUNT_POINT"; then
echo "NFS mount $MOUNT_POINT mounted successfully"
MOUNT_FIXED=1
return
else
echo "Failed to mount NFS mount $MOUNT_POINT"
fi
echo "Failed to fix NFS mount $MOUNT_POINT"
exit 1
}
if ! mountpoint "$MOUNT_POINT" >/dev/null 2>&1; then
echo "NFS mount $MOUNT_POINT not found"
fix_mount
fi
if ! timeout 2s stat "$MOUNT_POINT" >/dev/null 2>&1; then
echo "NFS mount $MOUNT_POINT appears to be unresponsive"
fix_mount
fi
# After a successful remount, delete pods stuck on this node
if [ "$MOUNT_FIXED" -eq 1 ]; then
echo "Mount was fixed, checking for stuck pods on this node..."
NODE=$(hostname)
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
kubectl get pods --all-namespaces --field-selector="spec.nodeName=$NODE" \
-o json 2>/dev/null | jq -r '
.items[] |
select(
.status.phase == "Unknown" or
.status.phase == "Pending" or
(.status.conditions // [] | any(.type == "Ready" and .status == "False")) or
(.status.containerStatuses // [] | any(.state.waiting.reason == "ContainerCreating"))
) | "\(.metadata.namespace) \(.metadata.name)"' | \
while read ns pod; do
echo "Deleting stuck pod $ns/$pod"
kubectl delete pod -n "$ns" "$pod" --grace-period=0 --force 2>&1
done
fi
|