K3s+NFS+Longhorn全套部署脚本

Posted by Forgus on 2026-03-18

部署脚本

1. 基础配置脚本(all_nodes_prepare.sh,所有节点执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
set -e

# ========== 1. 系统源优化 ==========
echo "===== 配置国内软件源 ====="
# 判断系统类型(Debian VM / 树莓派)
if [[ $(uname -m) == "x86_64" ]]; then
# Debian 12 (x86_64) 源
mv /etc/apt/sources.list /etc/apt/sources.list.bak
cat > /etc/apt/sources.list << EOF
deb https://mirrors.ustc.edu.cn/debian/ bookworm main contrib non-free
deb https://mirrors.ustc.edu.cn/debian/ bookworm-updates main contrib non-free
deb https://mirrors.ustc.edu.cn/debian-security bookworm-security main contrib non-free
EOF
elif [[ $(uname -m) == "aarch64" ]]; then
# 树莓派4B (ARM64) 源
mv /etc/apt/sources.list /etc/apt/sources.list.bak
mv /etc/apt/sources.list.d/raspi.list /etc/apt/sources.list.d/raspi.list.bak 2>/dev/null || true

# 基础系统源
cat > /etc/apt/sources.list << EOF
deb https://mirrors.ustc.edu.cn/debian/ bookworm main contrib non-free
deb https://mirrors.ustc.edu.cn/debian/ bookworm-updates main contrib non-free
deb https://mirrors.ustc.edu.cn/debian-security bookworm-security main contrib non-free
EOF

# 树莓派专属源
cat > /etc/apt/sources.list.d/raspi.list << EOF
deb http://mirrors.ustc.edu.cn/archive.raspberrypi.org/debian/ bookworm main
EOF
fi

# ========== 2. 关闭防火墙 ==========
echo "===== 关闭UFW防火墙 ====="
ufw disable || true

# ========== 3. 树莓派CGroup配置 ==========
if [[ $(uname -m) == "aarch64" ]]; then
echo "===== 配置树莓派CGroup ====="
# 检查是否已添加cgroup配置
if ! grep -q "cgroup_memory=1 cgroup_enable=memory" /boot/firmware/cmdline.txt; then
sed -i '$ s/$/ cgroup_memory=1 cgroup_enable=memory/' /boot/firmware/cmdline.txt
echo "CGroup配置已添加,重启后生效!"
fi
fi

# ========== 4. 系统更新与依赖安装 ==========
echo "===== 更新系统并安装基础依赖 ====="
apt update && apt upgrade -y
apt install -y curl wget vim git nfs-common open-iscsi chrony apt-transport-https ca-certificates software-properties-common

# ========== 5. 时间同步 ==========
echo "===== 配置时间同步 ====="
timedatectl set-timezone Asia/Shanghai
systemctl enable --now chrony

# ========== 6. 关闭Swap ==========
echo "===== 关闭swap ====="
swapoff -a
sed -i '/swap/s/^/#/' /etc/fstab

# ========== 7. 内核参数配置 ==========
echo "===== 配置内核参数 ====="
cat > /etc/sysctl.d/k8s.conf << EOF
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_forward = 1
EOF
sysctl --system

# ========== 8. 加载内核模块 ==========
echo "===== 加载内核模块 ====="
modprobe br_netfilter
modprobe overlay

# ========== 9. Open-ISCSI配置 ==========
echo "===== 配置open-iscsi ====="
systemctl enable --now iscsid

# ========== 10. 主机名配置 ==========
echo "===== 配置主机名和Hosts ====="
IP_ADDR=$(hostname -I | awk '{print $1}')
if [[ $IP_ADDR == "192.168.2.40" ]]; then
hostnamectl set-hostname k3s-master
elif [[ $IP_ADDR == "192.168.2.50" ]]; then
hostnamectl set-hostname k3s-worker-01
elif [[ $IP_ADDR =~ 192.168.2.5[1-4] ]]; then
NUM=${IP_ADDR##*.}
hostnamectl set-hostname k3s-worker-0$((NUM-49))
fi

# ========== 11. Hosts配置 ==========
cat >> /etc/hosts << EOF
192.168.2.40 k3s-master
192.168.2.50 k3s-worker-01
192.168.2.51 k3s-worker-02
192.168.2.52 k3s-worker-03
192.168.2.53 k3s-worker-04
192.168.2.54 k3s-worker-05
192.168.2.20 nfs-server
EOF

echo "===== 基础配置完成 ====="
echo "注意:树莓派需重启以生效CGroup配置!"

2. K3s Master部署脚本(master_deploy.sh,192.168.2.40执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
set -e

MASTER_IP="192.168.2.40"
NFS_SERVER="192.168.2.20"
NFS_PATH="/volume3/nfs-share"
IFACE-"enp16s16"

# ========== 0. 配置镜像加速 v1.34+ ==========

echo "===== 配置镜像加速 ====="
mkdir -p /etc/rancher/k3s
cat > /etc/rancher/k3s/registries.yaml << 'EOF'
mirrors:
docker.io:
endpoint:
- "https://docker.m.daocloud.io"
- "https://docker.1ms.ru"
EOF
echo "registries.yaml 创建完成"

# ========== 1. 安装K3s Master(国内镜像) ==========
echo "===== 安装K3s Master(国内镜像) ====="
curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | INSTALL_K3S_MIRROR=cn sh -s - server \
--node-ip $MASTER_IP \
--bind-address $MASTER_IP \
--advertise-address $MASTER_IP \
--disable traefik \
--disable servicelb \
--disable local-storage \
--flannel-iface $IFACE \
--cluster-cidr 10.42.0.0/16 \
--service-cidr 10.43.0.0/16 \
--kubelet-arg "cgroup-driver=systemd"

# ========== 2. 等待K3s启动 ==========
echo "===== 等待K3s Master启动 ====="
until kubectl get nodes; do
sleep 5
done

# ========== 3. Kubectl配置 ==========
echo "===== 配置kubectl ====="
mkdir -p ~/.kube
cp /etc/rancher/k3s/k3s.yaml ~/.kube/config
chmod 600 ~/.kube/config

# ========== 4. 获取Worker加入令牌 ==========
echo "===== 生成Worker加入信息 ====="
TOKEN=$(cat /var/lib/rancher/k3s/server/node-token)
echo "Worker节点加入命令:"
echo "curl -sfL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | INSTALL_K3S_MIRROR=cn K3S_URL=https://$MASTER_IP:6443 K3S_TOKEN=$TOKEN sh -"

# ========== 5. NFS挂载(Master节点) ==========
echo "===== 挂载NFS共享 ====="
mkdir -p /mnt/nfs-share
mount -t nfs $NFS_SERVER:$NFS_PATH /mnt/nfs-share
# 配置开机自动挂载
echo "$NFS_SERVER:$NFS_PATH /mnt/nfs-share nfs defaults 0 0" >> /etc/fstab

# ========== 6. 创建Longhorn备份目录 ==========
echo "===== 创建Longhorn备份目录 ====="
mkdir -p /mnt/nfs-share/longhorn-backup
chmod 777 /mnt/nfs-share/longhorn-backup

echo "===== K3s Master部署完成 ====="
echo "K3s配置文件路径:~/.kube/config"
echo "Token: $TOKEN"
echo "NFS挂载路径:/mnt/nfs-share"
echo "Longhorn备份路径:/mnt/nfs-share/longhorn-backup"

3. K3s Worker部署脚本(worker_deploy.sh,192.168.2.50~54执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/bash
set -e

# ==================== 核心配置(请根据实际环境修改) ====================
MASTER_IP="192.168.2.40" # K3s Server 地址
TOKEN="替换实际token值" # Server 生成的Token
FLANNEL_IFACE="eth0" # 集群通信网卡(树莓派4B 有线默认eth0)
LONGHORN_MOUNT="/var/lib/longhorn" # Longhorn 本地存储挂载点
# ======================================================================

# ========== 核心优化:自动获取主机名作为节点名 ==========
# 获取本机主机名(纯字母数字,去除特殊字符,符合K8s节点名规范)
NODE_NAME=$(hostname | sed 's/[^a-zA-Z0-9\-]//g')
if [[ -z "$NODE_NAME" ]]; then
echo "❌ 无法获取主机名,节点名不能为空!"
exit 1
fi
echo "📌 自动识别节点名:$NODE_NAME"

# ========== 通用工具函数(Agent 本地专用) ==========
# 检查命令是否存在
command_exists() {
command -v "$1" >/dev/null 2>&1
}

# 检查服务是否运行
service_running() {
systemctl is-active --quiet "$1"
}

# 检查目录是否挂载
is_mounted() {
mount | grep -q "$1"
}

# ========== 1. 配置镜像加速(幂等,Agent 本地) ==========
echo -e "\n===== [Agent 本地] 配置镜像加速 ====="
mkdir -p /etc/rancher/k3s
REGISTRY_FILE="/etc/rancher/k3s/registries.yaml"

# 仅当文件不存在/内容不一致时更新,避免重复写入
if [ ! -f "$REGISTRY_FILE" ] || ! grep -q "docker.m.daocloud.io" "$REGISTRY_FILE"; then
cat > "$REGISTRY_FILE" << 'EOF'
mirrors:
docker.io:
endpoint:
- "https://docker.m.daocloud.io"
- "https://docker.1ms.run"
EOF
echo "✅ 镜像加速配置已创建/更新"
else
echo "✅ 镜像加速配置已存在且有效,跳过"
fi

# ========== 2. 安装/校验 K3s Agent(幂等 + 指定节点名) ==========
echo -e "\n===== [Agent 本地] 安装/校验 K3s Agent ====="
# 获取Agent本机IP(优先取FLANNEL_IFACE网卡IP)
WORKER_IP=$(ip addr show "$FLANNEL_IFACE" | grep -oP '(?<=inet\s)\d+(\.\d+){3}' | head -1)
if [[ -z "$WORKER_IP" ]]; then
echo "❌ 未找到 $FLANNEL_IFACE 网卡的IP地址,请检查网卡配置!"
exit 1
fi

# 幂等安装:已安装则跳过,未安装则执行(新增 --node-name 指定节点名)
if command_exists k3s-agent; then
echo "✅ K3s Agent 已安装,跳过安装步骤"
else
echo "🔧 开始安装 K3s Agent(国内镜像源)..."
curl -vL https://rancher-mirror.rancher.cn/k3s/k3s-install.sh | \
INSTALL_K3S_MIRROR=cn \
K3S_URL="https://$MASTER_IP:6443" \
K3S_TOKEN="$TOKEN" \
bash -x -s -- agent \
--node-ip "$WORKER_IP" \
--node-name "$NODE_NAME" \
--flannel-iface "$FLANNEL_IFACE" \
--kubelet-arg "cgroup-driver=systemd" 2>&1 | tee /tmp/k3s-agent-install.log
echo "✅ K3s Agent 安装完成(节点名:$NODE_NAME)"
fi

# 确保K3s Agent服务运行(幂等:已运行则提示,未运行则启动)
if service_running k3s-agent; then
echo "✅ K3s Agent 服务已正常运行"
else
echo "🔧 启动 K3s Agent 服务..."
systemctl enable --now k3s-agent
# 校验启动结果
if service_running k3s-agent; then
echo "✅ K3s Agent 服务启动成功"
else
echo "❌ K3s Agent 服务启动失败,请检查日志:journalctl -u k3s-agent"
exit 1
fi
fi

# ========== 3. 配置Longhorn本地存储(幂等,Agent 本地) ==========
echo -e "\n===== [Agent 本地] 配置/校验 Longhorn 存储 ====="
# 自动识别第一个USB SSD(跳过系统盘,仅匹配sd开头的磁盘)
SSD_DEVICE=$(lsblk -o NAME,TYPE,MOUNTPOINT | grep -E '^sd[a-z]\s+disk\s*$' | awk '{print "/dev/"$1}' | head -1)

if [[ -z "$SSD_DEVICE" ]]; then
echo "⚠️ 未找到USB SSD设备(sd开头),跳过Longhorn存储配置"
else
# 检查是否已挂载
if is_mounted "$LONGHORN_MOUNT"; then
echo "✅ $LONGHORN_MOUNT 已挂载,跳过挂载步骤"
else
# 检查磁盘格式:仅当非ext4时格式化(避免清空已有数据)
if [[ "$(blkid -s TYPE -o value "$SSD_DEVICE" 2>/dev/null)" != "ext4" ]]; then
echo "🔧 格式化 SSD 设备 $SSD_DEVICE 为ext4(注意:会清空设备数据!)"
mkfs.ext4 -F "$SSD_DEVICE"
else
echo "✅ $SSD_DEVICE 已为ext4格式,跳过格式化"
fi

# 创建挂载点并挂载
mkdir -p "$LONGHORN_MOUNT"
mount "$SSD_DEVICE" "$LONGHORN_MOUNT"
echo "✅ SSD 已挂载到 $LONGHORN_MOUNT"

# 配置开机自动挂载(幂等:避免重复写入fstab)
if ! grep -q "^$SSD_DEVICE\s+$LONGHORN_MOUNT" /etc/fstab; then
echo "$SSD_DEVICE $LONGHORN_MOUNT ext4 defaults 0 0" >> /etc/fstab
echo "✅ 已添加到/etc/fstab,开机自动挂载"
else
echo "✅ /etc/fstab 中已存在挂载配置,跳过写入"
fi
fi

# 校验挂载状态
if is_mounted "$LONGHORN_MOUNT"; then
echo "✅ Longhorn 存储配置完成,挂载路径:$LONGHORN_MOUNT"
else
echo "❌ Longhorn 存储挂载失败,请手动检查!"
fi
fi

# ========== 4. Agent 节点状态汇总(本地校验) ==========
echo -e "\n===== [Agent 本地] 部署状态汇总 ====="
echo "📌 节点名:$NODE_NAME"
echo "📌 节点IP:$WORKER_IP"
echo "📌 K3s Agent 状态:$(systemctl is-active k3s-agent)"
echo "📌 Longhorn 存储挂载:$(is_mounted "$LONGHORN_MOUNT" && echo "已挂载" || echo "未挂载")"
echo -e "\n✅ K3s Agent 幂等部署完成!"
echo "ℹ️ 节点标签请在 Master 节点执行以下命令添加:"
echo "kubectl label node $NODE_NAME node-type=raspberrypi --overwrite"
echo "kubectl label node $NODE_NAME storage=longhorn --overwrite"

配置国内源(v1.34+):

所有节点修改配置

1
cp /var/lib/rancher/k3s/agent/etc/containerd/config.toml /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl

vim /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl

添加以下配置:

1
2
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://docker.m.daocloud.io", "https://docker.1ms.ru"]

systemctl restart k3s-agent

pod重建:

kubectl delete pods -n longhorn-system --all

树莓派UAS驱动兼容性问题

芯片型号 兼容性 备注
JMS578/JMS580 ⭐⭐⭐⭐⭐ 推荐,稳定性最好
ASM1153E ⭐⭐⭐⭐ 良好,注意固件版本
VL716 ⭐⭐⭐ 一般,可能有休眠问题
NS1066/NS1068 ⭐⭐ 不推荐,兼容性问题多

常见芯片VID:PID参考:

芯片型号 VID:PID
JMS578 152d:0578
JMS580 152d:0580
ASM1153E 174c:55aa
VL716 2109:0716
NS1066 2537:1066
1
2
3
# 获取VID:PID
journalctl -k | grep -iE "usb.*device|idvendor|idproduct" | tail -30
dmesg | grep "usb 2-1" | head -20
1
2
3
4
5
6
7
# 禁用uas
vim /boot/firmware/cmdline.txt
# 在开头添加(注意是第一个参数,用空格分隔):
modprobe.blacklist=uas usb-storage.quirks=152d:0578:u
# 禁用usb电流限制
vim /boot/firmware/config.txt
max_usb_current=1
1
2
3
4
5
6
7
8
9
# 同时关闭USB休眠
echo 'ACTION=="add", SUBSYSTEM=="usb", ATTR{power/control}="on"' | sudo tee /etc/udev/rules.d/50-usb-power-save.rules
update-initramfs -u
udevadm control --reload-rules
reboot
# 重启后验证
lsusb -t | grep Driver # 应该显示 usb-storage
# 压力测试
dd if=/dev/sda of=/dev/null bs=1M count=10000 iflag=direct status=progress

批量打标(k3s master执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
# Master 侧自动获取所有树莓派Agent节点并打标签(无需手动填节点名)
set -e

echo "📌 开始为所有树莓派Agent节点打标签..."

# 获取所有K3s节点中,类型为raspberrypi的节点(或直接筛选主机名符合树莓派命名的节点)
# 方式1:筛选所有Agent节点(排除Server)
AGENT_NODE_NAMES=$(kubectl get nodes -o name | grep -v "master" | sed 's/node\///g')

# 方式2:如果树莓派主机名有统一前缀(如rpi-),可精准筛选
# AGENT_NODE_NAMES=$(kubectl get nodes -o name | grep "rpi-" | sed 's/node\///g')

if [[ -z "$AGENT_NODE_NAMES" ]]; then
echo "⚠️ 未找到Agent节点,请先部署Agent并确保加入集群!"
exit 1
fi

# 批量打标签(幂等,--overwrite 确保重复执行无问题)
for node in $AGENT_NODE_NAMES; do
kubectl label node "$node" node-type=raspberrypi --overwrite
kubectl label node "$node" storage=longhorn --overwrite
MEM_TOTAL=$(ssh root@$node "free -g | grep Mem | awk '{print \$2}'")
# 核心修复:兼容8G内存显示7G的情况
if [[ "$MEM_TOTAL" -ge 7 ]]; then # 7/8G 都判定为8G节点
MEM_LABEL="8g"
elif [[ "$MEM_TOTAL" -ge 3 ]]; then # 3/4G 判定为4G节点
MEM_LABEL="4g"
else
echo "⚠️ 节点 $NODE_NAME 内存识别异常(获取到 $MEM_TOTAL GB),请手动检查!"
continue
fi
kubectl label node "$node" "memory=${MEM_LABEL}" --overwrite
echo "✅ 节点 $node 标签已更新"
done

# 验证标签结果
echo -e "\n📌 节点标签验证:"
kubectl get nodes -o custom-columns=NAME:.metadata.name,NODE_TYPE:.metadata.labels.node-type,MEMORY:.metadata.labels.memory,STORAGE:.metadata.labels.storage

4.Longhorn部署脚本(deploy_longhorn.sh,Master执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/bin/bash
set -euo pipefail # 严格模式:未定义变量报错,管道失败则脚本退出
# set +e # 如需忽略单步错误,可临时开启(不推荐)

# ========== 配置项 ==========
NFS_SERVER=${1:-"192.168.2.20"} # 替换为你的NFS服务器IP/域名
NFS_SHARE_PATH="/volume3/nfs-share/longhorn-backup"
HELM_VERSION="v3.15.2"
RETRY_MAX=3 # 最大重试次数
RETRY_INTERVAL=5 # 重试间隔(秒)
TIMEOUT=300 # 全局超时(秒)

# ========== 工具函数(幂等+重试) ==========
# 重试函数:retry 命令 参数
retry() {
local retries=$RETRY_MAX
local count=0
while [ $count -lt $retries ]; do
echo "[$(date +%Y-%m-%d\ %H:%M:%S)] 执行命令:$@ (尝试 $((count+1))/$retries)"
if "$@"; then
return 0
fi
count=$((count+1))
echo "命令执行失败,${RETRY_INTERVAL}秒后重试..."
sleep $RETRY_INTERVAL
done
echo "[$(date +%Y-%m-%d\ %H:%M:%S)] 命令重试$retries次失败:$@"
exit 1
}

# 检查命名空间是否存在
namespace_exists() {
kubectl get namespace "$1" &> /dev/null
}

# 检查Helm release是否存在
helm_release_exists() {
helm list -n "$2" | grep -q "^$1\s"
}

# 检查Pod是否全部就绪
pods_ready() {
local ns="$1"
local label="$2"
# 检查是否有Pod未就绪
if kubectl get pods -n "$ns" -l "$label" -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -q False; then
return 1
fi
# 检查是否所有Pod都已创建
if [ -z "$(kubectl get pods -n "$ns" -l "$label" 2>/dev/null)" ]; then
return 1
fi
return 0
}

# ========== 1. 前置校验(幂等) ==========
echo "===== 1. 前置校验 ====="
# 校验NFS服务器(仅警告,不终止,允许后续手动修复)
if ! ping -c 1 -W 3 "${NFS_SERVER}" &> /dev/null; then
echo "⚠️ 警告:NFS服务器 ${NFS_SERVER} 暂时无法访问,继续部署但备份功能可能失效!"
else
echo "✅ NFS服务器 ${NFS_SERVER} 可达"
fi

# ========== 2. 安装Helm(幂等+重试) ==========
echo -e "\n===== 2. 安装Helm ${HELM_VERSION} ====="
if command -v helm &> /dev/null; then
CURRENT_HELM_VERSION=$(helm version --short | awk -F'v' '{print $2}' | awk -F'+' '{print $1}')
echo "✅ Helm已安装,当前版本:${CURRENT_HELM_VERSION} (目标版本:${HELM_VERSION#v})"
else
echo "📥 开始安装Helm..."
# 下载Helm(带重试)
retry wget "https://get.helm.sh/helm-${HELM_VERSION}-linux-amd64.tar.gz" -O helm.tar.gz || {
echo "🔄 官方源失败,尝试腾讯云备用源..."
retry wget "https://mirrors.cloud.tencent.com/helm/${HELM_VERSION}/helm-${HELM_VERSION}-linux-amd64.tar.gz" -O helm.tar.gz
}

# 解压安装(幂等:覆盖已有文件)
tar -zxvf helm.tar.gz
mv -f linux-amd64/helm /usr/local/bin/
rm -rf linux-amd64 helm.tar.gz

# 验证安装
if ! command -v helm &> /dev/null; then
echo "❌ Helm安装失败!"
exit 1
fi
echo "✅ Helm ${HELM_VERSION} 安装成功"
fi

# 配置Helm仓库(幂等:--force-update覆盖已有配置)
retry helm repo add longhorn https://charts.longhorn.io --force-update
retry helm repo add stable https://mirror.azure.cn/kubernetes/charts/ --force-update
retry helm repo update

# ========== 3. 创建命名空间(幂等) ==========
echo -e "\n===== 3. 创建Longhorn命名空间 ====="
if namespace_exists "longhorn-system"; then
echo "✅ 命名空间 longhorn-system 已存在,跳过创建"
else
retry kubectl create namespace longhorn-system
echo "✅ 命名空间 longhorn-system 创建成功"
fi

# ========== 4. 部署Longhorn(幂等+重试) ==========
echo -e "\n===== 4. 部署Longhorn ====="
if helm_release_exists "longhorn" "longhorn-system"; then
echo "⚠️ Longhorn已部署,执行升级(而非重新安装)以保证幂等..."
retry helm upgrade longhorn longhorn/longhorn \
--namespace longhorn-system \
--set defaultSettings.defaultDataPath=/var/lib/longhorn \
--set defaultSettings.replicaCount=2 \
--set defaultSettings.minimalAvailableReplicaCount=1 \
--set csi.kubeletRootDir=/var/lib/kubelet \
--set persistence.defaultClass=true \
--set persistence.defaultClassReplicaCount=2 \
--set defaultSettings.backupTarget=nfs://${NFS_SERVER}${NFS_SHARE_PATH} \
--set defaultSettings.backupTargetCredentialSecret=""
else
echo "📦 首次部署Longhorn..."
retry helm install longhorn longhorn/longhorn \
--namespace longhorn-system \
--set defaultSettings.defaultDataPath=/var/lib/longhorn \
--set defaultSettings.replicaCount=2 \
--set defaultSettings.minimalAvailableReplicaCount=1 \
--set csi.kubeletRootDir=/var/lib/kubelet \
--set persistence.defaultClass=true \
--set persistence.defaultClassReplicaCount=2 \
--set defaultSettings.backupTarget=nfs://${NFS_SERVER}${NFS_SHARE_PATH} \
--set defaultSettings.backupTargetCredentialSecret=""
fi

# ========== 5. 等待Pod就绪(容错+重试) ==========
echo -e "\n===== 5. 等待Longhorn组件启动(超时${TIMEOUT}秒) ====="
start_time=$(date +%s)
while true; do
# 检查是否超时
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [ $elapsed -ge $TIMEOUT ]; then
echo "❌ 等待Pod就绪超时(${TIMEOUT}秒)!请手动检查:kubectl get pods -n longhorn-system"
break
fi

# 检查manager和ui Pod是否就绪
if pods_ready "longhorn-system" "app=longhorn-manager" && pods_ready "longhorn-system" "app=longhorn-ui"; then
echo "✅ 所有Longhorn Pod已就绪"
break
fi

echo "⌛ 等待Pod就绪中(已耗时${elapsed}秒)... 当前状态:"
kubectl get pods -n longhorn-system -l "app in (longhorn-manager, longhorn-ui)" --no-headers | awk '{print " " $1 " -> " $3}'
sleep 10
done

# ========== 6. 暴露Longhorn UI(幂等) ==========
echo -e "\n===== 6. 暴露Longhorn UI ====="
# 检查Service是否已为NodePort类型
SVC_TYPE=$(kubectl get svc longhorn-frontend -n longhorn-system -o jsonpath='{.spec.type}' 2>/dev/null || echo "")
if [ "$SVC_TYPE" = "NodePort" ]; then
echo "✅ Longhorn UI已为NodePort类型,跳过配置"
else
retry kubectl patch svc longhorn-frontend -n longhorn-system -p '{"spec":{"type":"NodePort","ports":[{"port":80,"targetPort":8000,"nodePort":30000}]}}'
echo "✅ Longhorn UI已暴露为NodePort: 30000"
fi

# ========== 7. 部署完成提示 ==========
echo -e "\n===== 部署完成 ===== 🎉"
echo "📌 关键信息:"
echo " - Longhorn命名空间:longhorn-system"
echo " - UI访问地址:http://任意节点IP:30000"
echo " - 备份目标:nfs://${NFS_SERVER}${NFS_SHARE_PATH}"
echo " - 检查Pod状态:kubectl get pods -n longhorn-system"
echo " - 重新部署:直接再次运行本脚本即可(已做幂等处理)"

4.5 Longhorn 清理磁盘元数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
NODE_NAME=$(hostname)

kubectl -n longhorn-system get nodes.longhorn.io $NODE_NAME -o yaml > /tmp/node.yaml

# 1. 删除节点记录(清理旧磁盘元数据)
kubectl -n longhorn-system delete nodes.longhorn.io $NODE_NAME --force --grace-period=0

# 2. 删除该节点的 manager pod(强制重新注册)
MANAGER_POD=$(kubectl -n longhorn-system get pods -o wide | grep longhorn-manager | grep $NODE_NAME | awk '{print $1}')
kubectl -n longhorn-system delete pod $MANAGER_POD

# 3. 等待 90 秒
echo "等待 Longhorn 重新发现节点..."
sleep 90

# 4. 启用调度
kubectl -n longhorn-system patch nodes.longhorn.io $NODE_NAME --type=merge -p '{"spec":{"allowScheduling":true}}'

# 5. 查看结果
kubectl -n longhorn-system get nodes.longhorn.io


# 导出当前配置
kubectl -n longhorn-system get nodes.longhorn.io $NODE_NAME -o yaml > /tmp/node.yaml

# 手动修改:把 status.conditions 里 type: Ready 的 status 改为 "False"
cat /tmp/node.yaml | sed 's/type: Ready/type: NotReady/' > /tmp/node-fake.yaml

# 先删除再重新创建(绕过 webhook)
kubectl -n longhorn-system delete nodes.longhorn.io $NODE_NAME --force 2>/dev/null || true
kubectl apply -f /tmp/node-fake.yaml 2>/dev/null || true

# 然后再删除
sleep 2
kubectl -n longhorn-system delete nodes.longhorn.io $NODE_NAME

5. NFS存储集成脚本(deploy_nfs.sh,Master执行)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/bin/bash
# 增强错误处理:遇到错误/未定义变量/管道失败时立即退出
set -euo pipefail

# ===================== 可配置变量(根据实际环境调整)=====================
NFS_SERVER="192.168.2.20"
NFS_PATH="/volume3/nfs-share"
STORAGE_CLASS_NAME="nfs-storage"
PV_NAME="nfs-pv-01"
PVC_NAME="nfs-pvc"
NAMESPACE="default"
STORAGE_SIZE="450Gi"
# =========================================================================

# ===================== 前置检查函数 =====================
function pre_check() {
echo "===== 执行前置检查 ====="

# 检查 NFS 服务器连通性
if ! ping -c 2 -W 3 "${NFS_SERVER}" >/dev/null 2>&1; then
echo "错误:NFS 服务器 ${NFS_SERVER} 无法 ping 通,请检查网络!"
exit 1
fi

# 检查 NFS 共享目录是否存在
if ! showmount -e "${NFS_SERVER}" | grep -q "${NFS_PATH}"; then
echo "错误:NFS 服务器 ${NFS_SERVER} 未共享 ${NFS_PATH} 目录,请检查!"
exit 1
fi

echo "===== 前置检查通过 ====="
}

# ===================== 创建 NFS 存储类 =====================
function create_storageclass() {
echo -e "\n===== 开始创建/更新 NFS 存储类 ====="
cat > nfs-storageclass.yaml << EOF
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ${STORAGE_CLASS_NAME}
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: Immediate
EOF

# apply 是幂等操作:存在则更新,不存在则创建
kubectl apply -f nfs-storageclass.yaml
echo "===== NFS 存储类 ${STORAGE_CLASS_NAME} 处理完成 ====="
}

# ===================== 创建 NFS PV =====================
function create_pv() {
echo -e "\n===== 开始创建/更新 NFS PV ====="
cat > nfs-pv.yaml << EOF
apiVersion: v1
kind: PersistentVolume
metadata:
name: ${PV_NAME}
spec:
capacity:
storage: ${STORAGE_SIZE}
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: ${STORAGE_CLASS_NAME}
nfs:
path: ${NFS_PATH}
server: ${NFS_SERVER}
EOF

kubectl apply -f nfs-pv.yaml
echo "===== NFS PV ${PV_NAME} 处理完成 ====="
}

# ===================== 创建 NFS PVC =====================
function create_pvc() {
echo -e "\n===== 开始创建/更新 NFS PVC ====="
cat > nfs-pvc.yaml << EOF
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ${PVC_NAME}
namespace: ${NAMESPACE}
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: ${STORAGE_SIZE}
storageClassName: ${STORAGE_CLASS_NAME}
EOF

kubectl apply -f nfs-pvc.yaml
echo "===== NFS PVC ${PVC_NAME} (${NAMESPACE}) 处理完成 ====="
}

# ===================== 验证 PVC 状态 =====================
function verify_pvc() {
echo -e "\n===== 验证 NFS PVC 状态 ====="
retries=10
interval=5
while [[ ${retries} -gt 0 ]]; do
pvc_status=$(kubectl get pvc -n "${NAMESPACE}" "${PVC_NAME}" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
if [[ ${pvc_status} == "Bound" ]]; then
echo "✅ PVC ${PVC_NAME} 状态正常(Bound)"
kubectl get pvc -n "${NAMESPACE}" "${PVC_NAME}"
return 0
elif [[ ${pvc_status} == "NotFound" ]]; then
echo "❌ PVC ${PVC_NAME} 不存在,重试中..."
else
echo "⚠️ PVC ${PVC_NAME} 当前状态:${pvc_status},等待绑定(剩余重试次数:${retries})"
fi
retries=$((retries - 1))
sleep ${interval}
done

echo "❌ 超时!PVC ${PVC_NAME} 未绑定成功,请检查 PV/PVC 配置或 NFS 服务"
kubectl describe pvc -n "${NAMESPACE}" "${PVC_NAME}"
exit 1
}

# ===================== 输出使用示例 =====================
function print_example() {
echo -e "\n===== NFS 存储集成完成 ====="
echo "NFS PVC 信息:${PVC_NAME}${NAMESPACE} 命名空间)"
echo "使用示例(测试 Pod YAML):"
cat << EOF
apiVersion: v1
kind: Pod
metadata:
name: nfs-test-pod
spec:
containers:
- name: test-container
image: busybox
command: ["/bin/sh", "-c", "sleep 3600"]
volumeMounts:
- name: nfs-volume
mountPath: /mnt/nfs
volumes:
- name: nfs-volume
persistentVolumeClaim:
claimName: ${PVC_NAME}
namespace: ${NAMESPACE}
EOF
}

# ===================== 主流程 =====================
function main() {
pre_check
create_storageclass
create_pv
create_pvc
verify_pvc
print_example

# 可选:清理临时生成的 YAML 文件(如需保留可注释此行)
rm -f nfs-storageclass.yaml nfs-pv.yaml nfs-pvc.yaml
echo -e "\n===== 所有操作完成 ====="
}

# 执行主流程
main

部署执行步骤

1. 基础配置阶段

1
2
3
4
5
6
7
# 1. 所有节点执行基础配置脚本
chmod +x all_nodes_prepare.sh
./all_nodes_prepare.sh
# 2. 树莓派执行完后必须重启(CGroup生效)
reboot
# 3. 重启后验证树莓派CGroup配置
cat /boot/firmware/cmdline.txt | grep cgroup # 应能看到配置项

2. K3s集群部署阶段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
# 1. Master节点(192.168.2.40)执行
chmod +x master_deploy.sh
./master_deploy.sh
# 2. 记录输出的Token,在所有Worker节点替换脚本中的TOKEN后执行
chmod +x worker_deploy.sh
./worker_deploy.sh
# 3. Master节点标记Worker节点
kubectl label node k3s-worker-01 node-type=raspberrypi storage=longhorn
kubectl label node k3s-worker-02 node-type=raspberrypi storage=longhorn
kubectl label node k3s-worker-03 node-type=raspberrypi storage=longhorn
kubectl label node k3s-worker-04 node-type=raspberrypi storage=longhorn
kubectl label node k3s-worker-05 node-type=raspberrypi storage=longhorn
# 4. 验证节点状态
kubectl get nodes # 所有节点应为Ready状态

3. 存储与工具部署阶段

1
2
3
4
5
6
# 1. 部署Longhorn
chmod +x deploy_longhorn.sh
./deploy_longhorn.sh
# 2. 部署NFS存储
chmod +x deploy_nfs.sh
./deploy_nfs.sh

NFS挂载的实际使用示例

1. 作为应用存储使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# nfs-app.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nfs-demo-app
spec:
replicas: 2
selector:
matchLabels:
app: nfs-demo
template:
metadata:
labels:
app: nfs-demo
spec:
containers:
- name: nginx
image: nginx:alpine
ports:
- containerPort: 80
volumeMounts:
- name: nfs-data
mountPath: /usr/share/nginx/html
volumes:
- name: nfs-data
persistentVolumeClaim:
claimName: nfs-pvc

执行部署:

1
2
3
kubectl apply -f nfs-app.yaml
kubectl expose deployment nfs-demo-app --port=80 --type=NodePort
# 访问 http://任意节点IP:NodePort 即可使用NFS存储的静态文件

2. Longhorn备份使用NFS

  1. 访问Longhorn UI(http://任意节点IP:30000

  2. 进入「Backup」页面,可看到已配置的NFS备份目标

  3. 对任意Volume创建备份,数据会自动存储到NFS的/volume3/nfs-share/longhorn-backup目录

    总结

  4. 核心优化点:替换K3s国内镜像源、配置国内系统源、关闭UFW、开启树莓派CGroup,确保国内环境可正常部署。

  5. NFS挂载意义:Master挂载NFS主要用于Longhorn备份、集群配置存储和应用静态数据共享,是集群存储架构的重要补充。

  6. 关键操作:树莓派执行基础配置后必须重启以生效CGroup,Worker节点需正确挂载SSD并标记节点标签,Longhorn需配置NFS作为备份目标。
    所有脚本已适配国内网络环境和硬件架构,执行前需确保NFS服务端可访问、树莓派SSD已正确连接,部署完成后可通过Longhorn UI和hiclaw Dashboard管理集群存储和容器。