[root@master mysql]# kubectl get pod -A NAMESPACE NAME READY STATUS RESTARTS AGE default nfs-client-provisioner-674f955bf4-zppg6 1/1 Running 6 (82d ago) 82d kube-system calico-kube-controllers-7fc4577899-cjcmj 1/1 Running 0 20h kube-system calico-node-c5w87 1/1 Running 1 (82d ago) 21h kube-system calico-node-fgpbz 1/1 Running 0 82d kube-system calico-node-kgprx 1/1 Running 0 82d kube-system calico-node-z9p2n 1/1 Running 3 (11m ago) 82d kube-system coredns-74586cf9b6-lb4l9 1/1 Running 0 82d kube-system coredns-74586cf9b6-t67fg 1/1 Running 0 36m kube-system etcd-master 1/1 Running 5 (82d ago) 21h kube-system etcd-node1 1/1 Running 0 82d kube-system etcd-node2 0/1 CrashLoopBackOff 21 (5m10s ago) 7m11s kube-system kube-apiserver-master 1/1 Running 3 (82d ago) 20h kube-system kube-apiserver-node1 1/1 Running 1 (82d ago) 20h kube-system kube-apiserver-node2 0/1 CrashLoopBackOff 2 (39s ago) 5m58s
查看日志显示是因为etcd数据损坏了,导致数据不一致,所以需要把etcd这个节点从集群中删除掉,然后重新加入集群,这样etcd就会自动同步,先看一下etcd的日志内容:
[root@master mysql]# kubectl logs -n kube-system etcd-node2 ................................... {"level":"panic","ts":"2024-03-07T02:12:24.095Z","logger":"raft","caller":"etcdserver/zap_raft.go:101","msg":"tocommit(145280) is out of range [lastIndex(19301)]. Was the raft log corrupted, truncated, or lost?","stacktrace":"go.etcd.io/etcd/server/v3/etcdserver.(*zapRaftLogger).Panicf\n\t/go/src/go.etcd.io/etcd/release/etcd/server/etcdserver/zap_raft.go:101\ngo.etcd.io/etcd/raft/v3.(*raftLog).commitTo\n\t/go/src/go.etcd.io/etcd/release/etcd/raft/log.go:237\ngo.etcd.io/etcd/raft/v3.(*raft).handleHeartbeat\n\t/go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:1508\ngo.etcd.io/etcd/raft/v3.stepFollower\n\t/go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:1434\ngo.etcd.io/etcd/raft/v3.(*raft).Step\n\t/go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:975\ngo.etcd.io/etcd/raft/v3.(*node).run\n\t/go/src/go.etcd.io/etcd/release/etcd/raft/node.go:356"} panic: tocommit(145280) is out of range [lastIndex(19301)]. Was the raft log corrupted, truncated, or lost? goroutine 128 [running]: go.uber.org/zap/zapcore.(*CheckedEntry).Write(0xc0001186c0, 0x0, 0x0, 0x0) /go/pkg/mod/go.uber.org/zap@v1.17.0/zapcore/entry.go:234 +0x58d go.uber.org/zap.(*SugaredLogger).log(0xc00000e028, 0x4, 0x124ecb9, 0x5d, 0xc0008a0ac0, 0x2, 0x2, 0x0, 0x0, 0x0) /go/pkg/mod/go.uber.org/zap@v1.17.0/sugar.go:227 +0x111 go.uber.org/zap.(*SugaredLogger).Panicf(...) /go/pkg/mod/go.uber.org/zap@v1.17.0/sugar.go:159 go.etcd.io/etcd/server/v3/etcdserver.(*zapRaftLogger).Panicf(0xc0004c2170, 0x124ecb9, 0x5d, 0xc0008a0ac0, 0x2, 0x2) /go/src/go.etcd.io/etcd/release/etcd/server/etcdserver/zap_raft.go:101 +0x7d go.etcd.io/etcd/raft/v3.(*raftLog).commitTo(0xc0005540e0, 0x23780) /go/src/go.etcd.io/etcd/release/etcd/raft/log.go:237 +0x135 go.etcd.io/etcd/raft/v3.(*raft).handleHeartbeat(0xc0003fac60, 0x8, 0x6a4909c7fc179c2e, 0x605fa7f2c9d111ed, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, ...) /go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:1508 +0x54 go.etcd.io/etcd/raft/v3.stepFollower(0xc0003fac60, 0x8, 0x6a4909c7fc179c2e, 0x605fa7f2c9d111ed, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, ...) /go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:1434 +0x478 go.etcd.io/etcd/raft/v3.(*raft).Step(0xc0003fac60, 0x8, 0x6a4909c7fc179c2e, 0x605fa7f2c9d111ed, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, ...) /go/src/go.etcd.io/etcd/release/etcd/raft/raft.go:975 +0xa55 go.etcd.io/etcd/raft/v3.(*node).run(0xc0003d8180) /go/src/go.etcd.io/etcd/release/etcd/raft/node.go:356 +0x798 created by go.etcd.io/etcd/raft/v3.RestartNode /go/src/go.etcd.io/etcd/release/etcd/raft/node.go:244 +0x330
我这里直接把这个节点从集群中删除掉,然后重新加入集群,也可以直接在etcd集群中删除这个节点,然后重启一下etcd节点也可以,我这里使用的是前者,看一下我的操作。
[root@master mysql]# kubectl delete node node2 node "node2" deleted [root@master mysql]# kubeadm init phase upload-certs --upload-certs I0307 10:21:07.278186 44502 version.go:255] remote version is much newer: v1.29.2; falling back to: stable-1.24 [upload-certs] Storing the certificates in Secret "kubeadm-certs" in the "kube-system" Namespace [upload-certs] Using certificate key: 881dd439e2c8b7decc7eb931319139f7382d5f56d481d814a834e5d1a74ce047 [root@master mysql]# kubeadm token create --print-join-command --config kubeadm.yaml kubeadm join 10.211.55.245:6443 --token 8j5iew.mahismlzx0wy2ivp --discovery-token-ca-cert-hash sha256:d977466bfde41aaf663ca0694f531b778f7604a389c7fa11645ec34d44a94579
这里把node2从集群中删除了,然后在创建一个密钥,等下把node2节点初始化一下在加入到集群中。
[root@node2 manifests]# kubeadm reset [root@node2 manifests]# kubeadm join 10.211.55.245:6443 --token 8j5iew.mahismlzx0wy2ivp --discovery-token-ca-cert-hash sha256:d977466bfde41aaf663ca0694f531b778f7604a389c7fa11645ec34d44a94579 --control-plane --certificate-key 881dd439e2c8b7decc7eb931319139f7382d5f56d481d814a834e5d1a74ce047 [preflight] Running pre-flight checks [WARNING FileExisting-tc]: tc not found in system path [preflight] Reading configuration from the cluster... ................................ [control-plane] Using manifest folder "/etc/kubernetes/manifests" [control-plane] Creating static Pod manifest for "kube-apiserver" [control-plane] Creating static Pod manifest for "kube-controller-manager" [control-plane] Creating static Pod manifest for "kube-scheduler" [check-etcd] Checking that the etcd cluster is healthy error execution phase check-etcd: etcd cluster is not healthy: failed to dial endpoint https://10.211.55.33:2379 with maintenance client: context deadline exceeded To see the stack trace of this error execute with --v=5 or higher
因为在etcd集群中没有把node2节点删除,所以加入不了集群,我们需要在etcd集群中删除一下node2节点。
[root@master ~]# kubectl exec -it -n kube-system etcd-node1 sh kubectl exec [POD] [COMMAND] is DEPRECATED and will be removed in a future version. Use kubectl exec [POD] -- [COMMAND] instead. sh-5.1# sh-5.1# export ETCDCTL_API=3 #etcdctl --cacert="/etc/kubernetes/pki/etcd/ca.crt" --cert="/etc/kubernetes/pki/etcd/server.crt" --key="/etc/kubernetes/pki/etcd/server.key" member list sh-5.1# <"/etc/kubernetes/pki/etcd/server.crt" --key="/etc/kubernetes/pki/etcd/server.key" member list 605fa7f2c9d111ed, started, master, https://10.211.55.31:2380, https://10.211.55.31:2379, false 6a4909c7fc179c2e, started, node2, https://10.211.55.33:2380, https://10.211.55.33:2379, false cabac9d105002911, started, node1, https://10.211.55.32:2380, https://10.211.55.32:2379, false #etcdctl --cacert="/etc/kubernetes/pki/etcd/ca.crt" --cert="/etc/kubernetes/pki/etcd/server.crt" --key="/etc/kubernetes/pki/etcd/server.key" member remove 6a4909c7fc179c2e sh-5.1# <i/etcd/server.crt" --key="/etc/kubernetes/pki/etcd/server.key" member remove 6a4909c7fc179c2e Member 6a4909c7fc179c2e removed from cluster 53524c1833db54da
从集群中删除了,然后从新初始化一下,就正常了。后来才想起来,如果我直接在etcd集群中把这个节点删除,然后重启正常应该会自动加入到集群,不需要重新初始化node节点了,这样解决更简单点。好了,没有了看些其他的吧。
您可以选择一种方式赞助本站
支付宝扫一扫赞助
微信钱包扫描赞助
赏