一、环境准备
1.1 服务器规划
┌─────────────────────────────────────────────────────────────────┐
│ 服务器角色规划 │
├──────────────┬──────────────┬──────────────┬────────────────────┤
│ 角色 │ 配置要求 │ 数量 │ 说明 │
├──────────────┼──────────────┼──────────────┼────────────────────┤
│ Master节点 │ 8核16GB 200G │ 3台 │ K8s控制平面 │
│ Worker节点 │ 16核32GB 500G│ 3台+ │ 运行观测云组件 │
│ 存储节点 │ 16核32GB 1T+ │ 3台 │ 数据持久化存储 │
│ 镜像仓库 │ 4核8GB 500G │ 1台 │ 私有镜像仓库 │
└──────────────┴──────────────┴──────────────┴────────────────────┘
1.2 操作系统配置
# 在所有节点执行
# 1. 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld
# 2. 关闭SELinux
setenforce 0
sed -i 's/^SELINUX=enforcing$/SELINUX=disabled/' /etc/selinux/config
# 3. 关闭swap
swapoff -a
sed -i '/swap/d' /etc/fstab
# 4. 配置内核参数
cat > /etc/sysctl.d/k8s.conf <<EOF
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
net.ipv4.tcp_tw_recycle = 0
vm.swappiness = 0
vm.overcommit_memory = 1
vm.panic_on_oom = 0
fs.inotify.max_user_watches = 89100
fs.file-max = 52706963
fs.nr_open = 52706963
net.netfilter.nf_conntrack_max = 2310720
EOF
sysctl --system
# 5. 配置hosts(示例)
cat >> /etc/hosts <<EOF
192.168.1.10 master1
192.168.1.11 master2
192.168.1.12 master3
192.168.1.20 worker1
192.168.1.21 worker2
192.168.1.22 worker3
192.168.1.30 registry
EOF
# 6. 安装Docker
yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
yum install -y docker-ce-20.10.* docker-ce-cli-20.10.*
# 配置Docker
mkdir -p /etc/docker
cat > /etc/docker/daemon.json <<EOF
{
"exec-opts": ["native.cgroupdriver=systemd"],
"log-driver": "json-file",
"log-opts": {
"max-size": "100m",
"max-file": "3"
},
"storage-driver": "overlay2",
"insecure-registries": ["192.168.1.30:5000"]
}
EOF
systemctl daemon-reload
systemctl enable docker
systemctl start docker
1.3 安装Kubernetes
# 在所有节点执行
# 1. 配置K8s yum源
cat > /etc/yum.repos.d/kubernetes.repo <<EOF
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=0
EOF
# 2. 安装kubeadm、kubelet、kubectl
yum install -y kubelet-1.28.* kubeadm-1.28.* kubectl-1.28.*
# 3. 配置kubelet
cat > /etc/sysconfig/kubelet <<EOF
="--cgroup-driver=systemd"
EOF
systemctl enable kubelet
# 4. 加载内核模块
modprobe br_netfilter
modprobe ip_vs
modprobe ip_vs_rr
modprobe ip_vs_wrr
modprobe ip_vs_sh
# 5. 配置crictl
cat > /etc/crictl.yaml <<EOF
runtime-endpoint: unix:///run/containerd/containerd.sock
image-endpoint: unix:///run/containerd/containerd.sock
timeout: 10
debug: false
EOF
二、Kubernetes集群部署
2.1 初始化Master节点
# 在master1节点执行
# 1. 生成默认配置
kubeadm config print init-defaults > kubeadm-config.yaml
# 2. 修改配置
cat > kubeadm-config.yaml <<EOF
apiVersion: kubeadm.k8s.io/v1beta3
bootstrapTokens:
- groups:
- system:bootstrappers:kubeadm:default-node-token
token: abcdef.0123456789abcdef
ttl: 24h0m0s
usages:
- signing
- authentication
kind: InitConfiguration
localAPIEndpoint:
advertiseAddress: 192.168.1.10
bindPort: 6443
nodeRegistration:
criSocket: unix:///run/containerd/containerd.sock
imagePullPolicy: IfNotPresent
taints: null
---
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
imageRepository: registry.aliyuncs.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: 1.28.0
networking:
dnsDomain: cluster.local
serviceSubnet: 10.96.0.0/12
podSubnet: 10.244.0.0/16
EOF
# 3. 拉取镜像
kubeadm config images pull --config kubeadm-config.yaml
# 4. 初始化集群
kubeadm init --config kubeadm-config.yaml
# 5. 配置kubectl
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config
2.2 加入其他节点
# 获取加入命令(在master1执行)
kubeadm token create --print-join-command
# 在worker节点执行加入命令
kubeadm join 192.168.1.10:6443 --token abcdef.0123456789abcdef \
--discovery-token-ca-cert-hash sha256:xxx
2.3 部署网络插件(Calico)
# 在master1执行
# 1. 下载Calico配置
wget https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/calico.yaml
# 2. 修改配置(使用国内镜像)
sed -i 's|docker.io/calico/|registry.aliyuncs.com/calico/|g' calico.yaml
sed -i 's|192.168.0.0/16|10.244.0.0/16|g' calico.yaml
# 3. 部署Calico
kubectl apply -f calico.yaml
# 4. 验证
kubectl get pods -n kube-system -l k8s-app=calico-node
kubectl get nodes
三、存储配置
3.1 部署Rook-Ceph(推荐)
# 1. 部署Rook Operator
git clone --single-branch --branch v1.12.0 https://github.com/rook/rook.git
cd rook/deploy/examples
kubectl create -f crds.yaml
kubectl create -f common.yaml
kubectl create -f operator.yaml
# 2. 创建Ceph集群
cat > cluster.yaml <<EOF
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v17.2.6
dataDirHostPath: /var/lib/rook
mon:
count: 3
mgr:
count: 2
dashboard:
enabled: true
storage:
useAllNodes: false
useAllDevices: false
nodes:
- name: "worker1"
devices:
- name: "sdb"
- name: "worker2"
devices:
- name: "sdb"
- name: "worker3"
devices:
- name: "sdb"
EOF
kubectl apply -f cluster.yaml
3.2 创建StorageClass
cat > storageclass.yaml <<EOF
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
name: replicapool
namespace: rook-ceph
spec:
failureDomain: host
replicated:
size: 3
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: rook-ceph-block
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
clusterID: rook-ceph
pool: replicapool
imageFormat: "2"
imageFeatures: layering
reclaimPolicy: Delete
allowVolumeExpansion: true
volumeBindingMode: Immediate
EOF
kubectl apply -f storageclass.yaml
kubectl get sc
四、观测云平台部署
4.1 准备Helm Chart
# 1. 安装Helm
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
# 2. 添加观测云Helm仓库
helm repo add guance https://charts.guance.com
helm repo update
4.2 配置values.yaml
cat > guance-values.yaml <<EOF
global:
imageRegistry: "192.168.1.30/guance"
storageClass: "rook-ceph-block"
timezone: "Asia/Shanghai"
guancedb:
enabled: true
replicas: 3
persistence:
enabled: true
size: 500Gi
resources:
requests:
cpu: 4000m
memory: 8Gi
opensearch:
enabled: true
replicas: 3
persistence:
size: 500Gi
kafka:
enabled: true
replicas: 3
persistence:
size: 100Gi
console:
enabled: true
replicas: 2
ingress:
enabled: true
host: guance.company.com
EOF
4.3 部署观测云
# 1. 创建命名空间
kubectl create namespace guance
# 2. 部署观测云
helm install guance guance/guance-stack \
-n guance \
-f guance-values.yaml \
--timeout 30m \
--wait
# 3. 查看部署状态
kubectl get pods -n guance
kubectl get svc -n guance
部署成功标志
所有Pod状态为Running,Service显示正确的ClusterIP,Ingress配置生效后即可通过域名访问控制台。
五、DataKit采集器部署
5.1 在K8s中部署DataKit
# datakit-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: datakit
namespace: guance
spec:
selector:
matchLabels:
app: datakit
template:
metadata:
labels:
app: datakit
spec:
hostNetwork: true
hostPID: true
containers:
- name: datakit
image: 192.168.1.30/guance/datakit:v1.0.0
env:
- name: ENV_DATAWAY
value: "https://guance.company.com/v1/write/metrics?token=<YOUR_TOKEN>"
5.2 在主机上安装DataKit
# 下载安装脚本
curl -fsSL https://static.guance.com/datakit/install.sh -o install.sh
# 执行安装
chmod +x install.sh
./install.sh \
--dataway "https://guance.company.com/v1/write/metrics?token=<YOUR_TOKEN>" \
--insecure
# 验证安装
datakit monitor
datakit --version
六、高可用与备份
6.1 高可用配置
highAvailability:
enabled: true
guancedb:
replicas: 3
antiAffinity: hard
opensearch:
replicas: 3
minimumMasterNodes: 2
kafka:
replicas: 3
minInsyncReplicas: 2
6.2 备份脚本
#!/bin/bash
# backup.sh - 完整备份脚本
BACKUP_DIR="/backup/guance-$(date +%Y%m%d-%H%M%S)"
mkdir -p $BACKUP_DIR
# 1. 备份GuanceDB
kubectl exec -it guancedb-0 -n guance -- \
guance-backup --output /tmp/guancedb-backup.tar.gz
kubectl cp guance/guancedb-0:/tmp/guancedb-backup.tar.gz \
$BACKUP_DIR/guancedb.tar.gz
# 2. 备份OpenSearch
curl -X PUT "http://opensearch-master:9200/_snapshot/backup/snapshot-$(date +%Y%m%d)"
# 3. 备份PostgreSQL
kubectl exec -it postgresql-0 -n guance -- \
pg_dump -U guance guance > $BACKUP_DIR/postgres.sql
echo "Backup completed: $BACKUP_DIR.tar.gz"
七、故障排查
7.1 常见问题排查
# 1. Pod无法启动
kubectl describe pod <pod-name> -n guance
kubectl logs <pod-name> -n guance --previous
# 2. 存储问题
kubectl get pvc -n guance
kubectl describe pvc <pvc-name> -n guance
# 3. 网络问题
kubectl get svc -n guance
kubectl get endpoints -n guance
# 4. 资源不足
kubectl top nodes
kubectl top pods -n guance
注意事项
生产环境部署前,请务必在测试环境验证所有配置。建议先使用最小部署模式验证功能,再逐步扩展到生产配置。
联系支持
- 官方文档:https://docs.guance.com
- 技术支持:support@guance.com
- 社区论坛:https://bbs.guance.com