观测云私有化部署完整指南

包含所有步骤、命令和配置代码,无疏漏版本

一、环境准备

1.1 服务器规划

┌─────────────────────────────────────────────────────────────────┐ │ 服务器角色规划 │ ├──────────────┬──────────────┬──────────────┬────────────────────┤ │ 角色 │ 配置要求 │ 数量 │ 说明 │ ├──────────────┼──────────────┼──────────────┼────────────────────┤ │ Master节点 │ 8核16GB 200G │ 3台 │ K8s控制平面 │ │ Worker节点 │ 16核32GB 500G│ 3台+ │ 运行观测云组件 │ │ 存储节点 │ 16核32GB 1T+ │ 3台 │ 数据持久化存储 │ │ 镜像仓库 │ 4核8GB 500G │ 1台 │ 私有镜像仓库 │ └──────────────┴──────────────┴──────────────┴────────────────────┘

1.2 操作系统配置

# 在所有节点执行

# 1. 关闭防火墙
systemctl stop firewalld
systemctl disable firewalld

# 2. 关闭SELinux
setenforce 0
sed -i 's/^SELINUX=enforcing$/SELINUX=disabled/' /etc/selinux/config

# 3. 关闭swap
swapoff -a
sed -i '/swap/d' /etc/fstab

# 4. 配置内核参数
cat > /etc/sysctl.d/k8s.conf <<EOF
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
net.ipv4.tcp_tw_recycle = 0
vm.swappiness = 0
vm.overcommit_memory = 1
vm.panic_on_oom = 0
fs.inotify.max_user_watches = 89100
fs.file-max = 52706963
fs.nr_open = 52706963
net.netfilter.nf_conntrack_max = 2310720
EOF

sysctl --system

# 5. 配置hosts(示例)
cat >> /etc/hosts <<EOF
192.168.1.10 master1
192.168.1.11 master2
192.168.1.12 master3
192.168.1.20 worker1
192.168.1.21 worker2
192.168.1.22 worker3
192.168.1.30 registry
EOF

# 6. 安装Docker
yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
yum install -y docker-ce-20.10.* docker-ce-cli-20.10.*

# 配置Docker
mkdir -p /etc/docker
cat > /etc/docker/daemon.json <<EOF
{
  "exec-opts": ["native.cgroupdriver=systemd"],
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "100m",
    "max-file": "3"
  },
  "storage-driver": "overlay2",
  "insecure-registries": ["192.168.1.30:5000"]
}
EOF

systemctl daemon-reload
systemctl enable docker
systemctl start docker

1.3 安装Kubernetes

# 在所有节点执行

# 1. 配置K8s yum源
cat > /etc/yum.repos.d/kubernetes.repo <<EOF
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
enabled=1
gpgcheck=0
EOF

# 2. 安装kubeadm、kubelet、kubectl
yum install -y kubelet-1.28.* kubeadm-1.28.* kubectl-1.28.*

# 3. 配置kubelet
cat > /etc/sysconfig/kubelet <<EOF
="--cgroup-driver=systemd"
EOF

systemctl enable kubelet

# 4. 加载内核模块
modprobe br_netfilter
modprobe ip_vs
modprobe ip_vs_rr
modprobe ip_vs_wrr
modprobe ip_vs_sh

# 5. 配置crictl
cat > /etc/crictl.yaml <<EOF
runtime-endpoint: unix:///run/containerd/containerd.sock
image-endpoint: unix:///run/containerd/containerd.sock
timeout: 10
debug: false
EOF

二、Kubernetes集群部署

2.1 初始化Master节点

# 在master1节点执行

# 1. 生成默认配置
kubeadm config print init-defaults > kubeadm-config.yaml

# 2. 修改配置
cat > kubeadm-config.yaml <<EOF
apiVersion: kubeadm.k8s.io/v1beta3
bootstrapTokens:
- groups:
  - system:bootstrappers:kubeadm:default-node-token
  token: abcdef.0123456789abcdef
  ttl: 24h0m0s
  usages:
  - signing
  - authentication
kind: InitConfiguration
localAPIEndpoint:
  advertiseAddress: 192.168.1.10
  bindPort: 6443
nodeRegistration:
  criSocket: unix:///run/containerd/containerd.sock
  imagePullPolicy: IfNotPresent
  taints: null
---
apiVersion: kubeadm.k8s.io/v1beta3
certificatesDir: /etc/kubernetes/pki
clusterName: kubernetes
imageRepository: registry.aliyuncs.com/google_containers
kind: ClusterConfiguration
kubernetesVersion: 1.28.0
networking:
  dnsDomain: cluster.local
  serviceSubnet: 10.96.0.0/12
  podSubnet: 10.244.0.0/16
EOF

# 3. 拉取镜像
kubeadm config images pull --config kubeadm-config.yaml

# 4. 初始化集群
kubeadm init --config kubeadm-config.yaml

# 5. 配置kubectl
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
chown $(id -u):$(id -g) $HOME/.kube/config

2.2 加入其他节点

# 获取加入命令(在master1执行)
kubeadm token create --print-join-command

# 在worker节点执行加入命令
kubeadm join 192.168.1.10:6443 --token abcdef.0123456789abcdef \
  --discovery-token-ca-cert-hash sha256:xxx

2.3 部署网络插件(Calico)

# 在master1执行

# 1. 下载Calico配置
wget https://raw.githubusercontent.com/projectcalico/calico/v3.26.0/manifests/calico.yaml

# 2. 修改配置(使用国内镜像)
sed -i 's|docker.io/calico/|registry.aliyuncs.com/calico/|g' calico.yaml
sed -i 's|192.168.0.0/16|10.244.0.0/16|g' calico.yaml

# 3. 部署Calico
kubectl apply -f calico.yaml

# 4. 验证
kubectl get pods -n kube-system -l k8s-app=calico-node
kubectl get nodes

三、存储配置

3.1 部署Rook-Ceph(推荐)

# 1. 部署Rook Operator
git clone --single-branch --branch v1.12.0 https://github.com/rook/rook.git
cd rook/deploy/examples

kubectl create -f crds.yaml
kubectl create -f common.yaml
kubectl create -f operator.yaml

# 2. 创建Ceph集群
cat > cluster.yaml <<EOF
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
  name: rook-ceph
  namespace: rook-ceph
spec:
  cephVersion:
    image: quay.io/ceph/ceph:v17.2.6
  dataDirHostPath: /var/lib/rook
  mon:
    count: 3
  mgr:
    count: 2
  dashboard:
    enabled: true
  storage:
    useAllNodes: false
    useAllDevices: false
    nodes:
      - name: "worker1"
        devices:
          - name: "sdb"
      - name: "worker2"
        devices:
          - name: "sdb"
      - name: "worker3"
        devices:
          - name: "sdb"
EOF

kubectl apply -f cluster.yaml

3.2 创建StorageClass

cat > storageclass.yaml <<EOF
apiVersion: ceph.rook.io/v1
kind: CephBlockPool
metadata:
  name: replicapool
  namespace: rook-ceph
spec:
  failureDomain: host
  replicated:
    size: 3
---
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: rook-ceph-block
provisioner: rook-ceph.rbd.csi.ceph.com
parameters:
  clusterID: rook-ceph
  pool: replicapool
  imageFormat: "2"
  imageFeatures: layering
reclaimPolicy: Delete
allowVolumeExpansion: true
volumeBindingMode: Immediate
EOF

kubectl apply -f storageclass.yaml
kubectl get sc

四、观测云平台部署

4.1 准备Helm Chart

# 1. 安装Helm
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash

# 2. 添加观测云Helm仓库
helm repo add guance https://charts.guance.com
helm repo update

4.2 配置values.yaml

cat > guance-values.yaml <<EOF
global:
  imageRegistry: "192.168.1.30/guance"
  storageClass: "rook-ceph-block"
  timezone: "Asia/Shanghai"

guancedb:
  enabled: true
  replicas: 3
  persistence:
    enabled: true
    size: 500Gi
  resources:
    requests:
      cpu: 4000m
      memory: 8Gi

opensearch:
  enabled: true
  replicas: 3
  persistence:
    size: 500Gi

kafka:
  enabled: true
  replicas: 3
  persistence:
    size: 100Gi

console:
  enabled: true
  replicas: 2
  ingress:
    enabled: true
    host: guance.company.com
EOF

4.3 部署观测云

# 1. 创建命名空间
kubectl create namespace guance

# 2. 部署观测云
helm install guance guance/guance-stack \
  -n guance \
  -f guance-values.yaml \
  --timeout 30m \
  --wait

# 3. 查看部署状态
kubectl get pods -n guance
kubectl get svc -n guance
部署成功标志

所有Pod状态为Running,Service显示正确的ClusterIP,Ingress配置生效后即可通过域名访问控制台。

五、DataKit采集器部署

5.1 在K8s中部署DataKit

# datakit-daemonset.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: datakit
  namespace: guance
spec:
  selector:
    matchLabels:
      app: datakit
  template:
    metadata:
      labels:
        app: datakit
    spec:
      hostNetwork: true
      hostPID: true
      containers:
        - name: datakit
          image: 192.168.1.30/guance/datakit:v1.0.0
          env:
            - name: ENV_DATAWAY
              value: "https://guance.company.com/v1/write/metrics?token=<YOUR_TOKEN>"

5.2 在主机上安装DataKit

# 下载安装脚本
curl -fsSL https://static.guance.com/datakit/install.sh -o install.sh

# 执行安装
chmod +x install.sh
./install.sh \
  --dataway "https://guance.company.com/v1/write/metrics?token=<YOUR_TOKEN>" \
  --insecure

# 验证安装
datakit monitor
datakit --version

六、高可用与备份

6.1 高可用配置

highAvailability:
  enabled: true
  guancedb:
    replicas: 3
    antiAffinity: hard
  opensearch:
    replicas: 3
    minimumMasterNodes: 2
  kafka:
    replicas: 3
    minInsyncReplicas: 2

6.2 备份脚本

#!/bin/bash
# backup.sh - 完整备份脚本

BACKUP_DIR="/backup/guance-$(date +%Y%m%d-%H%M%S)"
mkdir -p $BACKUP_DIR

# 1. 备份GuanceDB
kubectl exec -it guancedb-0 -n guance -- \
  guance-backup --output /tmp/guancedb-backup.tar.gz
kubectl cp guance/guancedb-0:/tmp/guancedb-backup.tar.gz \
  $BACKUP_DIR/guancedb.tar.gz

# 2. 备份OpenSearch
curl -X PUT "http://opensearch-master:9200/_snapshot/backup/snapshot-$(date +%Y%m%d)"

# 3. 备份PostgreSQL
kubectl exec -it postgresql-0 -n guance -- \
  pg_dump -U guance guance > $BACKUP_DIR/postgres.sql

echo "Backup completed: $BACKUP_DIR.tar.gz"

七、故障排查

7.1 常见问题排查

# 1. Pod无法启动
kubectl describe pod <pod-name> -n guance
kubectl logs <pod-name> -n guance --previous

# 2. 存储问题
kubectl get pvc -n guance
kubectl describe pvc <pvc-name> -n guance

# 3. 网络问题
kubectl get svc -n guance
kubectl get endpoints -n guance

# 4. 资源不足
kubectl top nodes
kubectl top pods -n guance
注意事项

生产环境部署前,请务必在测试环境验证所有配置。建议先使用最小部署模式验证功能,再逐步扩展到生产配置。

联系支持