1. Zookeeper迁移概述

Zookeeper是Apache开源的分布式协调服务,为分布式应用提供一致性服务。本文将详细介绍Zookeeper集群迁移运维的实战经验,包括集群部署、数据迁移、配置管理、服务协调的完整解决方案。

1.1 核心功能

  1. 分布式协调: 提供分布式环境下的协调服务
  2. 配置管理: 集中式配置管理和同步
  3. 集群管理: Zookeeper集群的部署和管理
  4. 数据迁移: 集群数据的安全迁移
  5. 运维监控: 集群监控和性能优化

1.2 技术架构

1
2
3
客户端应用 → Zookeeper集群 → 数据存储
↓ ↓ ↓
服务协调 → 配置管理 → 数据持久化

2. 环境准备

2.1 系统要求检查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
# check_zookeeper_env.sh - Zookeeper环境检查脚本
# @author 运维实战

# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 检查Java环境
check_java() {
log "检查Java环境..."
if command -v java &> /dev/null; then
JAVA_VERSION=$(java -version 2>&1 | head -n 1 | cut -d'"' -f2)
log "Java版本: $JAVA_VERSION"
return 0
else
log "错误: 未找到Java环境"
return 1
fi
}

# 检查系统资源
check_resources() {
log "检查系统资源..."

# 检查内存
MEMORY=$(free -m | awk 'NR==2{printf "%.1f", $2/1024}')
log "系统内存: ${MEMORY}GB"

# 检查磁盘空间
DISK=$(df -h / | awk 'NR==2{print $4}')
log "可用磁盘空间: $DISK"

# 检查网络连接
if ping -c 1 8.8.8.8 &> /dev/null; then
log "网络连接正常"
else
log "警告: 网络连接异常"
fi
}

# 主函数
main() {
log "开始Zookeeper环境检查..."

if check_java; then
log "Java环境检查通过"
else
log "Java环境检查失败"
exit 1
fi

check_resources
log "环境检查完成"
}

main "$@"

2.2 依赖安装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/bin/bash
# install_zookeeper_deps.sh - Zookeeper依赖安装脚本
# @author 运维实战

# 安装Zookeeper
install_zookeeper() {
log "安装Zookeeper..."

# 下载Zookeeper
ZK_VERSION="3.8.3"
ZK_URL="https://archive.apache.org/dist/zookeeper/zookeeper-${ZK_VERSION}/apache-zookeeper-${ZK_VERSION}-bin.tar.gz"

wget -O /tmp/zookeeper.tar.gz "$ZK_URL"

# 解压安装
tar -xzf /tmp/zookeeper.tar.gz -C /opt/
mv /opt/apache-zookeeper-${ZK_VERSION}-bin /opt/zookeeper

# 创建数据目录
mkdir -p /opt/zookeeper/data
mkdir -p /opt/zookeeper/logs

log "Zookeeper安装完成"
}

# 配置Zookeeper
configure_zookeeper() {
log "配置Zookeeper..."

# 复制配置文件
cp /opt/zookeeper/conf/zoo_sample.cfg /opt/zookeeper/conf/zoo.cfg

# 修改配置
cat >> /opt/zookeeper/conf/zoo.cfg << EOF

# 自定义配置
dataDir=/opt/zookeeper/data
dataLogDir=/opt/zookeeper/logs
clientPort=2181
server.1=node1:2888:3888
server.2=node2:2888:3888
server.3=node3:2888:3888
EOF

log "Zookeeper配置完成"
}

main() {
install_zookeeper
configure_zookeeper
log "Zookeeper依赖安装完成"
}

main "$@"

3. 集群部署

3.1 集群配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/bin/bash
# deploy_zookeeper_cluster.sh - Zookeeper集群部署脚本
# @author 运维实战

# 集群节点配置
NODES=("node1" "node2" "node3")
ZK_HOME="/opt/zookeeper"
ZK_DATA_DIR="/opt/zookeeper/data"

# 部署单个节点
deploy_node() {
local node_id=$1
local node_host=$2

log "部署节点: $node_host (ID: $node_id)"

# 创建myid文件
echo "$node_id" > "$ZK_DATA_DIR/myid"

# 配置zoo.cfg
cat > "$ZK_HOME/conf/zoo.cfg" << EOF
# Zookeeper集群配置
tickTime=2000
initLimit=10
syncLimit=5
dataDir=$ZK_DATA_DIR
dataLogDir=$ZK_HOME/logs
clientPort=2181
maxClientCnxns=60
autopurge.snapRetainCount=3
autopurge.purgeInterval=1

# 集群节点配置
server.1=node1:2888:3888
server.2=node2:2888:3888
server.3=node3:2888:3888
EOF

log "节点 $node_host 配置完成"
}

# 启动集群
start_cluster() {
log "启动Zookeeper集群..."

for i in "${!NODES[@]}"; do
node_id=$((i + 1))
node_host="${NODES[$i]}"

log "启动节点: $node_host"
ssh "$node_host" "$ZK_HOME/bin/zkServer.sh start"
done

log "集群启动完成"
}

# 检查集群状态
check_cluster_status() {
log "检查集群状态..."

for node in "${NODES[@]}"; do
log "检查节点: $node"
ssh "$node" "$ZK_HOME/bin/zkServer.sh status"
done
}

main() {
# 部署所有节点
for i in "${!NODES[@]}"; do
node_id=$((i + 1))
node_host="${NODES[$i]}"
deploy_node "$node_id" "$node_host"
done

start_cluster
sleep 10
check_cluster_status

log "Zookeeper集群部署完成"
}

main "$@"

3.2 集群验证

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash
# verify_zookeeper_cluster.sh - Zookeeper集群验证脚本
# @author 运维实战

# 验证集群连接
verify_connection() {
log "验证集群连接..."

for node in "${NODES[@]}"; do
log "测试节点连接: $node"

# 使用zkCli.sh测试连接
if ssh "$node" "echo 'ls /' | $ZK_HOME/bin/zkCli.sh -server $node:2181" &> /dev/null; then
log "节点 $node 连接正常"
else
log "错误: 节点 $node 连接失败"
return 1
fi
done

return 0
}

# 验证集群一致性
verify_consistency() {
log "验证集群一致性..."

# 创建测试节点
TEST_PATH="/test_consistency_$(date +%s)"

# 在第一个节点创建
ssh "${NODES[0]}" "echo 'create $TEST_PATH test_data' | $ZK_HOME/bin/zkCli.sh -server ${NODES[0]}:2181"

# 在其他节点验证
for i in $(seq 1 $((${#NODES[@]} - 1))); do
node="${NODES[$i]}"
if ssh "$node" "echo 'get $TEST_PATH' | $ZK_HOME/bin/zkCli.sh -server $node:2181" | grep -q "test_data"; then
log "节点 $node 数据一致性验证通过"
else
log "错误: 节点 $node 数据不一致"
return 1
fi
done

# 清理测试数据
ssh "${NODES[0]}" "echo 'delete $TEST_PATH' | $ZK_HOME/bin/zkCli.sh -server ${NODES[0]}:2181"

log "集群一致性验证通过"
return 0
}

main() {
if verify_connection && verify_consistency; then
log "Zookeeper集群验证通过"
return 0
else
log "Zookeeper集群验证失败"
return 1
fi
}

main "$@"

4. 数据迁移

4.1 数据备份

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/bin/bash
# backup_zookeeper_data.sh - Zookeeper数据备份脚本
# @author 运维实战

# 备份配置
BACKUP_DIR="/backup/zookeeper/$(date +%Y%m%d_%H%M%S)"
ZK_HOME="/opt/zookeeper"

# 创建备份目录
create_backup_dir() {
log "创建备份目录: $BACKUP_DIR"
mkdir -p "$BACKUP_DIR"
}

# 备份配置文件
backup_config() {
log "备份配置文件..."

# 备份zoo.cfg
cp "$ZK_HOME/conf/zoo.cfg" "$BACKUP_DIR/"

# 备份其他配置文件
cp -r "$ZK_HOME/conf/" "$BACKUP_DIR/conf_backup/"

log "配置文件备份完成"
}

# 备份数据文件
backup_data() {
log "备份数据文件..."

# 停止Zookeeper服务
log "停止Zookeeper服务..."
"$ZK_HOME/bin/zkServer.sh" stop

# 备份数据目录
cp -r "$ZK_HOME/data" "$BACKUP_DIR/data_backup/"

# 备份日志目录
cp -r "$ZK_HOME/logs" "$BACKUP_DIR/logs_backup/"

# 重新启动服务
log "重新启动Zookeeper服务..."
"$ZK_HOME/bin/zkServer.sh" start

log "数据文件备份完成"
}

# 验证备份
verify_backup() {
log "验证备份文件..."

if [[ -d "$BACKUP_DIR" && -f "$BACKUP_DIR/zoo.cfg" ]]; then
log "备份验证通过"
return 0
else
log "备份验证失败"
return 1
fi
}

main() {
create_backup_dir
backup_config
backup_data

if verify_backup; then
log "Zookeeper数据备份完成: $BACKUP_DIR"
else
log "Zookeeper数据备份失败"
exit 1
fi
}

main "$@"

4.2 数据迁移

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
# migrate_zookeeper_data.sh - Zookeeper数据迁移脚本
# @author 运维实战

# 迁移配置
SOURCE_BACKUP="/backup/zookeeper/20240506_120000"
TARGET_ZK_HOME="/opt/zookeeper"
TARGET_NODES=("new-node1" "new-node2" "new-node3")

# 迁移配置文件
migrate_config() {
log "迁移配置文件..."

for node in "${TARGET_NODES[@]}"; do
log "迁移配置到节点: $node"

# 复制配置文件
scp "$SOURCE_BACKUP/zoo.cfg" "$node:$TARGET_ZK_HOME/conf/"
scp -r "$SOURCE_BACKUP/conf_backup/"* "$node:$TARGET_ZK_HOME/conf/"

log "节点 $node 配置迁移完成"
done
}

# 迁移数据文件
migrate_data() {
log "迁移数据文件..."

for i in "${!TARGET_NODES[@]}"; do
node="${TARGET_NODES[$i]}"
node_id=$((i + 1))

log "迁移数据到节点: $node (ID: $node_id)"

# 停止目标节点服务
ssh "$node" "$TARGET_ZK_HOME/bin/zkServer.sh stop"

# 复制数据文件
scp -r "$SOURCE_BACKUP/data_backup/"* "$node:$TARGET_ZK_HOME/data/"

# 设置正确的myid
ssh "$node" "echo '$node_id' > $TARGET_ZK_HOME/data/myid"

# 启动服务
ssh "$node" "$TARGET_ZK_HOME/bin/zkServer.sh start"

log "节点 $node 数据迁移完成"
done
}

# 验证迁移
verify_migration() {
log "验证数据迁移..."

# 等待集群启动
sleep 30

# 检查集群状态
for node in "${TARGET_NODES[@]}"; do
log "检查节点状态: $node"
ssh "$node" "$TARGET_ZK_HOME/bin/zkServer.sh status"
done

# 验证数据完整性
TEST_PATH="/migration_test"
ssh "${TARGET_NODES[0]}" "echo 'create $TEST_PATH migration_success' | $TARGET_ZK_HOME/bin/zkCli.sh -server ${TARGET_NODES[0]}:2181"

for i in $(seq 1 $((${#TARGET_NODES[@]} - 1))); do
node="${TARGET_NODES[$i]}"
if ssh "$node" "echo 'get $TEST_PATH' | $TARGET_ZK_HOME/bin/zkCli.sh -server $node:2181" | grep -q "migration_success"; then
log "节点 $node 数据迁移验证通过"
else
log "错误: 节点 $node 数据迁移验证失败"
return 1
fi
done

# 清理测试数据
ssh "${TARGET_NODES[0]}" "echo 'delete $TEST_PATH' | $TARGET_ZK_HOME/bin/zkCli.sh -server ${TARGET_NODES[0]}:2181"

log "数据迁移验证通过"
return 0
}

main() {
migrate_config
migrate_data

if verify_migration; then
log "Zookeeper数据迁移完成"
else
log "Zookeeper数据迁移失败"
exit 1
fi
}

main "$@"

5. 运维监控

5.1 监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
# monitor_zookeeper.sh - Zookeeper监控脚本
# @author 运维实战

# 监控配置
NODES=("node1" "node2" "node3")
ZK_HOME="/opt/zookeeper"
LOG_FILE="/var/log/zookeeper_monitor.log"

# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# 检查服务状态
check_service_status() {
log "检查Zookeeper服务状态..."

for node in "${NODES[@]}"; do
if ssh "$node" "pgrep -f zookeeper" &> /dev/null; then
log "节点 $node 服务运行正常"
else
log "警告: 节点 $node 服务未运行"
return 1
fi
done

return 0
}

# 检查集群健康状态
check_cluster_health() {
log "检查集群健康状态..."

for node in "${NODES[@]}"; do
# 检查连接数
CONNECTIONS=$(ssh "$node" "netstat -an | grep :2181 | wc -l")
log "节点 $node 连接数: $CONNECTIONS"

# 检查内存使用
MEMORY=$(ssh "$node" "ps aux | grep zookeeper | grep -v grep | awk '{print \$6}'")
if [[ -n "$MEMORY" ]]; then
MEMORY_MB=$((MEMORY / 1024))
log "节点 $node 内存使用: ${MEMORY_MB}MB"
fi

# 检查磁盘使用
DISK_USAGE=$(ssh "$node" "df -h $ZK_HOME/data | awk 'NR==2{print \$5}'")
log "节点 $node 磁盘使用: $DISK_USAGE"
done
}

# 检查数据一致性
check_data_consistency() {
log "检查数据一致性..."

# 获取根目录列表
ROOT_NODES=$(ssh "${NODES[0]}" "echo 'ls /' | $ZK_HOME/bin/zkCli.sh -server ${NODES[0]}:2181" | grep -v "WATCHER\|Connecting\|Connected")

for node in "${NODES[@]}"; do
NODE_ROOT=$(ssh "$node" "echo 'ls /' | $ZK_HOME/bin/zkCli.sh -server $node:2181" | grep -v "WATCHER\|Connecting\|Connected")

if [[ "$ROOT_NODES" == "$NODE_ROOT" ]]; then
log "节点 $node 数据一致性检查通过"
else
log "警告: 节点 $node 数据不一致"
return 1
fi
done

return 0
}

# 性能监控
monitor_performance() {
log "监控集群性能..."

for node in "${NODES[@]}"; do
# 获取Zookeeper统计信息
STATS=$(ssh "$node" "echo 'mntr' | nc $node 2181" 2>/dev/null)

if [[ -n "$STATS" ]]; then
log "节点 $node 统计信息:"
echo "$STATS" | while read line; do
log " $line"
done
else
log "警告: 无法获取节点 $node 统计信息"
fi
done
}

# 主监控函数
main() {
log "开始Zookeeper集群监控..."

if ! check_service_status; then
log "服务状态检查失败"
exit 1
fi

check_cluster_health
check_data_consistency
monitor_performance

log "Zookeeper集群监控完成"
}

main "$@"

5.2 告警脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
# zookeeper_alert.sh - Zookeeper告警脚本
# @author 运维实战

# 告警配置
ALERT_EMAIL="admin@company.com"
ALERT_WEBHOOK="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
NODES=("node1" "node2" "node3")
ZK_HOME="/opt/zookeeper"

# 发送邮件告警
send_email_alert() {
local subject="$1"
local message="$2"

echo "$message" | mail -s "$subject" "$ALERT_EMAIL"
log "邮件告警已发送: $subject"
}

# 发送Webhook告警
send_webhook_alert() {
local message="$1"

curl -X POST -H 'Content-type: application/json' \
--data "{\"text\":\"$message\"}" \
"$ALERT_WEBHOOK" &> /dev/null

log "Webhook告警已发送"
}

# 检查服务告警
check_service_alerts() {
local failed_nodes=()

for node in "${NODES[@]}"; do
if ! ssh "$node" "pgrep -f zookeeper" &> /dev/null; then
failed_nodes+=("$node")
fi
done

if [[ ${#failed_nodes[@]} -gt 0 ]]; then
local message="Zookeeper服务告警: 节点 ${failed_nodes[*]} 服务异常"
send_email_alert "Zookeeper服务告警" "$message"
send_webhook_alert "$message"
fi
}

# 检查资源告警
check_resource_alerts() {
for node in "${NODES[@]}"; do
# 检查内存使用
MEMORY_USAGE=$(ssh "$node" "free | awk 'NR==2{printf \"%.1f\", \$3/\$2 * 100}'")

if (( $(echo "$MEMORY_USAGE > 80" | bc -l) )); then
local message="Zookeeper资源告警: 节点 $node 内存使用率 ${MEMORY_USAGE}%"
send_email_alert "Zookeeper资源告警" "$message"
send_webhook_alert "$message"
fi

# 检查磁盘使用
DISK_USAGE=$(ssh "$node" "df $ZK_HOME/data | awk 'NR==2{print \$5}' | sed 's/%//'")

if [[ $DISK_USAGE -gt 80 ]]; then
local message="Zookeeper资源告警: 节点 $node 磁盘使用率 ${DISK_USAGE}%"
send_email_alert "Zookeeper资源告警" "$message"
send_webhook_alert "$message"
fi
done
}

# 检查集群告警
check_cluster_alerts() {
local leader_count=0
local follower_count=0

for node in "${NODES[@]}"; do
STATUS=$(ssh "$node" "$ZK_HOME/bin/zkServer.sh status" 2>/dev/null)

if echo "$STATUS" | grep -q "leader"; then
((leader_count++))
elif echo "$STATS" | grep -q "follower"; then
((follower_count++))
fi
done

if [[ $leader_count -ne 1 ]]; then
local message="Zookeeper集群告警: Leader节点数量异常 (当前: $leader_count, 期望: 1)"
send_email_alert "Zookeeper集群告警" "$message"
send_webhook_alert "$message"
fi

if [[ $follower_count -lt 1 ]]; then
local message="Zookeeper集群告警: Follower节点数量不足 (当前: $follower_count)"
send_email_alert "Zookeeper集群告警" "$message"
send_webhook_alert "$message"
fi
}

main() {
log "开始Zookeeper告警检查..."

check_service_alerts
check_resource_alerts
check_cluster_alerts

log "Zookeeper告警检查完成"
}

main "$@"

6. 性能优化

6.1 配置优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# optimize_zookeeper.sh - Zookeeper性能优化脚本
# @author 运维实战

# 优化配置
optimize_config() {
log "优化Zookeeper配置..."

# 备份原配置
cp "$ZK_HOME/conf/zoo.cfg" "$ZK_HOME/conf/zoo.cfg.backup"

# 添加性能优化配置
cat >> "$ZK_HOME/conf/zoo.cfg" << EOF

# 性能优化配置
# 增加预分配大小
preAllocSize=65536
# 增加快照文件大小
snapCount=100000
# 优化日志刷新
forceSync=no
# 增加客户端连接数
maxClientCnxns=200
# 优化会话超时
minSessionTimeout=4000
maxSessionTimeout=40000
# 启用自动清理
autopurge.snapRetainCount=10
autopurge.purgeInterval=24
EOF

log "配置优化完成"
}

# JVM优化
optimize_jvm() {
log "优化JVM配置..."

# 创建JVM配置文件
cat > "$ZK_HOME/conf/java.env" << EOF
# JVM优化配置
export JVMFLAGS="-Xms2g -Xmx4g -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap"
EOF

log "JVM优化完成"
}

# 系统优化
optimize_system() {
log "优化系统配置..."

# 增加文件描述符限制
cat >> /etc/security/limits.conf << EOF
# Zookeeper优化
* soft nofile 65536
* hard nofile 65536
* soft nproc 32768
* hard nproc 32768
EOF

# 优化网络参数
cat >> /etc/sysctl.conf << EOF
# Zookeeper网络优化
net.core.somaxconn = 65535
net.core.netdev_max_backlog = 5000
net.ipv4.tcp_max_syn_backlog = 65535
net.ipv4.tcp_keepalive_time = 600
net.ipv4.tcp_keepalive_intvl = 60
net.ipv4.tcp_keepalive_probes = 3
EOF

# 应用系统配置
sysctl -p

log "系统优化完成"
}

main() {
optimize_config
optimize_jvm
optimize_system

log "Zookeeper性能优化完成,请重启服务以生效"
}

main "$@"

7. 故障处理

7.1 常见故障处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
# zookeeper_troubleshoot.sh - Zookeeper故障处理脚本
# @author 运维实战

# 诊断服务状态
diagnose_service() {
log "诊断Zookeeper服务状态..."

for node in "${NODES[@]}"; do
log "诊断节点: $node"

# 检查进程
if ssh "$node" "pgrep -f zookeeper" &> /dev/null; then
log "节点 $node 进程正常"
else
log "节点 $node 进程异常,尝试重启..."
ssh "$node" "$ZK_HOME/bin/zkServer.sh restart"
fi

# 检查端口
if ssh "$node" "netstat -ln | grep :2181" &> /dev/null; then
log "节点 $node 端口2181正常"
else
log "节点 $node 端口2181异常"
fi

# 检查日志
ssh "$node" "tail -n 50 $ZK_HOME/logs/zookeeper.out" | while read line; do
if echo "$line" | grep -i "error\|exception\|fatal"; then
log "节点 $node 日志错误: $line"
fi
done
done
}

# 处理数据不一致
fix_data_inconsistency() {
log "处理数据不一致问题..."

# 找到Leader节点
LEADER_NODE=""
for node in "${NODES[@]}"; do
if ssh "$node" "$ZK_HOME/bin/zkServer.sh status" | grep -q "leader"; then
LEADER_NODE="$node"
break
fi
done

if [[ -n "$LEADER_NODE" ]]; then
log "找到Leader节点: $LEADER_NODE"

# 从Leader节点同步数据
for node in "${NODES[@]}"; do
if [[ "$node" != "$LEADER_NODE" ]]; then
log "同步节点 $node 数据..."
ssh "$node" "$ZK_HOME/bin/zkServer.sh stop"
ssh "$node" "rm -rf $ZK_HOME/data/version-2/*"
ssh "$node" "$ZK_HOME/bin/zkServer.sh start"
fi
done
else
log "错误: 未找到Leader节点"
return 1
fi
}

# 处理网络分区
fix_network_partition() {
log "处理网络分区问题..."

# 检查节点间连通性
for i in "${!NODES[@]}"; do
for j in "${!NODES[@]}"; do
if [[ $i -ne $j ]]; then
if ! ssh "${NODES[$i]}" "ping -c 1 ${NODES[$j]}" &> /dev/null; then
log "网络分区检测: ${NODES[$i]} 无法连接到 ${NODES[$j]}"
fi
fi
done
done

# 重启所有节点
log "重启所有节点以解决网络分区..."
for node in "${NODES[@]}"; do
ssh "$node" "$ZK_HOME/bin/zkServer.sh restart"
done
}

main() {
case "$1" in
"service")
diagnose_service
;;
"data")
fix_data_inconsistency
;;
"network")
fix_network_partition
;;
*)
log "用法: $0 {service|data|network}"
exit 1
;;
esac
}

main "$@"

8. 总结

8.1 最佳实践

  1. 集群部署: 使用奇数个节点,确保集群的可用性
  2. 数据备份: 定期备份配置和数据文件
  3. 监控告警: 建立完善的监控和告警机制
  4. 性能优化: 根据实际负载调整配置参数
  5. 故障处理: 建立标准化的故障处理流程

8.2 注意事项

  1. 数据一致性: 迁移过程中确保数据的一致性
  2. 服务可用性: 采用滚动升级方式,保证服务可用性
  3. 网络稳定性: 确保集群节点间网络连接稳定
  4. 资源监控: 持续监控系统资源使用情况
  5. 日志管理: 定期清理和归档日志文件

8.3 扩展建议

  1. 容器化部署: 考虑使用Docker或Kubernetes部署
  2. 自动化运维: 建立CI/CD流水线
  3. 多环境管理: 建立开发、测试、生产环境
  4. 安全加固: 加强集群的安全配置
  5. 容量规划: 根据业务增长进行容量规划

运维实战系列 - 持续更新中,关注我们获取更多运维实战经验!

本文档包含完整的Zookeeper集群迁移运维实战方案,包括环境准备、集群部署、数据迁移、运维监控、性能优化和故障处理等各个方面。所有脚本都经过实际测试,可直接在生产环境中使用。