1. Redis主备架构概述

Redis主备架构是构建高可用缓存服务的重要方案,通过主从复制实现数据冗余和读写分离,通过哨兵模式实现自动故障转移。本文将详细介绍Redis主备架构的运维实战经验,包括主从配置、哨兵部署、故障转移、数据同步的完整解决方案。

1.1 核心功能

  1. 主从复制: 实现数据从主节点到从节点的实时同步
  2. 读写分离: 主节点处理写操作,从节点处理读操作
  3. 故障转移: 自动检测主节点故障并切换到从节点
  4. 数据一致性: 确保主备数据的一致性和完整性
  5. 高可用保障: 提供7x24小时不间断服务

1.2 技术架构

1
2
3
4
5
客户端应用 → Redis主节点 → Redis从节点
↓ ↓ ↓
写操作处理 → 数据同步 → 读操作处理
↓ ↓ ↓
哨兵监控 → 故障检测 → 自动切换

2. 环境准备

2.1 系统环境检查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
# check_redis_master_slave_env.sh - Redis主备环境检查脚本
# @author 运维实战

# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 检查系统环境
check_system_environment() {
log "开始检查Redis主备系统环境..."

# 检查操作系统
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
OS_VERSION=$(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)
log "操作系统: $OS_VERSION"
else
log "错误: 不支持的操作系统 $OSTYPE"
exit 1
fi

# 检查系统资源
TOTAL_MEM=$(free -h | grep "Mem:" | awk '{print $2}')
AVAILABLE_MEM=$(free -h | grep "Mem:" | awk '{print $7}')
CPU_CORES=$(nproc)
DISK_SPACE=$(df -h / | tail -1 | awk '{print $4}')

log "系统资源检查:"
log " 总内存: $TOTAL_MEM"
log " 可用内存: $AVAILABLE_MEM"
log " CPU核心数: $CPU_CORES"
log " 可用磁盘空间: $DISK_SPACE"

# 检查网络连通性
check_network_connectivity
}

# 检查网络连通性
check_network_connectivity() {
log "检查网络连通性..."

# 检查主备节点间网络
MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"

if ping -c 3 $MASTER_IP > /dev/null 2>&1; then
log "主节点网络连通正常: $MASTER_IP"
else
log "警告: 主节点网络连通异常: $MASTER_IP"
fi

if ping -c 3 $SLAVE_IP > /dev/null 2>&1; then
log "从节点网络连通正常: $SLAVE_IP"
else
log "警告: 从节点网络连通异常: $SLAVE_IP"
fi

# 检查端口可用性
check_port_availability
}

# 检查端口可用性
check_port_availability() {
log "检查Redis端口可用性..."

REDIS_PORTS=(6379 26379) # Redis服务端口和哨兵端口

for port in "${REDIS_PORTS[@]}"; do
if netstat -tlnp | grep ":$port " > /dev/null; then
log "警告: 端口 $port 已被占用"
else
log "端口 $port 可用"
fi
done
}

# 检查Redis安装
check_redis_installation() {
log "检查Redis安装状态..."

if command -v redis-server > /dev/null 2>&1; then
REDIS_VERSION=$(redis-server --version | head -1)
log "Redis已安装: $REDIS_VERSION"

# 检查Redis服务状态
if systemctl is-active --quiet redis; then
log "Redis服务运行正常"
else
log "Redis服务未运行"
fi
else
log "Redis未安装,开始安装..."
install_redis
fi
}

# 安装Redis
install_redis() {
log "开始安装Redis..."

# 更新包管理器
if command -v apt-get > /dev/null 2>&1; then
sudo apt-get update
sudo apt-get install -y redis-server redis-tools
elif command -v yum > /dev/null 2>&1; then
sudo yum update -y
sudo yum install -y redis
else
log "错误: 不支持的包管理器"
exit 1
fi

# 启动Redis服务
sudo systemctl start redis
sudo systemctl enable redis

log "Redis安装完成"
}

# 主函数
main() {
log "=== Redis主备环境检查开始 ==="
check_system_environment
check_redis_installation
log "=== Redis主备环境检查完成 ==="
}

# 执行主函数
main "$@"

2.2 Redis主备配置优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#!/bin/bash
# redis_master_slave_config.sh - Redis主备配置脚本
# @author 运维实战

# 配置参数
MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
SENTINEL_PORT=26379

# 备份配置文件
backup_config_files() {
log "备份Redis配置文件..."

# 备份主节点配置
if [ -f "/etc/redis/redis.conf" ]; then
sudo cp /etc/redis/redis.conf /etc/redis/redis.conf.backup.$(date +%Y%m%d_%H%M%S)
log "主节点配置文件已备份"
fi

# 备份哨兵配置
if [ -f "/etc/redis/sentinel.conf" ]; then
sudo cp /etc/redis/sentinel.conf /etc/redis/sentinel.conf.backup.$(date +%Y%m%d_%H%M%S)
log "哨兵配置文件已备份"
fi
}

# 配置主节点
configure_master_node() {
log "配置Redis主节点..."

cat > /etc/redis/redis.conf << EOF
# Redis主节点配置
bind 0.0.0.0
port $REDIS_PORT
timeout 300
tcp-keepalive 300

# 内存配置
maxmemory 2gb
maxmemory-policy allkeys-lru

# 持久化配置
save 900 1
save 300 10
save 60 10000
appendonly yes
appendfsync everysec
appendfilename "appendonly.aof"

# 主从复制配置
replica-read-only yes
replica-serve-stale-data yes

# 安全配置
requirepass redis123
masterauth redis123

# 日志配置
loglevel notice
logfile /var/log/redis/redis-server.log

# 其他配置
daemonize yes
pidfile /var/run/redis/redis-server.pid
dir /var/lib/redis
EOF

log "主节点配置完成"
}

# 配置从节点
configure_slave_node() {
log "配置Redis从节点..."

cat > /etc/redis/redis.conf << EOF
# Redis从节点配置
bind 0.0.0.0
port $REDIS_PORT
timeout 300
tcp-keepalive 300

# 内存配置
maxmemory 2gb
maxmemory-policy allkeys-lru

# 持久化配置
save 900 1
save 300 10
save 60 10000
appendonly yes
appendfsync everysec
appendfilename "appendonly.aof"

# 主从复制配置
replicaof $MASTER_IP $REDIS_PORT
replica-read-only yes
replica-serve-stale-data yes
replica-priority 100

# 安全配置
requirepass redis123
masterauth redis123

# 日志配置
loglevel notice
logfile /var/log/redis/redis-server.log

# 其他配置
daemonize yes
pidfile /var/run/redis/redis-server.pid
dir /var/lib/redis
EOF

log "从节点配置完成"
}

# 配置哨兵
configure_sentinel() {
log "配置Redis哨兵..."

cat > /etc/redis/sentinel.conf << EOF
# Redis哨兵配置
port $SENTINEL_PORT
bind 0.0.0.0
sentinel deny-scripts-reconfig yes

# 监控主节点
sentinel monitor mymaster $MASTER_IP $REDIS_PORT 2
sentinel auth-pass mymaster redis123
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000
sentinel notification-script mymaster /opt/redis-scripts/notify.sh
sentinel client-reconfig-script mymaster /opt/redis-scripts/reconfig.sh

# 日志配置
logfile /var/log/redis/sentinel.log
loglevel notice

# 其他配置
daemonize yes
pidfile /var/run/redis/sentinel.pid
dir /var/lib/redis
EOF

log "哨兵配置完成"
}

# 创建通知脚本
create_notification_scripts() {
log "创建哨兵通知脚本..."

# 创建脚本目录
mkdir -p /opt/redis-scripts

# 创建通知脚本
cat > /opt/redis-scripts/notify.sh << 'EOF'
#!/bin/bash
# Redis哨兵通知脚本
# @author 运维实战

# 获取参数
TYPE=$1
NAME=$2
IP=$3
PORT=$4
MASTER_IP=$5
MASTER_PORT=$6

# 日志记录
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Sentinel notification: $TYPE $NAME $IP:$PORT" >> /var/log/redis/sentinel-notify.log

# 根据事件类型处理
case $TYPE in
+sdown)
echo "主节点 $NAME 主观下线: $IP:$PORT"
# 发送告警通知
send_alert "Redis主节点主观下线" "主节点 $NAME ($IP:$PORT) 主观下线"
;;
-sdown)
echo "主节点 $NAME 主观上线: $IP:$PORT"
;;
+odown)
echo "主节点 $NAME 客观下线: $IP:$PORT"
# 发送告警通知
send_alert "Redis主节点客观下线" "主节点 $NAME ($IP:$PORT) 客观下线,开始故障转移"
;;
+switch-master)
echo "主节点切换: $NAME $MASTER_IP:$MASTER_PORT -> $IP:$PORT"
# 发送告警通知
send_alert "Redis主节点切换" "主节点已从 $MASTER_IP:$MASTER_PORT 切换到 $IP:$PORT"
;;
+slave)
echo "从节点上线: $NAME $IP:$PORT"
;;
-slave)
echo "从节点下线: $NAME $IP:$PORT"
;;
esac

# 发送告警函数
send_alert() {
local subject="$1"
local message="$2"

# 发送邮件告警
echo "$message" | mail -s "$subject" admin@example.com

# 发送钉钉告警
curl -X POST "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" \
-H 'Content-Type: application/json' \
-d "{\"msgtype\":\"text\",\"text\":{\"content\":\"$subject\\n$message\"}}"
}
EOF

chmod +x /opt/redis-scripts/notify.sh

# 创建重配置脚本
cat > /opt/redis-scripts/reconfig.sh << 'EOF'
#!/bin/bash
# Redis哨兵重配置脚本
# @author 运维实战

# 获取参数
MASTER_IP=$1
MASTER_PORT=$2
OLD_MASTER_IP=$3
OLD_MASTER_PORT=$4

# 日志记录
echo "[$(date '+%Y-%m-%d %H:%M:%S')] Sentinel reconfig: $MASTER_IP:$MASTER_PORT" >> /var/log/redis/sentinel-reconfig.log

# 更新应用配置
update_application_config() {
# 更新应用中的Redis连接配置
sed -i "s/$OLD_MASTER_IP:$OLD_MASTER_PORT/$MASTER_IP:$MASTER_PORT/g" /opt/app/config/redis.conf

# 重启应用服务
systemctl restart application-service

echo "应用配置已更新: $MASTER_IP:$MASTER_PORT"
}

# 执行配置更新
update_application_config
EOF

chmod +x /opt/redis-scripts/reconfig.sh

log "通知脚本创建完成"
}

# 启动服务
start_services() {
log "启动Redis服务..."

# 启动Redis服务
systemctl restart redis
systemctl enable redis

# 启动哨兵服务
systemctl restart redis-sentinel
systemctl enable redis-sentinel

# 检查服务状态
sleep 3

if systemctl is-active --quiet redis; then
log "Redis服务启动成功"
else
log "错误: Redis服务启动失败"
exit 1
fi

if systemctl is-active --quiet redis-sentinel; then
log "哨兵服务启动成功"
else
log "错误: 哨兵服务启动失败"
exit 1
fi
}

# 主函数
main() {
log "=== Redis主备配置开始 ==="
backup_config_files
configure_master_node
configure_slave_node
configure_sentinel
create_notification_scripts
start_services
log "=== Redis主备配置完成 ==="
}

# 执行主函数
main "$@"

3. 主从复制管理

3.1 主从复制配置脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/bin/bash
# redis_replication_manager.sh - Redis主从复制管理脚本
# @author 运维实战

# 配置参数
MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
REDIS_PASSWORD="redis123"

# 检查主从状态
check_replication_status() {
log "检查Redis主从复制状态..."

# 检查主节点状态
if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "主节点连接正常: $MASTER_IP:$REDIS_PORT"

# 获取主节点复制信息
MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)
CONNECTED_SLAVES=$(echo "$MASTER_INFO" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
log "主节点连接的从节点数: $CONNECTED_SLAVES"

# 显示从节点详细信息
echo "$MASTER_INFO" | grep "slave"
else
log "错误: 主节点连接失败: $MASTER_IP:$REDIS_PORT"
fi

# 检查从节点状态
if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "从节点连接正常: $SLAVE_IP:$REDIS_PORT"

# 获取从节点复制信息
SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)
MASTER_LINK_STATUS=$(echo "$SLAVE_INFO" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
MASTER_SYNC_IN_PROGRESS=$(echo "$SLAVE_INFO" | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r')

log "从节点主从连接状态: $MASTER_LINK_STATUS"
log "从节点同步状态: $MASTER_SYNC_IN_PROGRESS"

if [ "$MASTER_LINK_STATUS" = "up" ]; then
log "从节点复制正常"
else
log "警告: 从节点复制异常"
fi
else
log "错误: 从节点连接失败: $SLAVE_IP:$REDIS_PORT"
fi
}

# 建立主从关系
setup_master_slave() {
log "建立Redis主从关系..."

# 检查从节点是否已配置为主从
SLAVE_ROLE=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "role:" | cut -d: -f2 | tr -d '\r')

if [ "$SLAVE_ROLE" = "slave" ]; then
log "从节点已配置为主从关系"
else
log "配置从节点为主从关系..."

# 设置从节点指向主节点
redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD replicaof $MASTER_IP $REDIS_PORT

# 等待同步完成
wait_for_sync_completion

log "主从关系建立完成"
fi
}

# 等待同步完成
wait_for_sync_completion() {
log "等待主从同步完成..."

local max_wait=300 # 最大等待5分钟
local wait_time=0

while [ $wait_time -lt $max_wait ]; do
MASTER_SYNC_IN_PROGRESS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_SYNC_IN_PROGRESS" = "0" ]; then
log "主从同步完成"
break
fi

log "同步进行中,等待中... ($wait_time/$max_wait)"
sleep 10
wait_time=$((wait_time + 10))
done

if [ $wait_time -ge $max_wait ]; then
log "警告: 同步超时,请检查网络和配置"
fi
}

# 手动同步
manual_sync() {
log "执行手动同步..."

# 在从节点执行同步
redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD sync

log "手动同步命令已发送"
}

# 断开主从关系
break_master_slave() {
log "断开主从关系..."

# 在从节点执行断开命令
redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD replicaof no one

log "主从关系已断开"
}

# 重新建立主从关系
reestablish_master_slave() {
log "重新建立主从关系..."

# 先断开现有关系
break_master_slave
sleep 5

# 重新建立关系
setup_master_slave
}

# 检查数据一致性
check_data_consistency() {
log "检查主从数据一致性..."

# 获取主节点键数量
MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
log "主节点键数量: $MASTER_KEYS"

# 获取从节点键数量
SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
log "从节点键数量: $SLAVE_KEYS"

# 比较键数量
if [ "$MASTER_KEYS" = "$SLAVE_KEYS" ]; then
log "主从数据一致性检查通过"
else
log "警告: 主从数据不一致 (主:$MASTER_KEYS, 从:$SLAVE_KEYS)"
fi

# 检查关键键的一致性
check_key_consistency
}

# 检查关键键的一致性
check_key_consistency() {
log "检查关键键的一致性..."

# 获取主节点的所有键
MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD keys "*")

local inconsistent_keys=0

for key in $MASTER_KEYS; do
# 获取主节点键值
MASTER_VALUE=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD get "$key")

# 获取从节点键值
SLAVE_VALUE=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD get "$key")

# 比较键值
if [ "$MASTER_VALUE" != "$SLAVE_VALUE" ]; then
log "警告: 键 $key 值不一致"
inconsistent_keys=$((inconsistent_keys + 1))
fi
done

if [ $inconsistent_keys -eq 0 ]; then
log "所有键值一致性检查通过"
else
log "发现 $inconsistent_keys 个键值不一致"
fi
}

# 主函数
main() {
case $1 in
"status")
check_replication_status
;;
"setup")
setup_master_slave
;;
"sync")
manual_sync
;;
"break")
break_master_slave
;;
"reestablish")
reestablish_master_slave
;;
"consistency")
check_data_consistency
;;
*)
echo "用法: $0 {status|setup|sync|break|reestablish|consistency}"
echo " status - 检查主从状态"
echo " setup - 建立主从关系"
echo " sync - 手动同步"
echo " break - 断开主从关系"
echo " reestablish - 重新建立主从关系"
echo " consistency - 检查数据一致性"
;;
esac
}

# 执行主函数
main "$@"

3.2 数据同步监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/bin/bash
# redis_sync_monitor.sh - Redis数据同步监控脚本
# @author 运维实战

# 监控配置
MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
REDIS_PASSWORD="redis123"
MONITOR_INTERVAL=30
LOG_FILE="/var/log/redis-sync-monitor.log"

# 记录监控数据
log_sync_data() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local metric=$1
local value=$2
local node=$3

echo "[$timestamp] Node:$node Metric:$metric Value:$value" >> $LOG_FILE
}

# 监控主从同步状态
monitor_replication_sync() {
log "开始监控Redis主从同步状态..."

while true; do
# 监控主节点
if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)

CONNECTED_SLAVES=$(echo "$MASTER_INFO" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
MASTER_REPL_OFFSET=$(echo "$MASTER_INFO" | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')

log_sync_data "connected_slaves" $CONNECTED_SLAVES "master"
log_sync_data "master_repl_offset" $MASTER_REPL_OFFSET "master"
else
log_sync_data "connection_status" "down" "master"
fi

# 监控从节点
if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)

MASTER_LINK_STATUS=$(echo "$SLAVE_INFO" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
MASTER_SYNC_IN_PROGRESS=$(echo "$SLAVE_INFO" | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r')
SLAVE_REPL_OFFSET=$(echo "$SLAVE_INFO" | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')
MASTER_LAST_IO_SECONDS_AGO=$(echo "$SLAVE_INFO" | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r')

log_sync_data "master_link_status" $MASTER_LINK_STATUS "slave"
log_sync_data "master_sync_in_progress" $MASTER_SYNC_IN_PROGRESS "slave"
log_sync_data "slave_repl_offset" $SLAVE_REPL_OFFSET "slave"
log_sync_data "master_last_io_seconds_ago" $MASTER_LAST_IO_SECONDS_AGO "slave"

# 检查同步延迟
if [ "$MASTER_REPL_OFFSET" != "" ] && [ "$SLAVE_REPL_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_REPL_OFFSET - SLAVE_REPL_OFFSET))
log_sync_data "sync_lag" $SYNC_LAG "slave"

if [ $SYNC_LAG -gt 1000 ]; then
log_sync_data "sync_lag_alert" "high" "slave"
fi
fi
else
log_sync_data "connection_status" "down" "slave"
fi

sleep $MONITOR_INTERVAL
done
}

# 生成同步报告
generate_sync_report() {
local report_file="/var/log/redis-sync-report-$(date +%Y%m%d).txt"

log "生成Redis同步报告: $report_file"

cat > $report_file << EOF
Redis主从同步报告
生成时间: $(date)
========================================

EOF

# 主节点信息
echo "主节点 ($MASTER_IP:$REDIS_PORT) 信息:" >> $report_file
echo "----------------------------------------" >> $report_file
redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication >> $report_file
echo "" >> $report_file

# 从节点信息
echo "从节点 ($SLAVE_IP:$REDIS_PORT) 信息:" >> $report_file
echo "----------------------------------------" >> $report_file
redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication >> $report_file
echo "" >> $report_file

# 同步统计
echo "同步统计:" >> $report_file
echo "----------------------------------------" >> $report_file

# 计算同步延迟
MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')
SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_OFFSET" != "" ] && [ "$SLAVE_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET))
echo "同步延迟: $SYNC_LAG 字节" >> $report_file
fi

# 键数量统计
MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
echo "主节点键数量: $MASTER_KEYS" >> $report_file
echo "从节点键数量: $SLAVE_KEYS" >> $report_file

log "同步报告生成完成: $report_file"
}

# 设置同步告警
setup_sync_alerts() {
log "设置Redis同步告警..."

# 创建告警脚本
cat > /opt/redis-sync-alert.sh << 'EOF'
#!/bin/bash
# Redis同步告警脚本

MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
REDIS_PASSWORD="redis123"

check_sync_alerts() {
# 检查主从连接状态
if ! redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
echo "告警: 主节点连接失败 ($MASTER_IP:$REDIS_PORT)"
return
fi

if ! redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
echo "告警: 从节点连接失败 ($SLAVE_IP:$REDIS_PORT)"
return
fi

# 检查从节点主从连接状态
MASTER_LINK_STATUS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_LINK_STATUS" != "up" ]; then
echo "告警: 从节点主从连接异常 ($SLAVE_IP:$REDIS_PORT)"
fi

# 检查同步延迟
MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')
SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_OFFSET" != "" ] && [ "$SLAVE_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET))

if [ $SYNC_LAG -gt 10000 ]; then
echo "告警: 同步延迟过高 ($SYNC_LAG 字节)"
fi
fi

# 检查最后IO时间
MASTER_LAST_IO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_LAST_IO" != "" ] && [ $MASTER_LAST_IO -gt 60 ]; then
echo "告警: 从节点最后IO时间过长 ($MASTER_LAST_IO 秒)"
fi
}

# 每5分钟检查一次
while true; do
check_sync_alerts
sleep 300
done
EOF

chmod +x /opt/redis-sync-alert.sh

# 启动告警服务
nohup /opt/redis-sync-alert.sh > /var/log/redis-sync-alert.log 2>&1 &

log "同步告警设置完成"
}

# 主函数
main() {
case $1 in
"monitor")
monitor_replication_sync
;;
"report")
generate_sync_report
;;
"alerts")
setup_sync_alerts
;;
*)
echo "用法: $0 {monitor|report|alerts}"
echo " monitor - 开始同步监控"
echo " report - 生成同步报告"
echo " alerts - 设置同步告警"
;;
esac
}

# 执行主函数
main "$@"

4. 哨兵模式管理

4.1 哨兵部署脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/bin/bash
# redis_sentinel_deploy.sh - Redis哨兵部署脚本
# @author 运维实战

# 哨兵配置
SENTINEL_PORTS=(26379 26380 26381)
MASTER_IP="192.168.1.10"
MASTER_PORT=6379
MASTER_PASSWORD="redis123"
QUORUM=2

# 部署哨兵节点
deploy_sentinel_nodes() {
log "部署Redis哨兵节点..."

for i in "${!SENTINEL_PORTS[@]}"; do
local port=${SENTINEL_PORTS[$i]}
local sentinel_id="sentinel-$port"

log "部署哨兵节点 $sentinel_id (端口: $port)..."

# 创建哨兵配置目录
local sentinel_dir="/opt/redis-sentinel/$port"
mkdir -p $sentinel_dir

# 创建哨兵配置文件
cat > $sentinel_dir/sentinel.conf << EOF
# Redis哨兵配置文件
port $port
bind 0.0.0.0
sentinel deny-scripts-reconfig yes

# 监控主节点
sentinel monitor mymaster $MASTER_IP $MASTER_PORT $QUORUM
sentinel auth-pass mymaster $MASTER_PASSWORD
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000
sentinel notification-script mymaster /opt/redis-scripts/notify.sh
sentinel client-reconfig-script mymaster /opt/redis-scripts/reconfig.sh

# 哨兵配置
sentinel announce-ip $(hostname -I | awk '{print $1}')
sentinel announce-port $port

# 日志配置
logfile $sentinel_dir/sentinel.log
loglevel notice

# 其他配置
daemonize yes
pidfile $sentinel_dir/sentinel.pid
dir $sentinel_dir
EOF

# 启动哨兵服务
redis-sentinel $sentinel_dir/sentinel.conf

if [ $? -eq 0 ]; then
log "哨兵节点 $sentinel_id 启动成功"
else
log "错误: 哨兵节点 $sentinel_id 启动失败"
exit 1
fi
done

# 等待哨兵启动
sleep 5
}

# 检查哨兵状态
check_sentinel_status() {
log "检查Redis哨兵状态..."

for port in "${SENTINEL_PORTS[@]}"; do
log "检查哨兵节点 $port..."

if redis-cli -p $port ping > /dev/null 2>&1; then
log "哨兵节点 $port 运行正常"

# 获取哨兵信息
SENTINEL_INFO=$(redis-cli -p $port sentinel masters)
log "哨兵 $port 监控的主节点信息:"
echo "$SENTINEL_INFO"

# 获取哨兵列表
SENTINEL_LIST=$(redis-cli -p $port sentinel sentinels mymaster)
log "哨兵 $port 哨兵列表:"
echo "$SENTINEL_LIST"
else
log "错误: 哨兵节点 $port 连接失败"
fi
done
}

# 测试故障转移
test_failover() {
log "测试Redis故障转移..."

# 记录当前主节点
CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "mymaster" | awk '{print $4}')
log "当前主节点: $CURRENT_MASTER"

# 模拟主节点故障
log "模拟主节点故障..."
redis-cli -h $CURRENT_MASTER -p $MASTER_PORT -a $MASTER_PASSWORD debug sleep 60 &

# 等待故障转移
log "等待故障转移..."
local max_wait=300
local wait_time=0

while [ $wait_time -lt $max_wait ]; do
NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "mymaster" | awk '{print $4}')

if [ "$NEW_MASTER" != "$CURRENT_MASTER" ]; then
log "故障转移成功: $CURRENT_MASTER -> $NEW_MASTER"
break
fi

log "故障转移进行中,等待中... ($wait_time/$max_wait)"
sleep 10
wait_time=$((wait_time + 10))
done

if [ $wait_time -ge $max_wait ]; then
log "警告: 故障转移超时"
fi
}

# 手动故障转移
manual_failover() {
log "执行手动故障转移..."

# 在第一个哨兵上执行手动故障转移
redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover mymaster

if [ $? -eq 0 ]; then
log "手动故障转移命令已发送"
else
log "错误: 手动故障转移失败"
fi
}

# 添加哨兵节点
add_sentinel_node() {
local new_port=$1
local existing_port=$2

log "添加哨兵节点 $new_port..."

# 创建新哨兵配置
local sentinel_dir="/opt/redis-sentinel/$new_port"
mkdir -p $sentinel_dir

cat > $sentinel_dir/sentinel.conf << EOF
port $new_port
bind 0.0.0.0
sentinel deny-scripts-reconfig yes

sentinel monitor mymaster $MASTER_IP $MASTER_PORT $QUORUM
sentinel auth-pass mymaster $MASTER_PASSWORD
sentinel down-after-milliseconds mymaster 30000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 180000

logfile $sentinel_dir/sentinel.log
loglevel notice
daemonize yes
pidfile $sentinel_dir/sentinel.pid
dir $sentinel_dir
EOF

# 启动新哨兵
redis-sentinel $sentinel_dir/sentinel.conf

# 等待哨兵发现其他哨兵
sleep 10

log "哨兵节点 $new_port 添加完成"
}

# 移除哨兵节点
remove_sentinel_node() {
local port=$1

log "移除哨兵节点 $port..."

# 停止哨兵服务
redis-cli -p $port shutdown

# 删除配置文件
rm -rf "/opt/redis-sentinel/$port"

log "哨兵节点 $port 移除完成"
}

# 主函数
main() {
case $1 in
"deploy")
deploy_sentinel_nodes
;;
"status")
check_sentinel_status
;;
"test")
test_failover
;;
"failover")
manual_failover
;;
"add")
add_sentinel_node $2 $3
;;
"remove")
remove_sentinel_node $2
;;
*)
echo "用法: $0 {deploy|status|test|failover|add|remove}"
echo " deploy - 部署哨兵节点"
echo " status - 检查哨兵状态"
echo " test - 测试故障转移"
echo " failover - 手动故障转移"
echo " add <new_port> <existing_port> - 添加哨兵节点"
echo " remove <port> - 移除哨兵节点"
;;
esac
}

# 执行主函数
main "$@"

4.2 哨兵监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/bin/bash
# redis_sentinel_monitor.sh - Redis哨兵监控脚本
# @author 运维实战

# 哨兵配置
SENTINEL_PORTS=(26379 26380 26381)
MONITOR_INTERVAL=30
LOG_FILE="/var/log/redis-sentinel-monitor.log"

# 记录监控数据
log_sentinel_data() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local metric=$1
local value=$2
local sentinel=$3

echo "[$timestamp] Sentinel:$sentinel Metric:$metric Value:$value" >> $LOG_FILE
}

# 监控哨兵状态
monitor_sentinel_status() {
log "开始监控Redis哨兵状态..."

while true; do
for port in "${SENTINEL_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log_sentinel_data "status" "up" $port

# 获取哨兵信息
SENTINEL_INFO=$(redis-cli -p $port info sentinel)

# 监控关键指标
MASTERS=$(echo "$SENTINEL_INFO" | grep "masters:" | cut -d: -f2 | tr -d '\r')
SLAVES=$(echo "$SENTINEL_INFO" | grep "slaves:" | cut -d: -f2 | tr -d '\r')
SENTINELS=$(echo "$SENTINEL_INFO" | grep "sentinels:" | cut -d: -f2 | tr -d '\r')

log_sentinel_data "masters" $MASTERS $port
log_sentinel_data "slaves" $SLAVES $port
log_sentinel_data "sentinels" $SENTINELS $port

# 获取主节点信息
MASTER_INFO=$(redis-cli -p $port sentinel masters)
if [ -n "$MASTER_INFO" ]; then
MASTER_STATUS=$(echo "$MASTER_INFO" | grep "mymaster" | awk '{print $6}')
log_sentinel_data "master_status" $MASTER_STATUS $port
fi
else
log_sentinel_data "status" "down" $port
fi
done

sleep $MONITOR_INTERVAL
done
}

# 生成哨兵报告
generate_sentinel_report() {
local report_file="/var/log/redis-sentinel-report-$(date +%Y%m%d).txt"

log "生成Redis哨兵报告: $report_file"

cat > $report_file << EOF
Redis哨兵监控报告
生成时间: $(date)
========================================

EOF

# 哨兵节点信息
for port in "${SENTINEL_PORTS[@]}"; do
echo "哨兵节点 $port 信息:" >> $report_file
echo "----------------------------------------" >> $report_file

if redis-cli -p $port ping > /dev/null 2>&1; then
redis-cli -p $port info sentinel >> $report_file
echo "" >> $report_file

# 主节点信息
echo "监控的主节点信息:" >> $report_file
redis-cli -p $port sentinel masters >> $report_file
echo "" >> $report_file

# 从节点信息
echo "监控的从节点信息:" >> $report_file
redis-cli -p $port sentinel slaves mymaster >> $report_file
echo "" >> $report_file

# 哨兵信息
echo "哨兵节点信息:" >> $report_file
redis-cli -p $port sentinel sentinels mymaster >> $report_file
echo "" >> $report_file
else
echo "哨兵节点 $port 连接失败" >> $report_file
echo "" >> $report_file
fi
done

log "哨兵报告生成完成: $report_file"
}

# 设置哨兵告警
setup_sentinel_alerts() {
log "设置Redis哨兵告警..."

# 创建告警脚本
cat > /opt/redis-sentinel-alert.sh << 'EOF'
#!/bin/bash
# Redis哨兵告警脚本

SENTINEL_PORTS=(26379 26380 26381)

check_sentinel_alerts() {
# 检查哨兵节点状态
for port in "${SENTINEL_PORTS[@]}"; do
if ! redis-cli -p $port ping > /dev/null 2>&1; then
echo "告警: 哨兵节点 $port 连接失败"
fi
done

# 检查主节点状态
MASTER_INFO=$(redis-cli -p 26379 sentinel masters)
if [ -n "$MASTER_INFO" ]; then
MASTER_STATUS=$(echo "$MASTER_INFO" | grep "mymaster" | awk '{print $6}')

if [ "$MASTER_STATUS" != "ok" ]; then
echo "告警: 主节点状态异常 ($MASTER_STATUS)"
fi
fi

# 检查从节点数量
SLAVE_COUNT=$(redis-cli -p 26379 sentinel slaves mymaster | wc -l)
if [ $SLAVE_COUNT -lt 1 ]; then
echo "告警: 从节点数量不足 ($SLAVE_COUNT)"
fi

# 检查哨兵数量
SENTINEL_COUNT=$(redis-cli -p 26379 sentinel sentinels mymaster | wc -l)
if [ $SENTINEL_COUNT -lt 2 ]; then
echo "告警: 哨兵数量不足 ($SENTINEL_COUNT)"
fi
}

# 每5分钟检查一次
while true; do
check_sentinel_alerts
sleep 300
done
EOF

chmod +x /opt/redis-sentinel-alert.sh

# 启动告警服务
nohup /opt/redis-sentinel-alert.sh > /var/log/redis-sentinel-alert.log 2>&1 &

log "哨兵告警设置完成"
}

# 主函数
main() {
case $1 in
"monitor")
monitor_sentinel_status
;;
"report")
generate_sentinel_report
;;
"alerts")
setup_sentinel_alerts
;;
*)
echo "用法: $0 {monitor|report|alerts}"
echo " monitor - 开始哨兵监控"
echo " report - 生成哨兵报告"
echo " alerts - 设置哨兵告警"
;;
esac
}

# 执行主函数
main "$@"

5. 故障转移管理

5.1 故障转移脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/bin/bash
# redis_failover_manager.sh - Redis故障转移管理脚本
# @author 运维实战

# 配置参数
SENTINEL_PORTS=(26379 26380 26381)
MASTER_NAME="mymaster"
REDIS_PASSWORD="redis123"

# 检查故障转移状态
check_failover_status() {
log "检查Redis故障转移状态..."

# 获取主节点信息
MASTER_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters)

if [ -n "$MASTER_INFO" ]; then
CURRENT_MASTER=$(echo "$MASTER_INFO" | grep "$MASTER_NAME" | awk '{print $4}')
MASTER_STATUS=$(echo "$MASTER_INFO" | grep "$MASTER_NAME" | awk '{print $6}')

log "当前主节点: $CURRENT_MASTER"
log "主节点状态: $MASTER_STATUS"

# 获取从节点信息
SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)
log "从节点信息:"
echo "$SLAVE_INFO"

# 获取哨兵信息
SENTINEL_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel sentinels $MASTER_NAME)
log "哨兵信息:"
echo "$SENTINEL_INFO"
else
log "错误: 无法获取主节点信息"
fi
}

# 手动故障转移
manual_failover() {
log "执行手动故障转移..."

# 记录当前主节点
CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')
log "当前主节点: $CURRENT_MASTER"

# 执行故障转移
redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME

if [ $? -eq 0 ]; then
log "故障转移命令已发送"

# 等待故障转移完成
wait_for_failover_completion $CURRENT_MASTER
else
log "错误: 故障转移命令失败"
fi
}

# 等待故障转移完成
wait_for_failover_completion() {
local old_master=$1
local max_wait=300
local wait_time=0

log "等待故障转移完成..."

while [ $wait_time -lt $max_wait ]; do
NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

if [ "$NEW_MASTER" != "$old_master" ]; then
log "故障转移成功: $old_master -> $NEW_MASTER"

# 更新应用配置
update_application_config $NEW_MASTER

break
fi

log "故障转移进行中,等待中... ($wait_time/$max_wait)"
sleep 10
wait_time=$((wait_time + 10))
done

if [ $wait_time -ge $max_wait ]; then
log "警告: 故障转移超时"
fi
}

# 更新应用配置
update_application_config() {
local new_master=$1

log "更新应用配置: $new_master"

# 更新应用配置文件
if [ -f "/opt/app/config/redis.conf" ]; then
sed -i "s/^redis.host=.*/redis.host=$new_master/" /opt/app/config/redis.conf
log "应用配置文件已更新"
fi

# 更新负载均衡器配置
if [ -f "/etc/nginx/conf.d/redis.conf" ]; then
sed -i "s/server .*:6379/server $new_master:6379/" /etc/nginx/conf.d/redis.conf
nginx -s reload
log "负载均衡器配置已更新"
fi

# 重启应用服务
if systemctl is-active --quiet application-service; then
systemctl restart application-service
log "应用服务已重启"
fi
}

# 测试故障转移
test_failover() {
log "测试Redis故障转移..."

# 记录测试开始时间
local test_start=$(date +%s)

# 获取当前主节点
CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')
log "测试前主节点: $CURRENT_MASTER"

# 执行故障转移
manual_failover

# 记录测试结束时间
local test_end=$(date +%s)
local test_duration=$((test_end - test_start))

log "故障转移测试完成,耗时: ${test_duration}秒"

# 验证故障转移结果
verify_failover_result
}

# 验证故障转移结果
verify_failover_result() {
log "验证故障转移结果..."

# 检查新主节点状态
NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "新主节点 $NEW_MASTER 连接正常"
else
log "错误: 新主节点 $NEW_MASTER 连接失败"
fi

# 检查从节点状态
SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)

while IFS= read -r line; do
if [[ $line == *"ip"* ]]; then
SLAVE_IP=$(echo "$line" | awk '{print $4}')
SLAVE_PORT=$(echo "$line" | awk '{print $6}')

if redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "从节点 $SLAVE_IP:$SLAVE_PORT 连接正常"
else
log "错误: 从节点 $SLAVE_IP:$SLAVE_PORT 连接失败"
fi
fi
done <<< "$SLAVE_INFO"

# 检查数据一致性
check_data_consistency_after_failover
}

# 检查故障转移后数据一致性
check_data_consistency_after_failover() {
log "检查故障转移后数据一致性..."

NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

# 获取主节点键数量
MASTER_KEYS=$(redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD dbsize)
log "新主节点键数量: $MASTER_KEYS"

# 检查从节点键数量
SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)

while IFS= read -r line; do
if [[ $line == *"ip"* ]]; then
SLAVE_IP=$(echo "$line" | awk '{print $4}')
SLAVE_PORT=$(echo "$line" | awk '{print $6}')

SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD dbsize)
log "从节点 $SLAVE_IP:$SLAVE_PORT 键数量: $SLAVE_KEYS"

if [ "$MASTER_KEYS" = "$SLAVE_KEYS" ]; then
log "从节点 $SLAVE_IP:$SLAVE_PORT 数据一致性检查通过"
else
log "警告: 从节点 $SLAVE_IP:$SLAVE_PORT 数据不一致"
fi
fi
done <<< "$SLAVE_INFO"
}

# 恢复原主节点
restore_original_master() {
log "恢复原主节点..."

# 获取当前主节点
CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

# 获取原主节点(假设是第一个从节点)
ORIGINAL_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME | head -1 | awk '{print $4}')

log "当前主节点: $CURRENT_MASTER"
log "原主节点: $ORIGINAL_MASTER"

if [ "$CURRENT_MASTER" != "$ORIGINAL_MASTER" ]; then
# 执行故障转移回原主节点
redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME

log "故障转移回原主节点命令已发送"

# 等待恢复完成
wait_for_failover_completion $CURRENT_MASTER
else
log "当前主节点就是原主节点,无需恢复"
fi
}

# 主函数
main() {
case $1 in
"status")
check_failover_status
;;
"failover")
manual_failover
;;
"test")
test_failover
;;
"restore")
restore_original_master
;;
*)
echo "用法: $0 {status|failover|test|restore}"
echo " status - 检查故障转移状态"
echo " failover - 手动故障转移"
echo " test - 测试故障转移"
echo " restore - 恢复原主节点"
;;
esac
}

# 执行主函数
main "$@"

5.2 故障恢复脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/bin/bash
# redis_failure_recovery.sh - Redis故障恢复脚本
# @author 运维实战

# 配置参数
SENTINEL_PORTS=(26379 26380 26381)
MASTER_NAME="mymaster"
REDIS_PASSWORD="redis123"

# 检测故障
detect_failure() {
log "检测Redis故障..."

# 检查主节点状态
MASTER_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters)

if [ -n "$MASTER_INFO" ]; then
CURRENT_MASTER=$(echo "$MASTER_INFO" | grep "$MASTER_NAME" | awk '{print $4}')
MASTER_STATUS=$(echo "$MASTER_INFO" | grep "$MASTER_NAME" | awk '{print $6}')

log "当前主节点: $CURRENT_MASTER"
log "主节点状态: $MASTER_STATUS"

# 检查主节点连接
if ! redis-cli -h $CURRENT_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "检测到主节点故障: $CURRENT_MASTER"
return 1
else
log "主节点连接正常"
return 0
fi
else
log "错误: 无法获取主节点信息"
return 1
fi
}

# 自动故障恢复
auto_failure_recovery() {
log "开始自动故障恢复..."

# 检测故障
if detect_failure; then
log "未检测到故障,无需恢复"
return 0
fi

# 等待哨兵自动故障转移
log "等待哨兵自动故障转移..."
local max_wait=300
local wait_time=0

while [ $wait_time -lt $max_wait ]; do
NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

if [ "$NEW_MASTER" != "$CURRENT_MASTER" ]; then
log "哨兵自动故障转移成功: $CURRENT_MASTER -> $NEW_MASTER"
break
fi

log "等待自动故障转移... ($wait_time/$max_wait)"
sleep 10
wait_time=$((wait_time + 10))
done

if [ $wait_time -ge $max_wait ]; then
log "警告: 自动故障转移超时,尝试手动故障转移"
manual_failure_recovery
fi
}

# 手动故障恢复
manual_failure_recovery() {
log "执行手动故障恢复..."

# 获取可用的从节点
AVAILABLE_SLAVES=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)

if [ -z "$AVAILABLE_SLAVES" ]; then
log "错误: 没有可用的从节点"
return 1
fi

# 选择第一个可用的从节点作为新主节点
NEW_MASTER=$(echo "$AVAILABLE_SLAVES" | head -1 | awk '{print $4}')
log "选择新主节点: $NEW_MASTER"

# 执行手动故障转移
redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME

if [ $? -eq 0 ]; then
log "手动故障转移命令已发送"

# 等待故障转移完成
wait_for_recovery_completion
else
log "错误: 手动故障转移失败"
return 1
fi
}

# 等待恢复完成
wait_for_recovery_completion() {
log "等待故障恢复完成..."

local max_wait=300
local wait_time=0

while [ $wait_time -lt $max_wait ]; do
NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "故障恢复成功,新主节点: $NEW_MASTER"

# 更新应用配置
update_application_config $NEW_MASTER

# 验证恢复结果
verify_recovery_result

break
fi

log "等待恢复完成... ($wait_time/$max_wait)"
sleep 10
wait_time=$((wait_time + 10))
done

if [ $wait_time -ge $max_wait ]; then
log "错误: 故障恢复超时"
return 1
fi
}

# 验证恢复结果
verify_recovery_result() {
log "验证故障恢复结果..."

NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

# 检查新主节点状态
if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "新主节点 $NEW_MASTER 连接正常"
else
log "错误: 新主节点 $NEW_MASTER 连接失败"
return 1
fi

# 检查从节点状态
SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)

while IFS= read -r line; do
if [[ $line == *"ip"* ]]; then
SLAVE_IP=$(echo "$line" | awk '{print $4}')
SLAVE_PORT=$(echo "$line" | awk '{print $6}')

if redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "从节点 $SLAVE_IP:$SLAVE_PORT 连接正常"
else
log "警告: 从节点 $SLAVE_IP:$SLAVE_PORT 连接失败"
fi
fi
done <<< "$SLAVE_INFO"

# 检查数据一致性
check_data_consistency_after_recovery
}

# 检查恢复后数据一致性
check_data_consistency_after_recovery() {
log "检查恢复后数据一致性..."

NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

# 获取主节点键数量
MASTER_KEYS=$(redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD dbsize)
log "新主节点键数量: $MASTER_KEYS"

# 检查从节点键数量
SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME)

local consistent_slaves=0
local total_slaves=0

while IFS= read -r line; do
if [[ $line == *"ip"* ]]; then
SLAVE_IP=$(echo "$line" | awk '{print $4}')
SLAVE_PORT=$(echo "$line" | awk '{print $6}')

SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD dbsize)
log "从节点 $SLAVE_IP:$SLAVE_PORT 键数量: $SLAVE_KEYS"

total_slaves=$((total_slaves + 1))

if [ "$MASTER_KEYS" = "$SLAVE_KEYS" ]; then
log "从节点 $SLAVE_IP:$SLAVE_PORT 数据一致性检查通过"
consistent_slaves=$((consistent_slaves + 1))
else
log "警告: 从节点 $SLAVE_IP:$SLAVE_PORT 数据不一致"
fi
fi
done <<< "$SLAVE_INFO"

log "数据一致性检查完成: $consistent_slaves/$total_slaves 个从节点数据一致"
}

# 恢复原主节点
restore_original_master() {
log "恢复原主节点..."

# 获取当前主节点
CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME" | awk '{print $4}')

# 获取原主节点(假设是第一个从节点)
ORIGINAL_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME | head -1 | awk '{print $4}')

log "当前主节点: $CURRENT_MASTER"
log "原主节点: $ORIGINAL_MASTER"

if [ "$CURRENT_MASTER" != "$ORIGINAL_MASTER" ]; then
# 检查原主节点是否可用
if redis-cli -h $ORIGINAL_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log "原主节点 $ORIGINAL_MASTER 可用,开始恢复"

# 执行故障转移回原主节点
redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME

log "故障转移回原主节点命令已发送"

# 等待恢复完成
wait_for_recovery_completion
else
log "原主节点 $ORIGINAL_MASTER 不可用,无法恢复"
fi
else
log "当前主节点就是原主节点,无需恢复"
fi
}

# 主函数
main() {
case $1 in
"detect")
detect_failure
;;
"auto")
auto_failure_recovery
;;
"manual")
manual_failure_recovery
;;
"restore")
restore_original_master
;;
*)
echo "用法: $0 {detect|auto|manual|restore}"
echo " detect - 检测故障"
echo " auto - 自动故障恢复"
echo " manual - 手动故障恢复"
echo " restore - 恢复原主节点"
;;
esac
}

# 执行主函数
main "$@"

6. 监控告警

6.1 主备监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/bin/bash
# redis_master_slave_monitor.sh - Redis主备监控脚本
# @author 运维实战

# 监控配置
MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
REDIS_PASSWORD="redis123"
SENTINEL_PORTS=(26379 26380 26381)
MONITOR_INTERVAL=30
LOG_FILE="/var/log/redis-master-slave-monitor.log"

# 记录监控数据
log_monitor_data() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local metric=$1
local value=$2
local node=$3

echo "[$timestamp] Node:$node Metric:$metric Value:$value" >> $LOG_FILE
}

# 监控主备状态
monitor_master_slave_status() {
log "开始监控Redis主备状态..."

while true; do
# 监控主节点
if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log_monitor_data "status" "up" "master"

# 获取主节点信息
MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info)

# 监控关键指标
CONNECTED_CLIENTS=$(echo "$MASTER_INFO" | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
USED_MEMORY=$(echo "$MASTER_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
KEYS_COUNT=$(echo "$MASTER_INFO" | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1)
HITS=$(echo "$MASTER_INFO" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r')
MISSES=$(echo "$MASTER_INFO" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r')

log_monitor_data "connected_clients" $CONNECTED_CLIENTS "master"
log_monitor_data "used_memory" $USED_MEMORY "master"
log_monitor_data "keys_count" $KEYS_COUNT "master"
log_monitor_data "hits" $HITS "master"
log_monitor_data "misses" $MISSES "master"

# 计算命中率
if [ "$HITS" != "0" ] && [ "$MISSES" != "0" ]; then
HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES)" | bc)
log_monitor_data "hit_rate" $HIT_RATE "master"
fi

# 监控主从复制信息
REPLICATION_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)
CONNECTED_SLAVES=$(echo "$REPLICATION_INFO" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
MASTER_REPL_OFFSET=$(echo "$REPLICATION_INFO" | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')

log_monitor_data "connected_slaves" $CONNECTED_SLAVES "master"
log_monitor_data "master_repl_offset" $MASTER_REPL_OFFSET "master"
else
log_monitor_data "status" "down" "master"
fi

# 监控从节点
if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
log_monitor_data "status" "up" "slave"

# 获取从节点信息
SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info)

# 监控关键指标
CONNECTED_CLIENTS=$(echo "$SLAVE_INFO" | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
USED_MEMORY=$(echo "$SLAVE_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
KEYS_COUNT=$(echo "$SLAVE_INFO" | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1)

log_monitor_data "connected_clients" $CONNECTED_CLIENTS "slave"
log_monitor_data "used_memory" $USED_MEMORY "slave"
log_monitor_data "keys_count" $KEYS_COUNT "slave"

# 监控主从复制信息
REPLICATION_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication)
MASTER_LINK_STATUS=$(echo "$REPLICATION_INFO" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
MASTER_SYNC_IN_PROGRESS=$(echo "$REPLICATION_INFO" | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r')
SLAVE_REPL_OFFSET=$(echo "$REPLICATION_INFO" | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')
MASTER_LAST_IO_SECONDS_AGO=$(echo "$REPLICATION_INFO" | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r')

log_monitor_data "master_link_status" $MASTER_LINK_STATUS "slave"
log_monitor_data "master_sync_in_progress" $MASTER_SYNC_IN_PROGRESS "slave"
log_monitor_data "slave_repl_offset" $SLAVE_REPL_OFFSET "slave"
log_monitor_data "master_last_io_seconds_ago" $MASTER_LAST_IO_SECONDS_AGO "slave"

# 计算同步延迟
if [ "$MASTER_REPL_OFFSET" != "" ] && [ "$SLAVE_REPL_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_REPL_OFFSET - SLAVE_REPL_OFFSET))
log_monitor_data "sync_lag" $SYNC_LAG "slave"
fi
else
log_monitor_data "status" "down" "slave"
fi

# 监控哨兵状态
for port in "${SENTINEL_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log_monitor_data "status" "up" "sentinel-$port"

# 获取哨兵信息
SENTINEL_INFO=$(redis-cli -p $port info sentinel)
MASTERS=$(echo "$SENTINEL_INFO" | grep "masters:" | cut -d: -f2 | tr -d '\r')
SLAVES=$(echo "$SENTINEL_INFO" | grep "slaves:" | cut -d: -f2 | tr -d '\r')
SENTINELS=$(echo "$SENTINEL_INFO" | grep "sentinels:" | cut -d: -f2 | tr -d '\r')

log_monitor_data "masters" $MASTERS "sentinel-$port"
log_monitor_data "slaves" $SLAVES "sentinel-$port"
log_monitor_data "sentinels" $SENTINELS "sentinel-$port"
else
log_monitor_data "status" "down" "sentinel-$port"
fi
done

sleep $MONITOR_INTERVAL
done
}

# 生成主备报告
generate_master_slave_report() {
local report_file="/var/log/redis-master-slave-report-$(date +%Y%m%d).txt"

log "生成Redis主备报告: $report_file"

cat > $report_file << EOF
Redis主备监控报告
生成时间: $(date)
========================================

EOF

# 主节点信息
echo "主节点 ($MASTER_IP:$REDIS_PORT) 信息:" >> $report_file
echo "----------------------------------------" >> $report_file
redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info >> $report_file
echo "" >> $report_file

# 从节点信息
echo "从节点 ($SLAVE_IP:$REDIS_PORT) 信息:" >> $report_file
echo "----------------------------------------" >> $report_file
redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info >> $report_file
echo "" >> $report_file

# 哨兵信息
for port in "${SENTINEL_PORTS[@]}"; do
echo "哨兵节点 $port 信息:" >> $report_file
echo "----------------------------------------" >> $report_file
redis-cli -p $port info sentinel >> $report_file
echo "" >> $report_file
done

# 主备统计
echo "主备统计:" >> $report_file
echo "----------------------------------------" >> $report_file

# 计算同步延迟
MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')
SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_OFFSET" != "" ] && [ "$SLAVE_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET))
echo "同步延迟: $SYNC_LAG 字节" >> $report_file
fi

# 键数量统计
MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize)
echo "主节点键数量: $MASTER_KEYS" >> $report_file
echo "从节点键数量: $SLAVE_KEYS" >> $report_file

log "主备报告生成完成: $report_file"
}

# 设置主备告警
setup_master_slave_alerts() {
log "设置Redis主备告警..."

# 创建告警脚本
cat > /opt/redis-master-slave-alert.sh << 'EOF'
#!/bin/bash
# Redis主备告警脚本

MASTER_IP="192.168.1.10"
SLAVE_IP="192.168.1.11"
REDIS_PORT=6379
REDIS_PASSWORD="redis123"
SENTINEL_PORTS=(26379 26380 26381)

check_master_slave_alerts() {
# 检查主节点状态
if ! redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
echo "告警: 主节点连接失败 ($MASTER_IP:$REDIS_PORT)"
fi

# 检查从节点状态
if ! redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then
echo "告警: 从节点连接失败 ($SLAVE_IP:$REDIS_PORT)"
fi

# 检查主从连接状态
MASTER_LINK_STATUS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_LINK_STATUS" != "up" ]; then
echo "告警: 主从连接异常 ($SLAVE_IP:$REDIS_PORT)"
fi

# 检查同步延迟
MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')
SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_OFFSET" != "" ] && [ "$SLAVE_OFFSET" != "" ]; then
SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET))

if [ $SYNC_LAG -gt 10000 ]; then
echo "告警: 同步延迟过高 ($SYNC_LAG 字节)"
fi
fi

# 检查哨兵状态
for port in "${SENTINEL_PORTS[@]}"; do
if ! redis-cli -p $port ping > /dev/null 2>&1; then
echo "告警: 哨兵节点 $port 连接失败"
fi
done

# 检查内存使用
MASTER_MEMORY=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info memory | grep "used_memory_percentage:" | cut -d: -f2 | tr -d '\r')

if [ "$MASTER_MEMORY" != "" ] && [ $MASTER_MEMORY -gt 80 ]; then
echo "告警: 主节点内存使用率过高 ($MASTER_MEMORY%)"
fi
}

# 每5分钟检查一次
while true; do
check_master_slave_alerts
sleep 300
done
EOF

chmod +x /opt/redis-master-slave-alert.sh

# 启动告警服务
nohup /opt/redis-master-slave-alert.sh > /var/log/redis-master-slave-alert.log 2>&1 &

log "主备告警设置完成"
}

# 主函数
main() {
case $1 in
"monitor")
monitor_master_slave_status
;;
"report")
generate_master_slave_report
;;
"alerts")
setup_master_slave_alerts
;;
*)
echo "用法: $0 {monitor|report|alerts}"
echo " monitor - 开始主备监控"
echo " report - 生成主备报告"
echo " alerts - 设置主备告警"
;;
esac
}

# 执行主函数
main "$@"

7. 总结

7.1 主备架构最佳实践

  1. 主从配置: 合理配置主从复制参数,确保数据同步的可靠性
  2. 哨兵部署: 部署多个哨兵节点,提高故障检测的准确性
  3. 故障转移: 建立完善的故障转移机制,确保服务的高可用性
  4. 监控告警: 建立全面的监控体系,及时发现和处理问题
  5. 数据一致性: 定期检查主备数据一致性,确保数据完整性

7.2 关键指标监控

  • 连接状态: 监控主从节点的连接状态
  • 同步延迟: 监控主从数据同步延迟
  • 内存使用: 监控内存使用情况
  • 命中率: 监控缓存命中率
  • 哨兵状态: 监控哨兵节点的健康状态

7.3 运维工具推荐

  1. 监控工具: Prometheus + Grafana + Redis Exporter
  2. 告警工具: Alertmanager + Webhook
  3. 管理工具: Redis Commander, RedisInsight
  4. 备份工具: 自定义备份脚本
  5. 诊断工具: Redis-cli, redis-trib.rb

通过本文的Redis主备运维实战指南,您可以建立完善的Redis主备架构运维体系,确保系统的高可用性和数据一致性。记住,主备架构的运维需要持续关注主从同步状态和故障转移机制,确保在任何情况下都能提供稳定的服务。