第128集分布式缓存服务Redis主备运维实战 | 字数总计: 11.4k | 阅读时长: 55分钟 | 阅读量:
1. Redis主备架构概述 Redis主备架构是构建高可用缓存服务的重要方案,通过主从复制实现数据冗余和读写分离,通过哨兵模式实现自动故障转移。本文将详细介绍Redis主备架构的运维实战经验,包括主从配置、哨兵部署、故障转移、数据同步的完整解决方案。
1.1 核心功能
主从复制 : 实现数据从主节点到从节点的实时同步
读写分离 : 主节点处理写操作,从节点处理读操作
故障转移 : 自动检测主节点故障并切换到从节点
数据一致性 : 确保主备数据的一致性和完整性
高可用保障 : 提供7x24小时不间断服务
1.2 技术架构 1 2 3 4 5 客户端应用 → Redis主节点 → Redis从节点 ↓ ↓ ↓ 写操作处理 → 数据同步 → 读操作处理 ↓ ↓ ↓ 哨兵监控 → 故障检测 → 自动切换
2. 环境准备 2.1 系统环境检查 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 #!/bin/bash log () { echo "[$(date '+%Y-%m-%d %H:%M:%S') ] $1 " } check_system_environment () { log "开始检查Redis主备系统环境..." if [[ "$OSTYPE " == "linux-gnu" * ]]; then OS_VERSION=$(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) log "操作系统: $OS_VERSION " else log "错误: 不支持的操作系统 $OSTYPE " exit 1 fi TOTAL_MEM=$(free -h | grep "Mem:" | awk '{print $2}' ) AVAILABLE_MEM=$(free -h | grep "Mem:" | awk '{print $7}' ) CPU_CORES=$(nproc ) DISK_SPACE=$(df -h / | tail -1 | awk '{print $4}' ) log "系统资源检查:" log " 总内存: $TOTAL_MEM " log " 可用内存: $AVAILABLE_MEM " log " CPU核心数: $CPU_CORES " log " 可用磁盘空间: $DISK_SPACE " check_network_connectivity } check_network_connectivity () { log "检查网络连通性..." MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" if ping -c 3 $MASTER_IP > /dev/null 2>&1; then log "主节点网络连通正常: $MASTER_IP " else log "警告: 主节点网络连通异常: $MASTER_IP " fi if ping -c 3 $SLAVE_IP > /dev/null 2>&1; then log "从节点网络连通正常: $SLAVE_IP " else log "警告: 从节点网络连通异常: $SLAVE_IP " fi check_port_availability } check_port_availability () { log "检查Redis端口可用性..." REDIS_PORTS=(6379 26379) for port in "${REDIS_PORTS[@]} " ; do if netstat -tlnp | grep ":$port " > /dev/null; then log "警告: 端口 $port 已被占用" else log "端口 $port 可用" fi done } check_redis_installation () { log "检查Redis安装状态..." if command -v redis-server > /dev/null 2>&1; then REDIS_VERSION=$(redis-server --version | head -1) log "Redis已安装: $REDIS_VERSION " if systemctl is-active --quiet redis; then log "Redis服务运行正常" else log "Redis服务未运行" fi else log "Redis未安装,开始安装..." install_redis fi } install_redis () { log "开始安装Redis..." if command -v apt-get > /dev/null 2>&1; then sudo apt-get update sudo apt-get install -y redis-server redis-tools elif command -v yum > /dev/null 2>&1; then sudo yum update -y sudo yum install -y redis else log "错误: 不支持的包管理器" exit 1 fi sudo systemctl start redis sudo systemctl enable redis log "Redis安装完成" } main () { log "=== Redis主备环境检查开始 ===" check_system_environment check_redis_installation log "=== Redis主备环境检查完成 ===" } main "$@ "
2.2 Redis主备配置优化 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 #!/bin/bash MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 SENTINEL_PORT=26379 backup_config_files () { log "备份Redis配置文件..." if [ -f "/etc/redis/redis.conf" ]; then sudo cp /etc/redis/redis.conf /etc/redis/redis.conf.backup.$(date +%Y%m%d_%H%M%S) log "主节点配置文件已备份" fi if [ -f "/etc/redis/sentinel.conf" ]; then sudo cp /etc/redis/sentinel.conf /etc/redis/sentinel.conf.backup.$(date +%Y%m%d_%H%M%S) log "哨兵配置文件已备份" fi } configure_master_node () { log "配置Redis主节点..." cat > /etc/redis/redis.conf << EOF # Redis主节点配置 bind 0.0.0.0 port $REDIS_PORT timeout 300 tcp-keepalive 300 # 内存配置 maxmemory 2gb maxmemory-policy allkeys-lru # 持久化配置 save 900 1 save 300 10 save 60 10000 appendonly yes appendfsync everysec appendfilename "appendonly.aof" # 主从复制配置 replica-read-only yes replica-serve-stale-data yes # 安全配置 requirepass redis123 masterauth redis123 # 日志配置 loglevel notice logfile /var/log/redis/redis-server.log # 其他配置 daemonize yes pidfile /var/run/redis/redis-server.pid dir /var/lib/redis EOF log "主节点配置完成" } configure_slave_node () { log "配置Redis从节点..." cat > /etc/redis/redis.conf << EOF # Redis从节点配置 bind 0.0.0.0 port $REDIS_PORT timeout 300 tcp-keepalive 300 # 内存配置 maxmemory 2gb maxmemory-policy allkeys-lru # 持久化配置 save 900 1 save 300 10 save 60 10000 appendonly yes appendfsync everysec appendfilename "appendonly.aof" # 主从复制配置 replicaof $MASTER_IP $REDIS_PORT replica-read-only yes replica-serve-stale-data yes replica-priority 100 # 安全配置 requirepass redis123 masterauth redis123 # 日志配置 loglevel notice logfile /var/log/redis/redis-server.log # 其他配置 daemonize yes pidfile /var/run/redis/redis-server.pid dir /var/lib/redis EOF log "从节点配置完成" } configure_sentinel () { log "配置Redis哨兵..." cat > /etc/redis/sentinel.conf << EOF # Redis哨兵配置 port $SENTINEL_PORT bind 0.0.0.0 sentinel deny-scripts-reconfig yes # 监控主节点 sentinel monitor mymaster $MASTER_IP $REDIS_PORT 2 sentinel auth-pass mymaster redis123 sentinel down-after-milliseconds mymaster 30000 sentinel parallel-syncs mymaster 1 sentinel failover-timeout mymaster 180000 sentinel notification-script mymaster /opt/redis-scripts/notify.sh sentinel client-reconfig-script mymaster /opt/redis-scripts/reconfig.sh # 日志配置 logfile /var/log/redis/sentinel.log loglevel notice # 其他配置 daemonize yes pidfile /var/run/redis/sentinel.pid dir /var/lib/redis EOF log "哨兵配置完成" } create_notification_scripts () { log "创建哨兵通知脚本..." mkdir -p /opt/redis-scripts cat > /opt/redis-scripts/notify.sh << 'EOF' TYPE=$1 NAME=$2 IP=$3 PORT=$4 MASTER_IP=$5 MASTER_PORT=$6 echo "[$(date '+%Y-%m-%d %H:%M:%S') ] Sentinel notification: $TYPE $NAME $IP :$PORT " >> /var/log/redis/sentinel-notify.logcase $TYPE in +sdown) echo "主节点 $NAME 主观下线: $IP :$PORT " send_alert "Redis主节点主观下线" "主节点 $NAME ($IP :$PORT ) 主观下线" ;; -sdown) echo "主节点 $NAME 主观上线: $IP :$PORT " ;; +odown) echo "主节点 $NAME 客观下线: $IP :$PORT " send_alert "Redis主节点客观下线" "主节点 $NAME ($IP :$PORT ) 客观下线,开始故障转移" ;; +switch-master) echo "主节点切换: $NAME $MASTER_IP :$MASTER_PORT -> $IP :$PORT " send_alert "Redis主节点切换" "主节点已从 $MASTER_IP :$MASTER_PORT 切换到 $IP :$PORT " ;; +slave) echo "从节点上线: $NAME $IP :$PORT " ;; -slave) echo "从节点下线: $NAME $IP :$PORT " ;; esac send_alert () { local subject="$1 " local message="$2 " echo "$message " | mail -s "$subject " admin@example.com curl -X POST "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" \ -H 'Content-Type: application/json' \ -d "{\"msgtype\":\"text\",\"text\":{\"content\":\"$subject \\n$message \"}}" } EOF chmod +x /opt/redis-scripts/notify.sh cat > /opt/redis-scripts/reconfig.sh << 'EOF' MASTER_IP=$1 MASTER_PORT=$2 OLD_MASTER_IP=$3 OLD_MASTER_PORT=$4 echo "[$(date '+%Y-%m-%d %H:%M:%S') ] Sentinel reconfig: $MASTER_IP :$MASTER_PORT " >> /var/log/redis/sentinel-reconfig.logupdate_application_config () { sed -i "s/$OLD_MASTER_IP :$OLD_MASTER_PORT /$MASTER_IP :$MASTER_PORT /g" /opt/app/config/redis.conf systemctl restart application-service echo "应用配置已更新: $MASTER_IP :$MASTER_PORT " } update_application_config EOF chmod +x /opt/redis-scripts/reconfig.sh log "通知脚本创建完成" } start_services () { log "启动Redis服务..." systemctl restart redis systemctl enable redis systemctl restart redis-sentinel systemctl enable redis-sentinel sleep 3 if systemctl is-active --quiet redis; then log "Redis服务启动成功" else log "错误: Redis服务启动失败" exit 1 fi if systemctl is-active --quiet redis-sentinel; then log "哨兵服务启动成功" else log "错误: 哨兵服务启动失败" exit 1 fi } main () { log "=== Redis主备配置开始 ===" backup_config_files configure_master_node configure_slave_node configure_sentinel create_notification_scripts start_services log "=== Redis主备配置完成 ===" } main "$@ "
3. 主从复制管理 3.1 主从复制配置脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 #!/bin/bash MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 REDIS_PASSWORD="redis123" check_replication_status () { log "检查Redis主从复制状态..." if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "主节点连接正常: $MASTER_IP :$REDIS_PORT " MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) CONNECTED_SLAVES=$(echo "$MASTER_INFO " | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r' ) log "主节点连接的从节点数: $CONNECTED_SLAVES " echo "$MASTER_INFO " | grep "slave" else log "错误: 主节点连接失败: $MASTER_IP :$REDIS_PORT " fi if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "从节点连接正常: $SLAVE_IP :$REDIS_PORT " SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) MASTER_LINK_STATUS=$(echo "$SLAVE_INFO " | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) MASTER_SYNC_IN_PROGRESS=$(echo "$SLAVE_INFO " | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r' ) log "从节点主从连接状态: $MASTER_LINK_STATUS " log "从节点同步状态: $MASTER_SYNC_IN_PROGRESS " if [ "$MASTER_LINK_STATUS " = "up" ]; then log "从节点复制正常" else log "警告: 从节点复制异常" fi else log "错误: 从节点连接失败: $SLAVE_IP :$REDIS_PORT " fi } setup_master_slave () { log "建立Redis主从关系..." SLAVE_ROLE=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "role:" | cut -d: -f2 | tr -d '\r' ) if [ "$SLAVE_ROLE " = "slave" ]; then log "从节点已配置为主从关系" else log "配置从节点为主从关系..." redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD replicaof $MASTER_IP $REDIS_PORT wait_for_sync_completion log "主从关系建立完成" fi } wait_for_sync_completion () { log "等待主从同步完成..." local max_wait=300 local wait_time=0 while [ $wait_time -lt $max_wait ]; do MASTER_SYNC_IN_PROGRESS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_SYNC_IN_PROGRESS " = "0" ]; then log "主从同步完成" break fi log "同步进行中,等待中... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 同步超时,请检查网络和配置" fi } manual_sync () { log "执行手动同步..." redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD sync log "手动同步命令已发送" } break_master_slave () { log "断开主从关系..." redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD replicaof no one log "主从关系已断开" } reestablish_master_slave () { log "重新建立主从关系..." break_master_slave sleep 5 setup_master_slave } check_data_consistency () { log "检查主从数据一致性..." MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) log "主节点键数量: $MASTER_KEYS " SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) log "从节点键数量: $SLAVE_KEYS " if [ "$MASTER_KEYS " = "$SLAVE_KEYS " ]; then log "主从数据一致性检查通过" else log "警告: 主从数据不一致 (主:$MASTER_KEYS , 从:$SLAVE_KEYS )" fi check_key_consistency } check_key_consistency () { log "检查关键键的一致性..." MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD keys "*" ) local inconsistent_keys=0 for key in $MASTER_KEYS ; do MASTER_VALUE=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD get "$key " ) SLAVE_VALUE=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD get "$key " ) if [ "$MASTER_VALUE " != "$SLAVE_VALUE " ]; then log "警告: 键 $key 值不一致" inconsistent_keys=$((inconsistent_keys + 1 )) fi done if [ $inconsistent_keys -eq 0 ]; then log "所有键值一致性检查通过" else log "发现 $inconsistent_keys 个键值不一致" fi } main () { case $1 in "status" ) check_replication_status ;; "setup" ) setup_master_slave ;; "sync" ) manual_sync ;; "break" ) break_master_slave ;; "reestablish" ) reestablish_master_slave ;; "consistency" ) check_data_consistency ;; *) echo "用法: $0 {status|setup|sync|break|reestablish|consistency}" echo " status - 检查主从状态" echo " setup - 建立主从关系" echo " sync - 手动同步" echo " break - 断开主从关系" echo " reestablish - 重新建立主从关系" echo " consistency - 检查数据一致性" ;; esac } main "$@ "
3.2 数据同步监控脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 #!/bin/bash MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 REDIS_PASSWORD="redis123" MONITOR_INTERVAL=30 LOG_FILE="/var/log/redis-sync-monitor.log" log_sync_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local metric=$1 local value=$2 local node=$3 echo "[$timestamp ] Node:$node Metric:$metric Value:$value " >> $LOG_FILE } monitor_replication_sync () { log "开始监控Redis主从同步状态..." while true ; do if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) CONNECTED_SLAVES=$(echo "$MASTER_INFO " | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r' ) MASTER_REPL_OFFSET=$(echo "$MASTER_INFO " | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) log_sync_data "connected_slaves" $CONNECTED_SLAVES "master" log_sync_data "master_repl_offset" $MASTER_REPL_OFFSET "master" else log_sync_data "connection_status" "down" "master" fi if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) MASTER_LINK_STATUS=$(echo "$SLAVE_INFO " | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) MASTER_SYNC_IN_PROGRESS=$(echo "$SLAVE_INFO " | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r' ) SLAVE_REPL_OFFSET=$(echo "$SLAVE_INFO " | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) MASTER_LAST_IO_SECONDS_AGO=$(echo "$SLAVE_INFO " | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r' ) log_sync_data "master_link_status" $MASTER_LINK_STATUS "slave" log_sync_data "master_sync_in_progress" $MASTER_SYNC_IN_PROGRESS "slave" log_sync_data "slave_repl_offset" $SLAVE_REPL_OFFSET "slave" log_sync_data "master_last_io_seconds_ago" $MASTER_LAST_IO_SECONDS_AGO "slave" if [ "$MASTER_REPL_OFFSET " != "" ] && [ "$SLAVE_REPL_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_REPL_OFFSET - SLAVE_REPL_OFFSET)) log_sync_data "sync_lag" $SYNC_LAG "slave" if [ $SYNC_LAG -gt 1000 ]; then log_sync_data "sync_lag_alert" "high" "slave" fi fi else log_sync_data "connection_status" "down" "slave" fi sleep $MONITOR_INTERVAL done } generate_sync_report () { local report_file="/var/log/redis-sync-report-$(date +%Y%m%d) .txt" log "生成Redis同步报告: $report_file " cat > $report_file << EOF Redis主从同步报告 生成时间: $(date) ======================================== EOF echo "主节点 ($MASTER_IP :$REDIS_PORT ) 信息:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication >> $report_file echo "" >> $report_file echo "从节点 ($SLAVE_IP :$REDIS_PORT ) 信息:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication >> $report_file echo "" >> $report_file echo "同步统计:" >> $report_file echo "----------------------------------------" >> $report_file MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_OFFSET " != "" ] && [ "$SLAVE_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET)) echo "同步延迟: $SYNC_LAG 字节" >> $report_file fi MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) echo "主节点键数量: $MASTER_KEYS " >> $report_file echo "从节点键数量: $SLAVE_KEYS " >> $report_file log "同步报告生成完成: $report_file " } setup_sync_alerts () { log "设置Redis同步告警..." cat > /opt/redis-sync-alert.sh << 'EOF' MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 REDIS_PASSWORD="redis123" check_sync_alerts () { if ! redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then echo "告警: 主节点连接失败 ($MASTER_IP :$REDIS_PORT )" return fi if ! redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then echo "告警: 从节点连接失败 ($SLAVE_IP :$REDIS_PORT )" return fi MASTER_LINK_STATUS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_LINK_STATUS " != "up" ]; then echo "告警: 从节点主从连接异常 ($SLAVE_IP :$REDIS_PORT )" fi MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_OFFSET " != "" ] && [ "$SLAVE_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET)) if [ $SYNC_LAG -gt 10000 ]; then echo "告警: 同步延迟过高 ($SYNC_LAG 字节)" fi fi MASTER_LAST_IO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_LAST_IO " != "" ] && [ $MASTER_LAST_IO -gt 60 ]; then echo "告警: 从节点最后IO时间过长 ($MASTER_LAST_IO 秒)" fi } while true ; do check_sync_alerts sleep 300 done EOF chmod +x /opt/redis-sync-alert.sh nohup /opt/redis-sync-alert.sh > /var/log/redis-sync-alert.log 2>&1 & log "同步告警设置完成" } main () { case $1 in "monitor" ) monitor_replication_sync ;; "report" ) generate_sync_report ;; "alerts" ) setup_sync_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始同步监控" echo " report - 生成同步报告" echo " alerts - 设置同步告警" ;; esac } main "$@ "
4. 哨兵模式管理 4.1 哨兵部署脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 #!/bin/bash SENTINEL_PORTS=(26379 26380 26381) MASTER_IP="192.168.1.10" MASTER_PORT=6379 MASTER_PASSWORD="redis123" QUORUM=2 deploy_sentinel_nodes () { log "部署Redis哨兵节点..." for i in "${!SENTINEL_PORTS[@]} " ; do local port=${SENTINEL_PORTS[$i]} local sentinel_id="sentinel-$port " log "部署哨兵节点 $sentinel_id (端口: $port )..." local sentinel_dir="/opt/redis-sentinel/$port " mkdir -p $sentinel_dir cat > $sentinel_dir /sentinel.conf << EOF # Redis哨兵配置文件 port $port bind 0.0.0.0 sentinel deny-scripts-reconfig yes # 监控主节点 sentinel monitor mymaster $MASTER_IP $MASTER_PORT $QUORUM sentinel auth-pass mymaster $MASTER_PASSWORD sentinel down-after-milliseconds mymaster 30000 sentinel parallel-syncs mymaster 1 sentinel failover-timeout mymaster 180000 sentinel notification-script mymaster /opt/redis-scripts/notify.sh sentinel client-reconfig-script mymaster /opt/redis-scripts/reconfig.sh # 哨兵配置 sentinel announce-ip $(hostname -I | awk '{print $1}') sentinel announce-port $port # 日志配置 logfile $sentinel_dir/sentinel.log loglevel notice # 其他配置 daemonize yes pidfile $sentinel_dir/sentinel.pid dir $sentinel_dir EOF redis-sentinel $sentinel_dir /sentinel.conf if [ $? -eq 0 ]; then log "哨兵节点 $sentinel_id 启动成功" else log "错误: 哨兵节点 $sentinel_id 启动失败" exit 1 fi done sleep 5 } check_sentinel_status () { log "检查Redis哨兵状态..." for port in "${SENTINEL_PORTS[@]} " ; do log "检查哨兵节点 $port ..." if redis-cli -p $port ping > /dev/null 2>&1; then log "哨兵节点 $port 运行正常" SENTINEL_INFO=$(redis-cli -p $port sentinel masters) log "哨兵 $port 监控的主节点信息:" echo "$SENTINEL_INFO " SENTINEL_LIST=$(redis-cli -p $port sentinel sentinels mymaster) log "哨兵 $port 哨兵列表:" echo "$SENTINEL_LIST " else log "错误: 哨兵节点 $port 连接失败" fi done } test_failover () { log "测试Redis故障转移..." CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "mymaster" | awk '{print $4}' ) log "当前主节点: $CURRENT_MASTER " log "模拟主节点故障..." redis-cli -h $CURRENT_MASTER -p $MASTER_PORT -a $MASTER_PASSWORD debug sleep 60 & log "等待故障转移..." local max_wait=300 local wait_time=0 while [ $wait_time -lt $max_wait ]; do NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "mymaster" | awk '{print $4}' ) if [ "$NEW_MASTER " != "$CURRENT_MASTER " ]; then log "故障转移成功: $CURRENT_MASTER -> $NEW_MASTER " break fi log "故障转移进行中,等待中... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 故障转移超时" fi } manual_failover () { log "执行手动故障转移..." redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover mymaster if [ $? -eq 0 ]; then log "手动故障转移命令已发送" else log "错误: 手动故障转移失败" fi } add_sentinel_node () { local new_port=$1 local existing_port=$2 log "添加哨兵节点 $new_port ..." local sentinel_dir="/opt/redis-sentinel/$new_port " mkdir -p $sentinel_dir cat > $sentinel_dir /sentinel.conf << EOF port $new_port bind 0.0.0.0 sentinel deny-scripts-reconfig yes sentinel monitor mymaster $MASTER_IP $MASTER_PORT $QUORUM sentinel auth-pass mymaster $MASTER_PASSWORD sentinel down-after-milliseconds mymaster 30000 sentinel parallel-syncs mymaster 1 sentinel failover-timeout mymaster 180000 logfile $sentinel_dir/sentinel.log loglevel notice daemonize yes pidfile $sentinel_dir/sentinel.pid dir $sentinel_dir EOF redis-sentinel $sentinel_dir /sentinel.conf sleep 10 log "哨兵节点 $new_port 添加完成" } remove_sentinel_node () { local port=$1 log "移除哨兵节点 $port ..." redis-cli -p $port shutdown rm -rf "/opt/redis-sentinel/$port " log "哨兵节点 $port 移除完成" } main () { case $1 in "deploy" ) deploy_sentinel_nodes ;; "status" ) check_sentinel_status ;; "test" ) test_failover ;; "failover" ) manual_failover ;; "add" ) add_sentinel_node $2 $3 ;; "remove" ) remove_sentinel_node $2 ;; *) echo "用法: $0 {deploy|status|test|failover|add|remove}" echo " deploy - 部署哨兵节点" echo " status - 检查哨兵状态" echo " test - 测试故障转移" echo " failover - 手动故障转移" echo " add <new_port> <existing_port> - 添加哨兵节点" echo " remove <port> - 移除哨兵节点" ;; esac } main "$@ "
4.2 哨兵监控脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 #!/bin/bash SENTINEL_PORTS=(26379 26380 26381) MONITOR_INTERVAL=30 LOG_FILE="/var/log/redis-sentinel-monitor.log" log_sentinel_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local metric=$1 local value=$2 local sentinel=$3 echo "[$timestamp ] Sentinel:$sentinel Metric:$metric Value:$value " >> $LOG_FILE } monitor_sentinel_status () { log "开始监控Redis哨兵状态..." while true ; do for port in "${SENTINEL_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log_sentinel_data "status" "up" $port SENTINEL_INFO=$(redis-cli -p $port info sentinel) MASTERS=$(echo "$SENTINEL_INFO " | grep "masters:" | cut -d: -f2 | tr -d '\r' ) SLAVES=$(echo "$SENTINEL_INFO " | grep "slaves:" | cut -d: -f2 | tr -d '\r' ) SENTINELS=$(echo "$SENTINEL_INFO " | grep "sentinels:" | cut -d: -f2 | tr -d '\r' ) log_sentinel_data "masters" $MASTERS $port log_sentinel_data "slaves" $SLAVES $port log_sentinel_data "sentinels" $SENTINELS $port MASTER_INFO=$(redis-cli -p $port sentinel masters) if [ -n "$MASTER_INFO " ]; then MASTER_STATUS=$(echo "$MASTER_INFO " | grep "mymaster" | awk '{print $6}' ) log_sentinel_data "master_status" $MASTER_STATUS $port fi else log_sentinel_data "status" "down" $port fi done sleep $MONITOR_INTERVAL done } generate_sentinel_report () { local report_file="/var/log/redis-sentinel-report-$(date +%Y%m%d) .txt" log "生成Redis哨兵报告: $report_file " cat > $report_file << EOF Redis哨兵监控报告 生成时间: $(date) ======================================== EOF for port in "${SENTINEL_PORTS[@]} " ; do echo "哨兵节点 $port 信息:" >> $report_file echo "----------------------------------------" >> $report_file if redis-cli -p $port ping > /dev/null 2>&1; then redis-cli -p $port info sentinel >> $report_file echo "" >> $report_file echo "监控的主节点信息:" >> $report_file redis-cli -p $port sentinel masters >> $report_file echo "" >> $report_file echo "监控的从节点信息:" >> $report_file redis-cli -p $port sentinel slaves mymaster >> $report_file echo "" >> $report_file echo "哨兵节点信息:" >> $report_file redis-cli -p $port sentinel sentinels mymaster >> $report_file echo "" >> $report_file else echo "哨兵节点 $port 连接失败" >> $report_file echo "" >> $report_file fi done log "哨兵报告生成完成: $report_file " } setup_sentinel_alerts () { log "设置Redis哨兵告警..." cat > /opt/redis-sentinel-alert.sh << 'EOF' SENTINEL_PORTS=(26379 26380 26381) check_sentinel_alerts () { for port in "${SENTINEL_PORTS[@]} " ; do if ! redis-cli -p $port ping > /dev/null 2>&1; then echo "告警: 哨兵节点 $port 连接失败" fi done MASTER_INFO=$(redis-cli -p 26379 sentinel masters) if [ -n "$MASTER_INFO " ]; then MASTER_STATUS=$(echo "$MASTER_INFO " | grep "mymaster" | awk '{print $6}' ) if [ "$MASTER_STATUS " != "ok" ]; then echo "告警: 主节点状态异常 ($MASTER_STATUS )" fi fi SLAVE_COUNT=$(redis-cli -p 26379 sentinel slaves mymaster | wc -l) if [ $SLAVE_COUNT -lt 1 ]; then echo "告警: 从节点数量不足 ($SLAVE_COUNT )" fi SENTINEL_COUNT=$(redis-cli -p 26379 sentinel sentinels mymaster | wc -l) if [ $SENTINEL_COUNT -lt 2 ]; then echo "告警: 哨兵数量不足 ($SENTINEL_COUNT )" fi } while true ; do check_sentinel_alerts sleep 300 done EOF chmod +x /opt/redis-sentinel-alert.sh nohup /opt/redis-sentinel-alert.sh > /var/log/redis-sentinel-alert.log 2>&1 & log "哨兵告警设置完成" } main () { case $1 in "monitor" ) monitor_sentinel_status ;; "report" ) generate_sentinel_report ;; "alerts" ) setup_sentinel_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始哨兵监控" echo " report - 生成哨兵报告" echo " alerts - 设置哨兵告警" ;; esac } main "$@ "
5. 故障转移管理 5.1 故障转移脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 #!/bin/bash SENTINEL_PORTS=(26379 26380 26381) MASTER_NAME="mymaster" REDIS_PASSWORD="redis123" check_failover_status () { log "检查Redis故障转移状态..." MASTER_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters) if [ -n "$MASTER_INFO " ]; then CURRENT_MASTER=$(echo "$MASTER_INFO " | grep "$MASTER_NAME " | awk '{print $4}' ) MASTER_STATUS=$(echo "$MASTER_INFO " | grep "$MASTER_NAME " | awk '{print $6}' ) log "当前主节点: $CURRENT_MASTER " log "主节点状态: $MASTER_STATUS " SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) log "从节点信息:" echo "$SLAVE_INFO " SENTINEL_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel sentinels $MASTER_NAME ) log "哨兵信息:" echo "$SENTINEL_INFO " else log "错误: 无法获取主节点信息" fi } manual_failover () { log "执行手动故障转移..." CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) log "当前主节点: $CURRENT_MASTER " redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME if [ $? -eq 0 ]; then log "故障转移命令已发送" wait_for_failover_completion $CURRENT_MASTER else log "错误: 故障转移命令失败" fi } wait_for_failover_completion () { local old_master=$1 local max_wait=300 local wait_time=0 log "等待故障转移完成..." while [ $wait_time -lt $max_wait ]; do NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) if [ "$NEW_MASTER " != "$old_master " ]; then log "故障转移成功: $old_master -> $NEW_MASTER " update_application_config $NEW_MASTER break fi log "故障转移进行中,等待中... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 故障转移超时" fi } update_application_config () { local new_master=$1 log "更新应用配置: $new_master " if [ -f "/opt/app/config/redis.conf" ]; then sed -i "s/^redis.host=.*/redis.host=$new_master /" /opt/app/config/redis.conf log "应用配置文件已更新" fi if [ -f "/etc/nginx/conf.d/redis.conf" ]; then sed -i "s/server .*:6379/server $new_master :6379/" /etc/nginx/conf.d/redis.conf nginx -s reload log "负载均衡器配置已更新" fi if systemctl is-active --quiet application-service; then systemctl restart application-service log "应用服务已重启" fi } test_failover () { log "测试Redis故障转移..." local test_start=$(date +%s) CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) log "测试前主节点: $CURRENT_MASTER " manual_failover local test_end=$(date +%s) local test_duration=$((test_end - test_start)) log "故障转移测试完成,耗时: ${test_duration} 秒" verify_failover_result } verify_failover_result () { log "验证故障转移结果..." NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "新主节点 $NEW_MASTER 连接正常" else log "错误: 新主节点 $NEW_MASTER 连接失败" fi SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) while IFS= read -r line; do if [[ $line == *"ip" * ]]; then SLAVE_IP=$(echo "$line " | awk '{print $4}' ) SLAVE_PORT=$(echo "$line " | awk '{print $6}' ) if redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "从节点 $SLAVE_IP :$SLAVE_PORT 连接正常" else log "错误: 从节点 $SLAVE_IP :$SLAVE_PORT 连接失败" fi fi done <<< "$SLAVE_INFO " check_data_consistency_after_failover } check_data_consistency_after_failover () { log "检查故障转移后数据一致性..." NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) MASTER_KEYS=$(redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD dbsize) log "新主节点键数量: $MASTER_KEYS " SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) while IFS= read -r line; do if [[ $line == *"ip" * ]]; then SLAVE_IP=$(echo "$line " | awk '{print $4}' ) SLAVE_PORT=$(echo "$line " | awk '{print $6}' ) SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD dbsize) log "从节点 $SLAVE_IP :$SLAVE_PORT 键数量: $SLAVE_KEYS " if [ "$MASTER_KEYS " = "$SLAVE_KEYS " ]; then log "从节点 $SLAVE_IP :$SLAVE_PORT 数据一致性检查通过" else log "警告: 从节点 $SLAVE_IP :$SLAVE_PORT 数据不一致" fi fi done <<< "$SLAVE_INFO " } restore_original_master () { log "恢复原主节点..." CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) ORIGINAL_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME | head -1 | awk '{print $4}' ) log "当前主节点: $CURRENT_MASTER " log "原主节点: $ORIGINAL_MASTER " if [ "$CURRENT_MASTER " != "$ORIGINAL_MASTER " ]; then redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME log "故障转移回原主节点命令已发送" wait_for_failover_completion $CURRENT_MASTER else log "当前主节点就是原主节点,无需恢复" fi } main () { case $1 in "status" ) check_failover_status ;; "failover" ) manual_failover ;; "test" ) test_failover ;; "restore" ) restore_original_master ;; *) echo "用法: $0 {status|failover|test|restore}" echo " status - 检查故障转移状态" echo " failover - 手动故障转移" echo " test - 测试故障转移" echo " restore - 恢复原主节点" ;; esac } main "$@ "
5.2 故障恢复脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 #!/bin/bash SENTINEL_PORTS=(26379 26380 26381) MASTER_NAME="mymaster" REDIS_PASSWORD="redis123" detect_failure () { log "检测Redis故障..." MASTER_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters) if [ -n "$MASTER_INFO " ]; then CURRENT_MASTER=$(echo "$MASTER_INFO " | grep "$MASTER_NAME " | awk '{print $4}' ) MASTER_STATUS=$(echo "$MASTER_INFO " | grep "$MASTER_NAME " | awk '{print $6}' ) log "当前主节点: $CURRENT_MASTER " log "主节点状态: $MASTER_STATUS " if ! redis-cli -h $CURRENT_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "检测到主节点故障: $CURRENT_MASTER " return 1 else log "主节点连接正常" return 0 fi else log "错误: 无法获取主节点信息" return 1 fi } auto_failure_recovery () { log "开始自动故障恢复..." if detect_failure; then log "未检测到故障,无需恢复" return 0 fi log "等待哨兵自动故障转移..." local max_wait=300 local wait_time=0 while [ $wait_time -lt $max_wait ]; do NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) if [ "$NEW_MASTER " != "$CURRENT_MASTER " ]; then log "哨兵自动故障转移成功: $CURRENT_MASTER -> $NEW_MASTER " break fi log "等待自动故障转移... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 自动故障转移超时,尝试手动故障转移" manual_failure_recovery fi } manual_failure_recovery () { log "执行手动故障恢复..." AVAILABLE_SLAVES=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) if [ -z "$AVAILABLE_SLAVES " ]; then log "错误: 没有可用的从节点" return 1 fi NEW_MASTER=$(echo "$AVAILABLE_SLAVES " | head -1 | awk '{print $4}' ) log "选择新主节点: $NEW_MASTER " redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME if [ $? -eq 0 ]; then log "手动故障转移命令已发送" wait_for_recovery_completion else log "错误: 手动故障转移失败" return 1 fi } wait_for_recovery_completion () { log "等待故障恢复完成..." local max_wait=300 local wait_time=0 while [ $wait_time -lt $max_wait ]; do NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "故障恢复成功,新主节点: $NEW_MASTER " update_application_config $NEW_MASTER verify_recovery_result break fi log "等待恢复完成... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "错误: 故障恢复超时" return 1 fi } verify_recovery_result () { log "验证故障恢复结果..." NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) if redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "新主节点 $NEW_MASTER 连接正常" else log "错误: 新主节点 $NEW_MASTER 连接失败" return 1 fi SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) while IFS= read -r line; do if [[ $line == *"ip" * ]]; then SLAVE_IP=$(echo "$line " | awk '{print $4}' ) SLAVE_PORT=$(echo "$line " | awk '{print $6}' ) if redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "从节点 $SLAVE_IP :$SLAVE_PORT 连接正常" else log "警告: 从节点 $SLAVE_IP :$SLAVE_PORT 连接失败" fi fi done <<< "$SLAVE_INFO " check_data_consistency_after_recovery } check_data_consistency_after_recovery () { log "检查恢复后数据一致性..." NEW_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) MASTER_KEYS=$(redis-cli -h $NEW_MASTER -p 6379 -a $REDIS_PASSWORD dbsize) log "新主节点键数量: $MASTER_KEYS " SLAVE_INFO=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME ) local consistent_slaves=0 local total_slaves=0 while IFS= read -r line; do if [[ $line == *"ip" * ]]; then SLAVE_IP=$(echo "$line " | awk '{print $4}' ) SLAVE_PORT=$(echo "$line " | awk '{print $6}' ) SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $SLAVE_PORT -a $REDIS_PASSWORD dbsize) log "从节点 $SLAVE_IP :$SLAVE_PORT 键数量: $SLAVE_KEYS " total_slaves=$((total_slaves + 1 )) if [ "$MASTER_KEYS " = "$SLAVE_KEYS " ]; then log "从节点 $SLAVE_IP :$SLAVE_PORT 数据一致性检查通过" consistent_slaves=$((consistent_slaves + 1 )) else log "警告: 从节点 $SLAVE_IP :$SLAVE_PORT 数据不一致" fi fi done <<< "$SLAVE_INFO " log "数据一致性检查完成: $consistent_slaves /$total_slaves 个从节点数据一致" } restore_original_master () { log "恢复原主节点..." CURRENT_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel masters | grep "$MASTER_NAME " | awk '{print $4}' ) ORIGINAL_MASTER=$(redis-cli -p ${SENTINEL_PORTS[0]} sentinel slaves $MASTER_NAME | head -1 | awk '{print $4}' ) log "当前主节点: $CURRENT_MASTER " log "原主节点: $ORIGINAL_MASTER " if [ "$CURRENT_MASTER " != "$ORIGINAL_MASTER " ]; then if redis-cli -h $ORIGINAL_MASTER -p 6379 -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log "原主节点 $ORIGINAL_MASTER 可用,开始恢复" redis-cli -p ${SENTINEL_PORTS[0]} sentinel failover $MASTER_NAME log "故障转移回原主节点命令已发送" wait_for_recovery_completion else log "原主节点 $ORIGINAL_MASTER 不可用,无法恢复" fi else log "当前主节点就是原主节点,无需恢复" fi } main () { case $1 in "detect" ) detect_failure ;; "auto" ) auto_failure_recovery ;; "manual" ) manual_failure_recovery ;; "restore" ) restore_original_master ;; *) echo "用法: $0 {detect|auto|manual|restore}" echo " detect - 检测故障" echo " auto - 自动故障恢复" echo " manual - 手动故障恢复" echo " restore - 恢复原主节点" ;; esac } main "$@ "
6. 监控告警 6.1 主备监控脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 #!/bin/bash MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 REDIS_PASSWORD="redis123" SENTINEL_PORTS=(26379 26380 26381) MONITOR_INTERVAL=30 LOG_FILE="/var/log/redis-master-slave-monitor.log" log_monitor_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local metric=$1 local value=$2 local node=$3 echo "[$timestamp ] Node:$node Metric:$metric Value:$value " >> $LOG_FILE } monitor_master_slave_status () { log "开始监控Redis主备状态..." while true ; do if redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log_monitor_data "status" "up" "master" MASTER_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info) CONNECTED_CLIENTS=$(echo "$MASTER_INFO " | grep "connected_clients:" | cut -d: -f2 | tr -d '\r' ) USED_MEMORY=$(echo "$MASTER_INFO " | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r' ) KEYS_COUNT=$(echo "$MASTER_INFO " | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1) HITS=$(echo "$MASTER_INFO " | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r' ) MISSES=$(echo "$MASTER_INFO " | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r' ) log_monitor_data "connected_clients" $CONNECTED_CLIENTS "master" log_monitor_data "used_memory" $USED_MEMORY "master" log_monitor_data "keys_count" $KEYS_COUNT "master" log_monitor_data "hits" $HITS "master" log_monitor_data "misses" $MISSES "master" if [ "$HITS " != "0" ] && [ "$MISSES " != "0" ]; then HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES )" | bc) log_monitor_data "hit_rate" $HIT_RATE "master" fi REPLICATION_INFO=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) CONNECTED_SLAVES=$(echo "$REPLICATION_INFO " | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r' ) MASTER_REPL_OFFSET=$(echo "$REPLICATION_INFO " | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) log_monitor_data "connected_slaves" $CONNECTED_SLAVES "master" log_monitor_data "master_repl_offset" $MASTER_REPL_OFFSET "master" else log_monitor_data "status" "down" "master" fi if redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then log_monitor_data "status" "up" "slave" SLAVE_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info) CONNECTED_CLIENTS=$(echo "$SLAVE_INFO " | grep "connected_clients:" | cut -d: -f2 | tr -d '\r' ) USED_MEMORY=$(echo "$SLAVE_INFO " | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r' ) KEYS_COUNT=$(echo "$SLAVE_INFO " | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1) log_monitor_data "connected_clients" $CONNECTED_CLIENTS "slave" log_monitor_data "used_memory" $USED_MEMORY "slave" log_monitor_data "keys_count" $KEYS_COUNT "slave" REPLICATION_INFO=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication) MASTER_LINK_STATUS=$(echo "$REPLICATION_INFO " | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) MASTER_SYNC_IN_PROGRESS=$(echo "$REPLICATION_INFO " | grep "master_sync_in_progress:" | cut -d: -f2 | tr -d '\r' ) SLAVE_REPL_OFFSET=$(echo "$REPLICATION_INFO " | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) MASTER_LAST_IO_SECONDS_AGO=$(echo "$REPLICATION_INFO " | grep "master_last_io_seconds_ago:" | cut -d: -f2 | tr -d '\r' ) log_monitor_data "master_link_status" $MASTER_LINK_STATUS "slave" log_monitor_data "master_sync_in_progress" $MASTER_SYNC_IN_PROGRESS "slave" log_monitor_data "slave_repl_offset" $SLAVE_REPL_OFFSET "slave" log_monitor_data "master_last_io_seconds_ago" $MASTER_LAST_IO_SECONDS_AGO "slave" if [ "$MASTER_REPL_OFFSET " != "" ] && [ "$SLAVE_REPL_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_REPL_OFFSET - SLAVE_REPL_OFFSET)) log_monitor_data "sync_lag" $SYNC_LAG "slave" fi else log_monitor_data "status" "down" "slave" fi for port in "${SENTINEL_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log_monitor_data "status" "up" "sentinel-$port " SENTINEL_INFO=$(redis-cli -p $port info sentinel) MASTERS=$(echo "$SENTINEL_INFO " | grep "masters:" | cut -d: -f2 | tr -d '\r' ) SLAVES=$(echo "$SENTINEL_INFO " | grep "slaves:" | cut -d: -f2 | tr -d '\r' ) SENTINELS=$(echo "$SENTINEL_INFO " | grep "sentinels:" | cut -d: -f2 | tr -d '\r' ) log_monitor_data "masters" $MASTERS "sentinel-$port " log_monitor_data "slaves" $SLAVES "sentinel-$port " log_monitor_data "sentinels" $SENTINELS "sentinel-$port " else log_monitor_data "status" "down" "sentinel-$port " fi done sleep $MONITOR_INTERVAL done } generate_master_slave_report () { local report_file="/var/log/redis-master-slave-report-$(date +%Y%m%d) .txt" log "生成Redis主备报告: $report_file " cat > $report_file << EOF Redis主备监控报告 生成时间: $(date) ======================================== EOF echo "主节点 ($MASTER_IP :$REDIS_PORT ) 信息:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info >> $report_file echo "" >> $report_file echo "从节点 ($SLAVE_IP :$REDIS_PORT ) 信息:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info >> $report_file echo "" >> $report_file for port in "${SENTINEL_PORTS[@]} " ; do echo "哨兵节点 $port 信息:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -p $port info sentinel >> $report_file echo "" >> $report_file done echo "主备统计:" >> $report_file echo "----------------------------------------" >> $report_file MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_OFFSET " != "" ] && [ "$SLAVE_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET)) echo "同步延迟: $SYNC_LAG 字节" >> $report_file fi MASTER_KEYS=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) SLAVE_KEYS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD dbsize) echo "主节点键数量: $MASTER_KEYS " >> $report_file echo "从节点键数量: $SLAVE_KEYS " >> $report_file log "主备报告生成完成: $report_file " } setup_master_slave_alerts () { log "设置Redis主备告警..." cat > /opt/redis-master-slave-alert.sh << 'EOF' MASTER_IP="192.168.1.10" SLAVE_IP="192.168.1.11" REDIS_PORT=6379 REDIS_PASSWORD="redis123" SENTINEL_PORTS=(26379 26380 26381) check_master_slave_alerts () { if ! redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then echo "告警: 主节点连接失败 ($MASTER_IP :$REDIS_PORT )" fi if ! redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD ping > /dev/null 2>&1; then echo "告警: 从节点连接失败 ($SLAVE_IP :$REDIS_PORT )" fi MASTER_LINK_STATUS=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_LINK_STATUS " != "up" ]; then echo "告警: 主从连接异常 ($SLAVE_IP :$REDIS_PORT )" fi MASTER_OFFSET=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) SLAVE_OFFSET=$(redis-cli -h $SLAVE_IP -p $REDIS_PORT -a $REDIS_PASSWORD info replication | grep "slave_repl_offset:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_OFFSET " != "" ] && [ "$SLAVE_OFFSET " != "" ]; then SYNC_LAG=$((MASTER_OFFSET - SLAVE_OFFSET)) if [ $SYNC_LAG -gt 10000 ]; then echo "告警: 同步延迟过高 ($SYNC_LAG 字节)" fi fi for port in "${SENTINEL_PORTS[@]} " ; do if ! redis-cli -p $port ping > /dev/null 2>&1; then echo "告警: 哨兵节点 $port 连接失败" fi done MASTER_MEMORY=$(redis-cli -h $MASTER_IP -p $REDIS_PORT -a $REDIS_PASSWORD info memory | grep "used_memory_percentage:" | cut -d: -f2 | tr -d '\r' ) if [ "$MASTER_MEMORY " != "" ] && [ $MASTER_MEMORY -gt 80 ]; then echo "告警: 主节点内存使用率过高 ($MASTER_MEMORY %)" fi } while true ; do check_master_slave_alerts sleep 300 done EOF chmod +x /opt/redis-master-slave-alert.sh nohup /opt/redis-master-slave-alert.sh > /var/log/redis-master-slave-alert.log 2>&1 & log "主备告警设置完成" } main () { case $1 in "monitor" ) monitor_master_slave_status ;; "report" ) generate_master_slave_report ;; "alerts" ) setup_master_slave_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始主备监控" echo " report - 生成主备报告" echo " alerts - 设置主备告警" ;; esac } main "$@ "
7. 总结 7.1 主备架构最佳实践
主从配置 : 合理配置主从复制参数,确保数据同步的可靠性
哨兵部署 : 部署多个哨兵节点,提高故障检测的准确性
故障转移 : 建立完善的故障转移机制,确保服务的高可用性
监控告警 : 建立全面的监控体系,及时发现和处理问题
数据一致性 : 定期检查主备数据一致性,确保数据完整性
7.2 关键指标监控
连接状态 : 监控主从节点的连接状态
同步延迟 : 监控主从数据同步延迟
内存使用 : 监控内存使用情况
命中率 : 监控缓存命中率
哨兵状态 : 监控哨兵节点的健康状态
7.3 运维工具推荐
监控工具 : Prometheus + Grafana + Redis Exporter
告警工具 : Alertmanager + Webhook
管理工具 : Redis Commander, RedisInsight
备份工具 : 自定义备份脚本
诊断工具 : Redis-cli, redis-trib.rb
通过本文的Redis主备运维实战指南,您可以建立完善的Redis主备架构运维体系,确保系统的高可用性和数据一致性。记住,主备架构的运维需要持续关注主从同步状态和故障转移机制,确保在任何情况下都能提供稳定的服务。