1. Redis运维概述

Redis是高性能的内存数据库,广泛应用于分布式缓存、会话存储、消息队列等场景。本文将详细介绍Redis分布式缓存服务的运维实战经验,包括集群部署、性能优化、数据持久化、监控告警的完整解决方案。

1.1 核心功能

  1. 分布式缓存: 提供高性能的分布式缓存服务
  2. 数据持久化: RDB和AOF两种持久化机制
  3. 集群管理: Redis Cluster集群部署和管理
  4. 性能优化: 内存优化和性能调优
  5. 监控告警: 实时监控和故障告警

1.2 技术架构

1
2
3
客户端应用 → Redis集群 → 内存存储
↓ ↓ ↓
缓存服务 → 数据持久化 → 磁盘存储

2. 环境准备

2.1 系统要求检查

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
# check_redis_env.sh - Redis环境检查脚本
# @author 运维实战

# 日志函数
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# 检查系统环境
check_system() {
log "开始检查系统环境..."

# 检查操作系统
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
log "操作系统: Linux"
OS_VERSION=$(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)
log "系统版本: $OS_VERSION"
else
log "错误: 不支持的操作系统 $OSTYPE"
exit 1
fi

# 检查内存
TOTAL_MEM=$(free -h | grep "Mem:" | awk '{print $2}')
AVAILABLE_MEM=$(free -h | grep "Mem:" | awk '{print $7}')
log "总内存: $TOTAL_MEM, 可用内存: $AVAILABLE_MEM"

# 检查磁盘空间
DISK_USAGE=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//')
if [ $DISK_USAGE -gt 80 ]; then
log "警告: 磁盘使用率过高 ($DISK_USAGE%)"
else
log "磁盘使用率: $DISK_USAGE%"
fi

# 检查网络连接
if ping -c 1 8.8.8.8 > /dev/null 2>&1; then
log "网络连接正常"
else
log "警告: 网络连接异常"
fi
}

# 检查Redis安装
check_redis_installation() {
log "检查Redis安装状态..."

if command -v redis-server > /dev/null 2>&1; then
REDIS_VERSION=$(redis-server --version | head -1)
log "Redis已安装: $REDIS_VERSION"

# 检查Redis服务状态
if systemctl is-active --quiet redis; then
log "Redis服务运行正常"
else
log "Redis服务未运行"
fi
else
log "Redis未安装,开始安装..."
install_redis
fi
}

# 安装Redis
install_redis() {
log "开始安装Redis..."

# 更新包管理器
if command -v apt-get > /dev/null 2>&1; then
sudo apt-get update
sudo apt-get install -y redis-server
elif command -v yum > /dev/null 2>&1; then
sudo yum update -y
sudo yum install -y redis
else
log "错误: 不支持的包管理器"
exit 1
fi

# 启动Redis服务
sudo systemctl start redis
sudo systemctl enable redis

log "Redis安装完成"
}

# 主函数
main() {
log "=== Redis环境检查开始 ==="
check_system
check_redis_installation
log "=== Redis环境检查完成 ==="
}

# 执行主函数
main "$@"

2.2 Redis配置优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# redis_config_optimize.sh - Redis配置优化脚本
# @author 运维实战

# Redis配置文件路径
REDIS_CONF="/etc/redis/redis.conf"

# 备份原配置
backup_config() {
log "备份Redis配置文件..."
sudo cp $REDIS_CONF ${REDIS_CONF}.backup.$(date +%Y%m%d_%H%M%S)
log "配置文件已备份"
}

# 优化Redis配置
optimize_redis_config() {
log "开始优化Redis配置..."

# 内存优化
sudo sed -i 's/^# maxmemory <bytes>/maxmemory 2gb/' $REDIS_CONF
sudo sed -i 's/^# maxmemory-policy noeviction/maxmemory-policy allkeys-lru/' $REDIS_CONF

# 持久化优化
sudo sed -i 's/^save 900 1/save 900 1/' $REDIS_CONF
sudo sed -i 's/^save 300 10/save 300 10/' $REDIS_CONF
sudo sed -i 's/^save 60 10000/save 60 10000/' $REDIS_CONF

# AOF配置
sudo sed -i 's/^appendonly no/appendonly yes/' $REDIS_CONF
sudo sed -i 's/^# appendfsync everysec/appendfsync everysec/' $REDIS_CONF

# 网络优化
sudo sed -i 's/^# tcp-keepalive 0/tcp-keepalive 300/' $REDIS_CONF
sudo sed -i 's/^timeout 0/timeout 300/' $REDIS_CONF

# 日志配置
sudo sed -i 's/^loglevel notice/loglevel warning/' $REDIS_CONF
sudo sed -i 's/^# logfile \/var\/log\/redis\/redis-server.log/logfile \/var\/log\/redis\/redis-server.log/' $REDIS_CONF

log "Redis配置优化完成"
}

# 重启Redis服务
restart_redis() {
log "重启Redis服务..."
sudo systemctl restart redis

# 检查服务状态
sleep 3
if systemctl is-active --quiet redis; then
log "Redis服务重启成功"
else
log "错误: Redis服务重启失败"
exit 1
fi
}

# 主函数
main() {
log "=== Redis配置优化开始 ==="
backup_config
optimize_redis_config
restart_redis
log "=== Redis配置优化完成 ==="
}

# 执行主函数
main "$@"

3. Redis集群部署

3.1 Redis Cluster部署脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash
# redis_cluster_deploy.sh - Redis Cluster集群部署脚本
# @author 运维实战

# 集群配置
CLUSTER_NODES=6
CLUSTER_PORTS=(7000 7001 7002 7003 7004 7005)
CLUSTER_DIR="/opt/redis-cluster"

# 创建集群目录
create_cluster_dirs() {
log "创建Redis集群目录..."

for port in "${CLUSTER_PORTS[@]}"; do
NODE_DIR="$CLUSTER_DIR/$port"
mkdir -p $NODE_DIR

# 创建Redis配置文件
cat > $NODE_DIR/redis.conf << EOF
# Redis Cluster节点配置
port $port
cluster-enabled yes
cluster-config-file nodes-$port.conf
cluster-node-timeout 5000
appendonly yes
appendfsync everysec
save 900 1
save 300 10
save 60 10000
maxmemory 1gb
maxmemory-policy allkeys-lru
dir $NODE_DIR
logfile $NODE_DIR/redis.log
daemonize yes
pidfile $NODE_DIR/redis.pid
EOF

log "节点 $port 配置完成"
done
}

# 启动Redis节点
start_redis_nodes() {
log "启动Redis集群节点..."

for port in "${CLUSTER_PORTS[@]}"; do
NODE_DIR="$CLUSTER_DIR/$port"
redis-server $NODE_DIR/redis.conf

if [ $? -eq 0 ]; then
log "节点 $port 启动成功"
else
log "错误: 节点 $port 启动失败"
exit 1
fi
done

# 等待节点启动
sleep 5
}

# 创建集群
create_cluster() {
log "创建Redis集群..."

# 构建集群节点列表
CLUSTER_NODES_LIST=""
for port in "${CLUSTER_PORTS[@]}"; do
CLUSTER_NODES_LIST="$CLUSTER_NODES_LIST 127.0.0.1:$port"
done

# 创建集群
redis-cli --cluster create $CLUSTER_NODES_LIST --cluster-replicas 1 --cluster-yes

if [ $? -eq 0 ]; then
log "Redis集群创建成功"
else
log "错误: Redis集群创建失败"
exit 1
fi
}

# 验证集群状态
verify_cluster() {
log "验证Redis集群状态..."

# 检查集群节点
redis-cli -p 7000 cluster nodes

# 检查集群信息
redis-cli -p 7000 cluster info

# 测试集群功能
redis-cli -p 7000 set test_key "cluster_test"
redis-cli -p 7000 get test_key

log "集群验证完成"
}

# 主函数
main() {
log "=== Redis集群部署开始 ==="
create_cluster_dirs
start_redis_nodes
create_cluster
verify_cluster
log "=== Redis集群部署完成 ==="
}

# 执行主函数
main "$@"

3.2 集群管理脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/bin/bash
# redis_cluster_manager.sh - Redis集群管理脚本
# @author 运维实战

# 集群节点配置
CLUSTER_PORTS=(7000 7001 7002 7003 7004 7005)

# 检查集群状态
check_cluster_status() {
log "检查Redis集群状态..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port: 运行正常"

# 获取节点信息
NODE_INFO=$(redis-cli -p $port cluster nodes | grep ":$port")
NODE_ID=$(echo $NODE_INFO | awk '{print $1}')
NODE_ROLE=$(echo $NODE_INFO | awk '{print $3}')

log "节点 $port ID: $NODE_ID, 角色: $NODE_ROLE"
else
log "节点 $port: 连接失败"
fi
done

# 集群整体状态
redis-cli -p 7000 cluster info | grep cluster_state
}

# 添加新节点
add_node() {
local new_port=$1
local existing_port=$2

log "添加新节点 $new_port 到集群..."

# 启动新节点
NEW_NODE_DIR="/opt/redis-cluster/$new_port"
mkdir -p $NEW_NODE_DIR

cat > $NEW_NODE_DIR/redis.conf << EOF
port $new_port
cluster-enabled yes
cluster-config-file nodes-$new_port.conf
cluster-node-timeout 5000
appendonly yes
dir $NEW_NODE_DIR
daemonize yes
pidfile $NEW_NODE_DIR/redis.pid
EOF

redis-server $NEW_NODE_DIR/redis.conf

# 添加节点到集群
redis-cli --cluster add-node 127.0.0.1:$new_port 127.0.0.1:$existing_port

log "新节点 $new_port 添加完成"
}

# 移除节点
remove_node() {
local node_id=$1
local port=$2

log "移除节点 $node_id..."

# 从集群中移除节点
redis-cli --cluster del-node 127.0.0.1:$port $node_id

# 停止节点服务
redis-cli -p $port shutdown

log "节点 $node_id 移除完成"
}

# 重新分片
reshard_cluster() {
log "开始重新分片..."

# 获取集群信息
redis-cli -p 7000 cluster nodes

# 执行重新分片
redis-cli --cluster reshard 127.0.0.1:7000

log "重新分片完成"
}

# 备份集群数据
backup_cluster() {
local backup_dir="/opt/redis-backup/$(date +%Y%m%d_%H%M%S)"
mkdir -p $backup_dir

log "备份集群数据到 $backup_dir..."

for port in "${CLUSTER_PORTS[@]}"; do
NODE_DIR="/opt/redis-cluster/$port"

# 创建RDB快照
redis-cli -p $port bgsave

# 等待快照完成
while [ ! -f "$NODE_DIR/dump.rdb" ]; do
sleep 1
done

# 复制数据文件
cp $NODE_DIR/dump.rdb $backup_dir/dump_$port.rdb
cp $NODE_DIR/appendonly.aof $backup_dir/appendonly_$port.aof 2>/dev/null || true

log "节点 $port 数据备份完成"
done

log "集群数据备份完成: $backup_dir"
}

# 主函数
main() {
case $1 in
"status")
check_cluster_status
;;
"add")
add_node $2 $3
;;
"remove")
remove_node $2 $3
;;
"reshard")
reshard_cluster
;;
"backup")
backup_cluster
;;
*)
echo "用法: $0 {status|add|remove|reshard|backup}"
echo " status - 检查集群状态"
echo " add <new_port> <existing_port> - 添加新节点"
echo " remove <node_id> <port> - 移除节点"
echo " reshard - 重新分片"
echo " backup - 备份集群数据"
;;
esac
}

# 执行主函数
main "$@"

4. 性能优化

4.1 内存优化脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
# redis_memory_optimize.sh - Redis内存优化脚本
# @author 运维实战

# 检查内存使用情况
check_memory_usage() {
log "检查Redis内存使用情况..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
MEMORY_INFO=$(redis-cli -p $port info memory)

USED_MEMORY=$(echo "$MEMORY_INFO" | grep "used_memory:" | cut -d: -f2 | tr -d '\r')
USED_MEMORY_HUMAN=$(echo "$MEMORY_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
MAX_MEMORY=$(echo "$MEMORY_INFO" | grep "maxmemory:" | cut -d: -f2 | tr -d '\r')

log "节点 $port 内存使用: $USED_MEMORY_HUMAN (最大: $MAX_MEMORY)"

# 计算内存使用率
if [ "$MAX_MEMORY" != "0" ]; then
USAGE_RATE=$(echo "scale=2; $USED_MEMORY * 100 / $MAX_MEMORY" | bc)
log "节点 $port 内存使用率: ${USAGE_RATE}%"

if (( $(echo "$USAGE_RATE > 80" | bc -l) )); then
log "警告: 节点 $port 内存使用率过高"
fi
fi
fi
done
}

# 清理过期键
cleanup_expired_keys() {
log "清理过期键..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
# 执行内存回收
redis-cli -p $port memory purge

# 清理过期键
redis-cli -p $port --scan --pattern "*" | while read key; do
TTL=$(redis-cli -p $port ttl "$key")
if [ "$TTL" = "-2" ]; then
redis-cli -p $port del "$key"
fi
done

log "节点 $port 过期键清理完成"
fi
done
}

# 优化内存配置
optimize_memory_config() {
log "优化内存配置..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
# 设置内存淘汰策略
redis-cli -p $port config set maxmemory-policy allkeys-lru

# 设置内存限制
redis-cli -p $port config set maxmemory 1gb

# 启用内存压缩
redis-cli -p $port config set hash-max-ziplist-entries 512
redis-cli -p $port config set hash-max-ziplist-value 64
redis-cli -p $port config set list-max-ziplist-size -2
redis-cli -p $port config set list-compress-depth 0

log "节点 $port 内存配置优化完成"
fi
done
}

# 分析大键
analyze_big_keys() {
log "分析大键..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port 大键分析:"

# 使用redis-cli分析大键
redis-cli -p $port --bigkeys | head -20

# 分析内存使用
redis-cli -p $port memory usage --samples 10 | sort -nr | head -10
fi
done
}

# 主函数
main() {
case $1 in
"check")
check_memory_usage
;;
"cleanup")
cleanup_expired_keys
;;
"optimize")
optimize_memory_config
;;
"analyze")
analyze_big_keys
;;
*)
echo "用法: $0 {check|cleanup|optimize|analyze}"
echo " check - 检查内存使用情况"
echo " cleanup - 清理过期键"
echo " optimize - 优化内存配置"
echo " analyze - 分析大键"
;;
esac
}

# 执行主函数
main "$@"

4.2 性能监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/bin/bash
# redis_performance_monitor.sh - Redis性能监控脚本
# @author 运维实战

# 监控配置
MONITOR_INTERVAL=60
LOG_FILE="/var/log/redis-monitor.log"

# 记录监控数据
log_monitor_data() {
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
local port=$1
local metric=$2
local value=$3

echo "[$timestamp] Node:$port Metric:$metric Value:$value" >> $LOG_FILE
}

# 监控Redis性能指标
monitor_redis_performance() {
log "开始监控Redis性能指标..."

while true; do
for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
# 获取性能信息
INFO=$(redis-cli -p $port info)

# 监控关键指标
CONNECTED_CLIENTS=$(echo "$INFO" | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
USED_MEMORY=$(echo "$INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
KEYS_COUNT=$(echo "$INFO" | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1)
HITS=$(echo "$INFO" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r')
MISSES=$(echo "$INFO" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r')

# 记录监控数据
log_monitor_data $port "connected_clients" $CONNECTED_CLIENTS
log_monitor_data $port "used_memory" $USED_MEMORY
log_monitor_data $port "keys_count" $KEYS_COUNT
log_monitor_data $port "hits" $HITS
log_monitor_data $port "misses" $MISSES

# 计算命中率
if [ "$HITS" != "0" ] && [ "$MISSES" != "0" ]; then
HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES)" | bc)
log_monitor_data $port "hit_rate" $HIT_RATE
fi
fi
done

sleep $MONITOR_INTERVAL
done
}

# 生成性能报告
generate_performance_report() {
local report_file="/var/log/redis-performance-report-$(date +%Y%m%d).txt"

log "生成性能报告: $report_file"

cat > $report_file << EOF
Redis集群性能报告
生成时间: $(date)
========================================

EOF

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
echo "节点 $port 性能统计:" >> $report_file
echo "----------------------------------------" >> $report_file

# 获取统计信息
redis-cli -p $port info stats >> $report_file
echo "" >> $report_file

# 获取内存信息
redis-cli -p $port info memory >> $report_file
echo "" >> $report_file
fi
done

log "性能报告生成完成: $report_file"
}

# 设置性能告警
setup_performance_alerts() {
log "设置性能告警..."

# 创建告警脚本
cat > /opt/redis-alert.sh << 'EOF'
#!/bin/bash
# redis性能告警脚本

ALERT_THRESHOLDS=(
"connected_clients:1000"
"used_memory:80%"
"hit_rate:90"
)

check_alerts() {
for threshold in "${ALERT_THRESHOLDS[@]}"; do
METRIC=$(echo $threshold | cut -d: -f1)
THRESHOLD_VALUE=$(echo $threshold | cut -d: -f2)

# 检查每个节点
for port in 7000 7001 7002 7003 7004 7005; do
if redis-cli -p $port ping > /dev/null 2>&1; then
CURRENT_VALUE=$(redis-cli -p $port info | grep "$METRIC:" | cut -d: -f2 | tr -d '\r')

# 比较阈值
if [ "$METRIC" = "hit_rate" ]; then
if (( $(echo "$CURRENT_VALUE < $THRESHOLD_VALUE" | bc -l) )); then
echo "告警: 节点 $port $METRIC 低于阈值 ($CURRENT_VALUE < $THRESHOLD_VALUE)"
fi
else
if (( $(echo "$CURRENT_VALUE > $THRESHOLD_VALUE" | bc -l) )); then
echo "告警: 节点 $port $METRIC 超过阈值 ($CURRENT_VALUE > $THRESHOLD_VALUE)"
fi
fi
fi
done
done
}

# 每5分钟检查一次
while true; do
check_alerts
sleep 300
done
EOF

chmod +x /opt/redis-alert.sh

# 启动告警服务
nohup /opt/redis-alert.sh > /var/log/redis-alert.log 2>&1 &

log "性能告警设置完成"
}

# 主函数
main() {
case $1 in
"monitor")
monitor_redis_performance
;;
"report")
generate_performance_report
;;
"alerts")
setup_performance_alerts
;;
*)
echo "用法: $0 {monitor|report|alerts}"
echo " monitor - 开始性能监控"
echo " report - 生成性能报告"
echo " alerts - 设置性能告警"
;;
esac
}

# 执行主函数
main "$@"

5. 数据持久化

5.1 数据备份脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/bin/bash
# redis_backup.sh - Redis数据备份脚本
# @author 运维实战

# 备份配置
BACKUP_DIR="/opt/redis-backup"
RETENTION_DAYS=7

# 创建备份目录
create_backup_dir() {
local backup_date=$(date +%Y%m%d_%H%M%S)
local backup_path="$BACKUP_DIR/$backup_date"

mkdir -p $backup_path
echo $backup_path
}

# 备份单个节点
backup_single_node() {
local port=$1
local backup_path=$2

log "备份节点 $port 数据..."

if redis-cli -p $port ping > /dev/null 2>&1; then
NODE_DIR="/opt/redis-cluster/$port"

# 创建RDB快照
redis-cli -p $port bgsave

# 等待快照完成
while [ ! -f "$NODE_DIR/dump.rdb" ]; do
sleep 1
done

# 复制数据文件
cp $NODE_DIR/dump.rdb $backup_path/dump_$port.rdb

# 备份AOF文件
if [ -f "$NODE_DIR/appendonly.aof" ]; then
cp $NODE_DIR/appendonly.aof $backup_path/appendonly_$port.aof
fi

# 备份配置文件
cp $NODE_DIR/redis.conf $backup_path/redis_$port.conf

log "节点 $port 备份完成"
else
log "错误: 节点 $port 连接失败"
fi
}

# 备份整个集群
backup_cluster() {
log "开始备份Redis集群..."

local backup_path=$(create_backup_dir)
log "备份路径: $backup_path"

# 备份所有节点
for port in "${CLUSTER_PORTS[@]}"; do
backup_single_node $port $backup_path
done

# 备份集群配置
redis-cli -p 7000 cluster nodes > $backup_path/cluster_nodes.txt
redis-cli -p 7000 cluster info > $backup_path/cluster_info.txt

# 创建备份信息文件
cat > $backup_path/backup_info.txt << EOF
备份时间: $(date)
集群节点: ${CLUSTER_PORTS[@]}
备份类型: 全量备份
EOF

# 压缩备份文件
cd $BACKUP_DIR
tar -czf "$(basename $backup_path).tar.gz" "$(basename $backup_path)"
rm -rf $backup_path

log "集群备份完成: $backup_path.tar.gz"
}

# 增量备份
incremental_backup() {
log "开始增量备份..."

local backup_path=$(create_backup_dir)
log "增量备份路径: $backup_path"

# 只备份AOF文件
for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
NODE_DIR="/opt/redis-cluster/$port"

if [ -f "$NODE_DIR/appendonly.aof" ]; then
cp $NODE_DIR/appendonly.aof $backup_path/appendonly_$port.aof
log "节点 $port AOF文件备份完成"
fi
fi
done

# 压缩备份文件
cd $BACKUP_DIR
tar -czf "incremental_$(basename $backup_path).tar.gz" "$(basename $backup_path)"
rm -rf $backup_path

log "增量备份完成"
}

# 恢复数据
restore_data() {
local backup_file=$1
local target_port=$2

log "从 $backup_file 恢复数据到节点 $target_port..."

if [ ! -f "$backup_file" ]; then
log "错误: 备份文件不存在 $backup_file"
exit 1
fi

# 解压备份文件
local temp_dir="/tmp/redis_restore_$(date +%s)"
mkdir -p $temp_dir
tar -xzf $backup_file -C $temp_dir

# 停止目标节点
redis-cli -p $target_port shutdown

# 恢复数据文件
local target_dir="/opt/redis-cluster/$target_port"
cp $temp_dir/dump_$target_port.rdb $target_dir/dump.rdb

if [ -f "$temp_dir/appendonly_$target_port.aof" ]; then
cp $temp_dir/appendonly_$target_port.aof $target_dir/appendonly.aof
fi

# 启动目标节点
redis-server $target_dir/redis.conf

# 清理临时文件
rm -rf $temp_dir

log "数据恢复完成"
}

# 清理过期备份
cleanup_old_backups() {
log "清理过期备份文件..."

find $BACKUP_DIR -name "*.tar.gz" -mtime +$RETENTION_DAYS -delete

log "过期备份清理完成"
}

# 主函数
main() {
case $1 in
"backup")
backup_cluster
;;
"incremental")
incremental_backup
;;
"restore")
restore_data $2 $3
;;
"cleanup")
cleanup_old_backups
;;
*)
echo "用法: $0 {backup|incremental|restore|cleanup}"
echo " backup - 全量备份集群"
echo " incremental - 增量备份"
echo " restore <backup_file> <target_port> - 恢复数据"
echo " cleanup - 清理过期备份"
;;
esac
}

# 执行主函数
main "$@"

5.2 数据同步脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/bin/bash
# redis_sync.sh - Redis数据同步脚本
# @author 运维实战

# 主从同步配置
MASTER_PORT=7000
SLAVE_PORTS=(7001 7002)

# 设置主从关系
setup_master_slave() {
log "设置Redis主从关系..."

for slave_port in "${SLAVE_PORTS[@]}"; do
log "设置节点 $slave_port$MASTER_PORT 的从节点..."

# 停止从节点
redis-cli -p $slave_port shutdown

# 修改从节点配置
SLAVE_DIR="/opt/redis-cluster/$slave_port"
sed -i "s/^# replicaof <masterip> <masterport>/replicaof 127.0.0.1 $MASTER_PORT/" $SLAVE_DIR/redis.conf

# 启动从节点
redis-server $SLAVE_DIR/redis.conf

# 等待同步完成
sleep 5

# 检查同步状态
REPLICA_INFO=$(redis-cli -p $slave_port info replication)
REPLICA_STATE=$(echo "$REPLICA_INFO" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')

if [ "$REPLICA_STATE" = "up" ]; then
log "节点 $slave_port 主从同步成功"
else
log "警告: 节点 $slave_port 主从同步失败"
fi
done
}

# 检查同步状态
check_sync_status() {
log "检查Redis同步状态..."

# 检查主节点
MASTER_INFO=$(redis-cli -p $MASTER_PORT info replication)
CONNECTED_SLAVES=$(echo "$MASTER_INFO" | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r')
log "主节点 $MASTER_PORT 连接的从节点数: $CONNECTED_SLAVES"

# 检查从节点
for slave_port in "${SLAVE_PORTS[@]}"; do
SLAVE_INFO=$(redis-cli -p $slave_port info replication)
MASTER_LINK_STATUS=$(echo "$SLAVE_INFO" | grep "master_link_status:" | cut -d: -f2 | tr -d '\r')
REPLICA_LAG=$(echo "$SLAVE_INFO" | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r')

log "从节点 $slave_port 状态: $MASTER_LINK_STATUS, 延迟: $REPLICA_LAG"
done
}

# 手动同步
manual_sync() {
log "执行手动同步..."

for slave_port in "${SLAVE_PORTS[@]}"; do
# 执行同步命令
redis-cli -p $slave_port sync

log "节点 $slave_port 手动同步完成"
done
}

# 故障转移
failover() {
log "执行故障转移..."

# 检查主节点状态
if ! redis-cli -p $MASTER_PORT ping > /dev/null 2>&1; then
log "主节点 $MASTER_PORT 故障,开始故障转移..."

# 提升第一个从节点为主节点
NEW_MASTER=${SLAVE_PORTS[0]}

# 停止从节点服务
redis-cli -p $NEW_MASTER shutdown

# 修改配置为主节点
NEW_MASTER_DIR="/opt/redis-cluster/$NEW_MASTER"
sed -i 's/^replicaof/#replicaof/' $NEW_MASTER_DIR/redis.conf

# 启动新主节点
redis-server $NEW_MASTER_DIR/redis.conf

# 更新其他从节点指向新主节点
for slave_port in "${SLAVE_PORTS[@]:1}"; do
redis-cli -p $slave_port replicaof 127.0.0.1 $NEW_MASTER
done

log "故障转移完成,新主节点: $NEW_MASTER"
else
log "主节点运行正常,无需故障转移"
fi
}

# 主函数
main() {
case $1 in
"setup")
setup_master_slave
;;
"status")
check_sync_status
;;
"sync")
manual_sync
;;
"failover")
failover
;;
*)
echo "用法: $0 {setup|status|sync|failover}"
echo " setup - 设置主从关系"
echo " status - 检查同步状态"
echo " sync - 手动同步"
echo " failover - 故障转移"
;;
esac
}

# 执行主函数
main "$@"

6. 监控告警

6.1 监控系统部署

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/bash
# redis_monitor_deploy.sh - Redis监控系统部署脚本
# @author 运维实战

# 安装Prometheus
install_prometheus() {
log "安装Prometheus..."

# 下载Prometheus
wget https://github.com/prometheus/prometheus/releases/download/v2.40.0/prometheus-2.40.0.linux-amd64.tar.gz
tar -xzf prometheus-2.40.0.linux-amd64.tar.gz
mv prometheus-2.40.0.linux-amd64 /opt/prometheus

# 创建Prometheus配置
cat > /opt/prometheus/prometheus.yml << EOF
global:
scrape_interval: 15s
evaluation_interval: 15s

rule_files:
- "redis_rules.yml"

scrape_configs:
- job_name: 'redis'
static_configs:
- targets: ['localhost:9121']
scrape_interval: 5s
EOF

# 创建Redis规则文件
cat > /opt/prometheus/redis_rules.yml << EOF
groups:
- name: redis
rules:
- alert: RedisDown
expr: redis_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "Redis instance is down"
description: "Redis instance {{ \$labels.instance }} has been down for more than 0 minutes."

- alert: RedisMemoryHigh
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "Redis memory usage is high"
description: "Redis memory usage is above 80% for more than 2 minutes."

- alert: RedisSlowLog
expr: increase(redis_slowlog_length[5m]) > 10
for: 0m
labels:
severity: warning
annotations:
summary: "Redis slow log entries"
description: "Redis instance {{ \$labels.instance }} has more than 10 slow log entries in the last 5 minutes."
EOF

# 创建systemd服务
cat > /etc/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=prometheus
ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/opt/prometheus/data
Restart=always

[Install]
WantedBy=multi-user.target
EOF

# 创建prometheus用户
useradd --no-create-home --shell /bin/false prometheus
chown -R prometheus:prometheus /opt/prometheus

# 启动服务
systemctl daemon-reload
systemctl start prometheus
systemctl enable prometheus

log "Prometheus安装完成"
}

# 安装Redis Exporter
install_redis_exporter() {
log "安装Redis Exporter..."

# 下载Redis Exporter
wget https://github.com/oliver006/redis_exporter/releases/download/v1.45.0/redis_exporter-v1.45.0.linux-amd64.tar.gz
tar -xzf redis_exporter-v1.45.0.linux-amd64.tar.gz
mv redis_exporter-v1.45.0.linux-amd64/redis_exporter /usr/local/bin/

# 创建systemd服务
cat > /etc/systemd/system/redis-exporter.service << EOF
[Unit]
Description=Redis Exporter
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=redis
ExecStart=/usr/local/bin/redis_exporter --redis.addr=redis://localhost:7000 --redis.addr=redis://localhost:7001 --redis.addr=redis://localhost:7002
Restart=always

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
systemctl daemon-reload
systemctl start redis-exporter
systemctl enable redis-exporter

log "Redis Exporter安装完成"
}

# 安装Grafana
install_grafana() {
log "安装Grafana..."

# 添加Grafana仓库
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee /etc/apt/sources.list.d/grafana.list

# 安装Grafana
apt-get update
apt-get install -y grafana

# 启动服务
systemctl start grafana-server
systemctl enable grafana-server

log "Grafana安装完成"
}

# 配置Grafana仪表板
configure_grafana_dashboard() {
log "配置Grafana仪表板..."

# 等待Grafana启动
sleep 10

# 创建数据源
curl -X POST \
http://admin:admin@localhost:3000/api/datasources \
-H 'Content-Type: application/json' \
-d '{
"name": "Prometheus",
"type": "prometheus",
"url": "http://localhost:9090",
"access": "proxy",
"isDefault": true
}'

# 导入Redis仪表板
curl -X POST \
http://admin:admin@localhost:3000/api/dashboards/db \
-H 'Content-Type: application/json' \
-d @/opt/grafana-dashboard.json

log "Grafana仪表板配置完成"
}

# 主函数
main() {
log "=== Redis监控系统部署开始 ==="
install_prometheus
install_redis_exporter
install_grafana
configure_grafana_dashboard
log "=== Redis监控系统部署完成 ==="
}

# 执行主函数
main "$@"

6.2 告警配置脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/bin/bash
# redis_alert_config.sh - Redis告警配置脚本
# @author 运维实战

# 安装Alertmanager
install_alertmanager() {
log "安装Alertmanager..."

# 下载Alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz
tar -xzf alertmanager-0.25.0.linux-amd64.tar.gz
mv alertmanager-0.25.0.linux-amd64 /opt/alertmanager

# 创建Alertmanager配置
cat > /opt/alertmanager/alertmanager.yml << EOF
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@example.com'

route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'

receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
send_resolved: true

- name: 'email'
email_configs:
- to: 'admin@example.com'
subject: 'Redis Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
EOF

# 创建systemd服务
cat > /etc/systemd/system/alertmanager.service << EOF
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target

[Service]
Type=simple
User=alertmanager
ExecStart=/opt/alertmanager/alertmanager --config.file=/opt/alertmanager/alertmanager.yml --storage.path=/opt/alertmanager/data
Restart=always

[Install]
WantedBy=multi-user.target
EOF

# 创建alertmanager用户
useradd --no-create-home --shell /bin/false alertmanager
chown -R alertmanager:alertmanager /opt/alertmanager

# 启动服务
systemctl daemon-reload
systemctl start alertmanager
systemctl enable alertmanager

log "Alertmanager安装完成"
}

# 配置邮件告警
configure_email_alerts() {
log "配置邮件告警..."

# 安装邮件服务
apt-get install -y postfix mailutils

# 配置邮件服务
cat > /etc/postfix/main.cf << EOF
myhostname = redis-monitor.example.com
mydomain = example.com
myorigin = \$mydomain
inet_interfaces = loopback-only
mydestination = \$myhostname, localhost.\$mydomain, localhost, \$mydomain
relayhost = [smtp.gmail.com]:587
smtp_use_tls = yes
smtp_sasl_auth_enable = yes
smtp_sasl_password_maps = hash:/etc/postfix/sasl_passwd
smtp_sasl_security_options = noanonymous
EOF

# 配置SMTP认证
cat > /etc/postfix/sasl_passwd << EOF
[smtp.gmail.com]:587 username@gmail.com:password
EOF

chmod 600 /etc/postfix/sasl_passwd
postmap /etc/postfix/sasl_passwd

# 重启邮件服务
systemctl restart postfix

log "邮件告警配置完成"
}

# 配置Webhook告警
configure_webhook_alerts() {
log "配置Webhook告警..."

# 创建Webhook接收器
cat > /opt/webhook-receiver.py << 'EOF'
#!/usr/bin/env python3
import json
import requests
from flask import Flask, request

app = Flask(__name__)

@app.route('/', methods=['POST'])
def webhook():
data = request.get_json()

# 处理告警数据
for alert in data.get('alerts', []):
alert_name = alert.get('labels', {}).get('alertname')
alert_status = alert.get('status')
alert_summary = alert.get('annotations', {}).get('summary', '')

# 发送到钉钉
send_to_dingtalk(alert_name, alert_status, alert_summary)

# 发送到企业微信
send_to_wechat(alert_name, alert_status, alert_summary)

return 'OK'

def send_to_dingtalk(alert_name, status, summary):
webhook_url = "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN"

message = {
"msgtype": "text",
"text": {
"content": f"Redis告警\n告警名称: {alert_name}\n状态: {status}\n描述: {summary}"
}
}

requests.post(webhook_url, json=message)

def send_to_wechat(alert_name, status, summary):
webhook_url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_KEY"

message = {
"msgtype": "text",
"text": {
"content": f"Redis告警\n告警名称: {alert_name}\n状态: {status}\n描述: {summary}"
}
}

requests.post(webhook_url, json=message)

if __name__ == '__main__':
app.run(host='0.0.0.0', port=5001)
EOF

chmod +x /opt/webhook-receiver.py

# 安装Python依赖
pip3 install flask requests

# 创建systemd服务
cat > /etc/systemd/system/webhook-receiver.service << EOF
[Unit]
Description=Webhook Receiver
After=network.target

[Service]
Type=simple
User=root
ExecStart=/usr/bin/python3 /opt/webhook-receiver.py
Restart=always

[Install]
WantedBy=multi-user.target
EOF

# 启动服务
systemctl daemon-reload
systemctl start webhook-receiver
systemctl enable webhook-receiver

log "Webhook告警配置完成"
}

# 主函数
main() {
log "=== Redis告警配置开始 ==="
install_alertmanager
configure_email_alerts
configure_webhook_alerts
log "=== Redis告警配置完成 ==="
}

# 执行主函数
main "$@"

7. 故障处理

7.1 故障诊断脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#!/bin/bash
# redis_troubleshoot.sh - Redis故障诊断脚本
# @author 运维实战

# 诊断Redis连接问题
diagnose_connection() {
log "诊断Redis连接问题..."

for port in "${CLUSTER_PORTS[@]}"; do
log "检查节点 $port 连接状态..."

# 检查端口是否监听
if netstat -tlnp | grep ":$port " > /dev/null; then
log "节点 $port 端口监听正常"
else
log "错误: 节点 $port 端口未监听"
fi

# 检查Redis服务状态
if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port Redis服务正常"
else
log "错误: 节点 $port Redis服务异常"

# 检查Redis进程
REDIS_PID=$(ps aux | grep "redis-server.*:$port" | grep -v grep | awk '{print $2}')
if [ -n "$REDIS_PID" ]; then
log "Redis进程存在: $REDIS_PID"
else
log "Redis进程不存在"
fi

# 检查Redis日志
REDIS_LOG="/opt/redis-cluster/$port/redis.log"
if [ -f "$REDIS_LOG" ]; then
log "最近的Redis日志:"
tail -20 $REDIS_LOG
fi
fi
done
}

# 诊断内存问题
diagnose_memory() {
log "诊断Redis内存问题..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port 内存诊断:"

# 获取内存信息
MEMORY_INFO=$(redis-cli -p $port info memory)
USED_MEMORY=$(echo "$MEMORY_INFO" | grep "used_memory:" | cut -d: -f2 | tr -d '\r')
USED_MEMORY_HUMAN=$(echo "$MEMORY_INFO" | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r')
MAX_MEMORY=$(echo "$MEMORY_INFO" | grep "maxmemory:" | cut -d: -f2 | tr -d '\r')

log "已使用内存: $USED_MEMORY_HUMAN"
log "最大内存: $MAX_MEMORY"

# 检查内存碎片
MEMORY_FRAGMENTATION=$(echo "$MEMORY_INFO" | grep "mem_fragmentation_ratio:" | cut -d: -f2 | tr -d '\r')
log "内存碎片率: $MEMORY_FRAGMENTATION"

if (( $(echo "$MEMORY_FRAGMENTATION > 1.5" | bc -l) )); then
log "警告: 内存碎片率过高"
fi

# 检查大键
log "分析大键..."
redis-cli -p $port --bigkeys | head -10
fi
done
}

# 诊断性能问题
diagnose_performance() {
log "诊断Redis性能问题..."

for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port 性能诊断:"

# 获取统计信息
STATS_INFO=$(redis-cli -p $port info stats)

# 检查连接数
CONNECTED_CLIENTS=$(echo "$STATS_INFO" | grep "connected_clients:" | cut -d: -f2 | tr -d '\r')
log "连接客户端数: $CONNECTED_CLIENTS"

# 检查命令统计
TOTAL_COMMANDS=$(echo "$STATS_INFO" | grep "total_commands_processed:" | cut -d: -f2 | tr -d '\r')
log "总命令数: $TOTAL_COMMANDS"

# 检查慢查询
SLOWLOG_LEN=$(redis-cli -p $port slowlog len)
log "慢查询数量: $SLOWLOG_LEN"

if [ "$SLOWLOG_LEN" -gt 0 ]; then
log "最近的慢查询:"
redis-cli -p $port slowlog get 5
fi

# 检查键空间命中率
HITS=$(echo "$STATS_INFO" | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r')
MISSES=$(echo "$STATS_INFO" | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r')

if [ "$HITS" != "0" ] && [ "$MISSES" != "0" ]; then
HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES)" | bc)
log "键空间命中率: ${HIT_RATE}%"

if (( $(echo "$HIT_RATE < 90" | bc -l) )); then
log "警告: 键空间命中率过低"
fi
fi
fi
done
}

# 诊断集群问题
diagnose_cluster() {
log "诊断Redis集群问题..."

# 检查集群状态
CLUSTER_INFO=$(redis-cli -p 7000 cluster info)
CLUSTER_STATE=$(echo "$CLUSTER_INFO" | grep "cluster_state:" | cut -d: -f2 | tr -d '\r')
log "集群状态: $CLUSTER_STATE"

if [ "$CLUSTER_STATE" != "ok" ]; then
log "错误: 集群状态异常"
fi

# 检查集群节点
CLUSTER_NODES=$(redis-cli -p 7000 cluster nodes)
log "集群节点信息:"
echo "$CLUSTER_NODES"

# 检查故障节点
FAILED_NODES=$(echo "$CLUSTER_NODES" | grep "fail")
if [ -n "$FAILED_NODES" ]; then
log "发现故障节点:"
echo "$FAILED_NODES"
fi

# 检查主从关系
MASTER_NODES=$(echo "$CLUSTER_NODES" | grep "master")
SLAVE_NODES=$(echo "$CLUSTER_NODES" | grep "slave")

log "主节点数量: $(echo "$MASTER_NODES" | wc -l)"
log "从节点数量: $(echo "$SLAVE_NODES" | wc -l)"
}

# 自动修复
auto_fix() {
log "开始自动修复..."

# 修复连接问题
for port in "${CLUSTER_PORTS[@]}"; do
if ! redis-cli -p $port ping > /dev/null 2>&1; then
log "尝试重启节点 $port..."

# 停止节点
redis-cli -p $port shutdown 2>/dev/null || true

# 启动节点
NODE_DIR="/opt/redis-cluster/$port"
redis-server $NODE_DIR/redis.conf

sleep 3

if redis-cli -p $port ping > /dev/null 2>&1; then
log "节点 $port 修复成功"
else
log "节点 $port 修复失败"
fi
fi
done

# 修复内存问题
for port in "${CLUSTER_PORTS[@]}"; do
if redis-cli -p $port ping > /dev/null 2>&1; then
# 执行内存回收
redis-cli -p $port memory purge

# 清理过期键
redis-cli -p $port --scan --pattern "*" | while read key; do
TTL=$(redis-cli -p $port ttl "$key")
if [ "$TTL" = "-2" ]; then
redis-cli -p $port del "$key"
fi
done

log "节点 $port 内存清理完成"
fi
done
}

# 主函数
main() {
case $1 in
"connection")
diagnose_connection
;;
"memory")
diagnose_memory
;;
"performance")
diagnose_performance
;;
"cluster")
diagnose_cluster
;;
"fix")
auto_fix
;;
"all")
diagnose_connection
diagnose_memory
diagnose_performance
diagnose_cluster
;;
*)
echo "用法: $0 {connection|memory|performance|cluster|fix|all}"
echo " connection - 诊断连接问题"
echo " memory - 诊断内存问题"
echo " performance - 诊断性能问题"
echo " cluster - 诊断集群问题"
echo " fix - 自动修复"
echo " all - 全面诊断"
;;
esac
}

# 执行主函数
main "$@"

8. 总结

8.1 运维最佳实践

  1. 集群部署: 使用Redis Cluster实现高可用和水平扩展
  2. 性能优化: 合理配置内存策略和持久化机制
  3. 监控告警: 建立完善的监控体系和告警机制
  4. 数据备份: 定期备份数据,确保数据安全
  5. 故障处理: 建立故障诊断和自动修复机制

8.2 关键指标监控

  • 内存使用率: 监控内存使用情况,避免内存溢出
  • 连接数: 监控客户端连接数,防止连接耗尽
  • 命中率: 监控缓存命中率,优化缓存策略
  • 响应时间: 监控命令响应时间,确保性能
  • 集群状态: 监控集群健康状态,及时发现问题

8.3 运维工具推荐

  1. 监控工具: Prometheus + Grafana + Redis Exporter
  2. 告警工具: Alertmanager + Webhook
  3. 管理工具: Redis Commander, RedisInsight
  4. 备份工具: 自定义备份脚本
  5. 诊断工具: Redis-cli, redis-trib.rb

通过本文的Redis运维实战指南,您可以建立完善的Redis分布式缓存服务运维体系,确保系统的高可用性和高性能。记住,运维是一个持续改进的过程,需要根据业务需求和技术发展不断优化和完善。