1. Redis运维概述 Redis是高性能的内存数据库,广泛应用于分布式缓存、会话存储、消息队列等场景。本文将详细介绍Redis分布式缓存服务的运维实战经验,包括集群部署、性能优化、数据持久化、监控告警的完整解决方案。
1.1 核心功能
分布式缓存 : 提供高性能的分布式缓存服务
数据持久化 : RDB和AOF两种持久化机制
集群管理 : Redis Cluster集群部署和管理
性能优化 : 内存优化和性能调优
监控告警 : 实时监控和故障告警
1.2 技术架构 1 2 3 客户端应用 → Redis集群 → 内存存储 ↓ ↓ ↓ 缓存服务 → 数据持久化 → 磁盘存储
2. 环境准备 2.1 系统要求检查 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 #!/bin/bash log () { echo "[$(date '+%Y-%m-%d %H:%M:%S') ] $1 " } check_system () { log "开始检查系统环境..." if [[ "$OSTYPE " == "linux-gnu" * ]]; then log "操作系统: Linux" OS_VERSION=$(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) log "系统版本: $OS_VERSION " else log "错误: 不支持的操作系统 $OSTYPE " exit 1 fi TOTAL_MEM=$(free -h | grep "Mem:" | awk '{print $2}' ) AVAILABLE_MEM=$(free -h | grep "Mem:" | awk '{print $7}' ) log "总内存: $TOTAL_MEM , 可用内存: $AVAILABLE_MEM " DISK_USAGE=$(df -h / | tail -1 | awk '{print $5}' | sed 's/%//' ) if [ $DISK_USAGE -gt 80 ]; then log "警告: 磁盘使用率过高 ($DISK_USAGE %)" else log "磁盘使用率: $DISK_USAGE %" fi if ping -c 1 8.8.8.8 > /dev/null 2>&1; then log "网络连接正常" else log "警告: 网络连接异常" fi } check_redis_installation () { log "检查Redis安装状态..." if command -v redis-server > /dev/null 2>&1; then REDIS_VERSION=$(redis-server --version | head -1) log "Redis已安装: $REDIS_VERSION " if systemctl is-active --quiet redis; then log "Redis服务运行正常" else log "Redis服务未运行" fi else log "Redis未安装,开始安装..." install_redis fi } install_redis () { log "开始安装Redis..." if command -v apt-get > /dev/null 2>&1; then sudo apt-get update sudo apt-get install -y redis-server elif command -v yum > /dev/null 2>&1; then sudo yum update -y sudo yum install -y redis else log "错误: 不支持的包管理器" exit 1 fi sudo systemctl start redis sudo systemctl enable redis log "Redis安装完成" } main () { log "=== Redis环境检查开始 ===" check_system check_redis_installation log "=== Redis环境检查完成 ===" } main "$@ "
2.2 Redis配置优化 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 #!/bin/bash REDIS_CONF="/etc/redis/redis.conf" backup_config () { log "备份Redis配置文件..." sudo cp $REDIS_CONF ${REDIS_CONF} .backup.$(date +%Y%m%d_%H%M%S) log "配置文件已备份" } optimize_redis_config () { log "开始优化Redis配置..." sudo sed -i 's/^# maxmemory <bytes>/maxmemory 2gb/' $REDIS_CONF sudo sed -i 's/^# maxmemory-policy noeviction/maxmemory-policy allkeys-lru/' $REDIS_CONF sudo sed -i 's/^save 900 1/save 900 1/' $REDIS_CONF sudo sed -i 's/^save 300 10/save 300 10/' $REDIS_CONF sudo sed -i 's/^save 60 10000/save 60 10000/' $REDIS_CONF sudo sed -i 's/^appendonly no/appendonly yes/' $REDIS_CONF sudo sed -i 's/^# appendfsync everysec/appendfsync everysec/' $REDIS_CONF sudo sed -i 's/^# tcp-keepalive 0/tcp-keepalive 300/' $REDIS_CONF sudo sed -i 's/^timeout 0/timeout 300/' $REDIS_CONF sudo sed -i 's/^loglevel notice/loglevel warning/' $REDIS_CONF sudo sed -i 's/^# logfile \/var\/log\/redis\/redis-server.log/logfile \/var\/log\/redis\/redis-server.log/' $REDIS_CONF log "Redis配置优化完成" } restart_redis () { log "重启Redis服务..." sudo systemctl restart redis sleep 3 if systemctl is-active --quiet redis; then log "Redis服务重启成功" else log "错误: Redis服务重启失败" exit 1 fi } main () { log "=== Redis配置优化开始 ===" backup_config optimize_redis_config restart_redis log "=== Redis配置优化完成 ===" } main "$@ "
3. Redis集群部署 3.1 Redis Cluster部署脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 #!/bin/bash CLUSTER_NODES=6 CLUSTER_PORTS=(7000 7001 7002 7003 7004 7005) CLUSTER_DIR="/opt/redis-cluster" create_cluster_dirs () { log "创建Redis集群目录..." for port in "${CLUSTER_PORTS[@]} " ; do NODE_DIR="$CLUSTER_DIR /$port " mkdir -p $NODE_DIR cat > $NODE_DIR /redis.conf << EOF # Redis Cluster节点配置 port $port cluster-enabled yes cluster-config-file nodes-$port.conf cluster-node-timeout 5000 appendonly yes appendfsync everysec save 900 1 save 300 10 save 60 10000 maxmemory 1gb maxmemory-policy allkeys-lru dir $NODE_DIR logfile $NODE_DIR/redis.log daemonize yes pidfile $NODE_DIR/redis.pid EOF log "节点 $port 配置完成" done } start_redis_nodes () { log "启动Redis集群节点..." for port in "${CLUSTER_PORTS[@]} " ; do NODE_DIR="$CLUSTER_DIR /$port " redis-server $NODE_DIR /redis.conf if [ $? -eq 0 ]; then log "节点 $port 启动成功" else log "错误: 节点 $port 启动失败" exit 1 fi done sleep 5 } create_cluster () { log "创建Redis集群..." CLUSTER_NODES_LIST="" for port in "${CLUSTER_PORTS[@]} " ; do CLUSTER_NODES_LIST="$CLUSTER_NODES_LIST 127.0.0.1:$port " done redis-cli --cluster create $CLUSTER_NODES_LIST --cluster-replicas 1 --cluster-yes if [ $? -eq 0 ]; then log "Redis集群创建成功" else log "错误: Redis集群创建失败" exit 1 fi } verify_cluster () { log "验证Redis集群状态..." redis-cli -p 7000 cluster nodes redis-cli -p 7000 cluster info redis-cli -p 7000 set test_key "cluster_test" redis-cli -p 7000 get test_key log "集群验证完成" } main () { log "=== Redis集群部署开始 ===" create_cluster_dirs start_redis_nodes create_cluster verify_cluster log "=== Redis集群部署完成 ===" } main "$@ "
3.2 集群管理脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 #!/bin/bash CLUSTER_PORTS=(7000 7001 7002 7003 7004 7005) check_cluster_status () { log "检查Redis集群状态..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port : 运行正常" NODE_INFO=$(redis-cli -p $port cluster nodes | grep ":$port " ) NODE_ID=$(echo $NODE_INFO | awk '{print $1}' ) NODE_ROLE=$(echo $NODE_INFO | awk '{print $3}' ) log "节点 $port ID: $NODE_ID , 角色: $NODE_ROLE " else log "节点 $port : 连接失败" fi done redis-cli -p 7000 cluster info | grep cluster_state } add_node () { local new_port=$1 local existing_port=$2 log "添加新节点 $new_port 到集群..." NEW_NODE_DIR="/opt/redis-cluster/$new_port " mkdir -p $NEW_NODE_DIR cat > $NEW_NODE_DIR /redis.conf << EOF port $new_port cluster-enabled yes cluster-config-file nodes-$new_port.conf cluster-node-timeout 5000 appendonly yes dir $NEW_NODE_DIR daemonize yes pidfile $NEW_NODE_DIR/redis.pid EOF redis-server $NEW_NODE_DIR /redis.conf redis-cli --cluster add-node 127.0.0.1:$new_port 127.0.0.1:$existing_port log "新节点 $new_port 添加完成" } remove_node () { local node_id=$1 local port=$2 log "移除节点 $node_id ..." redis-cli --cluster del-node 127.0.0.1:$port $node_id redis-cli -p $port shutdown log "节点 $node_id 移除完成" } reshard_cluster () { log "开始重新分片..." redis-cli -p 7000 cluster nodes redis-cli --cluster reshard 127.0.0.1:7000 log "重新分片完成" } backup_cluster () { local backup_dir="/opt/redis-backup/$(date +%Y%m%d_%H%M%S) " mkdir -p $backup_dir log "备份集群数据到 $backup_dir ..." for port in "${CLUSTER_PORTS[@]} " ; do NODE_DIR="/opt/redis-cluster/$port " redis-cli -p $port bgsave while [ ! -f "$NODE_DIR /dump.rdb" ]; do sleep 1 done cp $NODE_DIR /dump.rdb $backup_dir /dump_$port .rdb cp $NODE_DIR /appendonly.aof $backup_dir /appendonly_$port .aof 2>/dev/null || true log "节点 $port 数据备份完成" done log "集群数据备份完成: $backup_dir " } main () { case $1 in "status" ) check_cluster_status ;; "add" ) add_node $2 $3 ;; "remove" ) remove_node $2 $3 ;; "reshard" ) reshard_cluster ;; "backup" ) backup_cluster ;; *) echo "用法: $0 {status|add|remove|reshard|backup}" echo " status - 检查集群状态" echo " add <new_port> <existing_port> - 添加新节点" echo " remove <node_id> <port> - 移除节点" echo " reshard - 重新分片" echo " backup - 备份集群数据" ;; esac } main "$@ "
4. 性能优化 4.1 内存优化脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 #!/bin/bash check_memory_usage () { log "检查Redis内存使用情况..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then MEMORY_INFO=$(redis-cli -p $port info memory) USED_MEMORY=$(echo "$MEMORY_INFO " | grep "used_memory:" | cut -d: -f2 | tr -d '\r' ) USED_MEMORY_HUMAN=$(echo "$MEMORY_INFO " | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r' ) MAX_MEMORY=$(echo "$MEMORY_INFO " | grep "maxmemory:" | cut -d: -f2 | tr -d '\r' ) log "节点 $port 内存使用: $USED_MEMORY_HUMAN (最大: $MAX_MEMORY )" if [ "$MAX_MEMORY " != "0" ]; then USAGE_RATE=$(echo "scale=2; $USED_MEMORY * 100 / $MAX_MEMORY " | bc) log "节点 $port 内存使用率: ${USAGE_RATE} %" if (( $(echo "$USAGE_RATE > 80 " | bc -l) )); then log "警告: 节点 $port 内存使用率过高" fi fi fi done } cleanup_expired_keys () { log "清理过期键..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then redis-cli -p $port memory purge redis-cli -p $port --scan --pattern "*" | while read key; do TTL=$(redis-cli -p $port ttl "$key " ) if [ "$TTL " = "-2" ]; then redis-cli -p $port del "$key " fi done log "节点 $port 过期键清理完成" fi done } optimize_memory_config () { log "优化内存配置..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then redis-cli -p $port config set maxmemory-policy allkeys-lru redis-cli -p $port config set maxmemory 1gb redis-cli -p $port config set hash-max-ziplist-entries 512 redis-cli -p $port config set hash-max-ziplist-value 64 redis-cli -p $port config set list-max-ziplist-size -2 redis-cli -p $port config set list-compress-depth 0 log "节点 $port 内存配置优化完成" fi done } analyze_big_keys () { log "分析大键..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port 大键分析:" redis-cli -p $port --bigkeys | head -20 redis-cli -p $port memory usage --samples 10 | sort -nr | head -10 fi done } main () { case $1 in "check" ) check_memory_usage ;; "cleanup" ) cleanup_expired_keys ;; "optimize" ) optimize_memory_config ;; "analyze" ) analyze_big_keys ;; *) echo "用法: $0 {check|cleanup|optimize|analyze}" echo " check - 检查内存使用情况" echo " cleanup - 清理过期键" echo " optimize - 优化内存配置" echo " analyze - 分析大键" ;; esac } main "$@ "
4.2 性能监控脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 #!/bin/bash MONITOR_INTERVAL=60 LOG_FILE="/var/log/redis-monitor.log" log_monitor_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local port=$1 local metric=$2 local value=$3 echo "[$timestamp ] Node:$port Metric:$metric Value:$value " >> $LOG_FILE } monitor_redis_performance () { log "开始监控Redis性能指标..." while true ; do for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then INFO=$(redis-cli -p $port info) CONNECTED_CLIENTS=$(echo "$INFO " | grep "connected_clients:" | cut -d: -f2 | tr -d '\r' ) USED_MEMORY=$(echo "$INFO " | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r' ) KEYS_COUNT=$(echo "$INFO " | grep "db0:keys=" | cut -d= -f2 | cut -d, -f1) HITS=$(echo "$INFO " | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r' ) MISSES=$(echo "$INFO " | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r' ) log_monitor_data $port "connected_clients" $CONNECTED_CLIENTS log_monitor_data $port "used_memory" $USED_MEMORY log_monitor_data $port "keys_count" $KEYS_COUNT log_monitor_data $port "hits" $HITS log_monitor_data $port "misses" $MISSES if [ "$HITS " != "0" ] && [ "$MISSES " != "0" ]; then HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES )" | bc) log_monitor_data $port "hit_rate" $HIT_RATE fi fi done sleep $MONITOR_INTERVAL done } generate_performance_report () { local report_file="/var/log/redis-performance-report-$(date +%Y%m%d) .txt" log "生成性能报告: $report_file " cat > $report_file << EOF Redis集群性能报告 生成时间: $(date) ======================================== EOF for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then echo "节点 $port 性能统计:" >> $report_file echo "----------------------------------------" >> $report_file redis-cli -p $port info stats >> $report_file echo "" >> $report_file redis-cli -p $port info memory >> $report_file echo "" >> $report_file fi done log "性能报告生成完成: $report_file " } setup_performance_alerts () { log "设置性能告警..." cat > /opt/redis-alert.sh << 'EOF' ALERT_THRESHOLDS=( "connected_clients:1000" "used_memory:80%" "hit_rate:90" ) check_alerts () { for threshold in "${ALERT_THRESHOLDS[@]} " ; do METRIC=$(echo $threshold | cut -d: -f1) THRESHOLD_VALUE=$(echo $threshold | cut -d: -f2) for port in 7000 7001 7002 7003 7004 7005; do if redis-cli -p $port ping > /dev/null 2>&1; then CURRENT_VALUE=$(redis-cli -p $port info | grep "$METRIC :" | cut -d: -f2 | tr -d '\r' ) if [ "$METRIC " = "hit_rate" ]; then if (( $(echo "$CURRENT_VALUE < $THRESHOLD_VALUE " | bc -l) )); then echo "告警: 节点 $port $METRIC 低于阈值 ($CURRENT_VALUE < $THRESHOLD_VALUE )" fi else if (( $(echo "$CURRENT_VALUE > $THRESHOLD_VALUE " | bc -l) )); then echo "告警: 节点 $port $METRIC 超过阈值 ($CURRENT_VALUE > $THRESHOLD_VALUE )" fi fi fi done done } while true ; do check_alerts sleep 300 done EOF chmod +x /opt/redis-alert.sh nohup /opt/redis-alert.sh > /var/log/redis-alert.log 2>&1 & log "性能告警设置完成" } main () { case $1 in "monitor" ) monitor_redis_performance ;; "report" ) generate_performance_report ;; "alerts" ) setup_performance_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始性能监控" echo " report - 生成性能报告" echo " alerts - 设置性能告警" ;; esac } main "$@ "
5. 数据持久化 5.1 数据备份脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 #!/bin/bash BACKUP_DIR="/opt/redis-backup" RETENTION_DAYS=7 create_backup_dir () { local backup_date=$(date +%Y%m%d_%H%M%S) local backup_path="$BACKUP_DIR /$backup_date " mkdir -p $backup_path echo $backup_path } backup_single_node () { local port=$1 local backup_path=$2 log "备份节点 $port 数据..." if redis-cli -p $port ping > /dev/null 2>&1; then NODE_DIR="/opt/redis-cluster/$port " redis-cli -p $port bgsave while [ ! -f "$NODE_DIR /dump.rdb" ]; do sleep 1 done cp $NODE_DIR /dump.rdb $backup_path /dump_$port .rdb if [ -f "$NODE_DIR /appendonly.aof" ]; then cp $NODE_DIR /appendonly.aof $backup_path /appendonly_$port .aof fi cp $NODE_DIR /redis.conf $backup_path /redis_$port .conf log "节点 $port 备份完成" else log "错误: 节点 $port 连接失败" fi } backup_cluster () { log "开始备份Redis集群..." local backup_path=$(create_backup_dir) log "备份路径: $backup_path " for port in "${CLUSTER_PORTS[@]} " ; do backup_single_node $port $backup_path done redis-cli -p 7000 cluster nodes > $backup_path /cluster_nodes.txt redis-cli -p 7000 cluster info > $backup_path /cluster_info.txt cat > $backup_path /backup_info.txt << EOF 备份时间: $(date) 集群节点: ${CLUSTER_PORTS[@]} 备份类型: 全量备份 EOF cd $BACKUP_DIR tar -czf "$(basename $backup_path) .tar.gz" "$(basename $backup_path) " rm -rf $backup_path log "集群备份完成: $backup_path .tar.gz" } incremental_backup () { log "开始增量备份..." local backup_path=$(create_backup_dir) log "增量备份路径: $backup_path " for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then NODE_DIR="/opt/redis-cluster/$port " if [ -f "$NODE_DIR /appendonly.aof" ]; then cp $NODE_DIR /appendonly.aof $backup_path /appendonly_$port .aof log "节点 $port AOF文件备份完成" fi fi done cd $BACKUP_DIR tar -czf "incremental_$(basename $backup_path) .tar.gz" "$(basename $backup_path) " rm -rf $backup_path log "增量备份完成" } restore_data () { local backup_file=$1 local target_port=$2 log "从 $backup_file 恢复数据到节点 $target_port ..." if [ ! -f "$backup_file " ]; then log "错误: 备份文件不存在 $backup_file " exit 1 fi local temp_dir="/tmp/redis_restore_$(date +%s) " mkdir -p $temp_dir tar -xzf $backup_file -C $temp_dir redis-cli -p $target_port shutdown local target_dir="/opt/redis-cluster/$target_port " cp $temp_dir /dump_$target_port .rdb $target_dir /dump.rdb if [ -f "$temp_dir /appendonly_$target_port .aof" ]; then cp $temp_dir /appendonly_$target_port .aof $target_dir /appendonly.aof fi redis-server $target_dir /redis.conf rm -rf $temp_dir log "数据恢复完成" } cleanup_old_backups () { log "清理过期备份文件..." find $BACKUP_DIR -name "*.tar.gz" -mtime +$RETENTION_DAYS -delete log "过期备份清理完成" } main () { case $1 in "backup" ) backup_cluster ;; "incremental" ) incremental_backup ;; "restore" ) restore_data $2 $3 ;; "cleanup" ) cleanup_old_backups ;; *) echo "用法: $0 {backup|incremental|restore|cleanup}" echo " backup - 全量备份集群" echo " incremental - 增量备份" echo " restore <backup_file> <target_port> - 恢复数据" echo " cleanup - 清理过期备份" ;; esac } main "$@ "
5.2 数据同步脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 #!/bin/bash MASTER_PORT=7000 SLAVE_PORTS=(7001 7002) setup_master_slave () { log "设置Redis主从关系..." for slave_port in "${SLAVE_PORTS[@]} " ; do log "设置节点 $slave_port 为 $MASTER_PORT 的从节点..." redis-cli -p $slave_port shutdown SLAVE_DIR="/opt/redis-cluster/$slave_port " sed -i "s/^# replicaof <masterip> <masterport>/replicaof 127.0.0.1 $MASTER_PORT /" $SLAVE_DIR /redis.conf redis-server $SLAVE_DIR /redis.conf sleep 5 REPLICA_INFO=$(redis-cli -p $slave_port info replication) REPLICA_STATE=$(echo "$REPLICA_INFO " | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) if [ "$REPLICA_STATE " = "up" ]; then log "节点 $slave_port 主从同步成功" else log "警告: 节点 $slave_port 主从同步失败" fi done } check_sync_status () { log "检查Redis同步状态..." MASTER_INFO=$(redis-cli -p $MASTER_PORT info replication) CONNECTED_SLAVES=$(echo "$MASTER_INFO " | grep "connected_slaves:" | cut -d: -f2 | tr -d '\r' ) log "主节点 $MASTER_PORT 连接的从节点数: $CONNECTED_SLAVES " for slave_port in "${SLAVE_PORTS[@]} " ; do SLAVE_INFO=$(redis-cli -p $slave_port info replication) MASTER_LINK_STATUS=$(echo "$SLAVE_INFO " | grep "master_link_status:" | cut -d: -f2 | tr -d '\r' ) REPLICA_LAG=$(echo "$SLAVE_INFO " | grep "master_repl_offset:" | cut -d: -f2 | tr -d '\r' ) log "从节点 $slave_port 状态: $MASTER_LINK_STATUS , 延迟: $REPLICA_LAG " done } manual_sync () { log "执行手动同步..." for slave_port in "${SLAVE_PORTS[@]} " ; do redis-cli -p $slave_port sync log "节点 $slave_port 手动同步完成" done } failover () { log "执行故障转移..." if ! redis-cli -p $MASTER_PORT ping > /dev/null 2>&1; then log "主节点 $MASTER_PORT 故障,开始故障转移..." NEW_MASTER=${SLAVE_PORTS[0]} redis-cli -p $NEW_MASTER shutdown NEW_MASTER_DIR="/opt/redis-cluster/$NEW_MASTER " sed -i 's/^replicaof/#replicaof/' $NEW_MASTER_DIR /redis.conf redis-server $NEW_MASTER_DIR /redis.conf for slave_port in "${SLAVE_PORTS[@]:1} " ; do redis-cli -p $slave_port replicaof 127.0.0.1 $NEW_MASTER done log "故障转移完成,新主节点: $NEW_MASTER " else log "主节点运行正常,无需故障转移" fi } main () { case $1 in "setup" ) setup_master_slave ;; "status" ) check_sync_status ;; "sync" ) manual_sync ;; "failover" ) failover ;; *) echo "用法: $0 {setup|status|sync|failover}" echo " setup - 设置主从关系" echo " status - 检查同步状态" echo " sync - 手动同步" echo " failover - 故障转移" ;; esac } main "$@ "
6. 监控告警 6.1 监控系统部署 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 #!/bin/bash install_prometheus () { log "安装Prometheus..." wget https://github.com/prometheus/prometheus/releases/download/v2.40.0/prometheus-2.40.0.linux-amd64.tar.gz tar -xzf prometheus-2.40.0.linux-amd64.tar.gz mv prometheus-2.40.0.linux-amd64 /opt/prometheus cat > /opt/prometheus/prometheus.yml << EOF global: scrape_interval: 15s evaluation_interval: 15s rule_files: - "redis_rules.yml" scrape_configs: - job_name: 'redis' static_configs: - targets: ['localhost:9121'] scrape_interval: 5s EOF cat > /opt/prometheus/redis_rules.yml << EOF groups: - name: redis rules: - alert: RedisDown expr: redis_up == 0 for: 0m labels: severity: critical annotations: summary: "Redis instance is down" description: "Redis instance {{ \$labels.instance }} has been down for more than 0 minutes." - alert: RedisMemoryHigh expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80 for: 2m labels: severity: warning annotations: summary: "Redis memory usage is high" description: "Redis memory usage is above 80% for more than 2 minutes." - alert: RedisSlowLog expr: increase(redis_slowlog_length[5m]) > 10 for: 0m labels: severity: warning annotations: summary: "Redis slow log entries" description: "Redis instance {{ \$labels.instance }} has more than 10 slow log entries in the last 5 minutes." EOF cat > /etc/systemd/system/prometheus.service << EOF [Unit] Description=Prometheus Wants=network-online.target After=network-online.target [Service] Type=simple User=prometheus ExecStart=/opt/prometheus/prometheus --config.file=/opt/prometheus/prometheus.yml --storage.tsdb.path=/opt/prometheus/data Restart=always [Install] WantedBy=multi-user.target EOF useradd --no-create-home --shell /bin/false prometheus chown -R prometheus:prometheus /opt/prometheus systemctl daemon-reload systemctl start prometheus systemctl enable prometheus log "Prometheus安装完成" } install_redis_exporter () { log "安装Redis Exporter..." wget https://github.com/oliver006/redis_exporter/releases/download/v1.45.0/redis_exporter-v1.45.0.linux-amd64.tar.gz tar -xzf redis_exporter-v1.45.0.linux-amd64.tar.gz mv redis_exporter-v1.45.0.linux-amd64/redis_exporter /usr/local/bin/ cat > /etc/systemd/system/redis-exporter.service << EOF [Unit] Description=Redis Exporter Wants=network-online.target After=network-online.target [Service] Type=simple User=redis ExecStart=/usr/local/bin/redis_exporter --redis.addr=redis://localhost:7000 --redis.addr=redis://localhost:7001 --redis.addr=redis://localhost:7002 Restart=always [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl start redis-exporter systemctl enable redis-exporter log "Redis Exporter安装完成" } install_grafana () { log "安装Grafana..." wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee /etc/apt/sources.list.d/grafana.list apt-get update apt-get install -y grafana systemctl start grafana-server systemctl enable grafana-server log "Grafana安装完成" } configure_grafana_dashboard () { log "配置Grafana仪表板..." sleep 10 curl -X POST \ http://admin:admin@localhost:3000/api/datasources \ -H 'Content-Type: application/json' \ -d '{ "name": "Prometheus", "type": "prometheus", "url": "http://localhost:9090", "access": "proxy", "isDefault": true }' curl -X POST \ http://admin:admin@localhost:3000/api/dashboards/db \ -H 'Content-Type: application/json' \ -d @/opt/grafana-dashboard.json log "Grafana仪表板配置完成" } main () { log "=== Redis监控系统部署开始 ===" install_prometheus install_redis_exporter install_grafana configure_grafana_dashboard log "=== Redis监控系统部署完成 ===" } main "$@ "
6.2 告警配置脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 #!/bin/bash install_alertmanager () { log "安装Alertmanager..." wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz tar -xzf alertmanager-0.25.0.linux-amd64.tar.gz mv alertmanager-0.25.0.linux-amd64 /opt/alertmanager cat > /opt/alertmanager/alertmanager.yml << EOF global: smtp_smarthost: 'localhost:587' smtp_from: 'alerts@example.com' route: group_by: ['alertname'] group_wait: 10s group_interval: 10s repeat_interval: 1h receiver: 'web.hook' receivers: - name: 'web.hook' webhook_configs: - url: 'http://127.0.0.1:5001/' send_resolved: true - name: 'email' email_configs: - to: 'admin@example.com' subject: 'Redis Alert: {{ .GroupLabels.alertname }}' body: | {{ range .Alerts }} Alert: {{ .Annotations.summary }} Description: {{ .Annotations.description }} {{ end }} EOF cat > /etc/systemd/system/alertmanager.service << EOF [Unit] Description=Alertmanager Wants=network-online.target After=network-online.target [Service] Type=simple User=alertmanager ExecStart=/opt/alertmanager/alertmanager --config.file=/opt/alertmanager/alertmanager.yml --storage.path=/opt/alertmanager/data Restart=always [Install] WantedBy=multi-user.target EOF useradd --no-create-home --shell /bin/false alertmanager chown -R alertmanager:alertmanager /opt/alertmanager systemctl daemon-reload systemctl start alertmanager systemctl enable alertmanager log "Alertmanager安装完成" } configure_email_alerts () { log "配置邮件告警..." apt-get install -y postfix mailutils cat > /etc/postfix/main.cf << EOF myhostname = redis-monitor.example.com mydomain = example.com myorigin = \$mydomain inet_interfaces = loopback-only mydestination = \$myhostname, localhost.\$mydomain, localhost, \$mydomain relayhost = [smtp.gmail.com]:587 smtp_use_tls = yes smtp_sasl_auth_enable = yes smtp_sasl_password_maps = hash:/etc/postfix/sasl_passwd smtp_sasl_security_options = noanonymous EOF cat > /etc/postfix/sasl_passwd << EOF [smtp.gmail.com]:587 username@gmail.com:password EOF chmod 600 /etc/postfix/sasl_passwd postmap /etc/postfix/sasl_passwd systemctl restart postfix log "邮件告警配置完成" } configure_webhook_alerts () { log "配置Webhook告警..." cat > /opt/webhook-receiver.py << 'EOF' import json import requests from flask import Flask, request app = Flask(__name__) @app.route('/' , methods=['POST' ]) def webhook(): data = request.get_json() for alert in data.get('alerts' , []): alert_name = alert.get('labels' , {}).get('alertname' ) alert_status = alert.get('status' ) alert_summary = alert.get('annotations' , {}).get('summary' , '' ) send_to_dingtalk(alert_name, alert_status, alert_summary) send_to_wechat(alert_name, alert_status, alert_summary) return 'OK' def send_to_dingtalk(alert_name, status, summary): webhook_url = "https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN" message = { "msgtype" : "text" , "text" : { "content" : f"Redis告警\n告警名称: {alert_name}\n状态: {status}\n描述: {summary}" } } requests.post(webhook_url, json=message) def send_to_wechat(alert_name, status, summary): webhook_url = "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=YOUR_KEY" message = { "msgtype" : "text" , "text" : { "content" : f"Redis告警\n告警名称: {alert_name}\n状态: {status}\n描述: {summary}" } } requests.post(webhook_url, json=message) if __name__ == '__main__' : app.run(host='0.0.0.0' , port=5001) EOF chmod +x /opt/webhook-receiver.py pip3 install flask requests cat > /etc/systemd/system/webhook-receiver.service << EOF [Unit] Description=Webhook Receiver After=network.target [Service] Type=simple User=root ExecStart=/usr/bin/python3 /opt/webhook-receiver.py Restart=always [Install] WantedBy=multi-user.target EOF systemctl daemon-reload systemctl start webhook-receiver systemctl enable webhook-receiver log "Webhook告警配置完成" } main () { log "=== Redis告警配置开始 ===" install_alertmanager configure_email_alerts configure_webhook_alerts log "=== Redis告警配置完成 ===" } main "$@ "
7. 故障处理 7.1 故障诊断脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 #!/bin/bash diagnose_connection () { log "诊断Redis连接问题..." for port in "${CLUSTER_PORTS[@]} " ; do log "检查节点 $port 连接状态..." if netstat -tlnp | grep ":$port " > /dev/null; then log "节点 $port 端口监听正常" else log "错误: 节点 $port 端口未监听" fi if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port Redis服务正常" else log "错误: 节点 $port Redis服务异常" REDIS_PID=$(ps aux | grep "redis-server.*:$port " | grep -v grep | awk '{print $2}' ) if [ -n "$REDIS_PID " ]; then log "Redis进程存在: $REDIS_PID " else log "Redis进程不存在" fi REDIS_LOG="/opt/redis-cluster/$port /redis.log" if [ -f "$REDIS_LOG " ]; then log "最近的Redis日志:" tail -20 $REDIS_LOG fi fi done } diagnose_memory () { log "诊断Redis内存问题..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port 内存诊断:" MEMORY_INFO=$(redis-cli -p $port info memory) USED_MEMORY=$(echo "$MEMORY_INFO " | grep "used_memory:" | cut -d: -f2 | tr -d '\r' ) USED_MEMORY_HUMAN=$(echo "$MEMORY_INFO " | grep "used_memory_human:" | cut -d: -f2 | tr -d '\r' ) MAX_MEMORY=$(echo "$MEMORY_INFO " | grep "maxmemory:" | cut -d: -f2 | tr -d '\r' ) log "已使用内存: $USED_MEMORY_HUMAN " log "最大内存: $MAX_MEMORY " MEMORY_FRAGMENTATION=$(echo "$MEMORY_INFO " | grep "mem_fragmentation_ratio:" | cut -d: -f2 | tr -d '\r' ) log "内存碎片率: $MEMORY_FRAGMENTATION " if (( $(echo "$MEMORY_FRAGMENTATION > 1.5 " | bc -l) )); then log "警告: 内存碎片率过高" fi log "分析大键..." redis-cli -p $port --bigkeys | head -10 fi done } diagnose_performance () { log "诊断Redis性能问题..." for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port 性能诊断:" STATS_INFO=$(redis-cli -p $port info stats) CONNECTED_CLIENTS=$(echo "$STATS_INFO " | grep "connected_clients:" | cut -d: -f2 | tr -d '\r' ) log "连接客户端数: $CONNECTED_CLIENTS " TOTAL_COMMANDS=$(echo "$STATS_INFO " | grep "total_commands_processed:" | cut -d: -f2 | tr -d '\r' ) log "总命令数: $TOTAL_COMMANDS " SLOWLOG_LEN=$(redis-cli -p $port slowlog len) log "慢查询数量: $SLOWLOG_LEN " if [ "$SLOWLOG_LEN " -gt 0 ]; then log "最近的慢查询:" redis-cli -p $port slowlog get 5 fi HITS=$(echo "$STATS_INFO " | grep "keyspace_hits:" | cut -d: -f2 | tr -d '\r' ) MISSES=$(echo "$STATS_INFO " | grep "keyspace_misses:" | cut -d: -f2 | tr -d '\r' ) if [ "$HITS " != "0" ] && [ "$MISSES " != "0" ]; then HIT_RATE=$(echo "scale=2; $HITS * 100 / ($HITS + $MISSES )" | bc) log "键空间命中率: ${HIT_RATE} %" if (( $(echo "$HIT_RATE < 90 " | bc -l) )); then log "警告: 键空间命中率过低" fi fi fi done } diagnose_cluster () { log "诊断Redis集群问题..." CLUSTER_INFO=$(redis-cli -p 7000 cluster info) CLUSTER_STATE=$(echo "$CLUSTER_INFO " | grep "cluster_state:" | cut -d: -f2 | tr -d '\r' ) log "集群状态: $CLUSTER_STATE " if [ "$CLUSTER_STATE " != "ok" ]; then log "错误: 集群状态异常" fi CLUSTER_NODES=$(redis-cli -p 7000 cluster nodes) log "集群节点信息:" echo "$CLUSTER_NODES " FAILED_NODES=$(echo "$CLUSTER_NODES " | grep "fail" ) if [ -n "$FAILED_NODES " ]; then log "发现故障节点:" echo "$FAILED_NODES " fi MASTER_NODES=$(echo "$CLUSTER_NODES " | grep "master" ) SLAVE_NODES=$(echo "$CLUSTER_NODES " | grep "slave" ) log "主节点数量: $(echo "$MASTER_NODES " | wc -l) " log "从节点数量: $(echo "$SLAVE_NODES " | wc -l) " } auto_fix () { log "开始自动修复..." for port in "${CLUSTER_PORTS[@]} " ; do if ! redis-cli -p $port ping > /dev/null 2>&1; then log "尝试重启节点 $port ..." redis-cli -p $port shutdown 2>/dev/null || true NODE_DIR="/opt/redis-cluster/$port " redis-server $NODE_DIR /redis.conf sleep 3 if redis-cli -p $port ping > /dev/null 2>&1; then log "节点 $port 修复成功" else log "节点 $port 修复失败" fi fi done for port in "${CLUSTER_PORTS[@]} " ; do if redis-cli -p $port ping > /dev/null 2>&1; then redis-cli -p $port memory purge redis-cli -p $port --scan --pattern "*" | while read key; do TTL=$(redis-cli -p $port ttl "$key " ) if [ "$TTL " = "-2" ]; then redis-cli -p $port del "$key " fi done log "节点 $port 内存清理完成" fi done } main () { case $1 in "connection" ) diagnose_connection ;; "memory" ) diagnose_memory ;; "performance" ) diagnose_performance ;; "cluster" ) diagnose_cluster ;; "fix" ) auto_fix ;; "all" ) diagnose_connection diagnose_memory diagnose_performance diagnose_cluster ;; *) echo "用法: $0 {connection|memory|performance|cluster|fix|all}" echo " connection - 诊断连接问题" echo " memory - 诊断内存问题" echo " performance - 诊断性能问题" echo " cluster - 诊断集群问题" echo " fix - 自动修复" echo " all - 全面诊断" ;; esac } main "$@ "
8. 总结 8.1 运维最佳实践
集群部署 : 使用Redis Cluster实现高可用和水平扩展
性能优化 : 合理配置内存策略和持久化机制
监控告警 : 建立完善的监控体系和告警机制
数据备份 : 定期备份数据,确保数据安全
故障处理 : 建立故障诊断和自动修复机制
8.2 关键指标监控
内存使用率 : 监控内存使用情况,避免内存溢出
连接数 : 监控客户端连接数,防止连接耗尽
命中率 : 监控缓存命中率,优化缓存策略
响应时间 : 监控命令响应时间,确保性能
集群状态 : 监控集群健康状态,及时发现问题
8.3 运维工具推荐
监控工具 : Prometheus + Grafana + Redis Exporter
告警工具 : Alertmanager + Webhook
管理工具 : Redis Commander, RedisInsight
备份工具 : 自定义备份脚本
诊断工具 : Redis-cli, redis-trib.rb
通过本文的Redis运维实战指南,您可以建立完善的Redis分布式缓存服务运维体系,确保系统的高可用性和高性能。记住,运维是一个持续改进的过程,需要根据业务需求和技术发展不断优化和完善。