第131集文档数据库服务副本集运维实战 | 字数总计: 6.4k | 阅读时长: 31分钟 | 阅读量:
1. MongoDB副本集概述 MongoDB副本集是MongoDB提供的高可用性解决方案,通过多个MongoDB实例的协同工作,实现数据的自动复制和故障转移。本文将详细介绍MongoDB副本集的运维实战经验,包括副本集部署、数据同步、故障转移、监控告警的完整解决方案。
1.1 核心功能
数据复制 : 实现数据从主节点到从节点的自动复制
故障转移 : 自动检测主节点故障并切换到从节点
读写分离 : 主节点处理写操作,从节点处理读操作
数据一致性 : 确保副本集内数据的一致性和完整性
高可用保障 : 提供7x24小时不间断服务
1.2 技术架构 1 2 3 4 5 客户端应用 → MongoDB主节点 → MongoDB从节点 ↓ ↓ ↓ 写操作处理 → 数据同步 → 读操作处理 ↓ ↓ ↓ 副本集监控 → 故障检测 → 自动切换
2. 环境准备 2.1 系统环境检查 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 #!/bin/bash log () { echo "[$(date '+%Y-%m-%d %H:%M:%S') ] $1 " } check_system_environment () { log "开始检查MongoDB副本集系统环境..." if [[ "$OSTYPE " == "linux-gnu" * ]]; then OS_VERSION=$(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2) log "操作系统: $OS_VERSION " else log "错误: 不支持的操作系统 $OSTYPE " exit 1 fi TOTAL_MEM=$(free -h | grep "Mem:" | awk '{print $2}' ) AVAILABLE_MEM=$(free -h | grep "Mem:" | awk '{print $7}' ) CPU_CORES=$(nproc ) DISK_SPACE=$(df -h / | tail -1 | awk '{print $4}' ) log "系统资源检查:" log " 总内存: $TOTAL_MEM " log " 可用内存: $AVAILABLE_MEM " log " CPU核心数: $CPU_CORES " log " 可用磁盘空间: $DISK_SPACE " check_network_connectivity } check_network_connectivity () { log "检查网络连通性..." REPLICA_NODES=("192.168.1.10" "192.168.1.11" "192.168.1.12" ) for node in "${REPLICA_NODES[@]} " ; do if ping -c 3 $node > /dev/null 2>&1; then log "节点网络连通正常: $node " else log "警告: 节点网络连通异常: $node " fi done check_port_availability } check_port_availability () { log "检查MongoDB端口可用性..." MONGODB_PORTS=(27017 27018 27019) for port in "${MONGODB_PORTS[@]} " ; do if netstat -tlnp | grep ":$port " > /dev/null; then log "警告: 端口 $port 已被占用" else log "端口 $port 可用" fi done } check_mongodb_installation () { log "检查MongoDB安装状态..." if command -v mongod > /dev/null 2>&1; then MONGODB_VERSION=$(mongod --version | head -1) log "MongoDB已安装: $MONGODB_VERSION " if systemctl is-active --quiet mongod; then log "MongoDB服务运行正常" else log "MongoDB服务未运行" fi else log "MongoDB未安装,开始安装..." install_mongodb fi } install_mongodb () { log "开始安装MongoDB..." wget -qO - https://www.mongodb.org/static/pgp/server-6.0.asc | sudo apt-key add - echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/6.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-6.0.list sudo apt-get update sudo apt-get install -y mongodb-org sudo systemctl start mongod sudo systemctl enable mongod log "MongoDB安装完成" } main () { log "=== MongoDB副本集环境检查开始 ===" check_system_environment check_mongodb_installation log "=== MongoDB副本集环境检查完成 ===" } main "$@ "
2.2 MongoDB副本集配置bin/bash REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_PORT=27017 MONGODB_DATA_DIR="/var/lib/mongodb" MONGODB_LOG_DIR="/var/log/mongodb" backup_config_files () { log "备份MongoDB配置文件..." if [ -f "/etc/mongod.conf" ]; then sudo cp /etc/mongod.conf /etc/mongod.conf.backup.$(date +%Y%m%d_%H%M%S) log "MongoDB配置文件已备份" fi } configure_primary_node () { log "配置MongoDB主节点..." cat > /etc/mongod.conf << EOF # MongoDB主节点配置 storage: dbPath: $MONGODB_DATA_DIR journal: enabled: true wiredTiger: engineConfig: cacheSizeGB: 2 journalCompressor: snappy directoryForIndexes: false collectionConfig: blockCompressor: snappy indexConfig: prefixCompression: true systemLog: destination: file logAppend: true path: $MONGODB_LOG_DIR/mongod.log logRotate: reopen net: port: $MONGODB_PORT bindIp: 0.0.0.0 maxIncomingConnections: 1000 processManagement: timeZoneInfo: /usr/share/zoneinfo replication: replSetName: $REPLICA_SET_NAME security: authorization: enabled keyFile: /etc/mongodb-keyfile operationProfiling: slowOpThresholdMs: 100 mode: slowOp setParameter: enableLocalhostAuthBypass: false EOF log "主节点配置完成" } configure_secondary_node () { local node_ip=$1 log "配置MongoDB从节点: $node_ip ..." cat > /etc/mongod.conf << EOF # MongoDB从节点配置 storage: dbPath: $MONGODB_DATA_DIR journal: enabled: true wiredTiger: engineConfig: cacheSizeGB: 2 journalCompressor: snappy directoryForIndexes: false collectionConfig: blockCompressor: snappy indexConfig: prefixCompression: true systemLog: destination: file logAppend: true path: $MONGODB_LOG_DIR/mongod.log logRotate: reopen net: port: $MONGODB_PORT bindIp: 0.0.0.0 maxIncomingConnections: 1000 processManagement: timeZoneInfo: /usr/share/zoneinfo replication: replSetName: $REPLICA_SET_NAME security: authorization: enabled keyFile: /etc/mongodb-keyfile operationProfiling: slowOpThresholdMs: 100 mode: slowOp setParameter: enableLocalhostAuthBypass: false EOF log "从节点 $node_ip 配置完成" } create_keyfile () { log "创建MongoDB认证密钥文件..." openssl rand -base64 756 > /etc/mongodb-keyfile chmod 600 /etc/mongodb-keyfile chown mongodb:mongodb /etc/mongodb-keyfile log "认证密钥文件创建完成" } create_admin_user () { log "创建MongoDB管理员用户..." sleep 10 mongo --eval " use admin; db.createUser({ user: 'admin', pwd: 'admin123', roles: [ { role: 'userAdminAnyDatabase', db: 'admin' }, { role: 'readWriteAnyDatabase', db: 'admin' }, { role: 'dbAdminAnyDatabase', db: 'admin' }, { role: 'clusterAdmin', db: 'admin' } ] }); " log "管理员用户创建完成" } initialize_replica_set () { log "初始化MongoDB副本集..." sleep 15 mongo --eval " rs.initiate({ _id: '$REPLICA_SET_NAME ', members: [ { _id: 0, host: '$PRIMARY_NODE ', priority: 2 }, { _id: 1, host: '${SECONDARY_NODES[0]} ', priority: 1 }, { _id: 2, host: '${SECONDARY_NODES[1]} ', priority: 1 } ] }); " log "副本集初始化完成" } start_services () { log "启动MongoDB服务..." systemctl restart mongod systemctl enable mongod sleep 5 if systemctl is-active --quiet mongod; then log "MongoDB服务启动成功" else log "错误: MongoDB服务启动失败" exit 1 fi } main () { log "=== MongoDB副本集配置开始 ===" backup_config_files configure_primary_node create_keyfile start_services create_admin_user initialize_replica_set log "=== MongoDB副本集配置完成 ===" } main "$@ "
3. 副本集管理 3.1 副本集管理脚本bin/bash REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" check_replica_set_status () { log "检查MongoDB副本集状态..." mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.status(); " mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.conf(); " PRIMARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.isMaster().ismaster; " ) if [ "$PRIMARY_STATUS " = "true" ]; then log "主节点运行正常" else log "警告: 主节点状态异常" fi for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) if [ "$SECONDARY_STATUS " = "SECONDARY" ]; then log "从节点 $node 运行正常" else log "警告: 从节点 $node 状态异常: $SECONDARY_STATUS " fi done } add_secondary_node () { local new_node=$1 log "添加从节点: $new_node ..." mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.add('$new_node '); " wait_for_sync_completion $new_node log "从节点 $new_node 添加完成" } wait_for_sync_completion () { local node=$1 log "等待节点 $node 同步完成..." local max_wait=300 local wait_time=0 while [ $wait_time -lt $max_wait ]; do NODE_STATE=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) if [ "$NODE_STATE " = "SECONDARY" ]; then log "节点 $node 同步完成" break fi log "同步进行中,等待中... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 节点 $node 同步超时" fi } remove_secondary_node () { local node=$1 log "移除从节点: $node ..." mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.remove('$node '); " log "从节点 $node 移除完成" } manual_failover () { log "执行手动故障转移..." CURRENT_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "当前主节点: $CURRENT_PRIMARY " mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.stepDown(); " sleep 10 NEW_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "故障转移完成,新主节点: $NEW_PRIMARY " } check_data_consistency () { log "检查副本集数据一致性..." PRIMARY_DBS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.adminCommand('listDatabases').databases.forEach(function(d) { print(d.name); }); " ) log "主节点数据库: $PRIMARY_DBS " for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_DBS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.adminCommand('listDatabases').databases.forEach(function(d) { print(d.name); }); " --host $node ) log "从节点 $node 数据库: $SECONDARY_DBS " if [ "$PRIMARY_DBS " = "$SECONDARY_DBS " ]; then log "节点 $node 数据一致性检查通过" else log "警告: 节点 $node 数据不一致" fi done } reconfigure_replica_set () { log "重新配置副本集..." CURRENT_CONFIG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.conf(); " ) log "当前副本集配置: $CURRENT_CONFIG " mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " cfg = rs.conf(); cfg.members[0].priority = 2; cfg.members[1].priority = 1; cfg.members[2].priority = 1; rs.reconfig(cfg); " log "副本集重新配置完成" } main () { case $1 in "status" ) check_replica_set_status ;; "add" ) add_secondary_node $2 ;; "remove" ) remove_secondary_node $2 ;; "failover" ) manual_failover ;; "consistency" ) check_data_consistency ;; "reconfig" ) reconfigure_replica_set ;; *) echo "用法: $0 {status|add|remove|failover|consistency|reconfig}" echo " status - 检查副本集状态" echo " add <node> - 添加从节点" echo " remove <node> - 移除从节点" echo " failover - 手动故障转移" echo " consistency - 检查数据一致性" echo " reconfig - 重新配置副本集" ;; esac } main "$@ "
3.2 数据同步监控脚本bin/bash REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" MONITOR_INTERVAL=30 LOG_FILE="/var/log/mongodb-sync-monitor.log" log_sync_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local metric=$1 local value=$2 local node=$3 echo "[$timestamp ] Node:$node Metric:$metric Value:$value " >> $LOG_FILE } monitor_replica_set_sync () { log "开始监控MongoDB副本集同步状态..." while true ; do PRIMARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).stateStr; " ) log_sync_data "primary_status" $PRIMARY_STATUS "primary" for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) log_sync_data "secondary_status" $SECONDARY_STATUS $node SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) log_sync_data "sync_lag" $SYNC_LAG $node REPL_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').replicationLag; " ) log_sync_data "repl_lag" $REPL_LAG $node done REPLICA_SET_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().ok; " ) log_sync_data "replica_set_status" $REPLICA_SET_STATUS "replica_set" sleep $MONITOR_INTERVAL done } generate_sync_report () { local report_file="/var/log/mongodb-sync-report-$(date +%Y%m%d) .txt" log "生成MongoDB同步报告: $report_file " cat > $report_file << EOF MongoDB副本集同步报告 生成时间: $(date) ======================================== EOF echo "副本集状态:" >> $report_file echo "----------------------------------------" >> $report_file mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval "rs.status()" >> $report_file echo "" >> $report_file echo "副本集配置:" >> $report_file echo "----------------------------------------" >> $report_file mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval "rs.conf()" >> $report_file echo "" >> $report_file echo "同步统计:" >> $report_file echo "----------------------------------------" >> $report_file for node in "${SECONDARY_NODES[@]} " ; do SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) echo "节点 $node 同步延迟: $SYNC_LAG " >> $report_file done PRIMARY_DBS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.adminCommand('listDatabases').databases.forEach(function(d) { print(d.name); }); " ) echo "主节点数据库: $PRIMARY_DBS " >> $report_file log "同步报告生成完成: $report_file " } setup_sync_alerts () { log "设置MongoDB同步告警..." cat > /opt/mongodb-sync-alert.sh << 'EOF' REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" check_sync_alerts () { PRIMARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).stateStr; " ) if [ "$PRIMARY_STATUS " != "PRIMARY" ]; then echo "告警: 主节点状态异常 ($PRIMARY_STATUS )" fi for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) if [ "$SECONDARY_STATUS " != "SECONDARY" ]; then echo "告警: 从节点 $node 状态异常 ($SECONDARY_STATUS )" fi SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) if [ "$SYNC_LAG " != "null" ] && [ "$SYNC_LAG " != "undefined" ]; then echo "告警: 从节点 $node 同步延迟过高 ($SYNC_LAG )" fi done REPLICA_SET_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().ok; " ) if [ "$REPLICA_SET_STATUS " != "1" ]; then echo "告警: 副本集状态异常 ($REPLICA_SET_STATUS )" fi } while true ; do check_sync_alerts sleep 300 done EOF chmod +x /opt/mongodb-sync-alert.sh nohup /opt/mongodb-sync-alert.sh > /var/log/mongodb-sync-alert.log 2>&1 & log "同步告警设置完成" } main () { case $1 in "monitor" ) monitor_replica_set_sync ;; "report" ) generate_sync_report ;; "alerts" ) setup_sync_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始同步监控" echo " report - 生成同步报告" echo " alerts - 设置同步告警" ;; esac } main "$@ "
4. 故障转移管理 4.1 故障转移脚本 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 #!/bin/bash REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" check_failover_status () { log "检查MongoDB故障转移状态..." CURRENT_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "当前主节点: $CURRENT_PRIMARY " REPLICA_SET_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().ok; " ) log "副本集状态: $REPLICA_SET_STATUS " mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.status().members.forEach(function(member) { print('节点: ' + member.name + ', 状态: ' + member.stateStr + ', 优先级: ' + member.priority); }); " } manual_failover () { log "执行手动故障转移..." CURRENT_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "当前主节点: $CURRENT_PRIMARY " mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.stepDown(); " wait_for_failover_completion $CURRENT_PRIMARY log "手动故障转移完成" } wait_for_failover_completion () { local old_primary=$1 local max_wait=300 local wait_time=0 log "等待故障转移完成..." while [ $wait_time -lt $max_wait ]; do NEW_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) if [ "$NEW_PRIMARY " != "$old_primary " ]; then log "故障转移成功: $old_primary -> $NEW_PRIMARY " break fi log "故障转移进行中,等待中... ($wait_time /$max_wait )" sleep 10 wait_time=$((wait_time + 10 )) done if [ $wait_time -ge $max_wait ]; then log "警告: 故障转移超时" fi } test_failover () { log "测试MongoDB故障转移..." local test_start=$(date +%s) CURRENT_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "测试前主节点: $CURRENT_PRIMARY " manual_failover local test_end=$(date +%s) local test_duration=$((test_end - test_start)) log "故障转移测试完成,耗时: ${test_duration} 秒" verify_failover_result } verify_failover_result () { log "验证故障转移结果..." NEW_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) log "新主节点: $NEW_PRIMARY " if mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --host $NEW_PRIMARY --eval "db.runCommand('ping')" > /dev/null 2>&1; then log "新主节点 $NEW_PRIMARY 连接正常" else log "错误: 新主节点 $NEW_PRIMARY 连接失败" fi for node in "${SECONDARY_NODES[@]} " ; do if [ "$node " != "$NEW_PRIMARY " ]; then SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) if [ "$SECONDARY_STATUS " = "SECONDARY" ]; then log "从节点 $node 状态正常" else log "警告: 从节点 $node 状态异常: $SECONDARY_STATUS " fi fi done check_data_consistency_after_failover } check_data_consistency_after_failover () { log "检查故障转移后数据一致性..." NEW_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) PRIMARY_DBS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.adminCommand('listDatabases').databases.forEach(function(d) { print(d.name); }); " --host $NEW_PRIMARY ) log "新主节点数据库: $PRIMARY_DBS " for node in "${SECONDARY_NODES[@]} " ; do if [ "$node " != "$NEW_PRIMARY " ]; then SECONDARY_DBS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.adminCommand('listDatabases').databases.forEach(function(d) { print(d.name); }); " --host $node ) log "从节点 $node 数据库: $SECONDARY_DBS " if [ "$PRIMARY_DBS " = "$SECONDARY_DBS " ]; then log "从节点 $node 数据一致性检查通过" else log "警告: 从节点 $node 数据不一致" fi fi done } restore_original_primary () { log "恢复原主节点..." CURRENT_PRIMARY=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).name; " ) ORIGINAL_PRIMARY=$PRIMARY_NODE log "当前主节点: $CURRENT_PRIMARY " log "原主节点: $ORIGINAL_PRIMARY " if [ "$CURRENT_PRIMARY " != "$ORIGINAL_PRIMARY " ]; then mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval " rs.stepDown(); " log "故障转移回原主节点命令已发送" wait_for_failover_completion $CURRENT_PRIMARY else log "当前主节点就是原主节点,无需恢复" fi } main () { case $1 in "status" ) check_failover_status ;; "failover" ) manual_failover ;; "test" ) test_failover ;; "restore" ) restore_original_primary ;; *) echo "用法: $0 {status|failover|test|restore}" echo " status - 检查故障转移状态" echo " failover - 手动故障转移" echo " test - 测试故障转移" echo " restore - 恢复原主节点" ;; esac } main "$@ "
5. 监控告警 5.1 副本集监控脚本bin/bash REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" MONITOR_INTERVAL=30 LOG_FILE="/var/log/mongodb-replica-monitor.log" log_monitor_data () { local timestamp=$(date '+%Y-%m-%d %H:%M:%S' ) local metric=$1 local value=$2 local node=$3 echo "[$timestamp ] Node:$node Metric:$metric Value:$value " >> $LOG_FILE } monitor_replica_set_status () { log "开始监控MongoDB副本集状态..." while true ; do PRIMARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).stateStr; " ) log_monitor_data "primary_status" $PRIMARY_STATUS "primary" for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) log_monitor_data "secondary_status" $SECONDARY_STATUS $node SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) log_monitor_data "sync_lag" $SYNC_LAG $node REPL_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').replicationLag; " ) log_monitor_data "repl_lag" $REPL_LAG $node done REPLICA_SET_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().ok; " ) log_monitor_data "replica_set_status" $REPLICA_SET_STATUS "replica_set" CONNECTIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().connections.current; " ) log_monitor_data "connections" $CONNECTIONS "primary" OPERATIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().opcounters.total; " ) log_monitor_data "operations" $OPERATIONS "primary" sleep $MONITOR_INTERVAL done } generate_replica_set_report () { local report_file="/var/log/mongodb-replica-set-report-$(date +%Y%m%d) .txt" log "生成MongoDB副本集报告: $report_file " cat > $report_file << EOF MongoDB副本集监控报告 生成时间: $(date) ======================================== EOF echo "副本集状态:" >> $report_file echo "----------------------------------------" >> $report_file mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval "rs.status()" >> $report_file echo "" >> $report_file echo "副本集配置:" >> $report_file echo "----------------------------------------" >> $report_file mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval "rs.conf()" >> $report_file echo "" >> $report_file echo "服务器状态:" >> $report_file echo "----------------------------------------" >> $report_file mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --eval "db.serverStatus()" >> $report_file echo "" >> $report_file echo "副本集统计:" >> $report_file echo "----------------------------------------" >> $report_file for node in "${SECONDARY_NODES[@]} " ; do SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) echo "节点 $node 同步延迟: $SYNC_LAG " >> $report_file done CONNECTIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().connections.current; " ) echo "当前连接数: $CONNECTIONS " >> $report_file OPERATIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().opcounters.total; " ) echo "总操作数: $OPERATIONS " >> $report_file log "副本集报告生成完成: $report_file " } setup_replica_set_alerts () { log "设置MongoDB副本集告警..." cat > /opt/mongodb-replica-set-alert.sh << 'EOF' REPLICA_SET_NAME="rs0" PRIMARY_NODE="192.168.1.10:27017" SECONDARY_NODES=("192.168.1.11:27017" "192.168.1.12:27017" ) MONGODB_USER="admin" MONGODB_PASSWORD="admin123" check_replica_set_alerts () { PRIMARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.state == 1).stateStr; " ) if [ "$PRIMARY_STATUS " != "PRIMARY" ]; then echo "告警: 主节点状态异常 ($PRIMARY_STATUS )" fi for node in "${SECONDARY_NODES[@]} " ; do SECONDARY_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').stateStr; " ) if [ "$SECONDARY_STATUS " != "SECONDARY" ]; then echo "告警: 从节点 $node 状态异常 ($SECONDARY_STATUS )" fi SYNC_LAG=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().members.find(m => m.name == '$node ').optimeDate; " ) if [ "$SYNC_LAG " != "null" ] && [ "$SYNC_LAG " != "undefined" ]; then echo "告警: 从节点 $node 同步延迟过高 ($SYNC_LAG )" fi done REPLICA_SET_STATUS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " rs.status().ok; " ) if [ "$REPLICA_SET_STATUS " != "1" ]; then echo "告警: 副本集状态异常 ($REPLICA_SET_STATUS )" fi CONNECTIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().connections.current; " ) if [ $CONNECTIONS -gt 1000 ]; then echo "告警: 连接数过高 ($CONNECTIONS )" fi OPERATIONS=$(mongo -u $MONGODB_USER -p $MONGODB_PASSWORD --authenticationDatabase admin --quiet --eval " db.serverStatus().opcounters.total; " ) if [ $OPERATIONS -gt 1000000 ]; then echo "告警: 操作数过高 ($OPERATIONS )" fi } while true ; do check_replica_set_alerts sleep 300 done EOF chmod +x /opt/mongodb-replica-set-alert.sh nohup /opt/mongodb-replica-set-alert.sh > /var/log/mongodb-replica-set-alert.log 2>&1 & log "副本集告警设置完成" } main () { case $1 in "monitor" ) monitor_replica_set_status ;; "report" ) generate_replica_set_report ;; "alerts" ) setup_replica_set_alerts ;; *) echo "用法: $0 {monitor|report|alerts}" echo " monitor - 开始副本集监控" echo " report - 生成副本集报告" echo " alerts - 设置副本集告警" ;; esac } main "$@ "
6. 总结 6.1 副本集最佳实践
副本集配置 : 合理配置副本集参数,确保数据同步的可靠性
故障转移 : 建立完善的故障转移机制,确保服务的高可用性
数据一致性 : 定期检查副本集内数据的一致性
监控告警 : 建立全面的监控体系,及时发现和处理问题
性能优化 : 优化MongoDB配置,提高系统性能
6.2 关键指标监控
副本集状态 : 监控主从节点状态
同步延迟 : 监控数据同步延迟
连接数 : 监控客户端连接数
操作数 : 监控数据库操作数
故障转移 : 监控故障转移事件
6.3 运维工具推荐
监控工具 : MongoDB Compass, MongoDB Atlas
告警工具 : 自定义告警脚本
管理工具 : mongo shell, MongoDB Ops Manager
备份工具 : mongodump, mongorestore
诊断工具 : mongostat, mongotop
通过本文的MongoDB副本集运维实战指南,您可以建立完善的MongoDB副本集运维体系,确保系统的高可用性和数据一致性。记住,副本集的运维需要持续关注主从同步状态和故障转移机制,确保在任何情况下都能提供稳定的服务。