前言

在生产环境中,服务的重启、升级、扩容等操作是不可避免的。如何确保服务在停机过程中不丢失正在处理的请求,不产生数据不一致,是系统稳定性的重要保障。SpringBoot优雅停机通过信号处理、资源清理、连接管理等机制,确保服务平滑下线。本文从架构设计到代码实现,系统梳理企业级优雅停机的完整解决方案。

一、优雅停机架构设计

1.1 整体架构图

1.2 优雅停机流程

二、核心实现架构

2.1 优雅停机管理器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
@Component
@Slf4j
public class GracefulShutdownManager {

@Autowired
private ApplicationContext applicationContext;

@Autowired
private ShutdownHookRegistry shutdownHookRegistry;

@Autowired
private HealthCheckService healthCheckService;

@Autowired
private ServiceRegistry serviceRegistry;

private volatile boolean shutdownInProgress = false;
private volatile boolean shutdownCompleted = false;
private final CountDownLatch shutdownLatch = new CountDownLatch(1);

private final List<ShutdownHook> shutdownHooks = new ArrayList<>();
private final AtomicInteger activeRequests = new AtomicInteger(0);
private final AtomicInteger activeTasks = new AtomicInteger(0);

/**
* 初始化优雅停机
*/
@PostConstruct
public void initializeGracefulShutdown() {
try {
// 1. 注册信号处理器
registerSignalHandlers();

// 2. 注册停机钩子
registerShutdownHooks();

// 3. 启动健康检查
startHealthCheck();

// 4. 注册服务
registerService();

log.info("优雅停机管理器初始化完成");

} catch (Exception e) {
log.error("优雅停机管理器初始化失败", e);
throw new RuntimeException("优雅停机管理器初始化失败", e);
}
}

/**
* 注册信号处理器
*/
private void registerSignalHandlers() {
// 1. 注册SIGTERM信号处理器
SignalHandler sigtermHandler = signal -> {
log.info("收到SIGTERM信号,开始优雅停机");
initiateGracefulShutdown("SIGTERM");
};
Signal.handle(new Signal("TERM"), sigtermHandler);

// 2. 注册SIGINT信号处理器
SignalHandler sigintHandler = signal -> {
log.info("收到SIGINT信号,开始优雅停机");
initiateGracefulShutdown("SIGINT");
};
Signal.handle(new Signal("INT"), sigintHandler);

// 3. 注册SIGHUP信号处理器
SignalHandler sighupHandler = signal -> {
log.info("收到SIGHUP信号,重新加载配置");
reloadConfiguration();
};
Signal.handle(new Signal("HUP"), sighupHandler);

log.info("信号处理器注册完成");
}

/**
* 注册停机钩子
*/
private void registerShutdownHooks() {
// 1. 数据库连接池停机钩子
shutdownHooks.add(new DatabaseConnectionPoolShutdownHook());

// 2. 缓存连接停机钩子
shutdownHooks.add(new CacheConnectionShutdownHook());

// 3. 消息队列停机钩子
shutdownHooks.add(new MessageQueueShutdownHook());

// 4. 定时任务停机钩子
shutdownHooks.add(new ScheduledTaskShutdownHook());

// 5. 文件资源停机钩子
shutdownHooks.add(new FileResourceShutdownHook());

// 6. 网络连接停机钩子
shutdownHooks.add(new NetworkConnectionShutdownHook());

log.info("停机钩子注册完成: 数量={}", shutdownHooks.size());
}

/**
* 启动健康检查
*/
private void startHealthCheck() {
// 启动健康检查服务
healthCheckService.start();
log.info("健康检查服务启动完成");
}

/**
* 注册服务
*/
private void registerService() {
// 注册到服务发现中心
serviceRegistry.register();
log.info("服务注册完成");
}

/**
* 发起优雅停机
*/
public void initiateGracefulShutdown(String reason) {
if (shutdownInProgress) {
log.warn("停机已在进行中,忽略重复的停机请求: reason={}", reason);
return;
}

shutdownInProgress = true;
log.info("开始优雅停机: reason={}", reason);

try {
// 1. 发送停机开始事件
publishShutdownEvent(ShutdownEvent.STARTED, reason);

// 2. 从负载均衡器移除
removeFromLoadBalancer();

// 3. 停止接收新请求
stopAcceptingNewRequests();

// 4. 等待现有请求完成
waitForActiveRequestsToComplete();

// 5. 等待活跃任务完成
waitForActiveTasksToComplete();

// 6. 执行停机钩子
executeShutdownHooks();

// 7. 从服务发现中心注销
unregisterService();

// 8. 发送停机完成事件
publishShutdownEvent(ShutdownEvent.COMPLETED, reason);

shutdownCompleted = true;
shutdownLatch.countDown();

log.info("优雅停机完成: reason={}", reason);

} catch (Exception e) {
log.error("优雅停机过程中发生异常: reason={}", reason, e);
publishShutdownEvent(ShutdownEvent.FAILED, reason + ": " + e.getMessage());
}
}

/**
* 从负载均衡器移除
*/
private void removeFromLoadBalancer() {
try {
// 1. 更新健康检查状态
healthCheckService.markUnhealthy();

// 2. 从负载均衡器移除
loadBalancerService.removeInstance();

// 3. 等待负载均衡器更新
Thread.sleep(2000);

log.info("已从负载均衡器移除");

} catch (Exception e) {
log.error("从负载均衡器移除失败", e);
}
}

/**
* 停止接收新请求
*/
private void stopAcceptingNewRequests() {
try {
// 1. 停止HTTP服务器
stopHttpServer();

// 2. 停止消息监听器
stopMessageListeners();

// 3. 停止定时任务
stopScheduledTasks();

log.info("已停止接收新请求");

} catch (Exception e) {
log.error("停止接收新请求失败", e);
}
}

/**
* 等待现有请求完成
*/
private void waitForActiveRequestsToComplete() {
int maxWaitTime = 30; // 最大等待30秒
int waitTime = 0;

while (activeRequests.get() > 0 && waitTime < maxWaitTime) {
try {
Thread.sleep(1000);
waitTime++;

if (waitTime % 5 == 0) {
log.info("等待现有请求完成: 活跃请求数={}, 等待时间={}s",
activeRequests.get(), waitTime);
}

} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}

if (activeRequests.get() > 0) {
log.warn("等待现有请求完成超时: 活跃请求数={}", activeRequests.get());
} else {
log.info("现有请求已完成");
}
}

/**
* 等待活跃任务完成
*/
private void waitForActiveTasksToComplete() {
int maxWaitTime = 60; // 最大等待60秒
int waitTime = 0;

while (activeTasks.get() > 0 && waitTime < maxWaitTime) {
try {
Thread.sleep(1000);
waitTime++;

if (waitTime % 10 == 0) {
log.info("等待活跃任务完成: 活跃任务数={}, 等待时间={}s",
activeTasks.get(), waitTime);
}

} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}

if (activeTasks.get() > 0) {
log.warn("等待活跃任务完成超时: 活跃任务数={}", activeTasks.get());
} else {
log.info("活跃任务已完成");
}
}

/**
* 执行停机钩子
*/
private void executeShutdownHooks() {
log.info("开始执行停机钩子: 数量={}", shutdownHooks.size());

for (ShutdownHook hook : shutdownHooks) {
try {
log.info("执行停机钩子: {}", hook.getClass().getSimpleName());
hook.shutdown();
log.info("停机钩子执行完成: {}", hook.getClass().getSimpleName());

} catch (Exception e) {
log.error("停机钩子执行失败: {}", hook.getClass().getSimpleName(), e);
}
}

log.info("所有停机钩子执行完成");
}

/**
* 从服务发现中心注销
*/
private void unregisterService() {
try {
serviceRegistry.unregister();
log.info("服务注销完成");

} catch (Exception e) {
log.error("服务注销失败", e);
}
}

/**
* 发布停机事件
*/
private void publishShutdownEvent(ShutdownEvent event, String reason) {
try {
ShutdownEventData eventData = new ShutdownEventData();
eventData.setEvent(event);
eventData.setReason(reason);
eventData.setTimestamp(LocalDateTime.now());
eventData.setActiveRequests(activeRequests.get());
eventData.setActiveTasks(activeTasks.get());

applicationContext.publishEvent(eventData);

} catch (Exception e) {
log.error("发布停机事件失败", e);
}
}

/**
* 重新加载配置
*/
private void reloadConfiguration() {
try {
log.info("开始重新加载配置");

// 重新加载配置逻辑
configurationService.reload();

log.info("配置重新加载完成");

} catch (Exception e) {
log.error("配置重新加载失败", e);
}
}

/**
* 停止HTTP服务器
*/
private void stopHttpServer() {
try {
// 停止Tomcat服务器
TomcatWebServer tomcatWebServer = (TomcatWebServer) applicationContext.getBean(WebServer.class);
tomcatWebServer.stop();

log.info("HTTP服务器已停止");

} catch (Exception e) {
log.error("停止HTTP服务器失败", e);
}
}

/**
* 停止消息监听器
*/
private void stopMessageListeners() {
try {
// 停止所有消息监听器
messageListenerService.stopAllListeners();

log.info("消息监听器已停止");

} catch (Exception e) {
log.error("停止消息监听器失败", e);
}
}

/**
* 停止定时任务
*/
private void stopScheduledTasks() {
try {
// 停止所有定时任务
scheduledTaskService.stopAllTasks();

log.info("定时任务已停止");

} catch (Exception e) {
log.error("停止定时任务失败", e);
}
}

/**
* 增加活跃请求计数
*/
public void incrementActiveRequests() {
activeRequests.incrementAndGet();
}

/**
* 减少活跃请求计数
*/
public void decrementActiveRequests() {
activeRequests.decrementAndGet();
}

/**
* 增加活跃任务计数
*/
public void incrementActiveTasks() {
activeTasks.incrementAndGet();
}

/**
* 减少活跃任务计数
*/
public void decrementActiveTasks() {
activeTasks.decrementAndGet();
}

/**
* 检查是否正在停机
*/
public boolean isShutdownInProgress() {
return shutdownInProgress;
}

/**
* 检查停机是否完成
*/
public boolean isShutdownCompleted() {
return shutdownCompleted;
}

/**
* 等待停机完成
*/
public void waitForShutdown() throws InterruptedException {
shutdownLatch.await();
}
}

2.2 停机钩子接口与实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/**
* 停机钩子接口
*/
public interface ShutdownHook {

/**
* 停机时执行
*/
void shutdown() throws Exception;

/**
* 获取优先级(数字越小优先级越高)
*/
default int getPriority() {
return 100;
}

/**
* 获取钩子名称
*/
default String getName() {
return this.getClass().getSimpleName();
}
}

/**
* 数据库连接池停机钩子
*/
@Component
@Slf4j
public class DatabaseConnectionPoolShutdownHook implements ShutdownHook {

@Autowired
private DataSource dataSource;

@Override
public void shutdown() throws Exception {
log.info("开始关闭数据库连接池");

try {
if (dataSource instanceof HikariDataSource) {
HikariDataSource hikariDataSource = (HikariDataSource) dataSource;

// 1. 等待活跃连接完成
waitForActiveConnections(hikariDataSource);

// 2. 关闭连接池
hikariDataSource.close();

log.info("数据库连接池关闭完成");
}

} catch (Exception e) {
log.error("关闭数据库连接池失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 10; // 高优先级
}

/**
* 等待活跃连接完成
*/
private void waitForActiveConnections(HikariDataSource dataSource) {
int maxWaitTime = 30; // 最大等待30秒
int waitTime = 0;

while (dataSource.getHikariPoolMXBean().getActiveConnections() > 0 && waitTime < maxWaitTime) {
try {
Thread.sleep(1000);
waitTime++;

if (waitTime % 5 == 0) {
log.info("等待数据库连接完成: 活跃连接数={}, 等待时间={}s",
dataSource.getHikariPoolMXBean().getActiveConnections(), waitTime);
}

} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}

if (dataSource.getHikariPoolMXBean().getActiveConnections() > 0) {
log.warn("等待数据库连接完成超时: 活跃连接数={}",
dataSource.getHikariPoolMXBean().getActiveConnections());
}
}
}

/**
* 缓存连接停机钩子
*/
@Component
@Slf4j
public class CacheConnectionShutdownHook implements ShutdownHook {

@Autowired
private RedisTemplate<String, Object> redisTemplate;

@Autowired
private RedissonClient redissonClient;

@Override
public void shutdown() throws Exception {
log.info("开始关闭缓存连接");

try {
// 1. 关闭Redis连接
if (redisTemplate != null) {
RedisConnectionFactory connectionFactory = redisTemplate.getConnectionFactory();
if (connectionFactory != null) {
connectionFactory.getConnection().close();
}
}

// 2. 关闭Redisson客户端
if (redissonClient != null) {
redissonClient.shutdown();
}

log.info("缓存连接关闭完成");

} catch (Exception e) {
log.error("关闭缓存连接失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 20;
}
}

/**
* 消息队列停机钩子
*/
@Component
@Slf4j
public class MessageQueueShutdownHook implements ShutdownHook {

@Autowired
private RabbitTemplate rabbitTemplate;

@Autowired
private KafkaTemplate<String, Object> kafkaTemplate;

@Override
public void shutdown() throws Exception {
log.info("开始关闭消息队列连接");

try {
// 1. 关闭RabbitMQ连接
if (rabbitTemplate != null) {
ConnectionFactory connectionFactory = rabbitTemplate.getConnectionFactory();
if (connectionFactory != null) {
connectionFactory.createConnection().close();
}
}

// 2. 关闭Kafka连接
if (kafkaTemplate != null) {
kafkaTemplate.flush();
}

log.info("消息队列连接关闭完成");

} catch (Exception e) {
log.error("关闭消息队列连接失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 30;
}
}

/**
* 定时任务停机钩子
*/
@Component
@Slf4j
public class ScheduledTaskShutdownHook implements ShutdownHook {

@Autowired
private TaskScheduler taskScheduler;

@Autowired
private ScheduledExecutorService scheduledExecutorService;

@Override
public void shutdown() throws Exception {
log.info("开始关闭定时任务");

try {
// 1. 关闭Spring TaskScheduler
if (taskScheduler instanceof ThreadPoolTaskScheduler) {
ThreadPoolTaskScheduler threadPoolTaskScheduler = (ThreadPoolTaskScheduler) taskScheduler;
threadPoolTaskScheduler.shutdown();

// 等待任务完成
if (!threadPoolTaskScheduler.getScheduledThreadPoolExecutor().awaitTermination(30, TimeUnit.SECONDS)) {
threadPoolTaskScheduler.getScheduledThreadPoolExecutor().shutdownNow();
}
}

// 2. 关闭ScheduledExecutorService
if (scheduledExecutorService != null) {
scheduledExecutorService.shutdown();

if (!scheduledExecutorService.awaitTermination(30, TimeUnit.SECONDS)) {
scheduledExecutorService.shutdownNow();
}
}

log.info("定时任务关闭完成");

} catch (Exception e) {
log.error("关闭定时任务失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 40;
}
}

/**
* 文件资源停机钩子
*/
@Component
@Slf4j
public class FileResourceShutdownHook implements ShutdownHook {

@Autowired
private FileResourceManager fileResourceManager;

@Override
public void shutdown() throws Exception {
log.info("开始关闭文件资源");

try {
// 关闭文件资源管理器
if (fileResourceManager != null) {
fileResourceManager.close();
}

log.info("文件资源关闭完成");

} catch (Exception e) {
log.error("关闭文件资源失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 50;
}
}

/**
* 网络连接停机钩子
*/
@Component
@Slf4j
public class NetworkConnectionShutdownHook implements ShutdownHook {

@Autowired
private RestTemplate restTemplate;

@Autowired
private WebClient webClient;

@Override
public void shutdown() throws Exception {
log.info("开始关闭网络连接");

try {
// 1. 关闭RestTemplate连接
if (restTemplate != null) {
ClientHttpRequestFactory requestFactory = restTemplate.getRequestFactory();
if (requestFactory instanceof HttpComponentsClientHttpRequestFactory) {
HttpComponentsClientHttpRequestFactory httpFactory = (HttpComponentsClientHttpRequestFactory) requestFactory;
httpFactory.destroy();
}
}

// 2. 关闭WebClient连接
if (webClient != null) {
// WebClient的关闭逻辑
}

log.info("网络连接关闭完成");

} catch (Exception e) {
log.error("关闭网络连接失败", e);
throw e;
}
}

@Override
public int getPriority() {
return 60;
}
}

2.3 健康检查服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
@Service
@Slf4j
public class HealthCheckService {

@Autowired
private GracefulShutdownManager shutdownManager;

@Autowired
private ServiceRegistry serviceRegistry;

private volatile boolean healthy = true;
private volatile boolean healthCheckEnabled = true;

/**
* 启动健康检查
*/
public void start() {
log.info("健康检查服务启动");

// 启动健康检查定时任务
ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
scheduler.scheduleAtFixedRate(this::performHealthCheck, 0, 30, TimeUnit.SECONDS);
}

/**
* 执行健康检查
*/
private void performHealthCheck() {
try {
if (!healthCheckEnabled) {
return;
}

// 1. 检查应用状态
boolean appHealthy = checkApplicationHealth();

// 2. 检查依赖服务
boolean dependenciesHealthy = checkDependenciesHealth();

// 3. 检查资源使用情况
boolean resourcesHealthy = checkResourcesHealth();

// 4. 综合健康状态
boolean overallHealthy = appHealthy && dependenciesHealthy && resourcesHealthy;

// 5. 更新健康状态
if (overallHealthy != healthy) {
healthy = overallHealthy;
updateHealthStatus(healthy);

if (!healthy) {
log.warn("健康检查失败,准备停机");
shutdownManager.initiateGracefulShutdown("健康检查失败");
}
}

} catch (Exception e) {
log.error("健康检查异常", e);
}
}

/**
* 检查应用状态
*/
private boolean checkApplicationHealth() {
try {
// 1. 检查是否正在停机
if (shutdownManager.isShutdownInProgress()) {
return false;
}

// 2. 检查活跃请求数
if (shutdownManager.getActiveRequests() > 1000) {
log.warn("活跃请求数过多: {}", shutdownManager.getActiveRequests());
return false;
}

// 3. 检查活跃任务数
if (shutdownManager.getActiveTasks() > 100) {
log.warn("活跃任务数过多: {}", shutdownManager.getActiveTasks());
return false;
}

return true;

} catch (Exception e) {
log.error("检查应用状态失败", e);
return false;
}
}

/**
* 检查依赖服务
*/
private boolean checkDependenciesHealth() {
try {
// 1. 检查数据库连接
boolean dbHealthy = checkDatabaseHealth();

// 2. 检查缓存连接
boolean cacheHealthy = checkCacheHealth();

// 3. 检查消息队列
boolean mqHealthy = checkMessageQueueHealth();

return dbHealthy && cacheHealthy && mqHealthy;

} catch (Exception e) {
log.error("检查依赖服务失败", e);
return false;
}
}

/**
* 检查数据库健康状态
*/
private boolean checkDatabaseHealth() {
try {
// 实现数据库健康检查逻辑
return true;
} catch (Exception e) {
log.error("数据库健康检查失败", e);
return false;
}
}

/**
* 检查缓存健康状态
*/
private boolean checkCacheHealth() {
try {
// 实现缓存健康检查逻辑
return true;
} catch (Exception e) {
log.error("缓存健康检查失败", e);
return false;
}
}

/**
* 检查消息队列健康状态
*/
private boolean checkMessageQueueHealth() {
try {
// 实现消息队列健康检查逻辑
return true;
} catch (Exception e) {
log.error("消息队列健康检查失败", e);
return false;
}
}

/**
* 检查资源使用情况
*/
private boolean checkResourcesHealth() {
try {
// 1. 检查内存使用率
MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean();
MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage();
double memoryUsageRate = (double) heapUsage.getUsed() / heapUsage.getMax();

if (memoryUsageRate > 0.9) {
log.warn("内存使用率过高: {}", memoryUsageRate);
return false;
}

// 2. 检查CPU使用率
OperatingSystemMXBean osBean = ManagementFactory.getOperatingSystemMXBean();
double cpuUsage = osBean.getSystemLoadAverage();

if (cpuUsage > 0.8) {
log.warn("CPU使用率过高: {}", cpuUsage);
return false;
}

// 3. 检查磁盘使用率
File root = new File("/");
long totalSpace = root.getTotalSpace();
long freeSpace = root.getFreeSpace();
double diskUsageRate = (double) (totalSpace - freeSpace) / totalSpace;

if (diskUsageRate > 0.9) {
log.warn("磁盘使用率过高: {}", diskUsageRate);
return false;
}

return true;

} catch (Exception e) {
log.error("检查资源使用情况失败", e);
return false;
}
}

/**
* 更新健康状态
*/
private void updateHealthStatus(boolean healthy) {
try {
// 更新服务注册中心的健康状态
serviceRegistry.updateHealthStatus(healthy);

log.info("健康状态已更新: {}", healthy ? "健康" : "不健康");

} catch (Exception e) {
log.error("更新健康状态失败", e);
}
}

/**
* 标记为不健康
*/
public void markUnhealthy() {
healthy = false;
updateHealthStatus(false);
log.info("应用已标记为不健康");
}

/**
* 标记为健康
*/
public void markHealthy() {
healthy = true;
updateHealthStatus(true);
log.info("应用已标记为健康");
}

/**
* 禁用健康检查
*/
public void disableHealthCheck() {
healthCheckEnabled = false;
log.info("健康检查已禁用");
}

/**
* 启用健康检查
*/
public void enableHealthCheck() {
healthCheckEnabled = true;
log.info("健康检查已启用");
}

/**
* 检查是否健康
*/
public boolean isHealthy() {
return healthy;
}

/**
* 检查健康检查是否启用
*/
public boolean isHealthCheckEnabled() {
return healthCheckEnabled;
}
}

三、请求拦截与处理

3.1 请求拦截器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@Component
@Slf4j
public class GracefulShutdownInterceptor implements HandlerInterceptor {

@Autowired
private GracefulShutdownManager shutdownManager;

@Override
public boolean preHandle(HttpServletRequest request, HttpServletResponse response, Object handler) throws Exception {
// 1. 检查是否正在停机
if (shutdownManager.isShutdownInProgress()) {
log.warn("服务正在停机,拒绝新请求: {}", request.getRequestURI());

response.setStatus(HttpStatus.SERVICE_UNAVAILABLE.value());
response.setContentType("application/json;charset=UTF-8");

String errorMessage = "{\"error\":\"服务正在停机,请稍后再试\"}";
response.getWriter().write(errorMessage);

return false;
}

// 2. 增加活跃请求计数
shutdownManager.incrementActiveRequests();

// 3. 设置请求开始时间
request.setAttribute("requestStartTime", System.currentTimeMillis());

return true;
}

@Override
public void afterCompletion(HttpServletRequest request, HttpServletResponse response, Object handler, Exception ex) throws Exception {
try {
// 1. 减少活跃请求计数
shutdownManager.decrementActiveRequests();

// 2. 记录请求处理时间
Long startTime = (Long) request.getAttribute("requestStartTime");
if (startTime != null) {
long duration = System.currentTimeMillis() - startTime;
log.debug("请求处理完成: uri={}, duration={}ms", request.getRequestURI(), duration);
}

} catch (Exception e) {
log.error("请求拦截器后处理异常", e);
}
}
}

/**
* Web配置
*/
@Configuration
public class WebConfig implements WebMvcConfigurer {

@Autowired
private GracefulShutdownInterceptor gracefulShutdownInterceptor;

@Override
public void addInterceptors(InterceptorRegistry registry) {
registry.addInterceptor(gracefulShutdownInterceptor)
.addPathPatterns("/**")
.excludePathPatterns("/health", "/actuator/**");
}
}

3.2 任务执行器包装

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@Component
@Slf4j
public class GracefulShutdownTaskExecutor implements TaskExecutor {

@Autowired
private GracefulShutdownManager shutdownManager;

private final TaskExecutor delegate;

public GracefulShutdownTaskExecutor(TaskExecutor delegate) {
this.delegate = delegate;
}

@Override
public void execute(Runnable task) {
// 1. 检查是否正在停机
if (shutdownManager.isShutdownInProgress()) {
log.warn("服务正在停机,拒绝新任务");
return;
}

// 2. 包装任务
Runnable wrappedTask = () -> {
try {
// 增加活跃任务计数
shutdownManager.incrementActiveTasks();

// 执行任务
task.run();

} catch (Exception e) {
log.error("任务执行异常", e);
} finally {
// 减少活跃任务计数
shutdownManager.decrementActiveTasks();
}
};

// 3. 提交任务
delegate.execute(wrappedTask);
}
}

/**
* 异步任务配置
*/
@Configuration
@EnableAsync
public class AsyncConfig implements AsyncConfigurer {

@Autowired
private GracefulShutdownManager shutdownManager;

@Override
public Executor getAsyncExecutor() {
ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
executor.setCorePoolSize(10);
executor.setMaxPoolSize(20);
executor.setQueueCapacity(100);
executor.setThreadNamePrefix("async-");
executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
executor.initialize();

return new GracefulShutdownTaskExecutor(executor);
}

@Override
public AsyncUncaughtExceptionHandler getAsyncUncaughtExceptionHandler() {
return (ex, method, params) -> {
log.error("异步任务执行异常: method={}, params={}", method.getName(), params, ex);
};
}
}

四、监控与告警

4.1 停机监控服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@Service
@Slf4j
public class ShutdownMonitoringService {

@Autowired
private MeterRegistry meterRegistry;

@Autowired
private GracefulShutdownManager shutdownManager;

/**
* 记录停机指标
*/
public void recordShutdownMetrics(ShutdownEventData eventData) {
try {
// 1. 记录停机事件
Counter.builder("shutdown.event.count")
.tag("event", eventData.getEvent().name())
.tag("reason", eventData.getReason())
.register(meterRegistry)
.increment();

// 2. 记录活跃请求数
Gauge.builder("shutdown.active.requests")
.register(meterRegistry, eventData, ShutdownEventData::getActiveRequests);

// 3. 记录活跃任务数
Gauge.builder("shutdown.active.tasks")
.register(meterRegistry, eventData, ShutdownEventData::getActiveTasks);

// 4. 记录停机时间
Timer.builder("shutdown.duration")
.tag("event", eventData.getEvent().name())
.register(meterRegistry)
.record(System.currentTimeMillis() - eventData.getTimestamp().atZone(ZoneId.systemDefault()).toInstant().toEpochMilli(), TimeUnit.MILLISECONDS);

} catch (Exception e) {
log.error("记录停机指标失败", e);
}
}

/**
* 监控停机状态
*/
@Scheduled(fixedDelay = 10000) // 10秒监控一次
public void monitorShutdownStatus() {
try {
// 1. 记录停机状态
Gauge.builder("shutdown.in.progress")
.register(meterRegistry, shutdownManager, manager -> manager.isShutdownInProgress() ? 1 : 0);

Gauge.builder("shutdown.completed")
.register(meterRegistry, shutdownManager, manager -> manager.isShutdownCompleted() ? 1 : 0);

// 2. 记录活跃请求和任务数
Gauge.builder("shutdown.current.active.requests")
.register(meterRegistry, shutdownManager, GracefulShutdownManager::getActiveRequests);

Gauge.builder("shutdown.current.active.tasks")
.register(meterRegistry, shutdownManager, GracefulShutdownManager::getActiveTasks);

} catch (Exception e) {
log.error("监控停机状态失败", e);
}
}

/**
* 生成停机报告
*/
public ShutdownReport generateShutdownReport() {
ShutdownReport report = new ShutdownReport();
report.setTimestamp(LocalDateTime.now());
report.setShutdownInProgress(shutdownManager.isShutdownInProgress());
report.setShutdownCompleted(shutdownManager.isShutdownCompleted());
report.setActiveRequests(shutdownManager.getActiveRequests());
report.setActiveTasks(shutdownManager.getActiveTasks());

return report;
}
}

4.2 告警服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
@Service
@Slf4j
public class ShutdownAlertService {

@Autowired
private NotificationService notificationService;

/**
* 发送停机告警
*/
public void sendShutdownAlert(ShutdownEventData eventData) {
try {
ShutdownAlert alert = new ShutdownAlert();
alert.setEvent(eventData.getEvent());
alert.setReason(eventData.getReason());
alert.setTimestamp(eventData.getTimestamp());
alert.setActiveRequests(eventData.getActiveRequests());
alert.setActiveTasks(eventData.getActiveTasks());

// 发送告警通知
notificationService.sendAlert(alert);

log.info("停机告警已发送: event={}, reason={}", eventData.getEvent(), eventData.getReason());

} catch (Exception e) {
log.error("发送停机告警失败", e);
}
}

/**
* 发送停机超时告警
*/
public void sendShutdownTimeoutAlert(String reason, int timeoutSeconds) {
try {
ShutdownTimeoutAlert alert = new ShutdownTimeoutAlert();
alert.setReason(reason);
alert.setTimeoutSeconds(timeoutSeconds);
alert.setTimestamp(LocalDateTime.now());

// 发送告警通知
notificationService.sendAlert(alert);

log.warn("停机超时告警已发送: reason={}, timeout={}s", reason, timeoutSeconds);

} catch (Exception e) {
log.error("发送停机超时告警失败", e);
}
}
}

五、配置与部署

5.1 应用配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# application.yml
server:
port: 8080
shutdown: graceful
tomcat:
threads:
max: 200
min-spare: 10
connection-timeout: 20000
keep-alive-timeout: 15000
max-connections: 8192
accept-count: 100

spring:
lifecycle:
timeout-per-shutdown-phase: 30s

datasource:
hikari:
maximum-pool-size: 20
minimum-idle: 5
connection-timeout: 30000
idle-timeout: 600000
max-lifetime: 1800000
leak-detection-threshold: 60000

management:
endpoints:
web:
exposure:
include: health,info,metrics,shutdown
endpoint:
health:
show-details: always
shutdown:
enabled: true
health:
defaults:
enabled: true

# 优雅停机配置
graceful:
shutdown:
enabled: true
timeout: 30s
health-check:
enabled: true
interval: 30s
timeout: 10s
load-balancer:
removal-delay: 2s
active-requests:
max-wait-time: 30s
active-tasks:
max-wait-time: 60s
hooks:
database:
priority: 10
timeout: 30s
cache:
priority: 20
timeout: 10s
message-queue:
priority: 30
timeout: 15s
scheduled-tasks:
priority: 40
timeout: 30s
file-resources:
priority: 50
timeout: 10s
network-connections:
priority: 60
timeout: 10s

5.2 Docker配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# Dockerfile
FROM openjdk:11-jre-slim

# 设置时区
ENV TZ=Asia/Shanghai
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

# 创建应用目录
WORKDIR /app

# 复制应用jar包
COPY target/app.jar app.jar

# 设置JVM参数
ENV JAVA_OPTS="-Xms512m -Xmx1024m -XX:+UseG1GC -XX:+UseStringDeduplication"

# 暴露端口
EXPOSE 8080

# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1

# 启动应用
ENTRYPOINT ["sh", "-c", "java $JAVA_OPTS -jar app.jar"]

5.3 Kubernetes配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: app-deployment
spec:
replicas: 3
selector:
matchLabels:
app: app
template:
metadata:
labels:
app: app
spec:
containers:
- name: app
image: app:latest
ports:
- containerPort: 8080
env:
- name: SPRING_PROFILES_ACTIVE
value: "prod"
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "1Gi"
cpu: "1000m"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
lifecycle:
preStop:
exec:
command: ["/bin/sh", "-c", "sleep 15"]
terminationGracePeriodSeconds: 60

---
apiVersion: v1
kind: Service
metadata:
name: app-service
spec:
selector:
app: app
ports:
- port: 80
targetPort: 8080
type: ClusterIP

---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: app-ingress
spec:
rules:
- host: app.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: app-service
port:
number: 80

六、最佳实践总结

6.1 优雅停机最佳实践

  1. 信号处理:正确处理SIGTERM和SIGINT信号
  2. 资源清理:按优先级顺序清理各种资源
  3. 请求处理:等待现有请求完成,拒绝新请求
  4. 健康检查:实时监控应用健康状态
  5. 监控告警:完善的停机监控和告警机制

6.2 部署最佳实践

  1. 容器化:使用Docker容器化部署
  2. 编排:使用Kubernetes进行服务编排
  3. 健康检查:配置合适的健康检查策略
  4. 资源限制:设置合理的资源限制
  5. 滚动更新:使用滚动更新策略

6.3 运维最佳实践

  1. 监控:完善的监控指标和告警
  2. 日志:详细的停机日志记录
  3. 测试:定期进行停机测试
  4. 文档:维护详细的运维文档
  5. 演练:定期进行停机演练

七、总结

SpringBoot优雅停机是生产环境服务治理的重要组成部分,通过合理的架构设计和实现,可以确保服务在停机过程中不丢失请求、不产生数据不一致,保证系统的稳定性和可靠性。

关键要点:

  1. 信号处理:正确处理系统信号,启动优雅停机流程
  2. 资源清理:按优先级顺序清理各种资源
  3. 请求管理:等待现有请求完成,拒绝新请求
  4. 健康检查:实时监控应用健康状态
  5. 监控告警:完善的停机监控和告警机制

通过本文的实践指导,读者可以构建一个完善的优雅停机系统,为生产环境的服务治理提供强有力的技术支撑。