前言

线程死锁是多线程编程中最严重的问题之一,它不仅会导致系统性能下降,还可能造成系统完全无法响应。在生产环境中,死锁问题往往难以复现和定位,需要完善的检测、预防和监控机制。本文从死锁原理到检测预防,从性能监控到调优策略,系统梳理企业级死锁处理的完整解决方案。

一、死锁问题架构设计

1.1 死锁检测与预防架构

1.2 死锁检测流程

二、死锁检测器实现

2.1 死锁检测器核心

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
@Component
@Slf4j
public class DeadlockDetector {

@Autowired
private ThreadMXBean threadMXBean;

@Autowired
private DeadlockMonitor deadlockMonitor;

@Autowired
private DeadlockAnalyzer deadlockAnalyzer;

private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
private volatile boolean detectionEnabled = true;

/**
* 启动死锁检测
*/
@PostConstruct
public void startDeadlockDetection() {
// 每5秒检测一次死锁
scheduler.scheduleAtFixedRate(this::detectDeadlocks, 0, 5, TimeUnit.SECONDS);
log.info("死锁检测器启动完成");
}

/**
* 检测死锁
*/
public void detectDeadlocks() {
if (!detectionEnabled) {
return;
}

try {
// 1. 获取死锁线程ID
long[] deadlockedThreads = threadMXBean.findDeadlockedThreads();

if (deadlockedThreads != null && deadlockedThreads.length > 0) {
log.error("检测到死锁: 死锁线程数量={}", deadlockedThreads.length);

// 2. 分析死锁信息
DeadlockInfo deadlockInfo = analyzeDeadlock(deadlockedThreads);

// 3. 记录死锁信息
recordDeadlockInfo(deadlockInfo);

// 4. 发送死锁告警
sendDeadlockAlert(deadlockInfo);

// 5. 处理死锁
handleDeadlock(deadlockInfo);
}

} catch (Exception e) {
log.error("死锁检测异常", e);
}
}

/**
* 分析死锁信息
*/
private DeadlockInfo analyzeDeadlock(long[] deadlockedThreads) {
DeadlockInfo deadlockInfo = new DeadlockInfo();
deadlockInfo.setDetectionTime(LocalDateTime.now());
deadlockInfo.setDeadlockedThreadCount(deadlockedThreads.length);

List<ThreadInfo> threadInfos = new ArrayList<>();
List<LockInfo> lockInfos = new ArrayList<>();

for (long threadId : deadlockedThreads) {
// 1. 获取线程信息
ThreadInfo threadInfo = threadMXBean.getThreadInfo(threadId);
threadInfos.add(threadInfo);

// 2. 获取锁信息
LockInfo[] locks = threadInfo.getLockedSynchronizers();
if (locks != null) {
lockInfos.addAll(Arrays.asList(locks));
}

// 3. 获取等待锁信息
LockInfo blockedLock = threadInfo.getLockInfo();
if (blockedLock != null) {
lockInfos.add(blockedLock);
}
}

deadlockInfo.setThreadInfos(threadInfos);
deadlockInfo.setLockInfos(lockInfos);

// 4. 构建死锁图
DeadlockGraph deadlockGraph = buildDeadlockGraph(threadInfos, lockInfos);
deadlockInfo.setDeadlockGraph(deadlockGraph);

return deadlockInfo;
}

/**
* 构建死锁图
*/
private DeadlockGraph buildDeadlockGraph(List<ThreadInfo> threadInfos, List<LockInfo> lockInfos) {
DeadlockGraph graph = new DeadlockGraph();

// 1. 添加节点
for (ThreadInfo threadInfo : threadInfos) {
DeadlockNode node = new DeadlockNode();
node.setType(NodeType.THREAD);
node.setId(threadInfo.getThreadId());
node.setName(threadInfo.getThreadName());
node.setState(threadInfo.getThreadState());
graph.addNode(node);
}

for (LockInfo lockInfo : lockInfos) {
DeadlockNode node = new DeadlockNode();
node.setType(NodeType.LOCK);
node.setId(lockInfo.getIdentityHashCode());
node.setName(lockInfo.getClassName());
graph.addNode(node);
}

// 2. 添加边
for (ThreadInfo threadInfo : threadInfos) {
// 线程持有的锁
LockInfo[] lockedLocks = threadInfo.getLockedSynchronizers();
if (lockedLocks != null) {
for (LockInfo lockInfo : lockedLocks) {
DeadlockEdge edge = new DeadlockEdge();
edge.setFromNodeId(threadInfo.getThreadId());
edge.setToNodeId(lockInfo.getIdentityHashCode());
edge.setType(EdgeType.HOLDS);
graph.addEdge(edge);
}
}

// 线程等待的锁
LockInfo waitingLock = threadInfo.getLockInfo();
if (waitingLock != null) {
DeadlockEdge edge = new DeadlockEdge();
edge.setFromNodeId(threadInfo.getThreadId());
edge.setToNodeId(waitingLock.getIdentityHashCode());
edge.setType(EdgeType.WAITS_FOR);
graph.addEdge(edge);
}
}

return graph;
}

/**
* 记录死锁信息
*/
private void recordDeadlockInfo(DeadlockInfo deadlockInfo) {
try {
// 1. 记录到数据库
deadlockMonitor.recordDeadlock(deadlockInfo);

// 2. 记录到日志
logDeadlockInfo(deadlockInfo);

// 3. 记录到监控系统
recordDeadlockMetrics(deadlockInfo);

} catch (Exception e) {
log.error("记录死锁信息失败", e);
}
}

/**
* 记录死锁日志
*/
private void logDeadlockInfo(DeadlockInfo deadlockInfo) {
StringBuilder logMessage = new StringBuilder();
logMessage.append("死锁检测报告:\n");
logMessage.append("检测时间: ").append(deadlockInfo.getDetectionTime()).append("\n");
logMessage.append("死锁线程数量: ").append(deadlockInfo.getDeadlockedThreadCount()).append("\n");

for (ThreadInfo threadInfo : deadlockInfo.getThreadInfos()) {
logMessage.append("线程: ").append(threadInfo.getThreadName())
.append(" (ID: ").append(threadInfo.getThreadId()).append(")\n");
logMessage.append("状态: ").append(threadInfo.getThreadState()).append("\n");
logMessage.append("阻塞时间: ").append(threadInfo.getBlockedTime()).append("ms\n");

if (threadInfo.getLockInfo() != null) {
logMessage.append("等待锁: ").append(threadInfo.getLockInfo().getClassName())
.append(" (ID: ").append(threadInfo.getLockInfo().getIdentityHashCode()).append(")\n");
}

logMessage.append("堆栈跟踪:\n");
StackTraceElement[] stackTrace = threadInfo.getStackTrace();
for (StackTraceElement element : stackTrace) {
logMessage.append(" ").append(element.toString()).append("\n");
}
logMessage.append("\n");
}

log.error(logMessage.toString());
}

/**
* 记录死锁指标
*/
private void recordDeadlockMetrics(DeadlockInfo deadlockInfo) {
try {
// 记录死锁次数
Counter.builder("deadlock.count")
.register(meterRegistry)
.increment();

// 记录死锁线程数
Gauge.builder("deadlock.thread.count")
.register(meterRegistry, deadlockInfo, DeadlockInfo::getDeadlockedThreadCount);

// 记录死锁检测时间
Timer.builder("deadlock.detection.time")
.register(meterRegistry)
.record(System.currentTimeMillis() - deadlockInfo.getDetectionTime().atZone(ZoneId.systemDefault()).toInstant().toEpochMilli(), TimeUnit.MILLISECONDS);

} catch (Exception e) {
log.error("记录死锁指标失败", e);
}
}

/**
* 发送死锁告警
*/
private void sendDeadlockAlert(DeadlockInfo deadlockInfo) {
try {
DeadlockAlert alert = new DeadlockAlert();
alert.setDetectionTime(deadlockInfo.getDetectionTime());
alert.setDeadlockedThreadCount(deadlockInfo.getDeadlockedThreadCount());
alert.setThreadNames(deadlockInfo.getThreadInfos().stream()
.map(ThreadInfo::getThreadName)
.collect(Collectors.toList()));
alert.setLockNames(deadlockInfo.getLockInfos().stream()
.map(LockInfo::getClassName)
.collect(Collectors.toList()));

// 发送告警
alertService.sendDeadlockAlert(alert);

} catch (Exception e) {
log.error("发送死锁告警失败", e);
}
}

/**
* 处理死锁
*/
private void handleDeadlock(DeadlockInfo deadlockInfo) {
try {
// 1. 分析死锁原因
DeadlockAnalysis analysis = deadlockAnalyzer.analyze(deadlockInfo);

// 2. 根据分析结果处理
if (analysis.getSeverity() == DeadlockSeverity.CRITICAL) {
// 严重死锁,中断线程
interruptDeadlockedThreads(deadlockInfo);
} else if (analysis.getSeverity() == DeadlockSeverity.WARNING) {
// 警告级别,记录日志
log.warn("检测到警告级别死锁: {}", analysis.getReason());
}

// 3. 生成处理报告
generateDeadlockReport(deadlockInfo, analysis);

} catch (Exception e) {
log.error("处理死锁失败", e);
}
}

/**
* 中断死锁线程
*/
private void interruptDeadlockedThreads(DeadlockInfo deadlockInfo) {
for (ThreadInfo threadInfo : deadlockInfo.getThreadInfos()) {
try {
Thread thread = findThreadById(threadInfo.getThreadId());
if (thread != null) {
thread.interrupt();
log.warn("已中断死锁线程: {}", threadInfo.getThreadName());
}
} catch (Exception e) {
log.error("中断死锁线程失败: {}", threadInfo.getThreadName(), e);
}
}
}

/**
* 根据线程ID查找线程
*/
private Thread findThreadById(long threadId) {
ThreadGroup rootGroup = Thread.currentThread().getThreadGroup();
while (rootGroup.getParent() != null) {
rootGroup = rootGroup.getParent();
}

Thread[] threads = new Thread[rootGroup.activeCount()];
rootGroup.enumerate(threads);

for (Thread thread : threads) {
if (thread != null && thread.getId() == threadId) {
return thread;
}
}

return null;
}

/**
* 生成死锁报告
*/
private void generateDeadlockReport(DeadlockInfo deadlockInfo, DeadlockAnalysis analysis) {
try {
DeadlockReport report = new DeadlockReport();
report.setDeadlockInfo(deadlockInfo);
report.setAnalysis(analysis);
report.setReportTime(LocalDateTime.now());
report.setRecommendations(generateRecommendations(analysis));

// 保存报告
deadlockMonitor.saveReport(report);

} catch (Exception e) {
log.error("生成死锁报告失败", e);
}
}

/**
* 生成建议
*/
private List<String> generateRecommendations(DeadlockAnalysis analysis) {
List<String> recommendations = new ArrayList<>();

if (analysis.getCause() == DeadlockCause.LOCK_ORDER) {
recommendations.add("建议统一锁的获取顺序,避免循环等待");
} else if (analysis.getCause() == DeadlockCause.LOCK_GRANULARITY) {
recommendations.add("建议优化锁的粒度,减少锁的持有时间");
} else if (analysis.getCause() == DeadlockCause.RESOURCE_COMPETITION) {
recommendations.add("建议优化资源竞争,使用无锁数据结构");
}

return recommendations;
}

/**
* 停止死锁检测
*/
@PreDestroy
public void stopDeadlockDetection() {
detectionEnabled = false;
scheduler.shutdown();
try {
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
scheduler.shutdownNow();
}
} catch (InterruptedException e) {
scheduler.shutdownNow();
Thread.currentThread().interrupt();
}
log.info("死锁检测器已停止");
}
}

2.2 死锁分析器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
@Component
@Slf4j
public class DeadlockAnalyzer {

@Autowired
private DeadlockPatternMatcher patternMatcher;

@Autowired
private DeadlockHistoryAnalyzer historyAnalyzer;

/**
* 分析死锁
*/
public DeadlockAnalysis analyze(DeadlockInfo deadlockInfo) {
DeadlockAnalysis analysis = new DeadlockAnalysis();
analysis.setDeadlockInfo(deadlockInfo);
analysis.setAnalysisTime(LocalDateTime.now());

try {
// 1. 分析死锁原因
DeadlockCause cause = analyzeDeadlockCause(deadlockInfo);
analysis.setCause(cause);

// 2. 分析死锁严重程度
DeadlockSeverity severity = analyzeDeadlockSeverity(deadlockInfo);
analysis.setSeverity(severity);

// 3. 分析死锁模式
DeadlockPattern pattern = patternMatcher.matchPattern(deadlockInfo);
analysis.setPattern(pattern);

// 4. 分析历史趋势
DeadlockTrend trend = historyAnalyzer.analyzeTrend(deadlockInfo);
analysis.setTrend(trend);

// 5. 生成分析结果
String reason = generateAnalysisReason(analysis);
analysis.setReason(reason);

return analysis;

} catch (Exception e) {
log.error("死锁分析失败", e);
analysis.setCause(DeadlockCause.UNKNOWN);
analysis.setSeverity(DeadlockSeverity.WARNING);
analysis.setReason("分析失败: " + e.getMessage());
return analysis;
}
}

/**
* 分析死锁原因
*/
private DeadlockCause analyzeDeadlockCause(DeadlockInfo deadlockInfo) {
// 1. 检查锁顺序
if (isLockOrderDeadlock(deadlockInfo)) {
return DeadlockCause.LOCK_ORDER;
}

// 2. 检查锁粒度
if (isLockGranularityDeadlock(deadlockInfo)) {
return DeadlockCause.LOCK_GRANULARITY;
}

// 3. 检查资源竞争
if (isResourceCompetitionDeadlock(deadlockInfo)) {
return DeadlockCause.RESOURCE_COMPETITION;
}

// 4. 检查嵌套锁
if (isNestedLockDeadlock(deadlockInfo)) {
return DeadlockCause.NESTED_LOCK;
}

return DeadlockCause.UNKNOWN;
}

/**
* 检查锁顺序死锁
*/
private boolean isLockOrderDeadlock(DeadlockInfo deadlockInfo) {
// 分析锁的获取顺序,检查是否存在循环等待
DeadlockGraph graph = deadlockInfo.getDeadlockGraph();

// 查找环路
return hasCycle(graph);
}

/**
* 检查锁粒度死锁
*/
private boolean isLockGranularityDeadlock(DeadlockInfo deadlockInfo) {
// 检查锁的粒度是否过大
for (ThreadInfo threadInfo : deadlockInfo.getThreadInfos()) {
LockInfo[] locks = threadInfo.getLockedSynchronizers();
if (locks != null && locks.length > 3) {
return true; // 持有锁过多
}
}

return false;
}

/**
* 检查资源竞争死锁
*/
private boolean isResourceCompetitionDeadlock(DeadlockInfo deadlockInfo) {
// 检查是否存在资源竞争
Map<String, Integer> lockCount = new HashMap<>();

for (ThreadInfo threadInfo : deadlockInfo.getThreadInfos()) {
LockInfo[] locks = threadInfo.getLockedSynchronizers();
if (locks != null) {
for (LockInfo lock : locks) {
String lockName = lock.getClassName();
lockCount.put(lockName, lockCount.getOrDefault(lockName, 0) + 1);
}
}
}

// 如果多个线程持有相同类型的锁,可能存在资源竞争
return lockCount.values().stream().anyMatch(count -> count > 1);
}

/**
* 检查嵌套锁死锁
*/
private boolean isNestedLockDeadlock(DeadlockInfo deadlockInfo) {
// 检查是否存在嵌套锁
for (ThreadInfo threadInfo : deadlockInfo.getThreadInfos()) {
StackTraceElement[] stackTrace = threadInfo.getStackTrace();
if (stackTrace != null) {
int lockCount = 0;
for (StackTraceElement element : stackTrace) {
if (element.getMethodName().contains("lock") ||
element.getMethodName().contains("synchronized")) {
lockCount++;
}
}
if (lockCount > 2) {
return true; // 嵌套锁过多
}
}
}

return false;
}

/**
* 分析死锁严重程度
*/
private DeadlockSeverity analyzeDeadlockSeverity(DeadlockInfo deadlockInfo) {
int threadCount = deadlockInfo.getDeadlockedThreadCount();
int lockCount = deadlockInfo.getLockInfos().size();

// 根据死锁线程数和锁数判断严重程度
if (threadCount >= 5 || lockCount >= 10) {
return DeadlockSeverity.CRITICAL;
} else if (threadCount >= 3 || lockCount >= 5) {
return DeadlockSeverity.HIGH;
} else if (threadCount >= 2 || lockCount >= 3) {
return DeadlockSeverity.MEDIUM;
} else {
return DeadlockSeverity.LOW;
}
}

/**
* 检查图中是否存在环路
*/
private boolean hasCycle(DeadlockGraph graph) {
// 使用DFS检测环路
Set<Long> visited = new HashSet<>();
Set<Long> recursionStack = new HashSet<>();

for (DeadlockNode node : graph.getNodes()) {
if (!visited.contains(node.getId())) {
if (hasCycleDFS(node.getId(), graph, visited, recursionStack)) {
return true;
}
}
}

return false;
}

/**
* DFS检测环路
*/
private boolean hasCycleDFS(Long nodeId, DeadlockGraph graph, Set<Long> visited, Set<Long> recursionStack) {
visited.add(nodeId);
recursionStack.add(nodeId);

List<DeadlockEdge> edges = graph.getEdges().stream()
.filter(edge -> edge.getFromNodeId().equals(nodeId))
.collect(Collectors.toList());

for (DeadlockEdge edge : edges) {
Long nextNodeId = edge.getToNodeId();

if (!visited.contains(nextNodeId)) {
if (hasCycleDFS(nextNodeId, graph, visited, recursionStack)) {
return true;
}
} else if (recursionStack.contains(nextNodeId)) {
return true; // 发现环路
}
}

recursionStack.remove(nodeId);
return false;
}

/**
* 生成分析原因
*/
private String generateAnalysisReason(DeadlockAnalysis analysis) {
StringBuilder reason = new StringBuilder();

reason.append("死锁分析结果: ");
reason.append("原因=").append(analysis.getCause().getDescription());
reason.append(", 严重程度=").append(analysis.getSeverity().getDescription());

if (analysis.getPattern() != null) {
reason.append(", 模式=").append(analysis.getPattern().getName());
}

if (analysis.getTrend() != null) {
reason.append(", 趋势=").append(analysis.getTrend().getDescription());
}

return reason.toString();
}
}

2.3 锁管理器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
@Component
@Slf4j
public class LockManager {

private final Map<String, LockInfo> locks = new ConcurrentHashMap<>();
private final Map<Long, List<String>> threadLocks = new ConcurrentHashMap<>();
private final Map<String, List<Long>> lockThreads = new ConcurrentHashMap<>();

private final ReadWriteLock lockMapLock = new ReentrantReadWriteLock();
private final Lock readLock = lockMapLock.readLock();
private final Lock writeLock = lockMapLock.writeLock();

/**
* 获取锁
*/
public boolean acquireLock(String lockName, long threadId, long timeoutMs) {
try {
writeLock.lock();

// 1. 检查是否已持有锁
if (isLockHeldByThread(lockName, threadId)) {
return true;
}

// 2. 检查锁是否可用
if (isLockAvailable(lockName)) {
// 3. 获取锁
acquireLockInternal(lockName, threadId);
return true;
} else {
// 4. 等待锁释放
return waitForLock(lockName, threadId, timeoutMs);
}

} finally {
writeLock.unlock();
}
}

/**
* 释放锁
*/
public void releaseLock(String lockName, long threadId) {
try {
writeLock.lock();

// 1. 检查是否持有锁
if (!isLockHeldByThread(lockName, threadId)) {
log.warn("线程 {} 尝试释放未持有的锁 {}", threadId, lockName);
return;
}

// 2. 释放锁
releaseLockInternal(lockName, threadId);

} finally {
writeLock.unlock();
}
}

/**
* 检查锁是否被线程持有
*/
private boolean isLockHeldByThread(String lockName, long threadId) {
List<String> threadLockList = threadLocks.get(threadId);
return threadLockList != null && threadLockList.contains(lockName);
}

/**
* 检查锁是否可用
*/
private boolean isLockAvailable(String lockName) {
LockInfo lockInfo = locks.get(lockName);
return lockInfo == null || lockInfo.getHolderThreadId() == null;
}

/**
* 内部获取锁
*/
private void acquireLockInternal(String lockName, long threadId) {
// 1. 更新锁信息
LockInfo lockInfo = locks.computeIfAbsent(lockName, k -> new LockInfo());
lockInfo.setLockName(lockName);
lockInfo.setHolderThreadId(threadId);
lockInfo.setAcquireTime(System.currentTimeMillis());

// 2. 更新线程锁映射
threadLocks.computeIfAbsent(threadId, k -> new ArrayList<>()).add(lockName);

// 3. 更新锁线程映射
lockThreads.computeIfAbsent(lockName, k -> new ArrayList<>()).add(threadId);

log.debug("线程 {} 获取锁 {}", threadId, lockName);
}

/**
* 内部释放锁
*/
private void releaseLockInternal(String lockName, long threadId) {
// 1. 更新锁信息
LockInfo lockInfo = locks.get(lockName);
if (lockInfo != null) {
lockInfo.setHolderThreadId(null);
lockInfo.setReleaseTime(System.currentTimeMillis());
}

// 2. 更新线程锁映射
List<String> threadLockList = threadLocks.get(threadId);
if (threadLockList != null) {
threadLockList.remove(lockName);
if (threadLockList.isEmpty()) {
threadLocks.remove(threadId);
}
}

// 3. 更新锁线程映射
List<Long> lockThreadList = lockThreads.get(lockName);
if (lockThreadList != null) {
lockThreadList.remove(threadId);
if (lockThreadList.isEmpty()) {
lockThreads.remove(lockName);
}
}

log.debug("线程 {} 释放锁 {}", threadId, lockName);
}

/**
* 等待锁释放
*/
private boolean waitForLock(String lockName, long threadId, long timeoutMs) {
long startTime = System.currentTimeMillis();

while (System.currentTimeMillis() - startTime < timeoutMs) {
try {
Thread.sleep(100); // 等待100ms

if (isLockAvailable(lockName)) {
acquireLockInternal(lockName, threadId);
return true;
}

} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return false;
}
}

return false; // 超时
}

/**
* 获取锁信息
*/
public LockInfo getLockInfo(String lockName) {
try {
readLock.lock();
return locks.get(lockName);
} finally {
readLock.unlock();
}
}

/**
* 获取线程持有的锁
*/
public List<String> getThreadLocks(long threadId) {
try {
readLock.lock();
List<String> threadLockList = threadLocks.get(threadId);
return threadLockList != null ? new ArrayList<>(threadLockList) : new ArrayList<>();
} finally {
readLock.unlock();
}
}

/**
* 获取所有锁信息
*/
public Map<String, LockInfo> getAllLocks() {
try {
readLock.lock();
return new HashMap<>(locks);
} finally {
readLock.unlock();
}
}

/**
* 检查死锁
*/
public boolean hasDeadlock() {
try {
readLock.lock();

// 构建等待图
Map<Long, Set<Long>> waitGraph = new HashMap<>();

for (Map.Entry<String, LockInfo> entry : locks.entrySet()) {
String lockName = entry.getKey();
LockInfo lockInfo = entry.getValue();

if (lockInfo.getHolderThreadId() != null) {
// 查找等待此锁的线程
List<Long> waitingThreads = lockThreads.get(lockName);
if (waitingThreads != null) {
for (Long waitingThreadId : waitingThreads) {
if (!waitingThreadId.equals(lockInfo.getHolderThreadId())) {
waitGraph.computeIfAbsent(waitingThreadId, k -> new HashSet<>())
.add(lockInfo.getHolderThreadId());
}
}
}
}
}

// 检测环路
return hasCycleInWaitGraph(waitGraph);

} finally {
readLock.unlock();
}
}

/**
* 检测等待图中的环路
*/
private boolean hasCycleInWaitGraph(Map<Long, Set<Long>> waitGraph) {
Set<Long> visited = new HashSet<>();
Set<Long> recursionStack = new HashSet<>();

for (Long threadId : waitGraph.keySet()) {
if (!visited.contains(threadId)) {
if (hasCycleDFS(threadId, waitGraph, visited, recursionStack)) {
return true;
}
}
}

return false;
}

/**
* DFS检测环路
*/
private boolean hasCycleDFS(Long threadId, Map<Long, Set<Long>> waitGraph,
Set<Long> visited, Set<Long> recursionStack) {
visited.add(threadId);
recursionStack.add(threadId);

Set<Long> waitingFor = waitGraph.get(threadId);
if (waitingFor != null) {
for (Long nextThreadId : waitingFor) {
if (!visited.contains(nextThreadId)) {
if (hasCycleDFS(nextThreadId, waitGraph, visited, recursionStack)) {
return true;
}
} else if (recursionStack.contains(nextThreadId)) {
return true; // 发现环路
}
}
}

recursionStack.remove(threadId);
return false;
}
}

三、性能监控与调优

3.1 性能监控服务

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
@Service
@Slf4j
public class PerformanceMonitoringService {

@Autowired
private MeterRegistry meterRegistry;

@Autowired
private ThreadMXBean threadMXBean;

@Autowired
private DeadlockDetector deadlockDetector;

/**
* 监控线程性能
*/
@Scheduled(fixedDelay = 10000) // 10秒监控一次
public void monitorThreadPerformance() {
try {
// 1. 监控线程数量
int threadCount = threadMXBean.getThreadCount();
int peakThreadCount = threadMXBean.getPeakThreadCount();
int daemonThreadCount = threadMXBean.getDaemonThreadCount();

Gauge.builder("thread.count")
.register(meterRegistry, threadCount, Number::doubleValue);

Gauge.builder("thread.peak.count")
.register(meterRegistry, peakThreadCount, Number::doubleValue);

Gauge.builder("thread.daemon.count")
.register(meterRegistry, daemonThreadCount, Number::doubleValue);

// 2. 监控线程状态
ThreadInfo[] threadInfos = threadMXBean.getThreadInfo(threadMXBean.getAllThreadIds());
if (threadInfos != null) {
Map<Thread.State, Long> stateCount = Arrays.stream(threadInfos)
.filter(Objects::nonNull)
.collect(Collectors.groupingBy(
ThreadInfo::getThreadState,
Collectors.counting()));

for (Map.Entry<Thread.State, Long> entry : stateCount.entrySet()) {
Gauge.builder("thread.state.count")
.tag("state", entry.getKey().name())
.register(meterRegistry, entry.getValue(), Number::doubleValue);
}
}

// 3. 监控阻塞线程
long blockedThreadCount = Arrays.stream(threadInfos)
.filter(Objects::nonNull)
.filter(info -> info.getThreadState() == Thread.State.BLOCKED)
.count();

Gauge.builder("thread.blocked.count")
.register(meterRegistry, blockedThreadCount, Number::doubleValue);

// 4. 监控等待线程
long waitingThreadCount = Arrays.stream(threadInfos)
.filter(Objects::nonNull)
.filter(info -> info.getThreadState() == Thread.State.WAITING)
.count();

Gauge.builder("thread.waiting.count")
.register(meterRegistry, waitingThreadCount, Number::doubleValue);

} catch (Exception e) {
log.error("监控线程性能失败", e);
}
}

/**
* 监控锁竞争
*/
@Scheduled(fixedDelay = 15000) // 15秒监控一次
public void monitorLockContention() {
try {
// 1. 监控锁竞争
long[] deadlockedThreads = threadMXBean.findDeadlockedThreads();
if (deadlockedThreads != null) {
Gauge.builder("lock.deadlock.count")
.register(meterRegistry, deadlockedThreads.length, Number::doubleValue);
}

// 2. 监控锁等待
ThreadInfo[] threadInfos = threadMXBean.getThreadInfo(threadMXBean.getAllThreadIds());
if (threadInfos != null) {
long lockWaitCount = Arrays.stream(threadInfos)
.filter(Objects::nonNull)
.filter(info -> info.getLockInfo() != null)
.count();

Gauge.builder("lock.wait.count")
.register(meterRegistry, lockWaitCount, Number::doubleValue);

// 3. 监控锁等待时间
long totalWaitTime = Arrays.stream(threadInfos)
.filter(Objects::nonNull)
.filter(info -> info.getBlockedTime() > 0)
.mapToLong(ThreadInfo::getBlockedTime)
.sum();

Gauge.builder("lock.wait.time.total")
.register(meterRegistry, totalWaitTime, Number::doubleValue);
}

} catch (Exception e) {
log.error("监控锁竞争失败", e);
}
}

/**
* 监控内存使用
*/
@Scheduled(fixedDelay = 20000) // 20秒监控一次
public void monitorMemoryUsage() {
try {
MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean();

// 1. 监控堆内存
MemoryUsage heapUsage = memoryBean.getHeapMemoryUsage();
Gauge.builder("memory.heap.used")
.register(meterRegistry, heapUsage.getUsed(), Number::doubleValue);

Gauge.builder("memory.heap.max")
.register(meterRegistry, heapUsage.getMax(), Number::doubleValue);

Gauge.builder("memory.heap.usage.ratio")
.register(meterRegistry, (double) heapUsage.getUsed() / heapUsage.getMax());

// 2. 监控非堆内存
MemoryUsage nonHeapUsage = memoryBean.getNonHeapMemoryUsage();
Gauge.builder("memory.nonheap.used")
.register(meterRegistry, nonHeapUsage.getUsed(), Number::doubleValue);

Gauge.builder("memory.nonheap.max")
.register(meterRegistry, nonHeapUsage.getMax(), Number::doubleValue);

} catch (Exception e) {
log.error("监控内存使用失败", e);
}
}

/**
* 监控GC性能
*/
@Scheduled(fixedDelay = 30000) // 30秒监控一次
public void monitorGCPerformance() {
try {
List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();

for (GarbageCollectorMXBean gcBean : gcBeans) {
String gcName = gcBean.getName();

// 1. 监控GC次数
Gauge.builder("gc.collection.count")
.tag("gc", gcName)
.register(meterRegistry, gcBean.getCollectionCount(), Number::doubleValue);

// 2. 监控GC时间
Gauge.builder("gc.collection.time")
.tag("gc", gcName)
.register(meterRegistry, gcBean.getCollectionTime(), Number::doubleValue);
}

} catch (Exception e) {
log.error("监控GC性能失败", e);
}
}

/**
* 生成性能报告
*/
public PerformanceReport generatePerformanceReport() {
PerformanceReport report = new PerformanceReport();
report.setTimestamp(LocalDateTime.now());

try {
// 1. 线程信息
ThreadReport threadReport = new ThreadReport();
threadReport.setThreadCount(threadMXBean.getThreadCount());
threadReport.setPeakThreadCount(threadMXBean.getPeakThreadCount());
threadReport.setDaemonThreadCount(threadMXBean.getDaemonThreadCount());
report.setThreadReport(threadReport);

// 2. 内存信息
MemoryMXBean memoryBean = ManagementFactory.getMemoryMXBean();
MemoryReport memoryReport = new MemoryReport();
memoryReport.setHeapUsed(memoryBean.getHeapMemoryUsage().getUsed());
memoryReport.setHeapMax(memoryBean.getHeapMemoryUsage().getMax());
memoryReport.setNonHeapUsed(memoryBean.getNonHeapMemoryUsage().getUsed());
memoryReport.setNonHeapMax(memoryBean.getNonHeapMemoryUsage().getMax());
report.setMemoryReport(memoryReport);

// 3. GC信息
List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
List<GCReport> gcReports = new ArrayList<>();
for (GarbageCollectorMXBean gcBean : gcBeans) {
GCReport gcReport = new GCReport();
gcReport.setGcName(gcBean.getName());
gcReport.setCollectionCount(gcBean.getCollectionCount());
gcReport.setCollectionTime(gcBean.getCollectionTime());
gcReports.add(gcReport);
}
report.setGcReports(gcReports);

// 4. 死锁信息
long[] deadlockedThreads = threadMXBean.findDeadlockedThreads();
DeadlockReport deadlockReport = new DeadlockReport();
deadlockReport.setDeadlockedThreadCount(deadlockedThreads != null ? deadlockedThreads.length : 0);
report.setDeadlockReport(deadlockReport);

} catch (Exception e) {
log.error("生成性能报告失败", e);
}

return report;
}
}

3.2 性能调优策略

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
@Service
@Slf4j
public class PerformanceTuningService {

@Autowired
private ThreadPoolExecutor threadPoolExecutor;

@Autowired
private LockManager lockManager;

@Autowired
private PerformanceMonitoringService monitoringService;

/**
* 动态调整线程池参数
*/
@Scheduled(fixedDelay = 60000) // 1分钟调整一次
public void adjustThreadPoolParameters() {
try {
PerformanceReport report = monitoringService.generatePerformanceReport();

// 1. 根据线程数量调整
adjustByThreadCount(report);

// 2. 根据内存使用率调整
adjustByMemoryUsage(report);

// 3. 根据GC频率调整
adjustByGCFrequency(report);

// 4. 根据死锁情况调整
adjustByDeadlockSituation(report);

} catch (Exception e) {
log.error("调整线程池参数失败", e);
}
}

/**
* 根据线程数量调整
*/
private void adjustByThreadCount(PerformanceReport report) {
ThreadReport threadReport = report.getThreadReport();

// 1. 检查线程数量
if (threadReport.getThreadCount() > 100) {
// 线程数量过多,减少核心线程数
int newCorePoolSize = Math.max(threadPoolExecutor.getCorePoolSize() - 2, 5);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.info("线程数量过多,调整核心线程数: {}", newCorePoolSize);
} else if (threadReport.getThreadCount() < 20) {
// 线程数量过少,增加核心线程数
int newCorePoolSize = Math.min(threadPoolExecutor.getCorePoolSize() + 2, 20);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.info("线程数量过少,调整核心线程数: {}", newCorePoolSize);
}

// 2. 检查阻塞线程数量
if (threadReport.getBlockedThreadCount() > 10) {
// 阻塞线程过多,增加最大线程数
int newMaxPoolSize = Math.min(threadPoolExecutor.getMaximumPoolSize() + 5, 50);
threadPoolExecutor.setMaximumPoolSize(newMaxPoolSize);
log.info("阻塞线程过多,调整最大线程数: {}", newMaxPoolSize);
}
}

/**
* 根据内存使用率调整
*/
private void adjustByMemoryUsage(PerformanceReport report) {
MemoryReport memoryReport = report.getMemoryReport();

// 1. 检查堆内存使用率
double heapUsageRatio = (double) memoryReport.getHeapUsed() / memoryReport.getHeapMax();

if (heapUsageRatio > 0.8) {
// 内存使用率过高,减少线程数
int newCorePoolSize = Math.max(threadPoolExecutor.getCorePoolSize() - 3, 5);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.warn("内存使用率过高,减少核心线程数: {}", newCorePoolSize);
} else if (heapUsageRatio < 0.3) {
// 内存使用率较低,可以增加线程数
int newCorePoolSize = Math.min(threadPoolExecutor.getCorePoolSize() + 2, 20);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.info("内存使用率较低,增加核心线程数: {}", newCorePoolSize);
}
}

/**
* 根据GC频率调整
*/
private void adjustByGCFrequency(PerformanceReport report) {
List<GCReport> gcReports = report.getGcReports();

for (GCReport gcReport : gcReports) {
// 检查GC频率
if (gcReport.getCollectionCount() > 1000) {
// GC频率过高,减少线程数
int newCorePoolSize = Math.max(threadPoolExecutor.getCorePoolSize() - 2, 5);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.warn("GC频率过高,减少核心线程数: {}", newCorePoolSize);
break;
}
}
}

/**
* 根据死锁情况调整
*/
private void adjustByDeadlockSituation(PerformanceReport report) {
DeadlockReport deadlockReport = report.getDeadlockReport();

if (deadlockReport.getDeadlockedThreadCount() > 0) {
// 存在死锁,减少线程数
int newCorePoolSize = Math.max(threadPoolExecutor.getCorePoolSize() - 5, 5);
threadPoolExecutor.setCorePoolSize(newCorePoolSize);
log.error("存在死锁,减少核心线程数: {}", newCorePoolSize);
}
}

/**
* 优化锁策略
*/
public void optimizeLockStrategy() {
try {
// 1. 分析锁使用情况
Map<String, LockInfo> allLocks = lockManager.getAllLocks();

for (Map.Entry<String, LockInfo> entry : allLocks.entrySet()) {
String lockName = entry.getKey();
LockInfo lockInfo = entry.getValue();

// 2. 检查锁持有时间
if (lockInfo.getAcquireTime() != null && lockInfo.getReleaseTime() != null) {
long holdTime = lockInfo.getReleaseTime() - lockInfo.getAcquireTime();

if (holdTime > 1000) { // 持有时间超过1秒
log.warn("锁 {} 持有时间过长: {}ms", lockName, holdTime);

// 3. 建议优化策略
suggestLockOptimization(lockName, holdTime);
}
}
}

} catch (Exception e) {
log.error("优化锁策略失败", e);
}
}

/**
* 建议锁优化策略
*/
private void suggestLockOptimization(String lockName, long holdTime) {
List<String> suggestions = new ArrayList<>();

if (holdTime > 5000) {
suggestions.add("考虑使用读写锁替代互斥锁");
suggestions.add("考虑减少锁的粒度");
suggestions.add("考虑使用无锁数据结构");
} else if (holdTime > 1000) {
suggestions.add("考虑优化锁内的业务逻辑");
suggestions.add("考虑使用锁超时机制");
}

log.info("锁 {} 优化建议: {}", lockName, suggestions);
}

/**
* 生成调优建议
*/
public TuningRecommendation generateTuningRecommendation() {
TuningRecommendation recommendation = new TuningRecommendation();
recommendation.setTimestamp(LocalDateTime.now());

try {
PerformanceReport report = monitoringService.generatePerformanceReport();

// 1. 线程池调优建议
List<String> threadPoolSuggestions = generateThreadPoolSuggestions(report);
recommendation.setThreadPoolSuggestions(threadPoolSuggestions);

// 2. 锁调优建议
List<String> lockSuggestions = generateLockSuggestions(report);
recommendation.setLockSuggestions(lockSuggestions);

// 3. 内存调优建议
List<String> memorySuggestions = generateMemorySuggestions(report);
recommendation.setMemorySuggestions(memorySuggestions);

// 4. GC调优建议
List<String> gcSuggestions = generateGCSuggestions(report);
recommendation.setGcSuggestions(gcSuggestions);

} catch (Exception e) {
log.error("生成调优建议失败", e);
}

return recommendation;
}

/**
* 生成线程池调优建议
*/
private List<String> generateThreadPoolSuggestions(PerformanceReport report) {
List<String> suggestions = new ArrayList<>();

ThreadReport threadReport = report.getThreadReport();

if (threadReport.getThreadCount() > 100) {
suggestions.add("线程数量过多,建议减少核心线程数");
}

if (threadReport.getBlockedThreadCount() > 10) {
suggestions.add("阻塞线程过多,建议增加最大线程数");
}

if (threadReport.getWaitingThreadCount() > 20) {
suggestions.add("等待线程过多,建议优化任务队列大小");
}

return suggestions;
}

/**
* 生成锁调优建议
*/
private List<String> generateLockSuggestions(PerformanceReport report) {
List<String> suggestions = new ArrayList<>();

DeadlockReport deadlockReport = report.getDeadlockReport();

if (deadlockReport.getDeadlockedThreadCount() > 0) {
suggestions.add("存在死锁,建议检查锁的获取顺序");
suggestions.add("建议使用锁超时机制");
suggestions.add("建议减少锁的粒度");
}

return suggestions;
}

/**
* 生成内存调优建议
*/
private List<String> generateMemorySuggestions(PerformanceReport report) {
List<String> suggestions = new ArrayList<>();

MemoryReport memoryReport = report.getMemoryReport();
double heapUsageRatio = (double) memoryReport.getHeapUsed() / memoryReport.getHeapMax();

if (heapUsageRatio > 0.8) {
suggestions.add("堆内存使用率过高,建议增加堆内存大小");
suggestions.add("建议优化对象创建和销毁");
}

return suggestions;
}

/**
* 生成GC调优建议
*/
private List<String> generateGCSuggestions(PerformanceReport report) {
List<String> suggestions = new ArrayList<>();

List<GCReport> gcReports = report.getGcReports();

for (GCReport gcReport : gcReports) {
if (gcReport.getCollectionCount() > 1000) {
suggestions.add("GC频率过高,建议调整GC参数");
suggestions.add("建议使用G1GC替代CMS");
}

if (gcReport.getCollectionTime() > 1000) {
suggestions.add("GC时间过长,建议优化GC参数");
}
}

return suggestions;
}
}

四、最佳实践总结

4.1 死锁预防最佳实践

  1. 锁顺序:统一锁的获取顺序,避免循环等待
  2. 锁超时:使用锁超时机制,避免无限等待
  3. 锁粒度:合理控制锁的粒度,减少锁的持有时间
  4. 无锁设计:使用无锁数据结构,减少锁竞争

4.2 性能调优最佳实践

  1. 线程池调优:根据业务特点调整线程池参数
  2. 锁优化:使用读写锁、分段锁等优化锁性能
  3. 内存管理:合理设置堆内存大小,优化对象生命周期
  4. GC调优:选择合适的GC算法和参数

4.3 监控告警最佳实践

  1. 实时监控:实时监控线程状态、锁竞争、内存使用
  2. 异常告警:及时发现和告警死锁、性能异常
  3. 趋势分析:分析性能趋势,预测潜在问题
  4. 自动调优:根据监控数据自动调整系统参数

五、总结

线程死锁是并发编程中的严重问题,需要从检测、预防、监控、调优等多个维度进行综合治理。通过完善的死锁检测机制、合理的预防策略、实时的性能监控和动态的调优策略,可以构建一个稳定、高效的并发系统。

关键要点:

  1. 死锁检测:实时检测死锁,及时处理
  2. 死锁预防:统一锁顺序,使用锁超时
  3. 性能监控:监控线程、锁、内存、GC
  4. 动态调优:根据监控数据动态调整参数
  5. 最佳实践:遵循并发编程最佳实践

通过本文的实践指导,读者可以构建一个完善的死锁检测和性能调优系统,为高并发应用提供强有力的技术支撑。