背景
线上几亿的数据在回刷的时候容器服务会出现OOM而重启,导致任务中断
内存泄露分析
jmap -histo pid
找出了有几十亿的java.lang.StackTraceElement对象,找不到被谁引用了
jmap -dump:format=b,file=heapdump.hprof pid
dump内存
下载到本机mac上,用mat(MemoryAnalyzer)分析,得到内存泄露报告,看到内存全部被com.dianping.cat.message.internal.DefaultMessageManager$Context引用,找到了罪魁祸首
原因分析
com.dianping.cat.log4j.Log4j2Appender
在打印错误Exception的时候会调用Cat.logError方法
public class Log4j2Appender extends AbstractAppender {
....
public void append(LogEvent event) {
try {
Level level = event.getLevel();
if (level.isMoreSpecificThan(Level.WARN)) {
this.logError(event);
}
} catch (Exception var3) {
if (!this.ignoreExceptions()) {
throw new AppenderLoggingException(var3);
}
}
}
....
private void logError(LogEvent event) {
Throwable exception = event.getThrown();
if (exception != null) {
Message message = event.getMessage();
if (message != null) {
Cat.logError(message.getFormattedMessage(), exception);
} else {
Cat.logError(exception);
}
}
<span class="token punctuation">}</span>
}
Cat
类的logError函数最终调用到了DefaultMessageManager.shouldLog方法
public class Cat {
...
public static void logError(String message, Throwable cause) {
try {
getProducer().logError(message, cause);
} catch (Exception var3) {
errorHandler(var3);
}
<span class="token punctuation">}</span>
<span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span>
<span class="token keyword">public</span> <span class="token keyword">static</span> <span class="token keyword">void</span> <span class="token function">logError</span><span class="token punctuation">(</span><span class="token class-name">Throwable</span> cause<span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token keyword">try</span> <span class="token punctuation">{<!-- --></span>
<span class="token function">getProducer</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span><span class="token function">logError</span><span class="token punctuation">(</span>cause<span class="token punctuation">)</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span> <span class="token keyword">catch</span> <span class="token punctuation">(</span><span class="token class-name">Exception</span> var2<span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token function">errorHandler</span><span class="token punctuation">(</span>var2<span class="token punctuation">)</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span>
<span class="token punctuation">}</span>
}
public class DefaultMessageProducer implements MessageProducer {
public void logError(String message, Throwable cause) {
if (Cat.getManager().isCatEnabled()) {
if (this.shouldLog(cause)) {
....
}
} else {
cause.printStackTrace();
}
}
private boolean shouldLog(Throwable e) {
return this.m_manager instanceof DefaultMessageManager ? ((DefaultMessageManager)this.m_manager).shouldLog(e) : true;
}
DefaultMessageManager
类的m_context在shouldLog的时候把异常堆栈保存下来了,如果Cat事务不关闭,随着异常越来越多就导致了内存溢出
public class DefaultMessageManager {
private ThreadLocal<DefaultMessageManager.Context> m_context = new ThreadLocal();
boolean shouldLog(Throwable e) {
DefaultMessageManager.Context ctx = (DefaultMessageManager.Context)this.m_context.get();
return ctx != null ? ctx.shouldLog(e) : true;
}
<span class="token keyword">class</span> <span class="token class-name">Context</span> <span class="token punctuation">{<!-- --></span>
<span class="token comment">// 内存不足就是由于错误堆栈信息没有限制导致的</span>
<span class="token keyword">private</span> <span class="token class-name">Set</span><span class="token generics"><span class="token punctuation"><</span><span class="token class-name">Throwable</span><span class="token punctuation">></span></span> m_knownExceptions<span class="token punctuation">;</span>
<span class="token keyword">public</span> <span class="token class-name">Context</span><span class="token punctuation">(</span><span class="token class-name">String</span> domain<span class="token punctuation">,</span> <span class="token class-name">String</span> hostName<span class="token punctuation">,</span> <span class="token class-name">String</span> ipAddress<span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span>
<span class="token keyword">this</span><span class="token punctuation">.</span>m_knownExceptions <span class="token operator">=</span> <span class="token keyword">new</span> <span class="token class-name">HashSet</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span>
<span class="token keyword">public</span> <span class="token keyword">boolean</span> <span class="token function">shouldLog</span><span class="token punctuation">(</span><span class="token class-name">Throwable</span> e<span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token keyword">if</span> <span class="token punctuation">(</span><span class="token keyword">this</span><span class="token punctuation">.</span>m_knownExceptions <span class="token operator">==</span> <span class="token keyword">null</span><span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token keyword">this</span><span class="token punctuation">.</span>m_knownExceptions <span class="token operator">=</span> <span class="token keyword">new</span> <span class="token class-name">HashSet</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span>
<span class="token keyword">if</span> <span class="token punctuation">(</span><span class="token keyword">this</span><span class="token punctuation">.</span>m_knownExceptions<span class="token punctuation">.</span><span class="token function">contains</span><span class="token punctuation">(</span>e<span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{<!-- --></span>
<span class="token keyword">return</span> <span class="token boolean">false</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span> <span class="token keyword">else</span> <span class="token punctuation">{<!-- --></span>
<span class="token comment">// 这里没有限制大小,只要有异常就往Set里面添加,这里应该做一个优化</span>
<span class="token keyword">this</span><span class="token punctuation">.</span>m_knownExceptions<span class="token punctuation">.</span><span class="token function">add</span><span class="token punctuation">(</span>e<span class="token punctuation">)</span><span class="token punctuation">;</span>
<span class="token keyword">return</span> <span class="token boolean">true</span><span class="token punctuation">;</span>
<span class="token punctuation">}</span>
<span class="token punctuation">}</span>
}
}
解决方案
手动调用Cat.getManager().reset();
方法清空保存的异常堆栈信息
public void reset() {
DefaultMessageManager.Context ctx = (DefaultMessageManager.Context)this.m_context.get();
if (ctx != null) {
if (ctx.m_totalDurationInMicros == 0L) {
ctx.m_stack.clear();
ctx.m_knownExceptions.clear();
this.m_context.remove();
} else {
// 这里会释放错误日志堆栈信息
ctx.m_knownExceptions.clear();
}
}
}