最近在跟ANR优化相关的需求,这儿跟踪代码记录下ANR的原理。
什么是ANR
ANR的全称是Application Not Response, 是体系对与之交互的组件(Activity, Service, BroadcastReceiver, ContentProvider)和用户交互(touch event)进行超时检测,来判断运用(主线程)是否呈现了卡顿或许响应过慢的问题,也便是其他体系中都有的watch dog机制。
ANR常见场景
ANR常见的场景主要有以下四种:
类型 | Timeout(s) | 是否弹窗 | 阐明 |
---|---|---|---|
Input | 5 | 是 | 部分厂商或许修正这个阈值,部分厂商或许不弹窗直接kill掉运用(比如vivo,oppo) |
serive | 20,200 | 无感知场景不会提示 | 创立service时会设置这个Timeout |
broadcast | 10, 60 | 无感知场景不会提示 | 在发送播送时设置Timeout |
contentprovider | 10 | 无感知场景不会提示 | 在publish的时候检测 |
ANR产生的原理
下面咱们跟踪代码来看看anr是怎样产生的,首先看service。 咱们的app调用以下代码来发动一个service:
context.startService(intent) ->
ContextImpl.startServiceCommon(...) ->
IActivityManager.satrtService(...) ->
// 经过binder调到了SystemService的 ActivityManagerService中
ActivityManagerService.startService(...) ->
ActiveServices.startServiceLocked(...) ->
ActiveServices.startServiceInnerLocked(...) ->
ActiveServices.startServiceInnerLocked(maps, ...) ->
ActiveServices.bringUpServiceLocked(...) ->
ActiveServices.realStartServiceLocked(...) ->
下面是realStartServiceLocked的代码:
// ActiveServices.java
// How long we wait for a service to finish executing.
static final int SERVICE_TIMEOUT = 20 * 1000 * Build.HW_TIMEOUT_MULTIPLIER;
// How long we wait for a service to finish executing.
static final int SERVICE_BACKGROUND_TIMEOUT = SERVICE_TIMEOUT * 10;
...
private void realStartServiceLocked(...) throws RemoteException {
...
// 这儿进入埋炸弹
bumpServiceExecutingLocked(r, execInFg, "create", null /* oomAdjReason */);
...
// 回调到App进程
thread.scheduleCreateService(r, r.serviceInfo,
mAm.compatibilityInfoForPackage(r.serviceInfo.applicationInfo),
app.mState.getReportedProcState());
...
}
//在bumpServiceExecutingLoaked里会调用scheduleServiceTimeoutLocked办法埋炸弹
void scheduleServiceTimeoutLocked(ProcessRecord proc) {
if (proc.mServices.numberOfExecutingServices() == 0 || proc.getThread() == null) {
return;
}
Message msg = mAm.mHandler.obtainMessage(
ActivityManagerService.SERVICE_TIMEOUT_MSG);
msg.obj = proc;
mAm.mHandler.sendMessageDelayed(msg, proc.mServices.shouldExecServicesFg()
? SERVICE_TIMEOUT : SERVICE_BACKGROUND_TIMEOUT);
}
能够看到埋炸弹便是发送了一条触发炸弹的延时音讯(SERVICE_TIMEOUT_MSG),假如这条音讯没有在延时时刻距离内被remove掉,就会履行SERVICE_TIMEOUT_MSG对应的action, 咱们稍后再看这个action是什么。 这儿的delay时刻也便是timeout时刻的界说,前台service是20s,后台service是200s。
// 经过传入的IApplicationThread 回调到App进程的并发送一个[CREATE_SERVICE]的音讯给主线程
thread.scheduleCreateService(...) ->
ActivityThread.handleCreateService(...) ->
看下handleCreateService的代码
private void handleCreateService(CreateServiceData data) {
...
// 反射创立service目标
service = packageInfo.getAppFactory().instantiateService(cl, data.info.name, data.intent);
...
service.attach(...)
// 这儿调用了onCreate回调
service.onCreate(...)
...
// 告知ActivityManagerService 履行完结了onCreate 拆炸弹
try {
ActivityManager.getService().serviceDoneExecuting(
data.token, SERVICE_DONE_EXECUTING_ANON, 0, 0);
} catch (RemoteException e) {
throw e.rethrowFromSystemServer();
}
...
}
接着回到ActivityManagerService中拆炸弹
ActivityManagerService.serviceDoneExecuting(...) ->
ActiveServices.serviceDoneExecutingLocked(...)
看ActiveServices.serviceDoneExecutingLocked代码:
private void serviceDoneExecutingLocked(...) {
...
// 移除音讯
mAm.mHandler.removeMessages(ActivityManagerService.SERVICE_TIMEOUT_MSG, r.app);
...
}
现已清楚了埋炸弹和拆炸弹的进程,咱们接着看看假如炸弹没有及时拆除时是怎样引爆的。 看SERVICE_TIMEOUT_MSG这个音讯是怎样履行的:
// ActivityManagerService.MainHandler
public void handleMessage(Message msg){
switch(msg.what) {
...
case SERVICE_TIMEOUT_MSG : {
mServices.serviceTimeout((ProcessRecord) msg . obj);
} break;
...
}
}
这儿调到了ServiceActive的serviceTimeout办法
void serviceTimeout(ProcessRecord proc) {
String anrMessage = null;
synchronized(mAm) {
if (proc.isDebugging()) {
// The app's being debugged, ignore timeout.
return;
}
final ProcessServiceRecord psr = proc.mServices;
if (psr.numberOfExecutingServices() == 0 || proc.getThread() == null) {
return;
}
// 找那个超时的service
final long now = SystemClock.uptimeMillis();
final long maxTime = now -
(psr.shouldExecServicesFg() ? SERVICE_TIMEOUT : SERVICE_BACKGROUND_TIMEOUT);
ServiceRecord timeout = null;
long nextTime = 0;
for (int i = psr.numberOfExecutingServices() - 1; i >= 0; i--) {
ServiceRecord sr = psr.getExecutingServiceAt(i);
if (sr.executingStart < maxTime) {
timeout = sr;
break;
}
if (sr.executingStart > nextTime) {
nextTime = sr.executingStart;
}
}
// 找到了结构anrMessage
if (timeout != null && mAm.mProcessList.isInLruListLOSP(proc)) {
Slog.w(TAG, "Timeout executing service: " + timeout);
StringWriter sw = new StringWriter();
PrintWriter pw = new FastPrintWriter(sw, false, 1024);
pw.println(timeout);
timeout.dump(pw, " ");
pw.close();
mLastAnrDump = sw.toString();
mAm.mHandler.removeCallbacks(mLastAnrDumpClearer);
mAm.mHandler.postDelayed(mLastAnrDumpClearer, LAST_ANR_LIFETIME_DURATION_MSECS);
anrMessage = "executing service " + timeout.shortInstanceName;
} else {
Message msg = mAm.mHandler.obtainMessage(
ActivityManagerService.SERVICE_TIMEOUT_MSG);
msg.obj = proc;
mAm.mHandler.sendMessageAtTime(msg, psr.shouldExecServicesFg()
? (nextTime+SERVICE_TIMEOUT) : (nextTime + SERVICE_BACKGROUND_TIMEOUT));
}
}
// 这儿开端处理ANR
if (anrMessage != null) {
mAm.mAnrHelper.appNotResponding(proc, anrMessage);
}
}
接着咱们看appNotResponding办法是怎样处理ANR的
void appNotResponding(...) {
...
// 为当时的anr创立一个AnrRecord 参加列表,或许一起有多个anr产生
mAnrRecords.add(new AnrRecord(anrProcess, activityShortComponentName, aInfo,
parentShortComponentName, parentProcess, aboveSystem, annotation));
...
// 开端处理ANR
startAnrConsumerIfNeeded();
}
private void startAnrConsumerIfNeeded() {
if (mRunning.compareAndSet(false, true)) {
// 发动Anr处理线程
new AnrConsumerThread().start();
}
}
/**
* The thread to execute {@link ProcessErrorStateRecord#appNotResponding}. It will terminate if
* all records are handled.
*/
private class AnrConsumerThread extends Thread {
AnrConsumerThread() {
super("AnrConsumer");
}
private AnrRecord next() {
synchronized (mAnrRecords) {
if (mAnrRecords.isEmpty()) {
return null;
}
final AnrRecord record = mAnrRecords.remove(0);
mProcessingPid = record.mPid;
return record;
}
}
@Override
public void run() {
AnrRecord r;
while ((r = next()) != null) {
...
// 以此处理每一个anr
r.appNotResponding(onlyDumpSelf);
...
}
...
}
}
private static class AnrRecord {
...
void appNotResponding(boolean onlyDumpSelf) {
mApp.mErrorState.appNotResponding(mActivityShortComponentName, mAppInfo,
mParentShortComponentName, mParentProcess, mAboveSystem, mAnnotation,
onlyDumpSelf);
}
}
接着调到了ProcessErrorStateRecord类中appNotResponding办法
frameworks/base/services/core/java/com/android/server/am/ProcessErrorStateRecord.java
void appNotResponding(String activityShortComponentName, ApplicationInfo aInfo,
String parentShortComponentName, WindowProcessController parentProcess,
boolean aboveSystem, String annotation, boolean onlyDumpSelf) {
// 记录需求dump仓库的进程
ArrayList<Integer> firstPids = new ArrayList<>(5);
SparseArray<Boolean> lastPids = new SparseArray<>(20);
...
final boolean isSilentAnr;
final int pid = mApp.getPid();
final UUID errorId;
...
// 设置无响应的状况
setNotResponding(true);
...
// 首先把产生anr进程的pid参加到优先dump的列表中
firstPids.add(pid);
// 假如是后台ANR或许强制只dump当时进程 则不把其他线程pid参加dump列表
isSilentAnr = isSilentAnr();
if (!isSilentAnr && !onlyDumpSelf) {
int parentPid = pid;
if (parentProcess != null && parentProcess.getPid() > 0) {
parentPid = parentProcess.getPid();
}
// 把父进程参加列表
if (parentPid != pid) firstPids.add(parentPid);
// 把System Server 进程参加列表
if (MY_PID != pid && MY_PID != parentPid) firstPids.add(MY_PID);
final int ppid = parentPid;
// 参加其他感兴趣的进程
mService.mProcessList.forEachLruProcessesLOSP(false, r -> {
if (r != null && r.getThread() != null) {
int myPid = r.getPid();
if (myPid > 0 && myPid != pid && myPid != ppid && myPid != MY_PID) {
if (r.isPersistent()) {
firstPids.add(myPid);
} else if (r.mServices.isTreatedLikeActivity()) {
firstPids.add(myPid);
} else {
lastPids.put(myPid, Boolean.TRUE);
}
}
}
});
...
// dump native 进程
String[] nativeProcs = null;
if (isSilentAnr || onlyDumpSelf) {
for (int i = 0; i < NATIVE_STACKS_OF_INTEREST.length; i++) {
if (NATIVE_STACKS_OF_INTEREST[i].equals(mApp.processName)) {
nativeProcs = new String[] { mApp.processName };
break;
}
}
} else {
nativeProcs = NATIVE_STACKS_OF_INTEREST;
}
int[] pids = nativeProcs == null ? null : Process.getPidsForCommands(nativeProcs);
ArrayList<Integer> nativePids = null;
if (pids != null) {
nativePids = new ArrayList<>(pids.length);
for (int i : pids) {
nativePids.add(i);
}
}
...
// dump 仓库
File tracesFile = ActivityManagerService.dumpStackTraces(firstPids,
isSilentAnr ? null : processCpuTracker, isSilentAnr ? null : lastPids,
nativePids, tracesFileException, offsets, annotation, criticalEventLog);
...
if (tracesFile == null) {
// 没有trace文件 就只向当时发射anr的进程发送signal_quit的信号
Process.sendSignal(pid, Process.SIGNAL_QUIT);
}
...
// 假如后台ANR 则直接杀进程
if (isSilentAnr() && !mApp.isDebugging()) {
mApp.killLocked("bg anr", ApplicationExitInfo.REASON_ANR, true);
return;
}
synchronized (mProcLock) {
//设置进程notRespond的状况
makeAppNotRespondingLSP(xx);
...
}
// ANR 弹窗
if (mService.mUiHandler != null) {
// Bring up the infamous App Not Responding dialog
Message msg = Message.obtain();
msg.what = ActivityManagerService.SHOW_NOT_RESPONDING_UI_MSG;
msg.obj = new AppNotRespondingDialog.Data(mApp, aInfo, aboveSystem);
mService.mUiHandler.sendMessageDelayed(msg, anrDialogDelayMs);
}
}
这个函数主要做了以下作业:
- 设置notResponsing 的状况
- 收集需求dump的进程分为3组,firstpid包含当时产生ANR的进程、它的父进程、一些persistent的进程;lastpid包含包含在当时进程列表中的其他进程; nativePids 一些预界说的natvie进程;但是假如是后台ANR则只会收集当时进程
- 收集完后开端恳求这些进程dump仓库;
- 假如是后台ANR则直接杀死进程,不然发送一个延时音讯,弹ANR的弹窗;
接着咱们看看怎样恳求这些进程dump仓库, dumpStackTraces办法最终调到了这个重载办法
// ActivityManagerService.java
public static Pair<Long, Long> dumpStackTraces(String tracesFile, ArrayList<Integer> firstPids,ArrayList<Integer> nativePids, ArrayList<Integer> extraPids) {
// 设dump一切进程仓库的总时刻为20s
long remainingTime = 20 * 1000 * Build.HW_TIMEOUT_MULTIPLIER;
// 记录产生ANR进程仓库在trace文件中的开端和结束的位置
long firstPidStart = -1;
long firstPidEnd = -1;
// 先dump firstPids中的进程,也便是最重要的一些进程
if (firstPids != null) {
int num = firstPids.size();
for (int i = 0; i < num; i++) {
final int pid = firstPids.get(i);
// We don't copy ANR traces from the system_server intentionally.
final boolean firstPid = i == 0 && MY_PID != pid;
File tf = null;
if (firstPid) {
tf = new File(tracesFile);
firstPidStart = tf.exists() ? tf.length() : 0;
}
// java进程dump
final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile,
remainingTime);
remainingTime -= timeTaken;
// 假如超时了则停止dump
if (remainingTime <= 0) {
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
if (firstPid) {
firstPidEnd = tf.length();
}
}
// native 进程dump
if (nativePids != null) {
for (int pid : nativePids) {
final long nativeDumpTimeoutMs = Math.min(NATIVE_DUMP_TIMEOUT_MS, remainingTime);
final long start = SystemClock.elapsedRealtime();
Debug.dumpNativeBacktraceToFileTimeout(
pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));
final long timeTaken = SystemClock.elapsedRealtime() - start;
remainingTime -= timeTaken;
if (remainingTime <= 0) {
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
}
}
// 最终dump 那些cpu运用高的进程
if (extraPids != null) {
for (int pid : extraPids) {
Slog.i(TAG, "Collecting stacks for extra pid " + pid);
final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime);
remainingTime -= timeTaken;
if (remainingTime <= 0) {
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
}
}
Slog.i(TAG, "Done dumping");
return firstPidStart >= 0 ? new Pair<>(firstPidStart, firstPidEnd) : null;
}
ActivityManagerService.dumpJavaTracesTombstoned
private static long dumpJavaTracesTombstoned(int pid, String fileName, long timeoutMs) {
...
boolean javaSuccess = Debug.dumpJavaBacktraceToFileTimeout(pid, fileName,
(int) (timeoutMs / 1000));
...
}
这儿dump Java进程和Native进程仓库分别调用了Debug类的dumpJavaBacktraceToFileTimeout和dumpNativeBacktraceToFileTimeout,这两个是jni办法,完结类在frameworks/base/core/jni/android_os_Debug.cpp文件中
能够看到都调用了dumpTraces办法,仅仅参数不同;
frameworks/base/core/jni/android_os_Debug.cpp
Debug.dumpTrace(...) ->
system/core/debuggerd/client/debuggerd_client.cpp
dump_backtrace_to_file_timeout(...) ->
debuggerd_trigger_dump(...)
debugggerd_trigger_dump比较长
bool debuggerd_trigger_dump(pid_t tid, DebuggerdDumpType dump_type, unsigned int timeout_ms,unique_fd output_fd) {
...
// 和其他进程经过socket通讯,更新socket超时时刻
const auto end = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
auto update_timeout = [timeout_ms, &output_fd](int sockfd, auto end) {
if (timeout_ms <= 0) return true;
auto remaining = end - std::chrono::steady_clock::now();
if (remaining < decltype(remaining)::zero()) {
log_error(output_fd, 0, "timeout expired");
return false;
}
struct timeval timeout;
populate_timeval(&timeout, remaining);
if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) != 0) {
log_error(output_fd, errno, "failed to set receive timeout");
return false;
}
if (setsockopt(sockfd, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) != 0) {
log_error(output_fd, errno, "failed to set send timeout");
return false;
}
return true;
};
// 创立用于跟dump进程通讯的socket
unique_fd sockfd(socket(AF_LOCAL, SOCK_SEQPACKET, 0));
if (sockfd == -1) {
log_error(output_fd, errno, "failed to create socket");
return false;
}
// 设置socket接纳和发送的timeout
if (!update_timeout(sockfd, end)) return false;
// 调用socket的connect衔接到dump进程
if (socket_local_client_connect(sockfd.get(), kTombstonedInterceptSocketName,
ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET) == -1) {
log_error(output_fd, errno, "failed to connect to tombstoned");
return false;
}
// 预备恳求数据
InterceptRequest req = {
.dump_type = dump_type,
.pid = pid,
};
// 预备传输数据的Pipe
// Create an intermediate pipe to pass to the other end.
unique_fd pipe_read, pipe_write;
if (!Pipe(&pipe_read, &pipe_write)) {
log_error(output_fd, errno, "failed to create pipe");
return false;
}
// 取Pipe缓冲区的大小
std::string pipe_size_str;
int pipe_buffer_size = 1024 * 1024;
if (android::base::ReadFileToString("/proc/sys/fs/pipe-max-size", &pipe_size_str)) {
pipe_size_str = android::base::Trim(pipe_size_str);
if (!android::base::ParseInt(pipe_size_str.c_str(), &pipe_buffer_size, 0)) {
LOG(FATAL) << "failed to parse pipe max size '" << pipe_size_str << "'";
}
}
// 设置读缓存的大小
if (fcntl(pipe_read.get(), F_SETPIPE_SZ, pipe_buffer_size) != pipe_buffer_size) {
log_error(output_fd, errno, "failed to set pipe buffer size");
}
//
if (!update_timeout(sockfd, end)) return false;
// 向dump进程发送pipe_write的fd
ssize_t rc = SendFileDescriptors(sockfd, &req, sizeof(req), pipe_write.get());
pipe_write.reset();
if (rc != sizeof(req)) {
log_error(output_fd, errno, "failed to send output fd to tombstoned");
return false;
}
...
// Check to make sure we've successfully registered.
InterceptResponse response;
if (!update_timeout(sockfd, end)) return false;
if (!get_response("initial", sockfd, &response)) return false;
if (response.status != InterceptStatus::kRegistered) {
log_error(output_fd, 0, "unexpected registration response: %d",
static_cast<int>(response.status));
return false;
}
// Send the signal. 发送sigquit信号
const int signal = (dump_type == kDebuggerdJavaBacktrace) ? SIGQUIT : BIONIC_SIGNAL_DEBUGGER;
sigval val = {.sival_int = (dump_type == kDebuggerdNativeBacktrace) ? 1 : 0};
if (sigqueue(pid, signal, val) != 0) {
log_error(output_fd, errno, "failed to send signal to pid %d", pid);
return false;
}
if (!update_timeout(sockfd, end)) return false;
// 从socket获取数据
if (!get_response("status", sockfd, &response)) return false;
if (response.status != InterceptStatus::kStarted) {
response.error_message[sizeof(response.error_message) - 1] = '\0';
log_error(output_fd, 0, "tombstoned reported failure: %s", response.error_message);
return false;
}
// Forward output from the pipe to the output fd.
while (true) {
auto remaining = end - std::chrono::steady_clock::now();
auto remaining_ms = std::chrono::duration_cast<std::chrono::milliseconds>(remaining).count();
if (timeout_ms <= 0) {
remaining_ms = -1;
} else if (remaining_ms < 0) {
log_error(output_fd, 0, "timeout expired");
return false;
}
// 监听作业,监听pipe_read上的读作业
struct pollfd pfd = {
.fd = pipe_read.get(), .events = POLLIN, .revents = 0,
};
// 超时堵塞在这儿,直达指定的作业到来
rc = poll(&pfd, 1, remaining_ms);
if (rc == -1) {
if (errno == EINTR) {
continue;
} else {
log_error(output_fd, errno, "error while polling");
return false;
}
} else if (rc == 0) {
log_error(output_fd, 0, "timeout expired");
return false;
}
// 从pipe_read端读数据到buffer
char buf[1024];
rc = TEMP_FAILURE_RETRY(read(pipe_read.get(), buf, sizeof(buf)));
if (rc == 0) {
// Done.
break;
} else if (rc == -1) {
log_error(output_fd, errno, "error while reading");
return false;
}
// 写数据到/data/anr/trace-xx.txt中
if (!android::base::WriteFully(output_fd.get(), buf, rc)) {
log_error(output_fd, errno, "error while writing");
return false;
}
}
LOG(INFO) << TAG "done dumping process " << pid;
return true;
}
这个办法比较长,主要做了以下作业:
- 创立用于和dump进程通讯的socket的,并衔接dump进程;
- 创立Pipe用于两个进程的数据传输;
- 把pipe_write文件描述符经过socket发送给dump进程,这样当dump好仓库后就能够经过这个pipe_write把数据发送给System Server进程;
- 经过sigqueue办法向dump进程发送sigquit信号;
- 敞开一个循环,经过poll监听dump进程写数据,dump进程写数据后,System Server就能够同过pipe_read端读到数据了;
- 读到数据后写入之前创立的trace文件里;
接着咱们看看这个dump进程是怎样处理sigquit信号,又是怎样dump仓库的; 运用进程都是从zygote进程fork而来,zygote进程在创立虚拟机时会屏蔽主线程的sigquit信号,子进程会承继父进程的信号集,主进程创立的主线程,以及主线程创立的子线程都承继了这个信号集,因此也都无法接受到sigquit信号;sigquit信号是由SignalCatcher线程统一处理的。接着咱们看看代码. 经过读取init.xxx.rc文件,zygote进程的发动从app_main的main办法开端履行;
// frameworks/base/cmds/app_process/app_main.cpp
int main(int argc, char* const argv[])
{
...
AppRuntime runtime(argv[0], computeArgBlockSize(argc, argv));
...
if (zygote) {
runtime.start("com.android.internal.os.ZygoteInit", args, zygote);
} else if (!className.isEmpty()) {
runtime.start("com.android.internal.os.RuntimeInit", args, zygote);
} else {
...
}
}
这儿runtime是AppRuntime的实例,AppRuntime承继了AndroidRuntime, 所以能够看AndroidRuntime的start办法;
// frameworks/base/core/jni/AndroidRuntime.cpp
int AndroidRuntime::startVm(JavaVM** pJavaVM, JNIEnv** pEnv, bool zygote, bool primary_zygote)
{
...
if (startVm(&mJavaVM, &env, zygote, primary_zygote) != 0) {
return;
}
...
// 调用Main办法
char* slashClassName = toSlashClassName(className != NULL ? className : "");
jclass startClass = env->FindClass(slashClassName);
if (startClass == NULL) {
ALOGE("JavaVM unable to locate class '%s'\n", slashClassName);
} else {
jmethodID startMeth = env->GetStaticMethodID(startClass, "main",
"([Ljava/lang/String;)V");
if (startMeth == NULL) {
ALOGE("JavaVM unable to find main() in '%s'\n", className);
} else {
env->CallStaticVoidMethod(startClass, startMeth, strArray);
}
}
...
}
startVm调了JNI_CreateJavaVM办法:
// frameworks/base/core/jni/AndroidRuntime.cpp
int AndroidRuntime::startVm(JavaVM** pJavaVM, JNIEnv** pEnv, bool zygote, bool primary_zygote)
{
...
if (JNI_CreateJavaVM(pJavaVM, pEnv, &initArgs) < 0) {
ALOGE("JNI_CreateJavaVM failed\n");
return -1;
}
...
}
调用了Runtime::Create
// art/runtime/jni/java_vm_ext.cc
extern "C" jint JNI_CreateJavaVM(JavaVM** p_vm, JNIEnv** p_env, void* vm_args) {
...
if (!Runtime::Create(options, ignore_unrecognized)) {
return JNI_ERR;
}
...
}
调用了instance_->Init
// art/runtime/runtime.cc
bool Runtime::Create(RuntimeArgumentMap&& runtime_options) {
if (Runtime::instance_ != nullptr) {
return false;
}
instance_ = new Runtime;
if (!instance_->Init(std::move(runtime_options))) {
instance_ = nullptr;
return false;
}
return true;
}
接着看看init办法
// art/runtime/runtime.cc
bool Runtime::Init(RuntimeArgumentMap&& runtime_options_in) {
...
BlockSignals(); // 在这儿设置了不监听sigquit信号
InitPlatformSignalHandlers();
...
std::string error_msg;
java_vm_ = JavaVMExt::Create(this, runtime_options, &error_msg);
if (java_vm_.get() == nullptr) {
LOG(ERROR) << "Could not initialize JavaVMExt: " << error_msg;
return false;
}
...
}
看看BlockSignals是怎样做的
// art/runtime/runtime.cc
void Runtime::BlockSignals() {
SignalSet signals;
signals.Add(SIGPIPE);
// SIGQUIT is used to dump the runtime's state (including stack traces).
signals.Add(SIGQUIT);
// SIGUSR1 is used to initiate a GC.
signals.Add(SIGUSR1);
signals.Block();
}
// art/runtime/signal_set.h
void Block() {
if (pthread_sigmask64(SIG_BLOCK, &set_, nullptr) != 0) {
PLOG(FATAL) << "pthread_sigmask failed";
}
}
能够看到这儿先界说了一个SignalSet的目标,然后参加了3个信号,其间就有”SIGQUIT“信号;最终调用了sianalSet的Block办法,这个办法便是经过pthread_sigmask64给当时线程设置屏蔽的信号。 pthread_sigmask屏蔽信号的用法能够看这儿。 既然SIGQUIT信号被屏蔽了,那咱们的进程又是在哪处理这个信号的呢?之前咱们说SIGQUIT信号会统一交给SigCatcher线程统一处理,所以接着看看是怎样处理的。在运用进程创立完结后,会取发动SigCatcher线程, 运用进程创立进程能够看这儿。
// art/runtime/runtime.cc
void Runtime::StartSignalCatcher() {
if (!is_zygote_) {
signal_catcher_ = new SignalCatcher(); // 1. 创立了SignalCatcher目标
}
}
//art/runtime/signal_catcher.cc
SignalCatcher::SignalCatcher()
: lock_("SignalCatcher lock"),
cond_("SignalCatcher::cond_", lock_),
thread_(nullptr) {
SetHaltFlag(false);
// Create a raw pthread; its start routine will attach to the runtime.
CHECK_PTHREAD_CALL(pthread_create, (&pthread_, nullptr, &Run, this), "signal catcher thread"); // 2. 创立了一个线程,履行Run办法
Thread* self = Thread::Current();
MutexLock mu(self, lock_);
while (thread_ == nullptr) {
cond_.Wait(self);
}
}
void* SignalCatcher::Run(void* arg) {
SignalCatcher* signal_catcher = reinterpret_cast<SignalCatcher*>(arg)
CHECK(signal_catcher != nullptr);
Runtime* runtime = Runtime::Current();
// 3. attach到运行时
CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup(),
!runtime->IsAotCompiler()));
Thread* self = Thread::Current();
DCHECK_NE(self->GetState(), ThreadState::kRunnable);
{
MutexLock mu(self, signal_catcher->lock_);
signal_catcher->thread_ = self;
signal_catcher->cond_.Broadcast(self)
}
// 4. 这儿设置要监听的信信号
// Set up mask with signals we want to handle.
SignalSet signals;
signals.Add(SIGQUIT); // ANR信号
signals.Add(SIGUSR1); // 这个是和GC相关的
while (true) {
// 5. 这儿会堵塞等待上面的两个信号到来
int signal_number = signal_catcher->WaitForSignal(self, signals)
if (signal_catcher->ShouldHalt()) {
runtime->DetachCurrentThread();
return nullptr
}
// 6. 信号来了,则会走到这儿,履行信号的处理流程
switch (signal_number) {
case SIGQUIT
signal_catcher->HandleSigQuit()
break;
case SIGUSR1:
signal_catcher->HandleSigUsr1();
break;
default:
LOG(ERROR) << "Unexpected signal %d" << signal_number;
break
}
}
}
能够看到便是新创立一个线程,并与JAVA运行时绑定,然后设置要监听的两个信号,然后敞开循环,经过WaitForSignal等待信号的到来,到来则调用相应的信号处理函数;接着咱们看看SigQuit信号是怎样处理的。
void SignalCatcher::HandleSigQuit() {
Runtime* runtime = Runtime::Current();
std::ostringstream os;
os << "\n"
<< "----- pid " << getpid() << " at " << GetIsoDate() << " -----\n";
DumpCmdLine(os);
// Note: The strings "Build fingerprint:" and "ABI:" are chosen to match the format used by
// debuggerd. This allows, for example, the stack tool to work.
std::string fingerprint = runtime->GetFingerprint();
os << "Build fingerprint: '" << (fingerprint.empty() ? "unknown" : fingerprint) << "'\n";
os << "ABI: '" << GetInstructionSetString(runtime->GetInstructionSet()) << "'\n";
os << "Build type: " << (kIsDebugBuild ? "debug" : "optimized") << "\n";
runtime->DumpForSigQuit(os); // 1. 要害代码
...
os << "----- end " << getpid() << " -----\n";
Output(os.str());
}
能够看到这儿便是咱们在anr的trace.txt文件里看到的内容。接着看这儿的要害代码,dumpForSigQuit:
void Runtime::DumpForSigQuit(std::ostream& os) {
// Print backtraces first since they are important do diagnose ANRs,
// and ANRs can often be trimmed to limit upload size.
thread_list_->DumpForSigQuit(os); // 线程仓库
GetClassLinker()->DumpForSigQuit(os); // 加载的类
GetInternTable()->DumpForSigQuit(os); // 常量池
GetJavaVM()->DumpForSigQuit(os);
GetHeap()->DumpForSigQuit(os); // Java heap的运用情况
oat_file_manager_->DumpForSigQuit(os);
if (GetJit() != nullptr) {
GetJit()->DumpForSigQuit(os);
} else {
os << "Running non JIT\n";
}
DumpDeoptimizations(os);
TrackedAllocators::Dump(os)
GetMetrics()->DumpForSigQuit(os);
os << "\n";
BaseMutex::DumpAll(os);
// Inform anyone else who is interested in SigQuit.
{
ScopedObjectAccess soa(Thread::Current());
callbacks_->SigQuit();
}
}
这儿dump了很多信息,看要害的线程仓库信息;
void ThreadList::DumpForSigQuit(std::ostream& os) {
{
...
Dump(os, dump_native_stack);
...
}
// art/runtime/thread_list.cc
void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
...
checkpoint.Dump(self, os); // 1.等待线程都跑到安全点上
...
}
// art/runtime/thread_list.cc
void Dump(Thread* self, std::ostream& os) {
MutexLock mu(self, lock_);
for (const auto& it : os_) {
os << it.second.str() << std::endl; // 输出dump内容
}
}
// art/runtime/thread_list.cc
void Run(Thread* thread) override {
...
// 线程已暂停或许现已跑到履行点上则开端dump 仓库
dump_order = thread->Dump(local_os, unwinder_, dump_native_stack_);
...
os_.emplace(sort_key, std::move(local_os));
...
}
//art/runtime/thread.cc
Thread::DumpOrder Thread::Dump(std::ostream& os,
unwindstack::AndroidLocalUnwinder& unwinder,
bool dump_native_stack,
bool force_dump_stack) const {
DumpState(os);
// 这儿开端dump 仓库
return DumpStack(os, unwinder, dump_native_stack, force_dump_stack);
}
Thread::DumpOrder Thread::DumpStack(std::ostream& os,
unwindstack::AndroidLocalUnwinder& unwinder,
bool dump_native_stack,
bool force_dump_stack) const {
...
// dump Native 仓库
DumpNativeStack(os, unwinder, GetTid(), " native: ", method);
...
// dump Java 仓库
dump_order = DumpJavaStack(os,
/*check_suspended=*/ !force_dump_stack,
/*dump_locks=*/ !force_dump_stack)
return dump_order;
}
// art/runtime/native_stack_dump.cc
void DumpNativeStack(std::ostream& os,
unwindstack::AndroidLocalUnwinder& unwinder,
pid_t tid,
const char* prefix,
ArtMethod* current_method,
void* ucontext_ptr,
bool skip_frames) {
// Historical note: This was disabled when running under Valgrind (b/18119146).
unwindstack::AndroidUnwinderData data(!skip_frames /*show_all_frames*/);
bool unwind_ret;
// 1.先回溯仓库
if (ucontext_ptr != nullptr) {
unwind_ret = unwinder.Unwind(ucontext_ptr, data);
} else {
unwind_ret = unwinder.Unwind(tid, data);
}
if (!unwind_ret) {
os << prefix << "(Unwind failed for thread " << tid << ": "
<< data.GetErrorString() << ")" << std::endl;
return;
}
// Check whether we have and should use addr2line.
bool use_addr2line;
if (kUseAddr2line) {
// Try to run it to see whether we have it. Push an argument so that it doesn't assume a.out
// and print to stderr.
use_addr2line = (gAborting > 0) && RunCommand(FindAddr2line() + " -h");
} else {
use_addr2line = false;
}
std::unique_ptr<Addr2linePipe> addr2line_state;
// 2. demangle
data.DemangleFunctionNames();
bool holds_mutator_lock = Locks::mutator_lock_->IsSharedHeld(Thread::Current());
for (const unwindstack::FrameData& frame : data.frames) {
// We produce output like this:
// ] #00 pc 000075bb8 /system/lib/libc.so (unwind_backtrace_thread+536)
// In order for parsing tools to continue to function, the stack dump
// format must at least adhere to this format:
// #XX pc <RELATIVE_ADDR> <FULL_PATH_TO_SHARED_LIBRARY> ...
// The parsers require a single space before and after pc, and two spaces
// after the <RELATIVE_ADDR>. There can be any prefix data before the
// #XX. <RELATIVE_ADDR> has to be a hex number but with no 0x prefix.
os << prefix << StringPrintf("#%02zu pc ", frame.num);
bool try_addr2line = false;
// 3. 结构每一帧的信息
if (frame.map_info == nullptr)
os << StringPrintf("%08" PRIx64 " ???", frame.pc);
} else {
os << StringPrintf("%08" PRIx64 " ", frame.rel_pc)
const std::shared_ptr<unwindstack::MapInfo>& map_info = frame.map_info;
// so文件名
if (map_info->name().empty()) {
os << StringPrintf("<anonymous:%" PRIx64 ">", map_info->start())
} else {
os << map_info->name().c_str();
}
if (map_info->elf_start_offset() != 0) {
os << StringPrintf(" (offset %" PRIx64 ")", map_info->elf_start_offset());
}
os << " (";
// 办法名 + 办法偏移
if (!frame.function_name.empty()) {
// Remove parameters from the printed function name to improve signal/noise in the logs.
// Also, ANRs are often trimmed, so printing less means we get more useful data out.
// We can still symbolize the function based on the PC and build-id (including inlining).
os << StripParameters(frame.function_name.c_str());
if (frame.function_offset != 0) {
os << "+" << frame.function_offset;
}
// Functions found using the gdb jit interface will be in an empty
// map that cannot be found using addr2line
if (!map_info->name().empty()) {
try_addr2line = true;
}
} else if (current_method != nullptr && holds_mutator_lock) {
const OatQuickMethodHeader* header = current_method->GetOatQuickMethodHeader(frame.pc)
if (header != nullptr) {
const void* start_of_code = header->GetCode();
os << current_method->JniLongName() << "+"
<< (frame.pc - reinterpret_cast<uint64_t>(start_of_code))
} else {
os << "???";
}
} else {
os << "???";
}
os << ")";
// buildId
std::string build_id = map_info->GetPrintableBuildID();
if (!build_id.empty()) {
os << " (BuildId: " << build_id << ")";
}
}
os << std::endl;
// 运用addr2line复原
if (try_addr2line && use_addr2line) {
// Guaranteed that map_info is not nullptr and name is non-empty.
Addr2line(frame.map_info->name(), frame.rel_pc, os, prefix, &addr2line_state);
}
}
if (addr2line_state != nullptr) {
Drain(0, prefix, &addr2line_state, os);
}
}
// art/runtime/thread.cc
Thread::DumpOrder Thread::DumpJavaStack(std::ostream& os,
bool check_suspended,
bool dump_locks) const {
...
ScopedExceptionStorage ses(Thread::Current())
std::unique_ptr<Context> context(Context::Create());
// 结构一个StackDumpVisitor
StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
!tls32_.throwing_OutOfMemoryError, check_suspended, dump_locks);
// 回溯仓库
dumper.WalkStack();
...
}
这儿遍历线程列表,等线程跑到安全点上,自己履行dump仓库的操作,先后dump了native仓库和java仓库。 最终看一下这些信息是怎样传回给SystemServer进程的。在HandlerSigQuit办法结束,咱们看到有一个Output办法
void SignalCatcher::Output(const std::string& s) {
ScopedThreadStateChange tsc(Thread::Current(), ThreadState::kWaitingForSignalCatcherOutput);
palette_status_t status = PaletteWriteCrashThreadStacks(s.data(), s.size());
...
}
这儿调用了PaletteWriteCrashThreadStacks写数据;
palette_status_t PaletteWriteCrashThreadStacks(/*in*/ const char* stacks, size_t stacks_len) {
android::base::unique_fd tombstone_fd;
android::base::unique_fd output_fd;
// 经过socket链接SystemServer
if (!tombstoned_connect(getpid(), &tombstone_fd, &output_fd, kDebuggerdJavaBacktrace)) {
LOG(INFO) << std::string_view(stacks, stacks_len);
// tombstoned_connect() logs failure reason.
return PALETTE_STATUS_FAILED_CHECK_LOG;
}
palette_status_t status = PALETTE_STATUS_OK;
// 开端写数据
if (!android::base::WriteFully(output_fd, stacks, stacks_len)) {
PLOG(ERROR) << "Failed to write tombstoned output";
TEMP_FAILURE_RETRY(ftruncate(output_fd, 0));
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
if (TEMP_FAILURE_RETRY(fdatasync(output_fd)) == -1 && errno != EINVAL) {
if (status == PALETTE_STATUS_OK) {
PLOG(ERROR) << "Failed to fsync tombstoned output";
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
TEMP_FAILURE_RETRY(ftruncate(output_fd, 0));
TEMP_FAILURE_RETRY(fdatasync(output_fd));
}
// 关闭链接
if (close(output_fd.release()) == -1 && errno != EINTR) {
if (status == PALETTE_STATUS_OK) {
PLOG(ERROR) << "Failed to close tombstoned output";
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
}
// 通讯完结
if (!tombstoned_notify_completion(tombstone_fd)) {
// tombstoned_notify_completion() logs failure.
status = PALETTE_STATUS_FAILED_CHECK_LOG;
}
return status
}
发送数据是经过之前树立的socket通道发送的。自此,整个进程就完结了。咱们来总结一下:
- 在ActivityManagerService在发动service 时会埋炸弹;
- 假如App进程能及时告知ActivityManagerService发动完结则拆除炸弹,不然指定时刻到达时则引爆炸弹;
- 引爆炸弹的进程首先是收集需求dump仓库的进程id, 然后和这些进程进行socket衔接,然后给这些进程发送sigquit信号;
- 收到信号的进程会履行dump一切线程的作业,然后将结果发送回SystemServer进程。
- 除了仓库信息外,还会收集进程的cpu运用率的信息。
这篇只记录了Service的anr的流程,下面几篇咱们将会跟踪一下其他3种类型的ANR的履行流程。