tombstone的抓取与debuggerd的有关系是一个守护进程,用来检测程序的崩溃,将程序崩溃前进程的状态记录下来,保存在/data/tombstone文件夹下,最多10个;本质上是对程序崩溃时某些信号的拦截
相关流程
客户端流程
首先,Android程序的入口有一个linker的操作,大致流程如下:
1 2 3 4 5 6 7 8 |
bionic/linker/arch/arm64/begin.S 31ENTRY(_start) 32 mov x0, sp 33 bl __linker_init 34 35 /* linker init returns the _entry address in the main image */ 36 br x0 37END(_start) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
bionic/linker/linker.cpp 4442/* 4443 * This is the entry point for the linker, called from begin.S. This 4444 * method is responsible for fixing the linker's own relocations, and 4445 * then calling __linker_init_post_relocation(). 4446 * 4447 * Because this method is called before the linker has fixed it's own 4448 * relocations, any attempt to reference an extern variable, extern 4449 * function, or other GOT reference will generate a segfault. 4450 */ 4451extern "C" ElfW(Addr) __linker_init(void* raw_args) { ... 4522 // We have successfully fixed our own relocations. It's safe to run 4523 // the main part of the linker now. 4524 args.abort_message_ptr = &g_abort_message; 4525 ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr); 4526 4527 INFO("[ Jumping to _start (%p)... ]", reinterpret_cast<void*>(start_address)); 4528 4529 // Return the address that the calling assembly stub should jump to. 4530 return start_address; 4531} 4195/* 4196 * This code is called after the linker has linked itself and 4197 * fixed it's own GOT. It is safe to make references to externs 4198 * and other non-local data at this point. 4199 */ 4200static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) { 4201#if TIMING 4202 struct timeval t0, t1; 4203 gettimeofday(&t0, 0); 4204#endif 4205 4206 // Sanitize the environment. 4207 __libc_init_AT_SECURE(args); 4208 4209 // Initialize system properties 4210 __system_properties_init(); // may use 'environ' 4211 4212 debuggerd_init(); 4213 4214 // Get a few environment variables. 4215 const char* LD_DEBUG = getenv("LD_DEBUG"); 4216 if (LD_DEBUG != nullptr) { 4217 g_ld_debug_verbosity = atoi(LD_DEBUG); 4218 } ... 4412} |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
bionic/linker/debugger.cpp 302__LIBC_HIDDEN__ void debuggerd_init() { 303 struct sigaction action; 304 memset(&action, 0, sizeof(action)); 305 sigemptyset(&action.sa_mask); 306 action.sa_sigaction = debuggerd_signal_handler; 307 action.sa_flags = SA_RESTART | SA_SIGINFO; 308 309 // Use the alternate signal stack if available so we can catch stack overflows. 310 action.sa_flags |= SA_ONSTACK; 311 312 sigaction(SIGABRT, &action, nullptr); 313 sigaction(SIGBUS, &action, nullptr); 314 sigaction(SIGFPE, &action, nullptr); 315 sigaction(SIGILL, &action, nullptr); 316 sigaction(SIGSEGV, &action, nullptr); 317#if defined(SIGSTKFLT) 318 sigaction(SIGSTKFLT, &action, nullptr); 319#endif 320 sigaction(SIGTRAP, &action, nullptr); 321} |
为上面这几个信号注册信号处理函数,也就是说只有这几个信号会生成tombstone
SIGILL(非法指令异常)
SIGABRT(abort退出异常)
SIGBUS(硬件访问异常)
SIGFPE(浮点运算异常)
SIGSEGV(内存访问异常)
SIGSTKFLT(协处理器栈异常)
SIGTRAP(这是什么?好像不常见)
信号处理函数为:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
258/* 259 * Catches fatal signals so we can ask debuggerd to ptrace us before 260 * we crash. 261 */ 262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) { 263 // It's possible somebody cleared the SA_SIGINFO flag, which would mean 264 // our "info" arg holds an undefined value. 265 if (!have_siginfo(signal_number)) { 266 info = nullptr; 267 } 268 269 log_signal_summary(signal_number, info); 270 271 send_debuggerd_packet(info); //发送请求 第一次接受到信号是向debuggerd服务端发送请求,等待回应表示链接上了 272 273 // We need to return from the signal handler so that debuggerd can dump the 274 // thread that crashed, but returning here does not guarantee that the signal 275 // will be thrown again, even for SIGSEGV and friends, since the signal could 276 // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to 277 // preserve the SA_SIGINFO contents. 278 signal(signal_number, SIG_DFL); //将信号处理函数置空 279 280 struct siginfo si; 281 if (!info) { 282 memset(&si, 0, sizeof(si)); 283 si.si_code = SI_USER; 284 si.si_pid = getpid(); 285 si.si_uid = getuid(); 286 info = &si; 287 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) { 288 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels 289 // that contain commit 66dd34a (3.9+). The manpage claims to only allow 290 // negative si_code values that are not SI_TKILL, but 66dd34a changed the 291 // check to allow all si_code values in calls coming from inside the house. 292 } 293 294 int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info); //给自己的相关线程再发送一次信号 295 if (rc != 0) { 296 __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s", 297 strerror(errno)); 298 _exit(0); 299 } 300} |
客户端向denggerd发送信息,并等待回应,通过socket的write & read
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
208static void send_debuggerd_packet(siginfo_t* info) { 209 // Mutex to prevent multiple crashing threads from trying to talk 210 // to debuggerd at the same time. 211 static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER; 212 int ret = pthread_mutex_trylock(&crash_mutex); 213 if (ret != 0) { 214 if (ret == EBUSY) { 215 __libc_format_log(ANDROID_LOG_INFO, "libc", 216 "Another thread contacted debuggerd first; not contacting debuggerd."); 217 // This will never complete since the lock is never released. 218 pthread_mutex_lock(&crash_mutex); 219 } else { 220 __libc_format_log(ANDROID_LOG_INFO, "libc", 221 "pthread_mutex_trylock failed: %s", strerror(ret)); 222 } 223 return; 224 } 225 226 int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC); 227 if (s == -1) { 228 __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s", 229 strerror(errno)); 230 return; 231 } 232 233 // debuggerd knows our pid from the credentials on the 234 // local socket but we need to tell it the tid of the crashing thread. 235 // debuggerd will be paranoid and verify that we sent a tid 236 // that's actually in our process. 237 debugger_msg_t msg; 238 msg.action = DEBUGGER_ACTION_CRASH; 239 msg.tid = gettid(); 240 msg.abort_msg_address = reinterpret_cast<uintptr_t>(g_abort_message); 241 msg.original_si_code = (info != nullptr) ? info->si_code : 0; 242 ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg))); 243 if (ret == sizeof(msg)) { 244 char debuggerd_ack; 245 ret = TEMP_FAILURE_RETRY(read(s, &debuggerd_ack, 1)); 246 int saved_errno = errno; 247 notify_gdb_of_libraries(); 248 errno = saved_errno; 249 } else { 250 // read or write failed -- broken connection? 251 __libc_format_log(ANDROID_LOG_FATAL, "libc", "Failed while talking to debuggerd: %s", 252 strerror(errno)); 253 } 254 255 close(s); 256} |
debuggerd服务端启动,dump流程
debuggerd守护进程如何启动,可以通过debuggerd -b 启动,我们暂且不去说他,就说正常的启动模式
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
941int main(int argc, char** argv) { 942 union selinux_callback cb; 943 if (argc == 1) { 944 cb.func_audit = audit_callback; 945 selinux_set_callback(SELINUX_CB_AUDIT, cb); 946 cb.func_log = selinux_log_callback; 947 selinux_set_callback(SELINUX_CB_LOG, cb); 948 return do_server(); 949 } 950 951 bool dump_backtrace = false; 952 bool have_tid = false; 953 pid_t tid = 0; 954 for (int i = 1; i < argc; i++) { 955 if (!strcmp(argv[i], "-b")) { 956 dump_backtrace = true; 957 } else if (!have_tid) { 958 tid = atoi(argv[i]); 959 have_tid = true; 960 } else { 961 usage(); 962 return 1; 963 } 964 } 965 if (!have_tid) { 966 usage(); 967 return 1; 968 } 969 return do_explicit_dump(tid, dump_backtrace); 970} |
启动一个debuggerd服务端
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
849static int do_server() { 850 // debuggerd crashes can't be reported to debuggerd. 851 // Reset all of the crash handlers. 852 signal(SIGABRT, SIG_DFL); 853 signal(SIGBUS, SIG_DFL); 854 signal(SIGFPE, SIG_DFL); 855 signal(SIGILL, SIG_DFL); 856 signal(SIGSEGV, SIG_DFL); 857#ifdef SIGSTKFLT 858 signal(SIGSTKFLT, SIG_DFL); 859#endif 860 signal(SIGTRAP, SIG_DFL); 861 862 // Ignore failed writes to closed sockets 863 signal(SIGPIPE, SIG_IGN); //将debuggerd本身的crash忽略 864 865 // Block SIGCHLD so we can sigtimedwait for it. 866 sigset_t sigchld; 867 sigemptyset(&sigchld); 868 sigaddset(&sigchld, SIGCHLD); 869 sigprocmask(SIG_SETMASK, &sigchld, nullptr); 870 871 int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT, 872 SOCK_STREAM | SOCK_CLOEXEC); //创建一个服务端,等待客户端连接 873 if (s == -1) return 1; 874 875 typedef void (*NativeDebugInit)(void); 876 static NativeDebugInit s_func_ptr = NULL; 877 if(!s_func_ptr) { 878 void* handle = dlopen("libmiuindbg.so",RTLD_NOW); 879 if(handle) { 880 s_func_ptr = (NativeDebugInit)dlsym(handle,"hook_context_do_hook"); 881 } 882 } 883 884 if(s_func_ptr) { 885 s_func_ptr(); 886 } 887 888 // Fork a process that stays root, and listens on a pipe to pause and resume the target. 889 if (!start_signal_sender()) { 890 ALOGE("debuggerd: failed to fork signal sender"); 891 return 1; 892 } 893 894 ALOGI("debuggerd: starting\n"); 895 896 for (;;) { 897 sockaddr_storage ss; 898 sockaddr* addrp = reinterpret_cast<sockaddr*>(&ss); 899 socklen_t alen = sizeof(ss); 900 901 ALOGV("waiting for connection\n"); 902 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC); 903 if (fd == -1) { 904 ALOGE("accept failed: %s\n", strerror(errno)); 905 continue; 906 } 907 908 handle_request(fd); //处理客户端的请求 909 } 910 return 0; 911} |
处理客户端发来的请求
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
808static void handle_request(int fd) { 809 ALOGV("handle_request(%d)\n", fd); 810 811 ScopedFd closer(fd); 812 debugger_request_t request; 813 memset(&request, 0, sizeof(request)); 814 int status = read_request(fd, &request); //读取客户端的请求 815 if (status != 0) { 816 return; 817 } 818 819 ALOGW("debuggerd: handling request: pid=%d uid=%d gid=%d tid=%d\n", request.pid, request.uid, 820 request.gid, request.tid); 821 822#if defined(__LP64__) 823 // On 64 bit systems, requests to dump 32 bit and 64 bit tids come 824 // to the 64 bit debuggerd. If the process is a 32 bit executable, 825 // redirect the request to the 32 bit debuggerd. 826 if (is32bit(request.tid)) { 827 // Only dump backtrace and dump tombstone requests can be redirected. 828 if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE || 829 request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) { 830 redirect_to_32(fd, &request); 831 } else { 832 ALOGE("debuggerd: Not allowed to redirect action %d to 32 bit debuggerd\n", request.action); 833 } 834 return; 835 } 836#endif 837 838 // Fork a child to handle the rest of the request. 839 pid_t fork_pid = fork(); 840 if (fork_pid == -1) { 841 ALOGE("debuggerd: failed to fork: %s\n", strerror(errno)); 842 } else if (fork_pid == 0) { 843 worker_process(fd, request); //处理request 844 } else { 845 monitor_worker_process(fork_pid, request); 846 } 847} |
read客户端发来的信息
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
197static int read_request(int fd, debugger_request_t* out_request) { 198 ucred cr; 199 socklen_t len = sizeof(cr); 200 int status = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &len); 201 if (status != 0) { 202 ALOGE("cannot get credentials"); 203 return -1; 204 } 205 206 ALOGV("reading tid"); 207 fcntl(fd, F_SETFL, O_NONBLOCK); 208 209 pollfd pollfds[1]; 210 pollfds[0].fd = fd; 211 pollfds[0].events = POLLIN; 212 pollfds[0].revents = 0; 213 status = TEMP_FAILURE_RETRY(poll(pollfds, 1, 3000)); //轮询fd句柄 215 ALOGE("timed out reading tid (from pid=%d uid=%d)\n", cr.pid, cr.uid); 216 return -1; 217 } 218 219 debugger_msg_t msg; 220 memset(&msg, 0, sizeof(msg)); 221 status = TEMP_FAILURE_RETRY(read(fd, &msg, sizeof(msg))); //读取客户端信息 222 if (status < 0) { 223 ALOGE("read failure? %s (pid=%d uid=%d)\n", strerror(errno), cr.pid, cr.uid); 224 return -1; 225 } 226 if (status != sizeof(debugger_msg_t)) { 227 ALOGE("invalid crash request of size %d (from pid=%d uid=%d)\n", status, cr.pid, cr.uid); 228 return -1; 229 } 230 231 out_request->action = static_cast<debugger_action_t>(msg.action); 232 out_request->tid = msg.tid; 233 out_request->pid = cr.pid; 234 out_request->uid = cr.uid; 235 out_request->gid = cr.gid; 236 out_request->abort_msg_address = msg.abort_msg_address; 237 out_request->original_si_code = msg.original_si_code; 238 239 if (msg.action == DEBUGGER_ACTION_CRASH) { 240 // Ensure that the tid reported by the crashing process is valid. 241 // This check needs to happen again after ptracing the requested thread to prevent a race. 242 if (!pid_contains_tid(out_request->pid, out_request->tid)) { 243 ALOGE("tid %d does not exist in pid %d. ignoring debug request\n", out_request->tid, 244 out_request->pid); 245 return -1; 246 } 247 } else if (cr.uid == 0 || (cr.uid == AID_SYSTEM && msg.action == DEBUGGER_ACTION_DUMP_BACKTRACE)) { 248 // Only root or system can ask us to attach to any process and dump it explicitly. 249 // However, system is only allowed to collect backtraces but cannot dump tombstones. 250 status = get_process_info(out_request->tid, &out_request->pid, 251 &out_request->uid, &out_request->gid); 252 if (status < 0) { 253 ALOGE("tid %d does not exist. ignoring explicit dump request\n", out_request->tid); 254 return -1; 255 } 256 257 if (!selinux_action_allowed(fd, out_request)) 258 return -1; 259 } else { 260 // No one else is allowed to dump arbitrary processes. 261 return -1; 262 } 263 return 0; 264} |
整体的dump流程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
566static void worker_process(int fd, debugger_request_t& request) { 567 // Open the tombstone file if we need it. 568 std::string tombstone_path; 569 int tombstone_fd = -1; 570 switch (request.action) { 571 case DEBUGGER_ACTION_DUMP_TOMBSTONE: 572 case DEBUGGER_ACTION_CRASH: 573 tombstone_fd = open_tombstone(&tombstone_path); 574 if (tombstone_fd == -1) { 575 ALOGE("debuggerd: failed to open tombstone file: %s\n", strerror(errno)); 576 exit(1); 577 } 578 break; 579 580 case DEBUGGER_ACTION_DUMP_BACKTRACE: 581 break; 582 583 default: 584 ALOGE("debuggerd: unexpected request action: %d", request.action); 585 exit(1); 586 } 587 588 // At this point, the thread that made the request is blocked in 589 // a read() call. If the thread has crashed, then this gives us 590 // time to PTRACE_ATTACH to it before it has a chance to really fault. 591 // 592 // The PTRACE_ATTACH sends a SIGSTOP to the target process, but it 593 // won't necessarily have stopped by the time ptrace() returns. (We 594 // currently assume it does.) We write to the file descriptor to 595 // ensure that it can run as soon as we call PTRACE_CONT below. 596 // See details in bionic/libc/linker/debugger.c, in function 597 // debugger_signal_handler(). 598 599 // Attach to the target process. //通过ptrace监控子进程(要crash的应用进程),此时debuggerd变为其父进程,向应用进程发送sigstop;以后应用进程接受到的signal会先发到父进程 600 if (!ptrace_attach_thread(request.pid, request.tid)) { 601 ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno)); 602 exit(1); 603 } 604 605 // DEBUGGER_ACTION_CRASH requests can come from arbitrary processes and the tid field in the 606 // request is sent from the other side. If an attacker can cause a process to be spawned with the 607 // pid of their process, they could trick debuggerd into dumping that process by exiting after 608 // sending the request. Validate the trusted request.uid/gid to defend against this. 609 if (request.action == DEBUGGER_ACTION_CRASH) { 610 pid_t pid; 611 uid_t uid; 612 gid_t gid; 613 if (get_process_info(request.tid, &pid, &uid, &gid) != 0) { 614 ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid); 615 exit(1); 616 } 617 618 if (pid != request.pid || uid != request.uid || gid != request.gid) { 619 ALOGE( 620 "debuggerd: attached task %d does not match request: " 621 "expected pid=%d,uid=%d,gid=%d, actual pid=%d,uid=%d,gid=%d", 622 request.tid, request.pid, request.uid, request.gid, pid, uid, gid); 623 exit(1); 624 } 625 } 626 627 // Don't attach to the sibling threads if we want to attach gdb. 628 // Supposedly, it makes the process less reliable. 629 bool attach_gdb = should_attach_gdb(request); 630 if (attach_gdb) { 631 // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges. 632 if (init_getevent() != 0) { 633 ALOGE("debuggerd: failed to initialize input device, not waiting for gdb"); 634 attach_gdb = false; 635 } 636 637 } 638 639 std::set<pid_t> siblings; 640 if (!attach_gdb) { 641 ptrace_siblings(request.pid, request.tid, siblings); 642 } 643 644 // Generate the backtrace map before dropping privileges. 645 std::unique_ptr<BacktraceMap> backtrace_map(BacktraceMap::Create(request.pid)); 646 647 int amfd = -1; 648 std::unique_ptr<std::string> amfd_data; 649 if (request.action == DEBUGGER_ACTION_CRASH) { 650 // Connect to the activity manager before dropping privileges. 651 amfd = activity_manager_connect(); 652 amfd_data.reset(new std::string); 653 } 654 655 // Collect the list of open files. 656 OpenFilesList open_files; 657 populate_open_files_list(request.pid, &open_files); 658 659 bool succeeded = false; 660 661 // Now that we've done everything that requires privileges, we can drop them. 662 if (!drop_privileges()) { 663 ALOGE("debuggerd: failed to drop privileges, exiting"); 664 _exit(1); 665 } 666 667 int crash_signal = SIGKILL; 668 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings, 669 &crash_signal, &open_files, amfd_data.get()); 670 if (succeeded) { 671 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) { 672 if (!tombstone_path.empty()) { 673 android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length()); //将dump结果写到相关路径下 674 } 675 } 676 } 677 678 if (attach_gdb || request.action == DEBUGGER_ACTION_CRASH) { 679 // Before detach we must send SIGSTOP to the target. 680 // Tell the signal process to send SIGSTOP to the target. 681 if (!send_signal(request.pid, 0, SIGSTOP)) { 682 ALOGE("debuggerd: failed to stop process for gdb attach: %s", strerror(errno)); 683 attach_gdb = false; 684 } 685 } 686 687 if (!attach_gdb) { 688 // Tell the Activity Manager about the crashing process. If we are 689 // waiting for gdb to attach, do not send this or Activity Manager 690 // might kill the process before anyone can attach. 691 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get()); 692 } 693 694 if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) { //detach客户端 695 ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno)); 696 } 697 698 for (pid_t sibling : siblings) { 699 ptrace(PTRACE_DETACH, sibling, 0, 0); 700 } 701 702 // Send the signal back to the process if it crashed and we're not waiting for gdb. 703 if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) { 704 if (!send_signal(request.pid, request.tid, crash_signal)) { 705 ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno)); 706 } 707 } 708 709 // Wait for gdb, if requested. 710 if (attach_gdb) { 711 wait_for_user_action(request); 712 713 // Now tell the activity manager about this process. 714 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get()); 715 716 // Tell the signal process to send SIGCONT to the target. 717 if (!send_signal(request.pid, 0, SIGCONT)) { 718 ALOGE("debuggerd: failed to resume process %d: %s", request.pid, strerror(errno)); 719 } 720 721 uninit_getevent(); 722 } 723 724 close(amfd); 725 726 exit(!succeeded); 727} |
perform_dump:进行dump的过程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
484static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd, 485 BacktraceMap* backtrace_map, const std::set<pid_t>& siblings, 486 int* crash_signal, OpenFilesList* open_files, std::string* amfd_data) { 487 if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) { //向应用进程(客户端返回一个值),表示连上了,可以开始dump了 488 ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno)); 489 return false; 490 } 491 492 int total_sleep_time_usec = 0; 493 while (true) { 494 int signal = wait_for_signal(request.tid, &total_sleep_time_usec); //因为此时已经被ptrace_attach了,所以第二次客户端发给自己的信号会在这里被接收 495 switch (signal) { 496 case -1: 497 ALOGE("debuggerd: timed out waiting for signal"); 498 return false; 499 500 case SIGSTOP: //这里是attach时向客户端发送的sigstop信号 501 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) { 502 ALOGV("debuggerd: stopped -- dumping to tombstone"); 503 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal, 504 request.original_si_code, request.abort_msg_address, open_files, amfd_data); 505 } else if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE) { 506 ALOGV("debuggerd: stopped -- dumping to fd"); 507 dump_backtrace(fd, backtrace_map, request.pid, request.tid, siblings, nullptr); 508 } else { 509 ALOGV("debuggerd: stopped -- continuing"); //此时通过debuggerd用PTRACE_CONT命令让应用继续执行, // 这样应用的read系统调用就可以返回到用户态,继续执行debuggerd_signal_handler() // 此时,debuggerd进入下一次循环,block在wait_for_signal,继续等待应用的下一个信号 510 if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) { 511 ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno)); 512 return false; 513 } 514 continue; // loop again //注意,这里是继续循环,等待客户端的第二次信号 515 } 516 break; 517 518 case SIGABRT: 519 case SIGBUS: 520 case SIGFPE: 521 case SIGILL: 522 case SIGSEGV: 523#ifdef SIGSTKFLT 524 case SIGSTKFLT: 525#endif 526 case SIGSYS: 527 case SIGTRAP: 528 ALOGV("stopped -- fatal signal\n"); 529 *crash_signal = signal; 530 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal, 531 request.original_si_code, request.abort_msg_address, open_files, amfd_data); //客户端发的第二次信号被debuggerd接受,开始dump 532 break; //dump完之后跳出循环,执行下面的操作 533 534 default: 535 ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal); 536 break; 537 } 538 break; 539 } 540 541 return true; 542} |
本质上有两次通信;
第一次通信是进程的signal handler通过socket与启动的dubuggerd服务端进行通信,客户端向debuggerd写request,服务端获取request并返回一个值表示收到;同时attach到客户端,作为父进程;同时发送一个SIGSTOP信号,被接收时,此时通过debuggerd用PTRACE_CONT命令让应用继续执行,这样应用的read系统调用就可以返回到用户态,继续执行debuggerd_signal_handler,debuggerd进入下一次循环,block在wait_for_signal,继续等待应用的下一个信号
客户端收到答复之后,将注册的信号处理函数去掉,(这样再接收到信号就可以正常的走kernel流程了),然后再次发送一个信号
这里就是第二次通信,信号被父进程debuggerd拦截,开始dump操作,dump操作完后进行detach操作,不再作为客户端的父进程
此时客户端会进入到默认的信号处理逻辑中
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
2173int get_signal(struct ksignal *ksig) 2174{ 2175 struct sighand_struct *sighand = current->sighand; 2176 struct signal_struct *signal = current->signal; 2177 int signr; 2178 2179 if (unlikely(current->task_works)) 2180 task_work_run(); 2181 2182 if (unlikely(uprobe_deny_signal())) 2183 return 0; 2184 2185 /* 2186 * Do this once, we can't return to user-mode if freezing() == T. 2187 * do_signal_stop() and ptrace_stop() do freezable_schedule() and 2188 * thus do not need another check after return. 2189 */ 2190 try_to_freeze(); 2191 2192relock: 2193 spin_lock_irq(&sighand->siglock); 2194 /* 2195 * Every stopped thread goes here after wakeup. Check to see if 2196 * we should notify the parent, prepare_signal(SIGCONT) encodes 2197 * the CLD_ si_code into SIGNAL_CLD_MASK bits. 2198 */ 2199 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { 2200 int why; 2201 2202 if (signal->flags & SIGNAL_CLD_CONTINUED) 2203 why = CLD_CONTINUED; 2204 else 2205 why = CLD_STOPPED; 2206 2207 signal->flags &= ~SIGNAL_CLD_MASK; 2208 2209 spin_unlock_irq(&sighand->siglock); 2210 2211 /* 2212 * Notify the parent that we're continuing. This event is 2213 * always per-process and doesn't make whole lot of sense 2214 * for ptracers, who shouldn't consume the state via 2215 * wait(2) either, but, for backward compatibility, notify 2216 * the ptracer of the group leader too unless it's gonna be 2217 * a duplicate. 2218 */ 2219 read_lock(&tasklist_lock); 2220 do_notify_parent_cldstop(current, false, why); 2221 2222 if (ptrace_reparented(current->group_leader)) 2223 do_notify_parent_cldstop(current->group_leader, 2224 true, why); 2225 read_unlock(&tasklist_lock); 2226 2227 goto relock; 2228 } 2229 2230 for (;;) { 2231 struct k_sigaction *ka; 2232 2233 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && 2234 do_signal_stop(0)) 2235 goto relock; 2236 2237 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { 2238 do_jobctl_trap(); 2239 spin_unlock_irq(&sighand->siglock); 2240 goto relock; 2241 } 2242 2243 signr = dequeue_signal(current, ¤t->blocked, &ksig->info); 2244 2245 if (!signr) 2246 break; /* will return 0 */ 2247 2248 if (unlikely(current->ptrace) && signr != SIGKILL) { 2249 signr = ptrace_signal(signr, &ksig->info); 2250 if (!signr) 2251 continue; 2252 } 2253 2254 ka = &sighand->action[signr-1]; 2255 2256 /* Trace actually delivered signals. */ 2257 trace_signal_deliver(signr, &ksig->info, ka); 2258 2259 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 2260 continue; 2261 if (ka->sa.sa_handler != SIG_DFL) { 2262 /* Run the handler. */ 2263 ksig->ka = *ka; 2264 2265 if (ka->sa.sa_flags & SA_ONESHOT) 2266 ka->sa.sa_handler = SIG_DFL; 2267 2268 break; /* will return non-zero "signr" value */ 2269 } 2270 2271 /* 2272 * Now we are doing the default action for this signal. 2273 */ 2274 if (sig_kernel_ignore(signr)) /* Default is nothing. */ 2275 continue; 2276 2277 /* 2278 * Global init gets no signals it doesn't want. 2279 * Container-init gets no signals it doesn't want from same 2280 * container. 2281 * 2282 * Note that if global/container-init sees a sig_kernel_only() 2283 * signal here, the signal must have been generated internally 2284 * or must have come from an ancestor namespace. In either 2285 * case, the signal cannot be dropped. 2286 */ 2287 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && 2288 !sig_kernel_only(signr)) 2289 continue; 2290 2291 if (sig_kernel_stop(signr)) { 2292 /* 2293 * The default action is to stop all threads in 2294 * the thread group. The job control signals 2295 * do nothing in an orphaned pgrp, but SIGSTOP 2296 * always works. Note that siglock needs to be 2297 * dropped during the call to is_orphaned_pgrp() 2298 * because of lock ordering with tasklist_lock. 2299 * This allows an intervening SIGCONT to be posted. 2300 * We need to check for that and bail out if necessary. 2301 */ 2302 if (signr != SIGSTOP) { 2303 spin_unlock_irq(&sighand->siglock); 2304 2305 /* signals can be posted during this window */ 2306 2307 if (is_current_pgrp_orphaned()) 2308 goto relock; 2309 2310 spin_lock_irq(&sighand->siglock); 2311 } 2312 2313 if (likely(do_signal_stop(ksig->info.si_signo))) { 2314 /* It released the siglock. */ 2315 goto relock; 2316 } 2317 2318 /* 2319 * We didn't actually stop, due to a race 2320 * with SIGCONT or something like that. 2321 */ 2322 continue; 2323 } 2324 2325 spin_unlock_irq(&sighand->siglock); 2326 2327 /* 2328 * Anything else is fatal, maybe with a core dump. 2329 */ 2330 current->flags |= PF_SIGNALED; 2331 2332 if (sig_kernel_coredump(signr)) { 2333 if (print_fatal_signals) 2334 print_fatal_signal(ksig->info.si_signo); 2335 proc_coredump_connector(current); 2336 /* 2337 * If it was able to dump core, this kills all 2338 * other threads in the group and synchronizes with 2339 * their demise. If we lost the race with another 2340 * thread getting here, it set group_exit_code 2341 * first and our do_group_exit call below will use 2342 * that value and ignore the one we pass it. 2343 */ 2344 do_coredump(&ksig->info); 2345 } 2346 2347 /* 2348 * Death signals, no core dump. 2349 */ 2350 do_group_exit(ksig->info.si_signo); 2351 /* NOTREACHED */ 2352 } 2353 spin_unlock_irq(&sighand->siglock); 2354 2355 ksig->sig = signr; 2356 return ksig->sig > 0; 2357} |
1 2 |
412#define sig_kernel_coredump(sig) \ 413 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK)) |
1 2 3 4 5 6 |
399 rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \ 400 rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \ 401 rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \ 402 rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \ 403 rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \ 404 SIGEMT_MASK |
可见coredump相应的信号比tombstone多,tombstone响应的为coredump的子集,能响应coredump的信号如下,参考default action列表:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
* +--------------------+------------------+ * | POSIX signal | default action | * +--------------------+------------------+ * | SIGHUP | terminate | * | SIGINT | terminate | * | SIGQUIT | coredump | * | SIGILL | coredump | * | SIGTRAP | coredump | * | SIGABRT/SIGIOT | coredump | * | SIGBUS | coredump | * | SIGFPE | coredump | * | SIGKILL | terminate(+) | * | SIGUSR1 | terminate | * | SIGSEGV | coredump | * | SIGUSR2 | terminate | * | SIGPIPE | terminate | * | SIGALRM | terminate | * | SIGTERM | terminate | * | SIGCHLD | ignore | * | SIGCONT | ignore(*) | * | SIGSTOP | stop(*)(+) | * | SIGTSTP | stop(*) | * | SIGTTIN | stop(*) | * | SIGTTOU | stop(*) | * | SIGURG | ignore | * | SIGXCPU | coredump | * | SIGXFSZ | coredump | * | SIGVTALRM | terminate | * | SIGPROF | terminate | * | SIGPOLL/SIGIO | terminate | * | SIGSYS/SIGUNUSED | coredump | * | SIGSTKFLT | terminate | * | SIGWINCH | ignore | * | SIGPWR | terminate | * | SIGRTMIN-SIGRTMAX | terminate | * +--------------------+------------------+ * | non-POSIX signal | default action | * +--------------------+------------------+ * | SIGEMT | coredump | * +--------------------+------------------+ |
那么如何tombstone添加一个信号呢?