commit 839338f6ab1254d6d0709f19db60b164269288d5 from: Omar Polo date: Thu Jun 22 21:25:08 2023 UTC gotd: wait asynchronously for children termination Instead of the current kill() + waitpid(WNOHANG), manage the subprocesses in a separate queue and handle SIGCHLD. A timer is installed to ensure that misbehaving subprocesses are still killed. Fixes the current "child PID 0 terminated" logs due to races with waitpid(). Issue initially reported by Josiah Frentsos. ok + tweaks stsp@ commit - 98ec424553c524789a56753ff641281548b6ac61 commit + 839338f6ab1254d6d0709f19db60b164269288d5 blob - 39b1d2a54632934de2bfaf127e20f0b0f725f4c8 blob + 68a99a122229289490584888ded7f2beb2d99042 --- gotd/gotd.c +++ gotd/gotd.c @@ -80,7 +80,11 @@ struct gotd_child_proc { char repo_path[PATH_MAX]; int pipe[2]; struct gotd_imsgev iev; + struct event tmo; + + TAILQ_ENTRY(gotd_child_proc) entry; }; +TAILQ_HEAD(gotd_procs, gotd_child_proc) procs; struct gotd_client { STAILQ_ENTRY(gotd_client) entry; @@ -113,6 +117,7 @@ static const struct got_error *start_repo_child(struct static const struct got_error *start_auth_child(struct gotd_client *, int, struct gotd_repo *, char *, const char *, int, int); static void kill_proc(struct gotd_child_proc *, int); +static void disconnect(struct gotd_client *); __dead static void usage(void) @@ -276,77 +281,62 @@ ensure_client_is_not_reading(struct gotd_client *clien } static void -wait_for_child(pid_t child_pid) +proc_done(struct gotd_child_proc *proc) { - pid_t pid; - int status; + struct gotd_client *client; - log_debug("waiting for child PID %ld to terminate", - (long)child_pid); + TAILQ_REMOVE(&procs, proc, entry); - do { - pid = waitpid(child_pid, &status, WNOHANG); - if (pid == -1) { - if (errno != EINTR && errno != ECHILD) - fatal("wait"); - } else if (WIFSIGNALED(status)) { - log_warnx("child PID %ld terminated; signal %d", - (long)pid, WTERMSIG(status)); - } - } while (pid != -1 || (pid == -1 && errno == EINTR)); -} + client = find_client_by_proc_fd(proc->iev.ibuf.fd); + if (client != NULL) { + if (proc == client->repo) + client->repo = NULL; + if (proc == client->auth) + client->auth = NULL; + if (proc == client->session) + client->session = NULL; + disconnect(client); + } -static void -proc_done(struct gotd_child_proc *proc) -{ - event_del(&proc->iev.ev); - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); + evtimer_del(&proc->tmo); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + } + free(proc); } static void kill_repo_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->repo == NULL) return; - proc = client->repo; + kill_proc(client->repo, 0); client->repo = NULL; - - proc_done(proc); } static void kill_auth_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->auth == NULL) return; - proc = client->auth; + kill_proc(client->auth, 0); client->auth = NULL; - - proc_done(proc); } static void kill_session_proc(struct gotd_client *client) { - struct gotd_child_proc *proc; - if (client->session == NULL) return; - proc = client->session; + kill_proc(client->session, 0); client->session = NULL; - - proc_done(proc); } static void @@ -745,6 +735,20 @@ static const char *gotd_proc_names[PROC_MAX] = { static void kill_proc(struct gotd_child_proc *proc, int fatal) { + struct timeval tv = { 5, 0 }; + + log_debug("kill -%d %d", fatal ? SIGKILL : SIGTERM, proc->pid); + + if (proc->iev.ibuf.fd != -1) { + event_del(&proc->iev.ev); + msgbuf_clear(&proc->iev.ibuf.w); + close(proc->iev.ibuf.fd); + proc->iev.ibuf.fd = -1; + } + + if (!evtimer_pending(&proc->tmo, NULL) && !fatal) + evtimer_add(&proc->tmo, &tv); + if (fatal) { log_warnx("sending SIGKILL to PID %d", proc->pid); kill(proc->pid, SIGKILL); @@ -753,9 +757,18 @@ kill_proc(struct gotd_child_proc *proc, int fatal) } static void +kill_proc_timeout(int fd, short ev, void *d) +{ + struct gotd_child_proc *proc = d; + + log_warnx("timeout waiting for PID %d to terminate;" + " retrying with force", proc->pid); + kill_proc(proc, 1); +} + +static void gotd_shutdown(void) { - struct gotd_child_proc *proc; uint64_t slot; log_debug("shutting down"); @@ -766,20 +779,31 @@ gotd_shutdown(void) disconnect(c); } - proc = gotd.listen_proc; - msgbuf_clear(&proc->iev.ibuf.w); - close(proc->iev.ibuf.fd); - kill_proc(proc, 0); - wait_for_child(proc->pid); - free(proc); + kill_proc(gotd.listen_proc, 0); log_info("terminating"); exit(0); } +static struct gotd_child_proc * +find_proc_by_pid(pid_t pid) +{ + struct gotd_child_proc *proc = NULL; + + TAILQ_FOREACH(proc, &procs, entry) + if (proc->pid == pid) + break; + + return proc; +} + void gotd_sighdlr(int sig, short event, void *arg) { + struct gotd_child_proc *proc; + pid_t pid; + int status; + /* * Normal signal handler rules don't apply because libevent * decouples for us. @@ -796,6 +820,35 @@ gotd_sighdlr(int sig, short event, void *arg) case SIGINT: gotd_shutdown(); break; + case SIGCHLD: + for (;;) { + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid == -1) { + if (errno == EINTR) + continue; + if (errno == ECHILD) + break; + fatal("waitpid"); + } + if (pid == 0) + break; + + log_debug("reaped pid %d", pid); + proc = find_proc_by_pid(pid); + if (proc == NULL) { + log_info("caught exit of unknown child %d", + pid); + continue; + } + + if (WIFSIGNALED(status)) { + log_warnx("child PID %d terminated with" + " signal %d", pid, WTERMSIG(status)); + } + + proc_done(proc); + } + break; default: fatalx("unexpected signal"); } @@ -1508,6 +1561,10 @@ start_listener(char *argv0, const char *confpath, int if (proc == NULL) fatal("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + + /* proc->tmo is initialized in main() after event_init() */ + proc->type = PROC_LISTEN; if (socketpair(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, @@ -1534,6 +1591,9 @@ start_session_child(struct gotd_client *client, struct if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + if (client_is_reading(client)) proc->type = PROC_SESSION_READ; else @@ -1580,6 +1640,9 @@ start_repo_child(struct gotd_client *client, enum gotd if (proc == NULL) return got_error_from_errno("calloc"); + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); + proc->type = proc_type; if (strlcpy(proc->repo_name, repo->name, sizeof(proc->repo_name)) >= sizeof(proc->repo_name)) @@ -1631,6 +1694,9 @@ start_auth_child(struct gotd_client *client, int requi close(fd); return err; } + + TAILQ_INSERT_HEAD(&procs, proc, entry); + evtimer_set(&proc->tmo, kill_proc_timeout, proc); proc->type = PROC_AUTH; if (strlcpy(proc->repo_name, repo->name, @@ -1732,10 +1798,12 @@ main(int argc, char **argv) struct passwd *pw = NULL; char *repo_path = NULL; enum gotd_procid proc_id = PROC_GOTD; - struct event evsigint, evsigterm, evsighup, evsigusr1; + struct event evsigint, evsigterm, evsighup, evsigusr1, evsigchld; int *pack_fds = NULL, *temp_fds = NULL; struct gotd_repo *repo = NULL; + TAILQ_INIT(&procs); + log_init(1, LOG_DAEMON); /* Log to stderr until daemonized. */ while ((ch = getopt(argc, argv, "Adf:LnP:RsSvW")) != -1) { @@ -1955,18 +2023,23 @@ main(int argc, char **argv) if (proc_id != PROC_GOTD) fatal("invalid process id %d", proc_id); + evtimer_set(&gotd.listen_proc->tmo, kill_proc_timeout, + gotd.listen_proc); + apply_unveil_selfexec(); signal_set(&evsigint, SIGINT, gotd_sighdlr, NULL); signal_set(&evsigterm, SIGTERM, gotd_sighdlr, NULL); signal_set(&evsighup, SIGHUP, gotd_sighdlr, NULL); signal_set(&evsigusr1, SIGUSR1, gotd_sighdlr, NULL); + signal_set(&evsigchld, SIGCHLD, gotd_sighdlr, NULL); signal(SIGPIPE, SIG_IGN); signal_add(&evsigint, NULL); signal_add(&evsigterm, NULL); signal_add(&evsighup, NULL); signal_add(&evsigusr1, NULL); + signal_add(&evsigchld, NULL); gotd_imsg_event_add(&gotd.listen_proc->iev);