diff --git a/src/journal/journald-server.c b/src/journal/journald-server.c index ecf7b7a47669f..299b0a848fb12 100644 --- a/src/journal/journald-server.c +++ b/src/journal/journald-server.c @@ -78,6 +78,8 @@ #define RECHECK_SPACE_USEC (30*USEC_PER_SEC) +#define NOTIFY_SNDBUF_SIZE (8*1024*1024) + static int determine_space_for( Server *s, JournalMetrics *metrics, @@ -1457,6 +1459,126 @@ static int server_open_hostname(Server *s) { return 0; } +static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) { + Server *s = userdata; + int r; + + assert(s); + assert(s->notify_event_source == es); + assert(s->notify_fd == fd); + + if (revents != EPOLLOUT) { + log_error("Invalid events on notify file descriptor."); + return -EINVAL; + } + + /* The $NOTIFY_SOCKET is writable again, now send exactly one + * message on it. Either it's the initial READY=1 event or an + * stdout stream event. If there's nothing to write anymore, + * turn our event source off. The next time there's something + * to send it will be turned on again. */ + + if (!s->sent_notify_ready) { + static const char p[] = + "READY=1\n" + "STATUS=Processing requests..."; + ssize_t l; + + l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT); + if (l < 0) { + if (errno == EAGAIN) + return 0; + + return log_error_errno(errno, "Failed to send READY=1 notification message: %m"); + } + + s->sent_notify_ready = true; + log_debug("Sent READY=1 notification."); + + } else if (s->stdout_streams_notify_queue) + /* Dispatch one stream notification event */ + stdout_stream_send_notify(s->stdout_streams_notify_queue); + + /* Leave us enabled if there's still more to to do. */ + if (s->stdout_streams_notify_queue) + return 0; + + /* There was nothing to do anymore, let's turn ourselves off. */ + r = sd_event_source_set_enabled(es, SD_EVENT_OFF); + if (r < 0) + return log_error_errno(r, "Failed to turn off notify event source: %m"); + + return 0; +} + +static int server_connect_notify(Server *s) { + union sockaddr_union sa = { + .un.sun_family = AF_UNIX, + }; + const char *e; + int r; + + assert(s); + assert(s->notify_fd < 0); + assert(!s->notify_event_source); + + /* + So here's the problem: we'd like to send notification + messages to PID 1, but we cannot do that via sd_notify(), + since that's synchronous, and we might end up blocking on + it. Specifically: given that PID 1 might block on + dbus-daemon during IPC, and dbus-daemon is logging to us, + and might hence block on us, we might end up in a deadlock + if we block on sending PID 1 notification messages -- by + generating a full blocking circle. To avoid this, let's + create a non-blocking socket, and connect it to the + notification socket, and then wait for POLLOUT before we + send anything. This should efficiently avoid any deadlocks, + as we'll never block on PID 1, hence PID 1 can safely block + on dbus-daemon which can safely block on us again. + + Don't think that this issue is real? It is, see: + https://github.com/systemd/systemd/issues/1505 + */ + + e = getenv("NOTIFY_SOCKET"); + if (!e) + return 0; + + if ((e[0] != '@' && e[0] != '/') || e[1] == 0) { + log_error("NOTIFY_SOCKET set to an invalid value: %s", e); + return -EINVAL; + } + + if (strlen(e) > sizeof(sa.un.sun_path)) { + log_error("NOTIFY_SOCKET path too long: %s", e); + return -EINVAL; + } + + s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (s->notify_fd < 0) + return log_error_errno(errno, "Failed to create notify socket: %m"); + + (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE); + + strncpy(sa.un.sun_path, e, sizeof(sa.un.sun_path)); + if (sa.un.sun_path[0] == '@') + sa.un.sun_path[0] = 0; + + r = connect(s->notify_fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(e)); + if (r < 0) + return log_error_errno(errno, "Failed to connect to notify socket: %m"); + + r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s); + if (r < 0) + return log_error_errno(r, "Failed to watch notification socket: %m"); + + /* This should fire pretty soon, which we'll use to send the + * READY=1 event. */ + + return 0; +} + int server_init(Server *s) { _cleanup_fdset_free_ FDSet *fds = NULL; int n, r, fd; @@ -1465,7 +1587,7 @@ int server_init(Server *s) { assert(s); zero(*s); - s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = -1; + s->syslog_fd = s->native_fd = s->stdout_fd = s->dev_kmsg_fd = s->audit_fd = s->hostname_fd = s->notify_fd = -1; s->compress = true; s->seal = true; @@ -1511,8 +1633,6 @@ int server_init(Server *s) { if (r < 0) return log_error_errno(r, "Failed to create event loop: %m"); - sd_event_set_watchdog(s->event, true); - n = sd_listen_fds(true); if (n < 0) return log_error_errno(n, "Failed to read listening file descriptors from environment: %m"); @@ -1637,6 +1757,8 @@ int server_init(Server *s) { server_cache_boot_id(s); server_cache_machine_id(s); + (void) server_connect_notify(s); + return system_journal_open(s, false); } @@ -1685,6 +1807,7 @@ void server_done(Server *s) { sd_event_source_unref(s->sigterm_event_source); sd_event_source_unref(s->sigint_event_source); sd_event_source_unref(s->hostname_event_source); + sd_event_source_unref(s->notify_event_source); sd_event_unref(s->event); safe_close(s->syslog_fd); @@ -1693,6 +1816,7 @@ void server_done(Server *s) { safe_close(s->dev_kmsg_fd); safe_close(s->audit_fd); safe_close(s->hostname_fd); + safe_close(s->notify_fd); if (s->rate_limit) journal_rate_limit_free(s->rate_limit); diff --git a/src/journal/journald-server.h b/src/journal/journald-server.h index a2631c6017573..170602ea16614 100644 --- a/src/journal/journald-server.h +++ b/src/journal/journald-server.h @@ -26,9 +26,12 @@ #include "sd-event.h" +typedef struct Server Server; + #include "hashmap.h" #include "journal-file.h" #include "journald-rate-limit.h" +#include "journald-stream.h" #include "list.h" typedef enum Storage { @@ -48,15 +51,14 @@ typedef enum SplitMode { _SPLIT_INVALID = -1 } SplitMode; -typedef struct StdoutStream StdoutStream; - -typedef struct Server { +struct Server { int syslog_fd; int native_fd; int stdout_fd; int dev_kmsg_fd; int audit_fd; int hostname_fd; + int notify_fd; sd_event *event; @@ -71,6 +73,7 @@ typedef struct Server { sd_event_source *sigterm_event_source; sd_event_source *sigint_event_source; sd_event_source *hostname_event_source; + sd_event_source *notify_event_source; JournalFile *runtime_journal; JournalFile *system_journal; @@ -111,6 +114,7 @@ typedef struct Server { usec_t oldest_file_usec; LIST_HEAD(StdoutStream, stdout_streams); + LIST_HEAD(StdoutStream, stdout_streams_notify_queue); unsigned n_stdout_streams; char *tty_path; @@ -132,6 +136,7 @@ typedef struct Server { struct udev *udev; + bool sent_notify_ready; bool sync_scheduled; char machine_id_field[sizeof("_MACHINE_ID=") + 32]; @@ -140,7 +145,7 @@ typedef struct Server { /* Cached cgroup root, so that we don't have to query that all the time */ char *cgroup_root; -} Server; +}; #define SERVER_MACHINE_ID(s) ((s)->machine_id_field + strlen("_MACHINE_ID=")) diff --git a/src/journal/journald-stream.c b/src/journal/journald-stream.c index 5300c61c02225..fb6afee1713cb 100644 --- a/src/journal/journald-stream.c +++ b/src/journal/journald-stream.c @@ -79,6 +79,7 @@ struct StdoutStream { bool forward_to_console:1; bool fdstore:1; + bool in_notify_queue:1; char buffer[LINE_MAX+1]; size_t length; @@ -88,6 +89,7 @@ struct StdoutStream { char *state_file; LIST_FIELDS(StdoutStream, stdout_stream); + LIST_FIELDS(StdoutStream, stdout_stream_notify_queue); }; void stdout_stream_free(StdoutStream *s) { @@ -98,6 +100,9 @@ void stdout_stream_free(StdoutStream *s) { assert(s->server->n_stdout_streams > 0); s->server->n_stdout_streams --; LIST_REMOVE(stdout_stream, s->server->stdout_streams, s); + + if (s->in_notify_queue) + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); } if (s->event_source) { @@ -121,7 +126,7 @@ static void stdout_stream_destroy(StdoutStream *s) { return; if (s->state_file) - unlink(s->state_file); + (void) unlink(s->state_file); stdout_stream_free(s); } @@ -200,11 +205,15 @@ static int stdout_stream_save(StdoutStream *s) { goto fail; } - /* Store the connection fd in PID 1, so that we get it passed - * in again on next start */ - if (!s->fdstore) { - sd_pid_notify_with_fds(0, false, "FDSTORE=1", &s->fd, 1); - s->fdstore = true; + if (!s->fdstore && !s->in_notify_queue) { + LIST_PREPEND(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = true; + + if (s->server->notify_event_source) { + r = sd_event_source_set_enabled(s->server->notify_event_source, SD_EVENT_ON); + if (r < 0) + log_warning_errno(r, "Failed to enable notify event source: %m"); + } } return 0; @@ -729,3 +738,50 @@ int server_open_stdout_socket(Server *s) { return 0; } + +void stdout_stream_send_notify(StdoutStream *s) { + struct iovec iovec = { + .iov_base = (char*) "FDSTORE=1", + .iov_len = strlen("FDSTORE=1"), + }; + struct msghdr msghdr = { + .msg_iov = &iovec, + .msg_iovlen = 1, + }; + struct cmsghdr *cmsg; + ssize_t l; + + assert(s); + assert(!s->fdstore); + assert(s->in_notify_queue); + assert(s->server); + assert(s->server->notify_fd >= 0); + + /* Store the connection fd in PID 1, so that we get it passed + * in again on next start */ + + msghdr.msg_controllen = CMSG_SPACE(sizeof(int)); + msghdr.msg_control = alloca0(msghdr.msg_controllen); + + cmsg = CMSG_FIRSTHDR(&msghdr); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + + memcpy(CMSG_DATA(cmsg), &s->fd, sizeof(int)); + + l = sendmsg(s->server->notify_fd, &msghdr, MSG_DONTWAIT|MSG_NOSIGNAL); + if (l < 0) { + if (errno == EAGAIN) + return; + + log_error_errno(errno, "Failed to send stream file descriptor to service manager: %m"); + } else { + log_debug("Successfully sent stream file descriptor to service manager."); + s->fdstore = 1; + } + + LIST_REMOVE(stdout_stream_notify_queue, s->server->stdout_streams_notify_queue, s); + s->in_notify_queue = false; + +} diff --git a/src/journal/journald-stream.h b/src/journal/journald-stream.h index 257dce45df350..e3497f0dedb02 100644 --- a/src/journal/journald-stream.h +++ b/src/journal/journald-stream.h @@ -21,9 +21,13 @@ along with systemd; If not, see . ***/ +typedef struct StdoutStream StdoutStream; + #include "fdset.h" #include "journald-server.h" int server_open_stdout_socket(Server *s); int server_restore_streams(Server *s, FDSet *fds); + void stdout_stream_free(StdoutStream *s); +void stdout_stream_send_notify(StdoutStream *s); diff --git a/src/journal/journald.c b/src/journal/journald.c index 83236ceba9b46..b137e3c7beadb 100644 --- a/src/journal/journald.c +++ b/src/journal/journald.c @@ -61,10 +61,6 @@ int main(int argc, char *argv[]) { log_debug("systemd-journald running as pid "PID_FMT, getpid()); server_driver_message(&server, SD_MESSAGE_JOURNAL_START, "Journal started"); - sd_notify(false, - "READY=1\n" - "STATUS=Processing requests..."); - for (;;) { usec_t t = USEC_INFINITY, n; @@ -117,10 +113,6 @@ int main(int argc, char *argv[]) { server_driver_message(&server, SD_MESSAGE_JOURNAL_STOP, "Journal stopped"); finish: - sd_notify(false, - "STOPPING=1\n" - "STATUS=Shutting down..."); - server_done(&server); return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/units/systemd-journald.service.in b/units/systemd-journald.service.in index 41bfde5be3c5d..2552102bfca02 100644 --- a/units/systemd-journald.service.in +++ b/units/systemd-journald.service.in @@ -22,7 +22,6 @@ RestartSec=0 NotifyAccess=all StandardOutput=null CapabilityBoundingSet=CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_SYS_PTRACE CAP_SYSLOG CAP_AUDIT_CONTROL CAP_AUDIT_READ CAP_CHOWN CAP_DAC_READ_SEARCH CAP_FOWNER CAP_SETUID CAP_SETGID CAP_MAC_OVERRIDE -WatchdogSec=3min FileDescriptorStoreMax=1024 # Increase the default a bit in order to allow many simultaneous