/*****************************************************************************

	flatline - minimalistic high-availability daemon
	Copyright (c) 2016 Wessel Dankers <wsl@fruit.je>

	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation; either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*****************************************************************************/

#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdarg.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <syslog.h>
#include <getopt.h>
#include <sysexits.h>
#include <signal.h>
#include <poll.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <sys/timerfd.h>
#include <sys/signalfd.h>
#include <netdb.h>

typedef uint64_t nanosecond_t;
#define NANOSECOND_C(x) UINT64_C(x)
#define PRIuNANOSECOND PRIu64
#define PRIxNANOSECOND PRIx64
#define PRIXNANOSECOND PRIX64

#ifndef VERSION
#define VERSION "(git)"
#endif

static bool verbose = false;

static char default_syslog_ident[20] = "flatline";
static const char *current_syslog_ident;
static int current_syslog_options = LOG_CONS | LOG_NDELAY;

static void set_syslog_ident(const char *ident) {
	openlog(ident, current_syslog_options, LOG_DAEMON);
	current_syslog_ident = ident;
}

static void custom_vsyslog(const char *ident, int priority, const char *format, va_list ap) {
	if(ident != current_syslog_ident)
		set_syslog_ident(ident);
	vsyslog(priority, format, ap);
}

static void custom_syslog(const char *ident, int priority, const char *format, ...) {
	va_list ap;
	va_start(ap, format);
	custom_vsyslog(ident, priority, format, ap);
	va_end(ap);
}

static void default_syslog(int priority, const char *format, ...) {
	va_list ap;
	va_start(ap, format);
	custom_vsyslog(default_syslog_ident, priority, format, ap);
	va_end(ap);
}

static void exit_with_error(int exit_code, const char *format, ...) {
	va_list ap;
	va_start(ap, format);
	custom_vsyslog(default_syslog_ident, LOG_EMERG, format, ap);
	va_end(ap);
	exit(exit_code);
}

static void _exit_with_error(int exit_code, const char *format, ...) {
	va_list ap;
	va_start(ap, format);
	custom_vsyslog(default_syslog_ident, LOG_EMERG, format, ap);
	va_end(ap);
	_exit(exit_code);
}

static nanosecond_t nanosecond_get_clock(void) {
	struct timespec ts;
	if(clock_gettime(CLOCK_MONOTONIC, &ts) == -1)
		exit_with_error(EX_OSERR, "clock_gettime(CLOCK_MONOTONIC): %m\n");
	return (nanosecond_t)ts.tv_sec * NANOSECOND_C(1000000000) + (nanosecond_t)ts.tv_nsec;
}

static void *xalloc(size_t len) {
	void *buf;
	buf = malloc(len);
	if(!buf)
		exit_with_error(EX_OSERR, "malloc(%zu): %m", len);
	return buf;
}

static char *xstrdup(const char *src) {
	return src ? strcpy(xalloc(strlen(src) + 1), src) : NULL;
}

extern char **environ;
static char *argv0;
static int argv0len;
static int argv0max;

static char **argv0init(int argc, char **argv) {
	argv0 = argv[0];
	char *argv0end = argv0;
	size_t len = 0;

	for(int i = 0; i < argc; i++)
		len += strlen(argv[i]);

	int envc;
	for(envc = 0; environ[envc]; envc++)
		len += strlen(environ[envc]);

	// both arrays, including trailing \0s for each of the strings
	// except for the last element of environ (which is NULL)
	len += (argc + envc + 1) * (sizeof *argv + sizeof **argv) - 1;

	char *buf = xalloc(len);

	char **new_argv = (char **)buf;
	buf += argc * sizeof *argv;

	char **new_environ = (char **)buf;
	buf += (envc + 1) * sizeof *environ;

	for(int i = 0; i < argc; i++) {
		char *arg = argv[i];
		size_t len = strlen(arg);
		size_t len0 = len + 1;
		memcpy(buf, arg, len0);
		new_argv[i] = buf;
		buf += len0;
		char *end = arg + len;
		if(end > argv0end)
			argv0end = end;
	}

	argv0len = argv0end - argv0;

	for(int i = 0; i < envc; i++) {
		char *env = environ[i];
		size_t len = strlen(env);
		size_t len0 = len + 1;
		memcpy(buf, env, len0);
		new_environ[i] = buf;
		buf += len0;
		char *end = env + len;
		if(end > argv0end)
			argv0end = end;
		memset(env, '\0', len);
	}
	new_environ[envc] = NULL;
	environ = new_environ;

	argv0max = argv0end - argv0 + 1;

	return new_argv;
}

static void argv0set(const char *format, ...) {
	va_list ap;
	va_start(ap, format);
	int len = vsnprintf(argv0, argv0max, format, ap);
	va_end(ap);

	if(len < 0)
		return;

	/* linux-3.16.36/mm/util.c:465 int get_cmdline():
		...
		483      * If the nul at the end of args has been overwritten, then
		484      * assume application is using setproctitle(3).
		...
		486     if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
		...
	*/
	if(len <= argv0len) {
		argv0[argv0len] = ' ';
		if(len == argv0len && argv0len < argv0max - 1)
			argv0[argv0len + 1] = '\0';
	}
}

// options
static const char *local_port;
static const char *local_addr;
static const char *remote_port = "39786";
static const char *remote_addr;
static const char *script_path;
static nanosecond_t timeout;
static nanosecond_t interval;

// state
static bool peer_is_up = true;
static bool peer_sees_us = false;
static bool peer_shutdown = false;
static bool self_shutdown = false;

static bool local_service_valid = true;
static bool local_service_active = false;
static bool local_service_tentative = false;
static bool remote_service_valid = true;
static bool remote_service_active = false;
static bool remote_service_tentative = false;

static bool peer_has_their_service = true;
static bool peer_has_our_service = true;
// if false, the peer should restart what it considers to be its local service:
static bool peer_has_valid_service = true;

static bool split_brain_recovery = false;
static pid_t script_pid = -1;
static uint8_t last_packet = UINT8_MAX;
static int epoll_fd = -1;
static sigset_t original_signal_mask;
static sigset_t signal_mask;

static nanosecond_t now;
static nanosecond_t recv_deadline;
static nanosecond_t send_deadline;
static nanosecond_t shutdown_deadline;
static nanosecond_t current_timer;

typedef void (*event_handler)(void *);

struct logpipe {
	event_handler handler;
	size_t fill;
	int fd;
	int loglevel;
	char buf[4096];
	char ident[1];
};

static void handle_logpipe_event(struct logpipe *logpipe) {
	int fd = logpipe->fd;
	char *buf = logpipe->buf;
	size_t fill = logpipe->fill;
	ssize_t result_byte_count = read(fd, buf + fill, sizeof logpipe->buf - fill - 1);
	switch(result_byte_count) {
		case -1:
			if(errno == EINTR || errno == EAGAIN || errno == EWOULDBLOCK)
				break;
			default_syslog(LOG_ERR, "read(pipe): %m");
			// fallthrough
		case 0:
			if(fill) {
				buf[fill] = '\0';
				custom_syslog(logpipe->ident, logpipe->loglevel, "%s", buf);
			}
			struct epoll_event dummy;
			if(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, logpipe->fd, &dummy) == -1)
				exit_with_error(EX_OSERR, "epoll_ctl(EPOLL_CTL_ADD, %d): %m", logpipe->fd);
			close(fd);
			if(current_syslog_ident == logpipe->ident)
				set_syslog_ident(default_syslog_ident);
			free(logpipe);
			break;
		default:;
			char *line = buf;
			char *buf_end = buf + fill + result_byte_count;
			while(line < buf_end) {
				char *new_line = memchr(line, '\n', buf_end - line);
				if(!new_line) {
					if(buf_end - line > sizeof logpipe->buf / 2) {
						*buf_end = '\0';
						custom_syslog(logpipe->ident, logpipe->loglevel, "%s", line);
						line = buf_end;
					} else if(buf != line) {
						memmove(buf, line, buf_end - line);
					}
					break;
				}
				*new_line = '\0';
				if(new_line > line)
					custom_syslog(logpipe->ident, logpipe->loglevel, "%s", line);
				line = new_line + 1;
			}
			logpipe->fill = buf_end - line;
	}
}

static struct logpipe *create_logpipe(int loglevel, int *fd_ptr) {
	int pipe_fds[2];
	if(pipe2(pipe_fds, O_CLOEXEC) == -1)
		exit_with_error(EX_OSERR, "pipe(): %m");

	int flags = fcntl(pipe_fds[0], F_GETFL);
	if(flags == -1)
		exit_with_error(EX_OSERR, "fcntl(F_GETFL): %m");
	if(fcntl(pipe_fds[0], F_SETFL, flags | O_NONBLOCK) == -1)
		exit_with_error(EX_OSERR, "fcntl(F_SETFL, O_NONBLOCK): %m");

	struct logpipe *logpipe = xalloc(sizeof *logpipe + strlen(script_path) + 16);
	logpipe->handler = (event_handler)handle_logpipe_event;
	logpipe->fill = 0;
	logpipe->fd = pipe_fds[0];
	logpipe->loglevel = loglevel;
	strcpy(logpipe->ident, script_path);

	struct epoll_event event = {EPOLLIN};
	event.data.ptr = logpipe;
	if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fds[0], &event) == -1)
		exit_with_error(EX_OSERR, "epoll_ctl(EPOLL_CTL_ADD, %d): %m", pipe_fds[0]);

	*fd_ptr = pipe_fds[1];

	return logpipe;
}

static void run_script(const char *local_or_remote, const char *start_or_stop) {
	if(verbose)
		default_syslog(LOG_NOTICE, "running script: %s %s %s", script_path, local_or_remote, start_or_stop);

	argv0set("flatline: running script: %s %s %s", script_path, local_or_remote, start_or_stop);

	int stdout_fd = -1;
	struct logpipe *stdout_pipe = create_logpipe(LOG_INFO, &stdout_fd);

	int stderr_fd = -1;
	struct logpipe *stderr_pipe = create_logpipe(LOG_ERR, &stderr_fd);

	script_pid = fork();
	switch(script_pid) {
		case -1:
			exit_with_error(EX_OSERR, "fork(): %m");
		case 0:
			if(sigprocmask(SIG_SETMASK, &original_signal_mask, NULL) == -1)
				_exit_with_error(EX_OSERR, "sigprocmask(): %m");

			if(dup2(stdout_fd, STDOUT_FILENO) == -1)
				_exit_with_error(EX_OSERR, "dup2(): %m");
			if(dup2(stderr_fd, STDERR_FILENO) == -1)
				_exit_with_error(EX_OSERR, "dup2(): %m");

			// allow logger to catch up
			sched_yield();

			execlp(script_path, script_path, local_or_remote, start_or_stop, (const char *)NULL);
			default_syslog(LOG_ERR, "exec(%s) failed: %m", script_path);
			_exit(EX_OSERR);
	}

	sprintf(stdout_pipe->ident, "%s[%ld]", script_path, (long)script_pid);
	sprintf(stderr_pipe->ident, "%s[%ld]", script_path, (long)script_pid);

	if(close(stdout_fd) == -1 || close(stderr_fd) == -1)
		exit_with_error(EX_OSERR, "close(): %m");
}

struct socket_list {
	event_handler handler; // must be the first item
	struct socket_list *next;
	int fd;
	int family;
	int socktype;
	int protocol;
	int last_error;
	nanosecond_t last_error_moment;
	char *hostname;
	char *portname;
	socklen_t addrlen;
	// sockaddr follows this struct
}; 

static struct socket_list *new_socket_list(struct socket_list *next, int fd, struct addrinfo *ai) {
	char hostname[NI_MAXHOST];
	char portname[NI_MAXSERV];
	int err = getnameinfo(ai->ai_addr, ai->ai_addrlen,
		hostname, sizeof hostname,
		portname, sizeof portname,
		NI_DGRAM | NI_NUMERICHOST | NI_NUMERICSERV);

	if(err)
		exit_with_error(EX_OSERR, "getnameinfo(NI_NUMERICHOST | NI_NUMERICSERV): %s", gai_strerror(err));

	struct socket_list *sock;
	sock = xalloc(sizeof *sock + ai->ai_addrlen);
	sock->handler = NULL;
	sock->next = next;
	sock->fd = fd;
	sock->family = ai->ai_family;
	sock->socktype = ai->ai_socktype;
	sock->protocol = ai->ai_protocol;
	sock->last_error = 0;
	sock->last_error_moment = 0;
	sock->hostname = xstrdup(hostname);
	sock->portname = xstrdup(portname);
	sock->addrlen = ai->ai_addrlen;
	memcpy(sock + 1, ai->ai_addr, ai->ai_addrlen);

	return sock;
}

static const struct option long_options[] = {
	{"help\0\0                    Print this message to stdout", no_argument, 0, 'h'},
	{"version\0\0                 Print the program version", no_argument, 0, 'V'},
	{"verbose\0\0                 Log additional information", no_argument, 0, 'v'},
	{"remote-address\0address\0   Address of the remote host", required_argument, 0, 'r'},
	{"port\0port\0                Port of the remote host", required_argument, 0, 'p'},
	{"listen-port\0port\0         Port for receiving packets (defaults to -p)", required_argument, 0, 'l'},
	{"bind-address\0address\0     Address to listen on", required_argument, 0, 'b'},
	{"script\0path\0              Script to start/stop services", required_argument, 0, 's'},
	{"interval\0timespec\0        Interval between keepalives sent", required_argument, 0, 'i'},
	{"timeout\0timespec\0         Timeout for receiving keepalives", required_argument, 0, 't'},
	{NULL}
};

static void usage(FILE *fh, const char *progname) {
	int i;
	fprintf(fh, "Usage: %s [-", progname);
	for(i = 0; long_options[i].name; i++)
		if(long_options[i].val && long_options[i].has_arg == no_argument)
			fputc(long_options[i].val, fh);
	fprintf(fh, "]");
	for(i = 0; long_options[i].name; i++) {
		const struct option *long_option = long_options + i;
		const char *name = long_option->name;
		const char *argument_name = name + strlen(name) + 1;
		int has_arg = long_option->has_arg;
		if(has_arg == required_argument)
			fprintf(fh, " [-%c <%s>]", long_option->val, argument_name);
		else if(has_arg == optional_argument)
			fprintf(fh, " [-%c [<%s>]]", long_option->val, argument_name);
	}
	fprintf(fh, "\n");
	for(i = 0; long_options[i].name; i++) {
		const struct option *long_option = long_options + i;
		const char *name = long_option->name;
		const char *argument_name = name + strlen(name) + 1;
		const char *description = argument_name + strlen(argument_name) + 1;
		int has_arg = long_option->has_arg;
		fprintf(fh, "\t-%c, --%s%s%s%s%s\n",
			long_option->val,
			name,
			has_arg == no_argument ? "" : has_arg == optional_argument ? "[=<" : "=<",
			argument_name,
			has_arg == no_argument ? "      " : has_arg == optional_argument ? ">] " : ">   ",
			description
		);
	}
}

static void get_short_options(const struct option *lo, char *buf) {
	int i;
	*buf++ = ':';
	while(lo->name) {
		i = lo->val;
		if(i) {
			*buf++ = (char)i;
			i = lo->has_arg;
			while(i--)
				*buf++ = ':';
		}
		lo++;
	}
	*buf++ = '\0';
}

static nanosecond_t parse_timespec(const char *timespec) {
	nanosecond_t total = 0;
	for(;;) {
		char c = *timespec++;
		if(c >= '0' && c <= '9') {
			total = total * NANOSECOND_C(10) + (nanosecond_t)(c - '0');
		} else {
			if(!c)
				return total * NANOSECOND_C(1000000000);
			char next = *timespec;
			switch(c) {
				case 's':
					if(next)
						break;
					return total * NANOSECOND_C(1000000000);
				case 'n':
					if(next && next != 's')
						break;
					return total;
				case 'u':
					if(next && next != 's')
						break;
					return total * NANOSECOND_C(1000);
				case 'm':
					if(!next)
						return total * NANOSECOND_C(60000000000);
					if(next == 's')
						return total * NANOSECOND_C(1000000);
					break;
				case 'h':
					if(next)
						break;
					return total * NANOSECOND_C(3600000000000);
			}
			exit_with_error(EX_USAGE, "unknown time unit %s", *timespec);
		}
	}
}

static void handle_listener_event(struct socket_list *listener) {
	char packet_buffer;
	struct sockaddr_storage sa;
	socklen_t sa_len = sizeof sa;
	ssize_t recv_result = recvfrom(listener->fd,
			&packet_buffer, sizeof packet_buffer,
			MSG_DONTWAIT | MSG_TRUNC,
			(struct sockaddr *)&sa, &sa_len);
	switch(recv_result) {
		case -1:
			default_syslog(LOG_ERR, "recvfrom() failed: %m");
			break;
		case 1:
			if(!peer_is_up)
				default_syslog(LOG_ERR, "peer is alive again");
			bool peer_had_their_service_before = peer_has_their_service;
			bool peer_was_in_shutdown_before = peer_shutdown;
			peer_is_up = true;
			peer_has_their_service = packet_buffer & 1;
			peer_has_our_service = (packet_buffer >> 1) & 1;
			peer_shutdown = (packet_buffer >> 2) & 1;
			peer_sees_us = (packet_buffer >> 3) & 1;
			recv_deadline = now + timeout;
			if(peer_shutdown && !peer_has_their_service &&
					(peer_had_their_service_before || !peer_was_in_shutdown_before))
				default_syslog(LOG_NOTICE, "peer shut down");
			break;
		default:;
			char hostname[NI_MAXHOST];
			char portname[NI_MAXSERV];
			int err = getnameinfo((struct sockaddr *)&sa, sa_len,
				hostname, sizeof hostname,
				portname, sizeof portname,
				NI_DGRAM | NI_NUMERICHOST | NI_NUMERICSERV);
			if(err)
				default_syslog(LOG_WARNING, "%s packet received from unknown source (%s)", recv_result ? "oversized" : "empty", gai_strerror(err));
			else
				default_syslog(LOG_WARNING, "%s packet received from host %s port %s", recv_result ? "oversized" : "empty", hostname, portname);
	}
}

struct signal_data {
	event_handler handler;
	int fd;
};

static void handle_signal_data(struct signal_data *event) {
	int fd = event->fd;
	for(;;) {
		struct signalfd_siginfo signalfd_buffer;
		ssize_t result_byte_count = read(fd, &signalfd_buffer, sizeof signalfd_buffer);
		if(result_byte_count == -1) {
			if(errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK)
				exit_with_error(EX_OSERR, "read(signalfd): %m");
			break;
		} else if(result_byte_count == sizeof signalfd_buffer) {
			switch(signalfd_buffer.ssi_signo) {
				case SIGCHLD:;
					pid_t pid;
					int script_status;
					while((pid = waitpid(-1, &script_status, WNOHANG)) > 0) {
						const char *process_log_name = "unknown subprocess";
						if(pid == script_pid) {
							if(!WIFSTOPPED(script_status) && !WIFCONTINUED(script_status)) {
								script_pid = -1;
								local_service_tentative = false;
								remote_service_tentative = false;
							}
							process_log_name = "service script";
							// allow the logger to catch up
							sched_yield();
						}
						if(WIFEXITED(script_status)) {
							script_status = WEXITSTATUS(script_status);
							if(script_status)
								default_syslog(LOG_WARNING, "%s with pid %ld exited with exit code %d", process_log_name, (long)pid, script_status);
						} else if(WIFSIGNALED(script_status)) {
							default_syslog(LOG_WARNING, "%s with pid %ld terminated by signal %d%s", process_log_name, (long)pid, WTERMSIG(script_status), WCOREDUMP(script_status) ? " (core dumped)" : "");
						}
					}
					break;
				case SIGHUP:
					default_syslog(LOG_WARNING, "SIGHUP received but ignored (reloading configuration is unsupported)");
					break;
				case SIGINT:
					sigdelset(&signal_mask, signalfd_buffer.ssi_signo);

					if(sigprocmask(SIG_SETMASK, &signal_mask, NULL) == -1)
						exit_with_error(EX_OSERR, "sigprocmask(): %m");

					if(signalfd(fd, &signal_mask, SFD_NONBLOCK | SFD_CLOEXEC) == -1)
						exit_with_error(EX_OSERR, "signalfd(): %m");

					// fallthrough
				case SIGXFSZ:
				case SIGXCPU:
				case SIGTERM:
					default_syslog(LOG_NOTICE, "signal %"PRId32" received, shutting down", signalfd_buffer.ssi_signo);
					self_shutdown = true;
					break;
				default:
					default_syslog(LOG_WARNING, "unexpectedly caught signal %"PRId32, signalfd_buffer.ssi_signo);
			}

		} else if(result_byte_count) {
			exit_with_error(EX_OSERR, "read(signalfd): unexpected number of bytes returned: %zd", result_byte_count);
		}
	}
}

static struct signal_data signal_data = { (event_handler)handle_signal_data, -1 };

struct timer_data {
	event_handler handler;
	int fd;
};

static void handle_timer_data(struct timer_data *event) {
	int fd = event->fd;
	uint64_t timerfd_buffer;
	ssize_t result_byte_count = read(fd, &timerfd_buffer, sizeof timerfd_buffer);
	if(result_byte_count == -1) {
		if(errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK)
			exit_with_error(EX_OSERR, "read(timerfd): %m");
	} else if(result_byte_count != sizeof timerfd_buffer) {
		exit_with_error(EX_OSERR, "read(timerfd): unexpected number of bytes returned: %zd", result_byte_count);
	}
	current_timer = 0;
}

static struct timer_data timer_data = { (event_handler)handle_timer_data, -1 };

int main(int argc, char **argv) {
	argv = argv0init(argc, argv);

	// make sure stdin is /dev/null and stdin/stdout are open
	// do this before openlog() so that the syslog socket does not confuse things.
	int fd = open("/dev/null", O_RDWR | O_NOCTTY);
	if(fd == -1)
		exit_with_error(EX_OSERR, "open(/dev/null): %m");
	if(fd != STDIN_FILENO && dup2(fd, STDIN_FILENO) == -1)
		exit_with_error(EX_OSERR, "dup2(/dev/null, STDIN_FILENO): %m");
	if(fd != STDOUT_FILENO && fcntl(STDOUT_FILENO, F_GETFD) == -1 && dup2(fd, STDOUT_FILENO) == -1)
		exit_with_error(EX_OSERR, "dup2(/dev/null, STDOUT_FILENO): %m");
	if(fd != STDERR_FILENO && fcntl(STDERR_FILENO, F_GETFD) == -1 && dup2(fd, STDERR_FILENO) == -1)
		exit_with_error(EX_OSERR, "dup2(/dev/null, STDERR_FILENO): %m");
	if(fd != STDIN_FILENO && fd != STDOUT_FILENO && fd != STDERR_FILENO && close(fd) == -1)
		exit_with_error(EX_OSERR, "close(): %m");

	sprintf(default_syslog_ident, "flatline[%ld]", (long)getpid());
#ifdef LOG_PERROR
	if(isatty(STDERR_FILENO))
		current_syslog_options |= LOG_PERROR;
#endif
	set_syslog_ident(default_syslog_ident);

	char short_options[20];
	get_short_options(long_options, short_options);

	opterr = 0;
	int option_character, option_index;
	while((option_character = getopt_long(argc, argv, short_options, long_options, &option_index)) != EOF) {
		switch(option_character) {
			case 'h':
				puts("flatline - minimalistic high-availability system");
				usage(stdout, *argv);
				exit(EX_OK);
			case 'V':
				printf("flatline %s\ncopyright (c) 2016 Wessel Dankers <wsl@fruit.je>\n", VERSION);
				exit(EX_OK);
			case 'v':
				verbose = true;
				break;
			case 'r':
				remote_addr = optarg;
				break;
			case 'p':
				remote_port = optarg;
				break;
			case 'l':
				local_port = optarg;
				break;
			case 'b':
				local_addr = optarg;
				break;
			case 's':
				script_path = optarg;
				break;
			case 'i':
				interval = parse_timespec(optarg);
				break;
			case 't':
				timeout = parse_timespec(optarg);
				break;
			case ':':
				usage(stderr, *argv);
				exit_with_error(EX_USAGE, "option -%c requires an argument", optopt);
			default:
				usage(stderr, *argv);
				exit_with_error(EX_USAGE, "unknown option: -%c", option_character);
		}
	}

	if(optind != argc) {
		usage(stderr, *argv);
		exit_with_error(EX_USAGE, "no argument expected");
	}

	if(!script_path)
		exit_with_error(EX_USAGE, "--script is a required argument but it's missing");

	if(!remote_addr)
		exit_with_error(EX_USAGE, "--remote-address is a required argument but it's missing");

	if(!local_port)
		local_port = remote_port;

	if(interval) {
		if(!timeout)
			timeout = interval * NANOSECOND_C(7) / NANOSECOND_C(2);
	} else {
		if(!timeout)
			timeout = NANOSECOND_C(1000000000);
		interval = timeout * NANOSECOND_C(2) / NANOSECOND_C(7);
	}

	if(timeout < interval)
		exit_with_error(EX_USAGE, "--timeout should be at least as large as --interval");

	if(!timeout)
		exit_with_error(EX_USAGE, "--timeout should not be 0");

	if(!interval)
		exit_with_error(EX_USAGE, "--interval should not be 0");

	argv0set("flatline (starting up)");
	default_syslog(LOG_NOTICE, "starting up");

	epoll_fd = epoll_create1(EPOLL_CLOEXEC);
	struct epoll_event event = {EPOLLIN};

	if(sigprocmask(0, NULL, &original_signal_mask) == -1)
		exit_with_error(EX_OSERR, "sigprocmask(): %m");
	signal_mask = original_signal_mask;
	sigaddset(&signal_mask, SIGCHLD);
	sigaddset(&signal_mask, SIGTERM);
	sigaddset(&signal_mask, SIGXFSZ);
	sigaddset(&signal_mask, SIGXCPU);
	sigaddset(&signal_mask, SIGHUP);
	sigaddset(&signal_mask, SIGINT);
	if(sigprocmask(SIG_SETMASK, &signal_mask, NULL) == -1)
		exit_with_error(EX_OSERR, "sigprocmask(): %m");

	signal(SIGPIPE, SIG_IGN);

	signal_data.fd = signalfd(-1, &signal_mask, SFD_NONBLOCK | SFD_CLOEXEC);
	if(signal_data.fd == -1)
		exit_with_error(EX_OSERR, "signalfd(SIGCHLD): %m");
	event.data.ptr = &signal_data;
	if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, signal_data.fd, &event) == -1)
		exit_with_error(EX_OSERR, "epoll_ctl(EPOLL_CTL_ADD, %d): %m", signal_data.fd);

	timer_data.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
	if(timer_data.fd == -1)
		exit_with_error(EX_OSERR, "timerfd_create(CLOCK_MONOTONIC): %m");
	event.events = EPOLLIN;
	event.data.ptr = &timer_data;
	if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_data.fd, &event) == -1)
		exit_with_error(EX_OSERR, "epoll_ctl(EPOLL_CTL_ADD, %d): %m", timer_data.fd);

	struct addrinfo hints = {.ai_family = AF_UNSPEC, .ai_socktype = SOCK_DGRAM, .ai_flags = AI_PASSIVE};
	struct addrinfo *ais, *ai;
	struct socket_list *listeners = NULL, *connectors = NULL, *current_connector, *socket_iterator;
	size_t num_connectors = 0, current_connector_index = 0;

	int err = getaddrinfo(local_addr, local_port, &hints, &ais);
	if(err)
		exit_with_error(EX_OSERR, "getaddrinfo(port=%s): %s", local_port, gai_strerror(err));
	if(!ais)
		exit_with_error(EX_OSERR, "getaddrinfo(port=%s): no addresses available for listening", local_port);

	for(ai = ais; ai; ai = ai->ai_next) {
		int fd = socket(ai->ai_family, ai->ai_socktype | SOCK_NONBLOCK | SOCK_CLOEXEC, ai->ai_protocol);
		if(fd == -1) {
			if(!err)
				err = errno;
		} else {
			const int on = 1;
			setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof on);
			if(bind(fd, ai->ai_addr, ai->ai_addrlen) == -1) {
				if(!err)
					err = errno;
			} else {
				listeners = new_socket_list(listeners, fd, ai);
				listeners->handler = (event_handler)handle_listener_event;
				event.data.ptr = listeners;
				event.events = EPOLLIN;
				if(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1)
					exit_with_error(EX_OSERR, "epoll_ctl(EPOLL_CTL_ADD, %d): %m", fd);
				continue;
			}
			if(close(fd) == -1)
				default_syslog(LOG_ERR, "close(%d): %m", fd);
		}
	}

	freeaddrinfo(ais);

	if(!listeners) {
		errno = err;
		exit_with_error(EX_OSERR, "unable to listen on port %s: %m", local_port);
	}

	hints.ai_flags = 0;

	err = getaddrinfo(remote_addr, remote_port, &hints, &ais);
	if(err)
		exit_with_error(EX_OSERR, "getaddrinfo(addr=%s, port=%s): %s", remote_addr, remote_port, gai_strerror(err));
	if(!ais)
		exit_with_error(EX_OSERR, "getaddrinfo(addr=%s, port=%s): no addresses", remote_addr, remote_port);

	for(ai = ais; ai; ai = ai->ai_next) {
		int fd = -1;
		// we may be able to reuse a listener
		for(socket_iterator = listeners; socket_iterator; socket_iterator = socket_iterator->next) {
			if(socket_iterator->family == ai->ai_family && socket_iterator->socktype == ai->ai_socktype && socket_iterator->protocol == ai->ai_protocol) {
				fd = socket_iterator->fd;
				connectors = new_socket_list(connectors, fd, ai);
				num_connectors++;
				break;
			}
		}

		if(fd == -1) {
			int fd = socket(ai->ai_family, ai->ai_socktype | SOCK_NONBLOCK | SOCK_CLOEXEC, ai->ai_protocol);
			if(fd == -1) {
				if(!err)
					err = errno;
			} else {
				const int on = 1;
				setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof on);
				if(connect(fd, ai->ai_addr, ai->ai_addrlen) == -1) {
					if(!err)
						err = errno;
				} else {
					connectors = new_socket_list(connectors, fd, ai);
					num_connectors++;
					continue;
				}
				if(close(fd) == -1)
					default_syslog(LOG_ERR, "close(%d): %m", fd);
			}
		}
	}

	freeaddrinfo(ais);

	if(!connectors) {
		errno = err;
		exit_with_error(EX_OSERR, "unable to connect to %s %s: %m", remote_addr, remote_port);
	}

	current_connector = connectors;
	now = nanosecond_get_clock();
	if(!now)
		exit_with_error(EX_OSERR, "your clock is wonky (it returned 0)");
	recv_deadline = now + timeout;

	for(;;) {
		if(now >= recv_deadline) {
			if(peer_is_up && !peer_shutdown)
				default_syslog(LOG_ERR, "peer timed out");
			peer_is_up = false;
			peer_shutdown = false;
			peer_has_our_service = false;
			peer_has_their_service = false;
		}

		// split brain detection

		if(local_service_active || local_service_tentative) {
			if(peer_has_our_service)
				local_service_valid = false;
		} else {
			local_service_valid = true;
		}

		if(remote_service_active || remote_service_tentative) {
			if(peer_has_their_service) {
				remote_service_valid = false;
				peer_has_valid_service = false;
			}
		} else {
			remote_service_valid = true;
		}

		if(!peer_has_their_service)
			peer_has_valid_service = true;

		if(remote_service_valid && local_service_valid && peer_has_valid_service) {
			if(split_brain_recovery && local_service_active && !local_service_tentative) {
				default_syslog(LOG_CRIT, "split brain recovery finished; please check if all services are running properly!");
				split_brain_recovery = false;
			}
		} else {
			if(!split_brain_recovery) {
				default_syslog(LOG_CRIT, "split brain detected! will stop and/or start services as necessary");
				split_brain_recovery = true;
			}
		}

		if(script_pid == -1) {
			if(self_shutdown) {
				if(remote_service_active) {
					default_syslog(LOG_NOTICE, "stopping remote service (daemon shutdown)");
					run_script("remote", "stop");
					remote_service_active = false;
					remote_service_tentative = true;
				} else if(local_service_active) {
					default_syslog(LOG_INFO, "stopping local service (daemon shutdown)");
					run_script("local", "stop");
					local_service_active = false;
					local_service_tentative = true;
				} else if(peer_has_our_service && peer_has_their_service) {
					default_syslog(LOG_NOTICE, "shutdown complete; exiting");
					exit(EX_OK);
				} else if(shutdown_deadline) {
					if(now > shutdown_deadline) {
						default_syslog(LOG_NOTICE, "shutdown assumed complete; exiting");
						exit(EX_OK);
					}
				} else {
					// send another three pings for the peer to take over our service
					shutdown_deadline = now + interval * NANOSECOND_C(7) / NANOSECOND_C(2);
				}
			} else if(!remote_service_valid) {
				default_syslog(LOG_CRIT, "stopping remote service (to recover from split brain condition)");
				run_script("remote", "stop");
				remote_service_active = false;
				remote_service_tentative = true;
			} else if(!local_service_valid) {
				default_syslog(LOG_CRIT, "stopping local service (to recover from split brain condition)");
				run_script("local", "stop");
				local_service_active = false;
				local_service_tentative = true;
			} else {
				if(!local_service_active && (!peer_is_up || peer_sees_us) && !peer_has_our_service) {
					default_syslog(LOG_INFO, "starting local service");
					run_script("local", "start");
					local_service_active = true;
					local_service_tentative = true;
				} else if(!peer_shutdown && peer_is_up) {
					if(remote_service_active) {
						default_syslog(LOG_NOTICE, "stopping remote service (to be taken over by peer)");
						run_script("remote", "stop");
						remote_service_active = false;
						remote_service_tentative = true;
					}
				} else {
					if(!remote_service_active && (!peer_is_up || peer_shutdown) && !peer_has_their_service) {
						default_syslog(LOG_WARNING, "starting remote service (to cover for peer)");
						run_script("remote", "start");
						remote_service_active = true;
						remote_service_tentative = true;
					}
				}
			}
		}

		uint8_t packet
			= (local_service_active || local_service_tentative)
			| (remote_service_active || remote_service_tentative || !peer_has_valid_service) << 1
			| self_shutdown << 2
			| peer_is_up << 3;

		if(packet != last_packet || send_deadline <= now) {
			nanosecond_t clock_phase = now % interval;
			size_t connector_index = num_connectors * clock_phase / interval;

			if(current_connector_index > connector_index) {
				current_connector = connectors;
				current_connector_index = 0;
			}
			while(current_connector_index < connector_index) {
				current_connector = current_connector->next;
				current_connector_index++;
			}

			if(packet == last_packet) {
				if(sendto(current_connector->fd, &packet, sizeof packet, 0,
						(const struct sockaddr *)(current_connector + 1), current_connector->addrlen) == -1) {
					if(errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
						current_connector->last_error_moment = now;
						if(errno != current_connector->last_error) {
							current_connector->last_error = errno;
							if(verbose)
								default_syslog(LOG_ERR, "sendto(%s, %s): %m", current_connector->hostname, current_connector->portname);
						}
					}
				} else if(now > current_connector->last_error_moment + timeout) {
					current_connector->last_error = 0;
				}
			} else {
				for(socket_iterator = connectors; socket_iterator; socket_iterator = socket_iterator->next) {
					if(sendto(socket_iterator->fd, &packet, sizeof packet, 0,
							(const struct sockaddr *)(socket_iterator + 1), socket_iterator->addrlen) == -1) {
						if(errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
							socket_iterator->last_error_moment = now;
							if(errno != socket_iterator->last_error) {
								socket_iterator->last_error = errno;
								if(verbose)
									default_syslog(LOG_ERR, "sendto(%s, %s): %m", socket_iterator->hostname, socket_iterator->portname);
							}
						}
					} else if(now > socket_iterator->last_error_moment + timeout) {
						socket_iterator->last_error = 0;
					}
				}
				last_packet = packet;
			}

			send_deadline = now - clock_phase + interval * (connector_index + 1) / num_connectors;
		}

		nanosecond_t next_event = send_deadline;
		if(self_shutdown) {
			if(shutdown_deadline && shutdown_deadline > now && shutdown_deadline < next_event)
				next_event = shutdown_deadline;
		} else {
			if(recv_deadline > now && recv_deadline < send_deadline)
				next_event = recv_deadline;
		}
		if(next_event != current_timer) {
			current_timer = next_event;
			struct itimerspec its = {{0}};
			its.it_value.tv_sec = current_timer / NANOSECOND_C(1000000000);
			its.it_value.tv_nsec = current_timer % NANOSECOND_C(1000000000);
			if(timerfd_settime(timer_data.fd, TFD_TIMER_ABSTIME, &its, NULL) == -1)
				exit_with_error(EX_OSERR, "set_timerfdtime(CLOCK_MONOTONIC, TFD_TIMER_ABSTIME): %m\n");
		}

		argv0set("flatline (daemons: local=%s remote=%s) (services: local=%s remote=%s)",
			self_shutdown ? "shutting" : "up",
			peer_is_up ? peer_shutdown ? "shutting" : "up" : "down",
			local_service_active
				? local_service_tentative ? "starting" : "started"
				: local_service_tentative ? "stopping" : "stopped",
			remote_service_active
				? remote_service_tentative ? "starting" : "started"
				: remote_service_tentative ? "stopping" : "stopped"
		);

		switch(epoll_wait(epoll_fd, &event, 1, -1)) {
			case 0:
				break; // not really supposed to happen?
			case -1:
				if(errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK)
					exit_with_error(EX_OSERR, "epoll_wait(): %m");
				break;
			case 1:;
				void *data = event.data.ptr;
				event_handler handler = *(event_handler *)data;
				handler(data);
				break;
			default:
				exit_with_error(EX_OSERR, "epoll_wait() is behaving funny");
		}

		nanosecond_t then = nanosecond_get_clock();
		if(then < now)
			exit_with_error(EX_OSERR, "CLOCK_MONOTONIC isn't");
		now = then;
		if(now > next_event) {
			nanosecond_t overshoot = now - next_event;
			if(overshoot * NANOSECOND_C(7) > timeout * NANOSECOND_C(2)) {
				// looks like we skipped a beat (assuming 7/2=3.5 pings before a timeout)
				default_syslog(LOG_ERR, "clock jump detected - adjusting");
				recv_deadline += overshoot;
				if(shutdown_deadline)
					shutdown_deadline += overshoot;
			}
		}
	}

	return 0;
}
