[master] 28964cf Add back the ports waiter with the existing (struct waited *) passing though port_send

Thu May 28 18:11:14 CEST 2015

commit 28964cf1cb071b53ef87a27ca060b25fb74a9ad7
Author: Nils Goroll <nils.goroll at uplex.de>
Date:   Thu May 28 17:42:50 2015 +0200

    Add back the ports waiter with the existing (struct waited *) passing though port_send
    
    See comment XXX questions to discuss with phk

diff --git a/bin/varnishd/waiter/cache_waiter_ports.c b/bin/varnishd/waiter/cache_waiter_ports.c
index d02f302..f44959e 100644
--- a/bin/varnishd/waiter/cache_waiter_ports.c
+++ b/bin/varnishd/waiter/cache_waiter_ports.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2006 Varnish Software AS
  * Copyright (c) 2007 OmniTI Computer Consulting, Inc.
  * Copyright (c) 2007 Theo Schlossnagle
- * Copyright (c) 2010-2012 UPLEX, Nils Goroll
+ * Copyright (c) 2010-2015 UPLEX, Nils Goroll
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,36 @@
  *
  */
 
-#if 0
+
+/*
+ * XXX questions to discuss with phk - is it really better
+ *
+ * - to share the binheap (requiring the mtx), which epoll / kq do now
+ *
+ * - than to send events to be entered through the events interface and keep the
+ *   binheap private to the waiter thread (which we still do here) ?
+ *
+ * at best, we can save two port syscalls, but not always:
+ *
+ * - if the waited event has a timeout earlier than the first element on the
+ *   binheap, we need to kick the waiter thread anyway
+ *
+ * - if the waiter thread is busy, it will get the passed waited event together
+ *   with other events
+ *
+ * on the other end we need to sync on the mtx to protect the binheap.  Solaris
+ * uses userland adaptive mutexes: if the thread holding the lock is running,
+ * spinlock, otherwise syscall.
+ *
+ * and the critical section for the mtx is basically "whenever not blocking in
+ * port_getn", which does not sound too good with respect to scalability.
+ *
+ * At any rate, we could save even more syscalls by increasing nevents
+ * (port_getn returns when nevents exist or the timeout is reached). This would
+ * increase our latency reacting on POLLIN events.
+ *
+ */
+
 #include "config.h"
 
 #if defined(HAVE_PORT_CREATE)
@@ -47,24 +76,24 @@
 #include "waiter/mgt_waiter.h"
 #include "vtim.h"
 
+// XXX replace with process.max-port-events bound to a sensible maximum
 #define MAX_EVENTS 256
 
 struct vws {
 	unsigned		magic;
 #define VWS_MAGIC		0x0b771473
 	struct waiter		*waiter;
-
 	pthread_t		thread;
+	double			next;
 	int			dport;
+	unsigned		nwaited;
+	int			die;
 };
 
 static inline void
 vws_add(struct vws *vws, int fd, void *data)
 {
-	/*
-	 * POLLIN should be all we need here
-	 *
-	 */
+	// POLLIN should be all we need here
 	AZ(port_associate(vws->dport, PORT_SOURCE_FD, fd, POLLIN, data));
 }
 
@@ -75,38 +104,29 @@ vws_del(struct vws *vws, int fd)
 }
 
 static inline void
-vws_port_ev(struct vws *vws, port_event_t *ev, double now) {
-	struct waited *sp;
+vws_port_ev(struct vws *vws, struct waiter *w, port_event_t *ev, double now) {
+	struct waited *wp;
 	if(ev->portev_source == PORT_SOURCE_USER) {
-		CAST_OBJ_NOTNULL(sp, ev->portev_user, WAITED_MAGIC);
-		assert(sp->fd >= 0);
-		VTAILQ_INSERT_TAIL(&vws->waiter->waithead, sp, list);
-		vws_add(vws, sp->fd, sp);
+		CAST_OBJ_NOTNULL(wp, ev->portev_user, WAITED_MAGIC);
+		assert(wp->fd >= 0);
+		vws->nwaited++;
+		Wait_HeapInsert(vws->waiter, wp);
+		vws_add(vws, wp->fd, wp);
 	} else {
 		assert(ev->portev_source == PORT_SOURCE_FD);
-		CAST_OBJ_NOTNULL(sp, ev->portev_user, WAITED_MAGIC);
-		assert(sp->fd >= 0);
-		if(ev->portev_events & POLLERR) {
-			vws_del(vws, sp->fd);
-			Wait_Handle(vws->waiter, sp, WAITER_REMCLOSE, now);
-			return;
-		}
-
+		CAST_OBJ_NOTNULL(wp, ev->portev_user, WAITED_MAGIC);
+		assert(wp->fd >= 0);
+		vws->nwaited--;
 		/*
-		 * note: the original man page for port_associate(3C) states:
-		 *
-		 *    When an event for a PORT_SOURCE_FD object is retrieved,
-		 *    the object no longer has an association with the port.
-		 *
-		 * This can be read along the lines of sparing the
-		 * port_dissociate after port_getn(), but in fact,
-		 * port_dissociate should be used
+		 * port_getn does not implicitly disassociate
 		 *
 		 * Ref: http://opensolaris.org/jive/thread.jspa?\
 		 *          threadID=129476&tstart=0
 		 */
-		vws_del(vws, sp->fd);
-		Wait_Handle(vws->waiter, sp, WAITER_ACTION, now);
+		vws_del(vws, wp->fd);
+		Wait_Call(w, wp, ev->portev_events & POLLERR ?
+		    WAITER_REMCLOSE : WAITER_ACTION,
+		    now);
 	}
 	return;
 }
@@ -114,53 +134,46 @@ vws_port_ev(struct vws *vws, port_event_t *ev, double now) {
 static void *
 vws_thread(void *priv)
 {
-	struct waited *sp;
+	struct waited *wp;
+	struct waiter *w;
 	struct vws *vws;
+	double now, then;
+	struct timespec ts;
+	const double	max_t  = 100.0;
+	port_event_t ev[MAX_EVENTS];
+	u_int nevents;
+	int ei, ret;
 
 	CAST_OBJ_NOTNULL(vws, priv, VWS_MAGIC);
-	/*
-	 * timeouts:
-	 *
-	 * min_ts : Minimum timeout for port_getn
-	 * min_t  : ^ equivalent in floating point representation
-	 *
-	 * max_ts : Maximum timeout for port_getn
-	 * max_t  : ^ equivalent in floating point representation
-	 *
-	 * with (nevents == 1), we should always choose the correct port_getn
-	 * timeout to check session timeouts, so max is just a safety measure
-	 * (if this implementation is correct, it could be set to an "infinte"
-	 *  value)
-	 *
-	 * with (nevents > 1), min and max define the acceptable range for
-	 * - additional latency of keep-alive connections and
-	 * - additional tolerance for handling session timeouts
-	 *
-	 */
-	static struct timespec min_ts = {0L, 100L * 1000L * 1000L /*ns*/};
-	static double          min_t  = 0.1; /* 100    ms*/
-	static struct timespec max_ts = {1L, 0L};		/* 1 second */
-	static double	       max_t  = 1.0;			/* 1 second */
-
-	/* XXX: These should probably go in vws ? */
-	struct timespec ts;
-	struct timespec *timeout;
+	w = vws->waiter;
+	CHECK_OBJ_NOTNULL(w, WAITER_MAGIC);
+	THR_SetName("cache-ports");
 
-	timeout = &max_ts;
+	now = VTIM_real();
 
-	while (!vws->waiter->dismantle) {
-		port_event_t ev[MAX_EVENTS];
-		u_int nevents;
-		int ei, ret;
-		double now, idle;
+	while (!vws->die) {
+		while (1) {
+			then = Wait_HeapDue(w, &wp);
+			if (wp == NULL) {
+				vws->next = now + max_t;
+				break;
+			} else if (then > now) {
+				vws->next = then;
+				break;
+			}
+			CHECK_OBJ_NOTNULL(wp, WAITED_MAGIC);
+			vws_del(vws, wp->fd);
+			Wait_Call(w, wp, WAITER_TIMEOUT, now);
+		}
+		then = vws->next - now;
+		ts.tv_sec = (time_t)floor(then);
+		ts.tv_nsec = (long)(1e9 * (then - ts.tv_sec));
 
 		/*
-		 * XXX Do we want to scale this up dynamically to increase
-		 *     efficiency in high throughput situations? - would need to
-		 *     start with one to keep latency low at any rate
-		 *
-		 *     Note: when increasing nevents, we must lower min_ts
-		 *	     and max_ts
+		 * min number of events we accept.  could consider to scale up
+		 * for efficiency, but as we always get all waiting events up to
+		 * the maximum, we'd only optimize the idle case sacrificing
+		 * some latency
 		 */
 		nevents = 1;
 
@@ -177,12 +190,12 @@ vws_thread(void *priv)
 		 *
 		 */
 
-		ret = port_getn(vws->dport, ev, MAX_EVENTS, &nevents, timeout);
+		ret = port_getn(vws->dport, ev, MAX_EVENTS, &nevents, &ts);
 		now = VTIM_real();
 
 		if (ret < 0 && errno == EBADF) {
-			/* Our stop signal */
-			AN(vws->waiter->dismantle);
+			/* close on dport is our stop signal */
+			AN(vws->die);
 			break;
 		}
 
@@ -190,61 +203,21 @@ vws_thread(void *priv)
 			assert((errno == EINTR) || (errno == ETIME));
 
 		for (ei = 0; ei < nevents; ei++)
-			vws_port_ev(vws, ev + ei, now);
-
-		/* check for timeouts */
-		idle = now - *vws->waiter->tmo;
-
-		/*
-		 * This loop assumes that the oldest sessions are always at the
-		 * beginning of the list (which is the case if we guarantee to
-		 * enqueue at the tail only
-		 *
-		 */
-
-		for (;;) {
-			sp = VTAILQ_FIRST(&vws->waiter->waithead);
-			if (sp == NULL)
-				break;
-			if (sp->idle > idle) {
-				break;
-			}
-			vws_del(vws, sp->fd);
-			Wait_Handle(vws->waiter, sp, WAITER_TIMEOUT, now);
-		}
-
-		/*
-		 * Calculate the timeout for the next get_portn
-		 */
-
-		if (sp) {
-			double tmo = (sp->idle + *vws->waiter->tmo) - now;
-
-			if (tmo < min_t) {
-				timeout = &min_ts;
-			} else if (tmo > max_t) {
-				timeout = &max_ts;
-			} else {
-				ts = VTIM_timespec(tmo);
-				timeout = &ts;
-			}
-		} else {
-			timeout = &max_ts;
-		}
+			vws_port_ev(vws, w, &ev[ei], now);
 	}
-	return(0);
+	return NULL;
 }
 
 /*--------------------------------------------------------------------*/
 
 static int
-vws_pass(void *priv, struct waited *sp)
+vws_enter(void *priv, struct waited *wp)
 {
 	int r;
 	struct vws *vws;
 
 	CAST_OBJ_NOTNULL(vws, priv, VWS_MAGIC);
-	r = port_send(vws->dport, 0, TRUST_ME(sp));
+	r = port_send(vws->dport, 0, TRUST_ME(wp));
 	if (r == -1 && errno == EAGAIN)
 		return (-1);
 	AZ(r);
@@ -277,6 +250,7 @@ vws_fini(struct waiter *w)
 	void *vp;
 
 	CAST_OBJ_NOTNULL(vws, w->priv, VWS_MAGIC);
+	vws->die = 1;
 	AZ(close(vws->dport));
 	AZ(pthread_join(vws->thread, &vp));
 }
@@ -287,9 +261,8 @@ const struct waiter_impl waiter_ports = {
 	.name =		"ports",
 	.init =		vws_init,
 	.fini =		vws_fini,
-	.pass =		vws_pass,
+	.enter =		vws_enter,
 	.size =		sizeof(struct vws),
 };
 
 #endif /* defined(HAVE_PORT_CREATE) */
-#endif
diff --git a/bin/varnishd/waiter/mgt_waiter.c b/bin/varnishd/waiter/mgt_waiter.c
index 18b3269..ee7cef7 100644
--- a/bin/varnishd/waiter/mgt_waiter.c
+++ b/bin/varnishd/waiter/mgt_waiter.c
@@ -36,18 +36,16 @@
 #include "mgt/mgt.h"
 #include "waiter/mgt_waiter.h"
 
-static const struct choice waiter_choice[] = {
+const struct choice waiter_choice[] = {
     #if defined(HAVE_KQUEUE)
 	{ "kqueue",	&waiter_kqueue },
     #endif
     #if defined(HAVE_EPOLL_CTL)
 	{ "epoll",	&waiter_epoll },
     #endif
-#if 0
     #if defined(HAVE_PORT_CREATE)
 	{ "ports",	&waiter_ports },
     #endif
-#endif
 	{ "poll",	&waiter_poll },
 	{ NULL,		NULL}
 };