Always reschedule requests from the waiting list.

Previously we used various heuristics (test TCP connection) to avoid the reschedule if the req was abandonned while on the waiting list. We also subjected the rescheduling to pool-queue limits like any new request. This was all safe because we knew we could clean up the request state cheaply, even if it was somewhat cumbersome. Now vmods can have per-task PRIV's and we have no idea what it will cost us (stack, time, etc) to clean them up, so we cannot burden J.Random Request who happens to rush the waiting list with the burden. Fix this by always rescheduling, not subject to pool-queue limits, and eliminate all the special-casing for exceeded limits, including the debug feature to force a rescheduling failure and two tests exercising it. As a side effect of this, requests on the waiting list gets a "business class upgrade" over newly arriving requests when there are no worker threads available. Given that these requests arrived earlier, and we already performed work on them, this seems only fair. Forced to pay proper attention by: slink

Always reschedule requests from the waiting list.
Previously we used various heuristics (test TCP connection) to avoid the reschedule if the req was abandonned while on the waiting list. We also subjected the rescheduling to pool-queue limits like any new request. This was all safe because we knew we could clean up the request state cheaply, even if it was somewhat cumbersome. Now vmods can have per-task PRIV's and we have no idea what it will cost us (stack, time, etc) to clean them up, so we cannot burden J.Random Request who happens to rush the waiting list with the burden. Fix this by always rescheduling, not subject to pool-queue limits, and eliminate all the special-casing for exceeded limits, including the debug feature to force a rescheduling failure and two tests exercising it. As a side effect of this, requests on the waiting list gets a "business class upgrade" over newly arriving requests when there are no worker threads available. Given that these requests arrived earlier, and we already performed work on them, this seems only fair. Forced to pay proper attention by: slink
59367a6c · Poul-Henning Kamp · c0a4ee41 · 59367a6c · 59367a6c · 59367a6c
Commit 59367a6c authored Aug 28, 2018 by Poul-Henning Kamp
11 changed files
--- a/bin/varnishd/cache/cache.h
+++ b/bin/varnishd/cache/cache.h
@@ -217,10 +217,17 @@ struct pool_task {
 * tasks are taken off the queues in this order
 *
 * prios up to TASK_QUEUE_RESERVE are run from the reserve
+ *
+ * TASK_QUEUE_{REQ|STR} are new req's (H1/H2), and subject to queue limit.
+ *
+ * TASK_QUEUE_RUSH is req's returning from waiting list, they are
+ * not subject to TASK_QUEUE_CLIENT because we cannot safely clean
+ * them up if scheduling them fails.
 */
 enum task_prio {
 	TASK_QUEUE_BO,
 #define TASK_QUEUE_RESERVE	TASK_QUEUE_BO
+	TASK_QUEUE_RUSH,
 	TASK_QUEUE_REQ,
 	TASK_QUEUE_STR,
 	TASK_QUEUE_VCA,

--- a/bin/varnishd/cache/cache_hash.c
+++ b/bin/varnishd/cache/cache_hash.c
@@ -76,6 +76,7 @@ static struct objhead *private_oh;
 static void hsh_rush1(const struct worker *, struct objhead *,
    struct rush *, int);
 static void hsh_rush2(struct worker *, struct rush *);
+static int HSH_DerefObjHead(struct worker *wrk, struct objhead **poh);

 /*---------------------------------------------------------------------*/

@@ -591,8 +592,18 @@ hsh_rush2(struct worker *wrk, struct rush *r)
 		CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
 		VTAILQ_REMOVE(&r->reqs, req, w_list);
 		DSL(DBG_WAITINGLIST, req->vsl->wid, "off waiting list");
-		AN(req->transport->reembark);
-		req->transport->reembark(wrk, req);
+		if (req->transport->reembark != NULL) {
+			// For ESI includes
+			req->transport->reembark(wrk, req);
+		} else {
+			/*
+			 * We ignore the queue limits which apply to new
+			 * requests because if we fail to reschedule there
+			 * may be vmod_privs to cleanup and we need a proper
+			 * workerthread for that.
+			 */
+			AZ(Pool_Task(req->sp->pool, &req->task, TASK_QUEUE_RUSH));
+		}
 	}
 }

@@ -939,7 +950,7 @@ HSH_DerefObjCore(struct worker *wrk, struct objcore **ocp, int rushmax)
 	return (0);
 }

-int
+static int
 HSH_DerefObjHead(struct worker *wrk, struct objhead **poh)
 {
 	struct objhead *oh;

--- a/bin/varnishd/cache/cache_objhead.h
+++ b/bin/varnishd/cache/cache_objhead.h
@@ -63,7 +63,6 @@ int HSH_Snipe(const struct worker *, struct objcore *);
 struct boc *HSH_RefBoc(const struct objcore *);
 void HSH_DerefBoc(struct worker *wrk, struct objcore *);
 void HSH_DeleteObjHead(const struct worker *, struct objhead *);
-int HSH_DerefObjHead(struct worker *, struct objhead **);

 int HSH_DerefObjCore(struct worker *, struct objcore **, int rushmax);
 #define HSH_RUSH_POLICY -1

--- a/bin/varnishd/cache/cache_req_fsm.c
+++ b/bin/varnishd/cache/cache_req_fsm.c
@@ -50,34 +50,6 @@
 #include "vsha256.h"
 #include "vtim.h"

-/*--------------------------------------------------------------------
- * Reschedule a request from the waiting list
- */
-
-int
-CNT_Reembark(struct worker *wrk, struct req *req)
-{
-
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-
-	if (!DO_DEBUG(DBG_FAILRESCHED) &&
-	    !SES_Reschedule_Req(req, TASK_QUEUE_REQ))
-		return (0);
-
-	/* Couldn't schedule, ditch */
-	wrk->stats->busy_wakeup--;
-	wrk->stats->busy_killed++;
-	VSLb(req->vsl, SLT_Error, "Fail to reschedule req from waiting list");
-
-	AN(req->ws->r);
-	WS_Release(req->ws, 0);
-	AN(req->hash_objhead);
-	(void)HSH_DerefObjHead(wrk, &req->hash_objhead);
-	AZ(req->hash_objhead);
-	return(-1);
-}
-
 /*--------------------------------------------------------------------
 * Handle "Expect:" and "Connection:" on incoming request
 */

--- a/bin/varnishd/cache/cache_session.c
+++ b/bin/varnishd/cache/cache_session.c
@@ -376,30 +376,6 @@ SES_New(struct pool *pp)
 	return (sp);
 }

-/*--------------------------------------------------------------------
- * Reschedule a request on a work-thread from its sessions pool
- *
- * This is used to reschedule requests waiting on busy objects
- */
-
-int
-SES_Reschedule_Req(struct req *req, enum task_prio prio)
-{
-	struct sess *sp;
-	struct pool *pp;
-
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-	sp = req->sp;
-	CHECK_OBJ_NOTNULL(sp, SESS_MAGIC);
-	pp = sp->pool;
-	CHECK_OBJ_NOTNULL(pp, POOL_MAGIC);
-	AN(TASK_QUEUE_CLIENT(prio));
-
-	AN(req->task.func);
-
-	return (Pool_Task(pp, &req->task, prio));
-}
-
 /*--------------------------------------------------------------------
 * Handle a session (from waiter)
 */

--- a/bin/varnishd/cache/cache_varnishd.h
+++ b/bin/varnishd/cache/cache_varnishd.h
@@ -335,7 +335,6 @@ enum req_fsm_nxt {
 };

 enum req_fsm_nxt CNT_Request(struct worker *, struct req *);
-int CNT_Reembark(struct worker *, struct req *);

 /* cache_session.c */
 void SES_NewPool(struct pool *, unsigned pool_no);
@@ -343,7 +342,6 @@ void SES_DestroyPool(struct pool *);
 void SES_Wait(struct sess *, const struct transport *);
 void SES_Ref(struct sess *sp);
 void SES_Rel(struct sess *sp);
-int SES_Reschedule_Req(struct req *, enum task_prio);

 const char * HTC_Status(enum htc_status_e);
 void HTC_RxInit(struct http_conn *htc, struct ws *ws);

--- a/bin/varnishd/http1/cache_http1_fsm.c
+++ b/bin/varnishd/http1/cache_http1_fsm.c
@@ -201,25 +201,6 @@ http1_req_cleanup(struct sess *sp, struct worker *wrk, struct req *req)
 	return (0);
 }

-/*----------------------------------------------------------------------
- * Clean up a req from waiting list which cannot complete
- */
-
-static void v_matchproto_(vtr_reembark_f)
-http1_reembark(struct worker *wrk, struct req *req)
-{
-
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-	assert(req->transport == &HTTP1_transport);
-
-	if (!CNT_Reembark(wrk, req))
-		return;
-
-	SES_Close(req->sp, SC_OVERLOAD);
-	AN(http1_req_cleanup(req->sp, wrk, req));
-}
-
 static int v_matchproto_(vtr_minimal_response_f)
 http1_minimal_response(struct req *req, uint16_t status)
 {
@@ -263,7 +244,6 @@ struct transport HTTP1_transport = {
 	.deliver =		V1D_Deliver,
 	.minimal_response =	http1_minimal_response,
 	.new_session =		http1_new_session,
-	.reembark =		http1_reembark,
 	.req_body =		http1_req_body,
 	.req_fail =		http1_req_fail,
 	.req_panic =		http1_req_panic,

--- a/bin/varnishd/http2/cache_http2_session.c
+++ b/bin/varnishd/http2/cache_http2_session.c
@@ -434,32 +434,12 @@ h2_new_session(struct worker *wrk, void *arg)
 	h2_del_sess(wrk, h2, SC_RX_JUNK);
 }

-static void v_matchproto_(vtr_reembark_f)
-h2_reembark(struct worker *wrk, struct req *req)
-{
-	struct h2_req *r2;
-
-	CHECK_OBJ_NOTNULL(wrk, WORKER_MAGIC);
-	CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
-	assert(req->transport == &H2_transport);
-
-	if (!CNT_Reembark(wrk, req))
-		return;
-
-	CAST_OBJ_NOTNULL(r2, req->transport_priv, H2_REQ_MAGIC);
-	assert(r2->state == H2_S_CLOS_REM);
-	AN(r2->scheduled);
-	r2->scheduled = 0;
-	r2->h2sess->do_sweep = 1;
-}
-
 struct transport H2_transport = {
 	.name =			"H2",
 	.magic =		TRANSPORT_MAGIC,
 	.deliver =		h2_deliver,
 	.minimal_response =	h2_minimal_response,
 	.new_session =		h2_new_session,
-	.reembark =		h2_reembark,
 	.req_body =		h2_req_body,
 	.req_fail =		h2_req_fail,
 	.sess_panic =		h2_sess_panic,

--- a/bin/varnishtest/tests/c00013.vtc
+++ b/bin/varnishtest/tests/c00013.vtc
@@ -50,60 +50,3 @@ varnish v1 -vsl_catchup
 varnish v1 -expect busy_sleep >= 1
 varnish v1 -expect busy_wakeup >= 1
 varnish v1 -stop
-
-##################################################
-# Now try again where getting a thread fails
-
-barrier b3 cond 2
-barrier b4 cond 2
-
-server s3 {
-	rxreq
-	expect req.url == "/foo"
-	send "HTTP/1.0 200 OK\r\nConnection: close\r\n\r\n"
-	delay .2
-	barrier b3 sync
-	delay .2
-	send "line1\n"
-	delay .2
-	barrier b4 sync
-	send "line2\n"
-} -start
-
-varnish v3 -vcl+backend {
-	sub vcl_backend_fetch {
-		set bereq.backend = s3;
-	}
-	sub vcl_backend_response {
-		set beresp.do_stream = false;
-	}
-} -start
-
-varnish v3 -cliok "param.set debug +failresched"
-
-varnish v3 -cliok "param.set debug +syncvsl"
-
-client c3 -connect ${v3_sock} {
-	txreq -url "/foo" -hdr "client: c3"
-	rxresp
-	expect resp.status == 200
-	expect resp.bodylen == 12
-	expect resp.http.x-varnish == "1001"
-} -start
-
-barrier b3 sync
-
-client c4 -connect ${v3_sock} {
-	txreq -url "/foo" -hdr "client: c4"
-	delay .2
-	barrier b4 sync
-	expect_close
-} -run
-
-client c3 -wait
-
-varnish v1 -vsl_catchup
-varnish v3 -expect busy_sleep >= 1
-varnish v3 -expect busy_wakeup == 0
-varnish v3 -expect busy_killed == 1
-varnish v3 -expect sc_overload == 1
--- a/bin/varnishtest/tests/r02563.vtc
+++ b/bin/varnishtest/tests/r02563.vtc
-varnishtest "#2563: Panic after reembark failure"
-
-barrier b1 cond 2
-barrier b2 cond 2
-
-server s1 {
-	rxreq
-	expect req.url == "/foo"
-	expect req.http.client == "c1"
-	send "HTTP/1.0 200 OK\r\nConnection: close\r\n\r\n"
-	delay .2
-	barrier b1 sync
-	delay .2
-	send "line1\n"
-	delay .2
-	barrier b2 sync
-	send "line2\n"
-} -start
-
-varnish v1 -vcl+backend {
-	sub vcl_backend_response {
-		set beresp.do_stream = false;
-	}
-} -start
-
-varnish v1 -cliok "param.set feature +http2"
-varnish v1 -cliok "param.set debug +failresched"
-varnish v1 -cliok "param.set debug +waitinglist"
-varnish v1 -cliok "param.set debug +syncvsl"
-
-client c1 {
-	txreq -url "/foo" -hdr "client: c1"
-	rxresp
-	expect resp.status == 200
-	expect resp.bodylen == 12
-	expect resp.http.x-varnish == "1001"
-} -start
-
-barrier b1 sync
-
-client c2 {
-	stream 1 {
-		txreq -url "/foo"
-		delay .2
-		barrier b2 sync
-		rxrst
-		expect rst.err == REFUSED_STREAM
-	} -start
-
-	stream 3 {
-		delay 1
-		txreq -url "/foo"
-		rxresp
-	} -run
-
-	stream 1 -wait
-} -run
-
-client c1 -wait
-
-varnish v1 -vsl_catchup
-varnish v1 -expect busy_sleep >= 1
-varnish v1 -expect busy_wakeup == 0
-varnish v1 -expect busy_killed == 1
--- a/include/tbl/debug_bits.h
+++ b/include/tbl/debug_bits.h
@@ -50,7 +50,6 @@ DEBUG_BIT(H2_NOCHECK,		h2_nocheck,	"Disable various H2 checks")
 DEBUG_BIT(VMOD_SO_KEEP,		vmod_so_keep,	"Keep copied VMOD libraries")
 DEBUG_BIT(PROCESSORS,		processors,	"Fetch/Deliver processors")
 DEBUG_BIT(PROTOCOL,		protocol,	"Protocol debugging")
-DEBUG_BIT(FAILRESCHED,		failresched,	"Fail from waiting list")
 #undef DEBUG_BIT

 /*lint -restore */