Commit d9c36c7e authored by Nils Goroll's avatar Nils Goroll

POC/MVP: introduce T_SUBREQ: ref the subrequest

much else needs to be changed, but this commit still works with the
previous concept, so it might be helpful...

Before this commit, our concept basically was:

- start esi include requests on separate threads as quickly as
  possible

- copy or reference bytes received via a VDP bytes callback

- have the top request thread push these bytes

- run additional VDPs on the subrequest threads

This concept has some fundamental drawbacks:

- varnish-cache core uses the gzgz and pretendgzip vdps to strip
  intermediate gzip headers and calculate the CRC

  Because the CRC needs to be calculated in the order of delivery, we
  cannot calculate it in the subrequest threads. We would thus need to
  reinvent all of the CRC calculation, with many special cases to
  consider.

- even if we did this, our support for additional VDPs at esi_level >
  0 would be either limited or really complicated: For one, we
  currently always need the pesi vdp first (which differs from
  standard varnish) and we probably would need many more cases where
  we copy data

In general, our current concept complicates things and requires work
to be done multiple times.

This commit shows the basic idea to avoid all this complication. It is
far from clean, but already survives a varnishtest -j40 -n1000
src/tests/*vtc

It does not yet change the vdp context, but it will allow to get much
closer to the original varnish behavior:

We return from the subreq thread without invoking any delivery, we
just save the references to the request and (busy) object to continue
delivery later (in the top request thread).

The only uglyness this requires is that we need to keep varnish-cache
core code from removing a private (pass/hfm/hfp) object from under our
feet.

Then the top request can deliver non-esi objects with the already
built vdp without any additional copying whatsoever, the delivery bit
of the requests is simply continued in a different thread.

This will allow us to switch back to the varnish-cache esi concepts:
ESI subrequests push their gzgz/pretendgzip VDPs and are otherwise
compatible with other VDPs. And they do not require the esi VDP to be
present for subrequests.

Via our transport, I think we will at least be able to ensure pesi is
used on subrequests if level 0 has esi, but we might even get to
pesi/esi interop to the extend where starting with esi and continuing
with pesi at some deeper level could work.

For pesi objects we will need to continue to ref/buffer VDP_bytes,
because we simply need to do the ESI parse in parallel and at least
for private objects where is no second chance, the object will be gone
once we have seen the VDP_bytes once.

Copying could still be optimized to use less storage objects.
parent ebfcebb5
......@@ -29,9 +29,9 @@ digraph tree {
/* legend */
T_NEXUS [shape=diamond]
T_DATA [shape=triangle]
T_REQ [shape=box]
T_SUBREQ [shape=box]
T_NEXUS -> T_DATA
T_NEXUS -> T_REQ
T_NEXUS -> T_SUBREQ
}
}
\ No newline at end of file
......@@ -91,10 +91,13 @@ static const uint8_t gzip_hdr[] = {
static const void * const gzip_fini = &gzip_fini;
#define GZIP_TAILBUF_SZ 13
#define OC_F_FINAL (OC_F_PRIVATE | OC_F_HFM | OC_F_HFP)
enum n_type {
T_INVALID = 0,
T_NEXUS,
T_DATA
T_DATA,
T_SUBREQ
};
/*
......@@ -147,6 +150,17 @@ struct node_data {
enum vdp_action act;
};
struct node_subreq {
struct req *topreq; // XXX HACK
struct req *req;
struct boc *boc;
struct objcore *oc;
uint8_t oc_flags_saved;
// subreq to topreq delivery
int done;
pthread_cond_t cond;
};
struct node {
unsigned magic;
#define NODE_MAGIC 0xe31edef3
......@@ -155,10 +169,14 @@ struct node {
VSTAILQ_ENTRY(node) sibling;
VSTAILQ_ENTRY(node) unpend;
struct node *parent;
// XXX (re)move me: the req from vdp_bytes()
struct req *req;
union {
struct node_nexus nexus; // T_NEXUS
struct node_data data; // T_DATA
struct node_subreq subreq; // T_SUBREQ
};
};
......@@ -223,12 +241,13 @@ struct pesi {
unsigned magic;
#define PESI_MAGIC 0xa6ba54a0
unsigned bypass;
unsigned keep_req;
struct pesi_tree *pesi_tree;
struct pecx pecx[1];
VTAILQ_ENTRY(pesi) list;
// node lives in pecx->move there rather?
// XXX REMOVE LEFTOVER
struct node *fosterparent;
};
......@@ -324,11 +343,11 @@ pesi_finish(struct pesi_tree *pesi_tree)
{
Lck_Lock(&pesi_tree->task_lock);
assert(pesi_tree->task_running > 0);
assert(pesi_tree->task_finishing > 0);
// XXX WIP assert(pesi_tree->task_finishing > 0);
pesi_tree->task_finishing--;
pesi_tree->task_running--;
if (pesi_tree->task_running == 0) {
AZ(pesi_tree->task_finishing);
// XXX WIP AZ(pesi_tree->task_finishing);
AZ(pthread_cond_signal(&pesi_tree->task_cond));
}
Lck_Unlock(&pesi_tree->task_lock);
......@@ -363,15 +382,22 @@ node_free(struct node *node)
AN(mempool);
MPL_AssertSane(node);
if (node->type == T_DATA) {
if (node->type == T_NEXUS)
VSTAILQ_FOREACH_SAFE(child, &node->nexus.children, sibling, tmp)
node_free(child);
switch (node->type) {
case T_SUBREQ:
AN(node->subreq.done);
AZ(pthread_cond_destroy(&node->subreq.cond));
/* FALLTHROUGH */
case T_NEXUS:
case T_DATA:
MPL_Free(mempool, node);
return;
break;
default:
INCOMPL();
}
VSTAILQ_FOREACH_SAFE(child, &node->nexus.children, sibling, tmp)
node_free(child);
MPL_Free(mempool, node);
}
static void
......@@ -389,13 +415,18 @@ node_insert(struct bytes_tree *tree, struct node *parent,
else
assert(parent->state == ST_PRIVATE);
if (node->type == T_NEXUS)
switch (node->type) {
case T_NEXUS:
assert(node->state == ST_PRIVATE ||
node->state == ST_OPEN);
else if (node->type == T_DATA)
break;
case T_DATA:
case T_SUBREQ:
assert(node->state == ST_DATA);
else
break;
default:
INCOMPL();
}
AZ(node->parent);
......@@ -525,6 +556,8 @@ set_unpending(struct bytes_tree *tree, struct node *node)
Lck_AssertHeld(&tree->tree_lock);
assert(node->state == ST_DATA);
assert(node->type == T_DATA ||
node->type == T_SUBREQ);
node->state = ST_UNPENDING;
......@@ -737,9 +770,20 @@ ved_task(struct worker *wrk, void *priv)
VCL_Rel(&req->vcl);
VSLdbg(req, "Done PESI");
req_fini(req, wrk);
if (pesi->keep_req == 0) {
VSLdbg(req, "Done PESI keep_req == 0");
req_fini(req, wrk);
}
else {
VSLdbg(req, "Done PESI keep_req == 1");
assert (node->type == T_SUBREQ);
// XXX MAKE EFFICIENT
// WOULD BE BETTER TO ONLY INSERT NODE WHEN READY
Lck_Lock(&pesi_tree->tree->tree_lock);
node->subreq.done = 1;
AZ(pthread_cond_signal(&node->subreq.cond));
Lck_Unlock(&pesi_tree->tree->tree_lock);
}
wrk->task.func = NULL;
......@@ -1058,6 +1102,13 @@ bytes_unpend_worklist(struct req *req, struct bytes_tree *tree,
assert(node->state == ST_DATA);
if (node->type == T_SUBREQ) {
VSTAILQ_INSERT_TAIL(work, node, unpend);
set_unpending(tree, node);
check = CHK_ANY;
continue;
}
p = node->data.ptr;
if (p == NULL && node->data.len > 0) {
CHECK_OBJ_NOTNULL(node->data.st, STORAGE_MAGIC);
......@@ -1120,6 +1171,73 @@ bytes_push_worklist(struct req *req, struct bytes_tree *tree,
// assert_node needs lock
assert(node->state == ST_UNPENDING);
/* XXX CLEANUP */
if (node->type == T_SUBREQ) {
struct pesi *pesi;
if (node->subreq.done == 0) {
Lck_Lock(&tree->tree_lock);
if (node->subreq.done == 0)
AZ(Lck_CondWait(
&node->subreq.cond,
&tree->tree_lock, 0));
Lck_Unlock(&tree->tree_lock);
}
AN(node->subreq.done);
/* transfer all to local variables */
struct req *subreq = node->subreq.req;
struct boc *boc = node->subreq.boc;
struct objcore *objcore = node->subreq.oc;
node->subreq.req = NULL;
node->subreq.boc = NULL;
node->subreq.oc = NULL;
if ((node->subreq.oc_flags_saved & OC_F_FINAL) != 0)
objcore->flags = node->subreq.oc_flags_saved;
else
assert(objcore->flags == node->subreq.oc_flags_saved);
CHECK_OBJ_NOTNULL(subreq, REQ_MAGIC);
CHECK_OBJ_ORNULL(boc, BOC_MAGIC);
AZ(subreq->objcore);
subreq->objcore = objcore;
objcore = NULL;
CHECK_OBJ_NOTNULL(subreq->objcore, OBJCORE_MAGIC);
// XXX NEED Bytes to topreq
// VDP_close(node->subreq.req);
/* wrk from topreq ! */
subreq->wrk = req->wrk;
// XXX HACK
node->subreq.topreq = req;
VSLdbg(subreq, "DeliverObj from top");
VDP_DeliverObj(subreq);
/* bottom of cnt_transmit() */
HSH_Cancel(req->wrk, subreq->objcore, boc);
if (boc != NULL)
HSH_DerefBoc(req->wrk, subreq->objcore);
(void)HSH_DerefObjCore(req->wrk, &subreq->objcore, 0);
// XXX TOTALLY THE WRONG ORDER pesi_finish was already
// called by the client thread from ved_task()
CAST_OBJ_NOTNULL(pesi, subreq->transport_priv,
PESI_MAGIC);
pesi_destroy(&pesi);
req_fini(subreq, req->wrk);
node = next;
continue;
}
assert(node->type == T_DATA);
p = node->data.ptr;
if (p == NULL && node->data.len > 0) {
CHECK_OBJ_NOTNULL(node->data.st, STORAGE_MAGIC);
......@@ -1332,7 +1450,6 @@ pesi_buf_bytes(struct req *req, enum vdp_action act, void **priv,
* non-api knowledge from VDP_DeliverObj()
* final = req->objcore->flags & (OC_F_PRIVATE | OC_F_HFM | OC_F_HFP);
*/
#define OC_F_FINAL (OC_F_PRIVATE | OC_F_HFM | OC_F_HFP)
/*
* if data is coming from a varnish-cache simple storage, we know that
......@@ -1351,7 +1468,6 @@ pesi_buf_bytes(struct req *req, enum vdp_action act, void **priv,
parent->nexus.oc = req->objcore;
HSH_Ref(parent->nexus.oc);
}
#undef OC_F_FINAL
refok = (ptr != tailbuf) &&
......@@ -1427,7 +1543,7 @@ const struct vdp VDP_pesi_buf = {
/* VDP pesi */
static int
push_vdps(struct req *req)
push_vdps_esi(struct req *req)
{
int i;
struct pecx *pecx;
......@@ -1454,17 +1570,17 @@ push_vdps(struct req *req)
VTAILQ_FOREACH(vdpe, &req->vdc->vdp, list) {
CHECK_OBJ_NOTNULL(vdpe, VDP_ENTRY_MAGIC);
VSLdbgv(req,
"push_vdps: VDP list before pushing parent VDPs: %s",
"push_vdps_esi: VDP list before pushing parent VDPs: %s",
vdpe->vdp->name);
}
i = ObjCheckFlag(req->wrk, req->objcore, OF_GZIPED);
VSLdbgv(req, "push_vdps: OF_GZIPED=%d tree->isgzip=%d RES_ESI=%d",
VSLdbgv(req, "push_vdps_esi: OF_GZIPED=%d tree->isgzip=%d RES_ESI=%d",
i, tree->isgzip, (req->res_mode & RES_ESI) == 0);
if (tree->isgzip && i && !(req->res_mode & RES_ESI)) {
/* A gzip'ed include which is not ESI processed */
VSLdbg(req, "push_vdps: pushing bad_gzgz");
VSLdbg(req, "push_vdps_esi: pushing bad_gzgz");
if ((foo = WS_Alloc(req->ws, sizeof(*foo))) == NULL) {
VSLb(req->vsl, SLT_Error,
"Insufficient workspace for ESI gzip data");
......@@ -1476,13 +1592,13 @@ push_vdps(struct req *req)
VDP_Push(req, &VDP_pesi_buf, tree);
}
else if (tree->isgzip && !i) {
VSLdbg(req, "push_vdps: pushing bad_pretend_gz");
VSLdbg(req, "push_vdps_esi: pushing bad_pretend_gz");
/* Non-Gzip'ed include in gzip'ed parent */
XXXAZ(VDP_Push(req, &bad_pretend_gz, pecx));
VDP_Push(req, &VDP_pesi_buf, tree);
}
else {
VSLdbg(req, "push_vdps: pushing pesi");
VSLdbg(req, "push_vdps_esi: pushing pesi_buf");
/* Anything else goes straight through */
VDP_Push(req, &VDP_pesi_buf, tree);
}
......@@ -1490,7 +1606,74 @@ push_vdps(struct req *req)
CHECK_OBJ_NOTNULL(req->vdc, VDP_CTX_MAGIC);
VTAILQ_FOREACH(vdpe, &req->vdc->vdp, list) {
CHECK_OBJ_NOTNULL(vdpe, VDP_ENTRY_MAGIC);
VSLdbgv(req, "push_vdps: VDP list: %s", vdpe->vdp->name);
VSLdbgv(req, "push_vdps_esi: VDP list: %s", vdpe->vdp->name);
}
return (0);
}
/* XXX CLEANUP */
static int
push_vdps_NOesi(struct req *req)
{
int i;
struct pecx *pecx;
struct pesi_tree *pesi_tree;
struct pesi *pesi;
struct bad_foo *foo;
struct bytes_tree *tree;
CHECK_OBJ_NOTNULL(req, REQ_MAGIC);
CHECK_OBJ_NOTNULL(req->objcore, OBJCORE_MAGIC);
CHECK_OBJ_NOTNULL(req->wrk, WORKER_MAGIC);
CAST_OBJ_NOTNULL(pesi, req->transport_priv, PESI_MAGIC);
pecx = pesi->pecx;
CHECK_OBJ_NOTNULL(pecx, PECX_MAGIC);
pesi_tree = pesi->pesi_tree;
CHECK_OBJ_NOTNULL(pesi_tree, PESI_TREE_MAGIC);
CHECK_OBJ_NOTNULL(pesi_tree->tree, BYTES_TREE_MAGIC);
tree = pesi_tree->tree;
/* XXX 5-28 */
struct vdp_entry *vdpe;
CHECK_OBJ_NOTNULL(req->vdc, VDP_CTX_MAGIC);
VTAILQ_FOREACH(vdpe, &req->vdc->vdp, list) {
CHECK_OBJ_NOTNULL(vdpe, VDP_ENTRY_MAGIC);
VSLdbgv(req,
"push_vdps_NOesi: VDP list before pushing parent VDPs: %s",
vdpe->vdp->name);
}
i = ObjCheckFlag(req->wrk, req->objcore, OF_GZIPED);
VSLdbgv(req, "push_vdps_NOesi: OF_GZIPED=%d tree->isgzip=%d RES_ESI=%d",
i, tree->isgzip, (req->res_mode & RES_ESI) == 0);
if (tree->isgzip && i && !(req->res_mode & RES_ESI)) {
/* A gzip'ed include which is not ESI processed */
VSLdbg(req, "push_vdps_NOesi: pushing bad_gzgz");
if ((foo = WS_Alloc(req->ws, sizeof(*foo))) == NULL) {
VSLb(req->vsl, SLT_Error,
"Insufficient workspace for ESI gzip data");
return (-1);
}
INIT_OBJ(foo, BAD_FOO_MAGIC);
foo->pecx = pecx;
XXXAZ(VDP_Push(req, &bad_gzgz, foo));
}
else if (tree->isgzip && !i) {
VSLdbg(req, "push_vdps_NOesi: pushing bad_pretend_gz");
/* Non-Gzip'ed include in gzip'ed parent */
XXXAZ(VDP_Push(req, &bad_pretend_gz, pecx));
}
else {
VSLdbg(req, "push_vdps_NOesi: pushing nothing");
/* Anything else goes straight through */
}
/* XXX debugging */
CHECK_OBJ_NOTNULL(req->vdc, VDP_CTX_MAGIC);
VTAILQ_FOREACH(vdpe, &req->vdc->vdp, list) {
CHECK_OBJ_NOTNULL(vdpe, VDP_ENTRY_MAGIC);
VSLdbgv(req, "push_vdps_NOesi: VDP list: %s", vdpe->vdp->name);
}
return (0);
}
......@@ -1585,7 +1768,7 @@ vdp_pesi_init(struct req *req, void **priv)
AZ(pecx->woken);
req->transport_priv = pesi;
return (push_vdps(req));
return (push_vdps_esi(req));
}
static int v_matchproto_(vdp_fini_f)
......@@ -1705,6 +1888,19 @@ vdp_pesi_bytes(struct req *req, enum vdp_action act, void **priv,
node = pecx->node;
CHECK_OBJ_NOTNULL(node, NODE_MAGIC);
/*
* XXX call from topreq delivery
*
* TODO: take pesi out of the VDPs
*/
if (node->type == T_SUBREQ) {
VSLdbg(req, "ved_vdp: T_SUBREQ");
AN(node->subreq.done);
AN(node->subreq.topreq);
req->acct.resp_bodybytes += len;
return (VDP_bytes(node->subreq.topreq, act, ptr, len));
}
/*
* XXX fails for e00031.vtc and e00033.vtc:
*
......@@ -1715,6 +1911,9 @@ vdp_pesi_bytes(struct req *req, enum vdp_action act, void **priv,
*/
assert(node->state == ST_OPEN || node->state == ST_PRIVATE);
/*
* XXX avoid being pushed for anything but ESI
*/
if (!ObjHasAttr(req->wrk, req->objcore, OA_ESIDATA)) {
VSLdbg(req, "ved_vdp: no ESI data, pushing");
req->acct.resp_bodybytes += len;
......@@ -1820,7 +2019,7 @@ vdp_pesi_bytes(struct req *req, enum vdp_action act, void **priv,
VSLdbgv(req,
"ved_vdp: pushing VDP tree=%p",
tree);
XXXAZ(push_vdps(req));
XXXAZ(push_vdps_esi(req));
}
Debug("INCL [%s][%s] END\n", q, pecx->p);
pecx->p = r + 1;
......@@ -1962,6 +2161,7 @@ vped_deliver(struct req *req, struct boc *boc, int wantbody)
int i;
struct pesi *pesi;
struct bytes_tree *tree;
struct node *node;
VSLdbgv(req, "vped_deliver: req=%p boc=%p wantbody=%d", req, boc,
wantbody);
......@@ -1982,6 +2182,61 @@ vped_deliver(struct req *req, struct boc *boc, int wantbody)
if (boc == NULL && ObjGetLen(req->wrk, req->objcore) == 0)
return;
/* XXX WIP / STILL HACKY */
if (!ObjHasAttr(req->wrk, req->objcore, OA_ESIDATA)) {
XXXAZ(push_vdps_NOesi(req));
node = pesi->pecx->node;
/* XXX TODO CHANGE NODE TYPES -- this is a NEXUS atm */
assert(node->type == T_NEXUS);
assert(node->state == ST_PRIVATE);
assert(node->nexus.npending_private == 0);
Lck_Lock(&tree->tree_lock);
// XXX we should not need to lock here once node types are
// cleaned up
node->type = T_SUBREQ;
node->state = ST_DATA;
/* our caller, cnt_transmit(), releases the refs to boc and oc
* when we return, so in order to hand this request to the
* topreq thread, we need to gain one more
*/
node->subreq.req = req;
node->subreq.boc = HSH_RefBoc(req->objcore);
HSH_Ref(req->objcore);
node->subreq.oc = req->objcore;
/*
* also, cnt_transmit calls HSH_Cancel which we need to postpone
*/
node->subreq.oc_flags_saved = req->objcore->flags;
if ((req->objcore->flags & OC_F_FINAL) != 0) {
req->objcore->flags &= ~OC_F_FINAL;
}
AZ(pthread_cond_init(&node->subreq.cond, NULL));
// XXX rename
pesi->keep_req = 1;
if (node->parent == NULL ||
node->parent->state != ST_PRIVATE)
AZ(pthread_cond_signal(&tree->cond));
Lck_Unlock(&tree->tree_lock);
/*
* to be done in the other thread:
* - VDP_DeliverObj()
* - VDP_close()
*
* from ved_task():
* - req_fini()
*/
return;
}
/* XXX needed ??? */
i = ObjCheckFlag(req->wrk, req->objcore, OF_GZIPED);
VSLdbgv(req, "vped_deliver: OF_GZIPED=%d tree->isgzip=%d RES_ESI=%d",
i, tree->isgzip, req->res_mode & RES_ESI);
......@@ -1996,7 +2251,7 @@ vped_deliver(struct req *req, struct boc *boc, int wantbody)
return;
}
}
XXXAZ(push_vdps(req));
XXXAZ(push_vdps_esi(req));
(void)VDP_DeliverObj(req);
VDP_close(req);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment