Rework fellow_cache_obj_iter and read ahead

Issue #41 has shown a deadlock scenario where various object iterators
would wait for memory.

While reviewing this issue, we noticed a couple of shortcomings in the
existing code:

* fellow_cache_seg_ref_in() would always wait for allocation requests
  for readahead segments. Yet, when under memory pressure, we should
  not wait at all for memory for readahead.

* fellow_cache_obj_iter() would hold onto already sent segments also
  when waiting for synchronous I/O and memory allocations.

To improve on these shortcomings and further optimize the code, some
of fellow_cache_obj_iter() and all of the readahead code has been
rewritten. Improvements comprise the following:

* For read ahead, we now use asynchronous memory allocations. If they
  succeed right away, we issue I/O right away also, but if allocations
  are delayed, we continue delivery and check back later. By chance,
  memory allocations will succeed until then.

* We decouple memory allocations from specific segments and only care
  about the right size of the allocation. Because many segments will
  be of chunk_bytes size, this will allow more efficient use of
  available asynchronous allocations.

* We now de-reference already sent segments also whenever we need to
  wait for anything, be it a memory allocation or I/O. This should
  help overall efficiency and reduce memory pressure, because already
  sent segments can be LRUd earlier.

  The drawback is that we flush the VDP pipeline more often (we need
  to before we can deref segments).

We also cap the readahead parameter at the equivalent of 1/16 of
memory in order to avoid inefficiencies because of single requests
holding too much of the memory cache hostage.

An additional hard cap at 31 is required to keep the default esi depth
supported with the default stack size of varnish-cache.
parent c8b01760
......@@ -25,6 +25,19 @@ fellow
* Improved code coverage and added Coverity for additional linting.
* Added an absolute maximum of 31 and dynamic maximum to the readahead
parameter to avoid single object deliveries holding more than 1/16
of the available memory cache.
* The readahead implementation has been changed to only run when less
than or equal to half (rounded down) the configured read ahead
segments are already available.
* The ``readahead`` parameter default has been changed from 2 to 5 to
enable the efficiency improvement by the aforementioned change: As 5
/ 2 = 2, read ahead will trigger for every 2 segments, instead of
for every segment.
* Added a dynamic minimum to the dsk_reserve_chunks parameter to
always keep the reserve at 2MB minimum. This is required for stable
operation of LRU when the log is full.
......
......@@ -3579,6 +3579,67 @@ fellow_cache_seg_deref(struct fellow_cache_seg * const *segs, unsigned n)
AZ(pthread_mutex_unlock(&fco->mtx));
}
struct fcoi_deref {
unsigned magic;
#define FCOI_DEREF_MAGIC 0x2a16ec74
unsigned n, max;
struct fellow_cache_seg **segs;
objiterate_f *func;
void *priv;
};
static inline void
fcoi_add(struct fcoi_deref *fcoid, struct fellow_cache_seg *fcs)
{
CHECK_OBJ_NOTNULL(fcoid, FCOI_DEREF_MAGIC);
AN(fcoid->segs);
assert(fcoid->n < fcoid->max);
fcoid->segs[fcoid->n++] = fcs;
}
/*
* deref when we know a flush has just happened or not needed
*/
static inline void
fcoi_deref(struct fcoi_deref *fcoid)
{
CHECK_OBJ_NOTNULL(fcoid, FCOI_DEREF_MAGIC);
if (fcoid->n == 0)
return;
AN(fcoid->segs);
assert(fcoid->n <= fcoid->max);
fellow_cache_seg_deref(fcoid->segs, fcoid->n);
fcoid->n = 0;
}
/*
* when deref'ing from from _obj_iter(), we need to flush first.
* this function wraps the deref'ing in a struct
*/
static int
fellow_cache_obj_iter_flush_deref(struct fcoi_deref *fcoid)
{
int r;
CHECK_OBJ_NOTNULL(fcoid, FCOI_DEREF_MAGIC);
if (fcoid->n == 0)
return (0);
AN(fcoid->func);
r = fcoid->func(fcoid->priv, OBJ_ITER_FLUSH, NULL, (size_t)0);
fcoi_deref(fcoid);
return (r);
}
static const char *
fellow_cache_seg_check(struct fellow_cache_seg *fcs)
{
......@@ -3796,6 +3857,214 @@ fellow_cache_seg_ref_in(struct fellow_cache *fc, enum fellow_cache_io_e type,
fellow_cache_seg_ref_in(fc, type, racesegs, racen);
}
#define NREQS (newreqs ? 1 : 0)
#define OREQS (newreqs ? 0 : 1)
/* return 1 if allocation has been assigned */
static inline int
fellow_cache_obj_readahead_assign_or_request(
const struct fellow_cache *fc,
struct fellow_cache_seg *fcs,
struct buddy_returns *rets,
struct buddy_reqs reqs[2], unsigned newreqs)
{
struct buddy_ptr_extent mem;
size_t sz;
int r;
assert(fcs->state == FCS_DISK);
AZ(fcs->alloc.ptr);
sz = fellow_rndup(fc->ffd, fcs->disk_seg->seg.size);
mem = buddy_get_next_ptr_extent(&reqs[OREQS]);
while (mem.ptr != NULL && mem.size != sz) {
DBG("%zu != %zu", mem.size, sz);
AN(buddy_return_ptr_extent(rets, &mem));
mem = buddy_get_next_ptr_extent(&reqs[OREQS]);
}
if (mem.ptr != NULL) {
DBG("success %p", mem.ptr);
fcs->alloc = mem;
return (1);
}
DBG("fail %u", 0);
r = buddy_req_extent(&reqs[NREQS], sz, 0);
if (r == 0)
assert(errno == ENOSPC);
return (0);
}
#undef NREQS
#undef OREQS
#define NREQS (*newreqs ? 1 : 0)
#define OREQS (*newreqs ? 0 : 1)
static void
fellow_cache_obj_readahead(
struct fellow_cache *fc,
struct fcscursor *rac, struct fellow_cache_obj *fco,
struct fellow_cache_seg *ra[], const unsigned mod /* length of ra */,
unsigned *ranp, const unsigned ranto, unsigned need,
struct fcoi_deref *fcoid,
struct buddy_reqs reqs[2], unsigned *newreqs)
{
struct fellow_cache_seg *fcs;
struct fcscursor c;
unsigned ran, ckp, u, n, needdisk, ion;
/* jump target for need == 1 and sync allocation */
again:
needdisk = 0;
ion = 0;
CHECK_OBJ_NOTNULL(fc, FELLOW_CACHE_MAGIC);
AN(rac);
CHECK_OBJ_NOTNULL(fco, FELLOW_CACHE_OBJ_MAGIC);
AN(ra);
AN(mod);
AN(ranp);
ran = *ranp;
assert(need <= 1);
AN(fcoid);
AN(reqs);
AN(newreqs);
assert(*newreqs <= 1);
struct fellow_lru_chgbatch lcb[1] =
FELLOW_LRU_CHGBATCH_INIT(lcb, fco, 64);
struct buddy_returns *rets =
BUDDY_RETURNS_STK(fc->membuddy, BUDDY_RETURNS_MAX);
assert(ranto >= ran);
n = (ranto - ran) + 1;
struct fellow_cache_seg *iosegs[n];
memset(iosegs, 0, sizeof iosegs);
c = *rac;
// unlocked check
if (! (ran < ranto &&
(fcs = FCSC_NEXT(&c)) != NULL &&
(need || FCOS(fcs->state) > FCOS_BUSY)))
return;
c = *rac;
// update available allocations
(void) buddy_alloc_async_ready(&reqs[OREQS]);
BUDDY_REQS_PRI(&reqs[NREQS], FEP_GET);
ckp = ranto;
AZ(pthread_mutex_lock(&fco->mtx));
/* with need, we enter the loop also for FCOS_BUSY, but only once
*
* after the loop, needdisk signifies if we need the radisk
*/
while (ran < ranto &&
(fcs = FCSC_NEXT(&c)) != NULL &&
(need || FCOS(fcs->state) > FCOS_BUSY)) {
if (need)
assert(FCOS(fcs->state) >= FCOS_BUSY);
else
assert(FCOS(fcs->state) > FCOS_BUSY);
assert(fcs->fco == fco);
/* ref all incore-ish, remember _DISK */
switch (fcs->state) {
case FCS_READFAIL:
INCOMPL();
break;
case FCS_BUSY:
case FCS_READING:
case FCS_WRITING:
case FCS_CHECK:
case FCS_INCORE:
goto ref;
case FCS_DISK:
break;
default:
WRONG("_readahead fcs->state");
}
assert(fcs->state == FCS_DISK);
if (fellow_cache_obj_readahead_assign_or_request(fc, fcs,
rets, reqs, *newreqs) == 0) {
if (ran < ckp)
ckp = ran;
// end the loop to sync wait for single allocation
if (need > needdisk) {
needdisk = need;
need = 0;
// because sync wait, raise pri
BUDDY_REQS_PRI(&reqs[NREQS], FEP_NEW);
break;
}
goto ref;
}
// ref for IO
(void) fellow_cache_seg_ref_locked(NULL, fcs);
fellow_cache_seg_transition_locked_notincore(fcs, FCS_READING);
iosegs[ion++] = fcs;
ref:
need = 0;
/*
* for goto again, references are already taken
*/
if (ra[ran % mod] == fcs)
AN(fcs->refcnt);
else {
(void) fellow_cache_seg_ref_locked(lcb, fcs);
AZ(ra[ran % mod]);
ra[ran % mod] = fcs;
}
ran++;
}
AN(FCO_REFCNT(fco));
FCO_REFCNT(fco) += ion;
fellow_cache_lru_chgbatch_apply(lcb);
AZ(pthread_mutex_unlock(&fco->mtx));
if (ckp < ran)
ran = ckp;
if (ion)
fellow_cache_seg_async_read(fc, iosegs, ion);
// final assertions & advance read-ahead cursor
for (u = *ranp; u < ran; u++) {
fcs = FCSC_NEXT(rac);
assert(fcs == ra[u % mod]);
CHECK_OBJ_NOTNULL(fcs, FELLOW_CACHE_SEG_MAGIC);
AN(fcs->refcnt);
}
*ranp = ran;
uint8_t rdy = buddy_alloc_async(&reqs[NREQS]);
buddy_alloc_async_done(&reqs[OREQS]);
buddy_return(rets);
// swap OREQS <=> NREQS
*newreqs = OREQS;
// if allocation succeeded immediately, kick off I/O
if (rdy)
goto again;
if (needdisk) {
AZ(need);
(void) fellow_cache_obj_iter_flush_deref(fcoid);
AN(buddy_alloc_async_wait(&reqs[OREQS]));
goto again;
}
}
/*
* ra[] is a ring of pointers to fcses which we (potentially)
* read ahead.
......@@ -3818,23 +4087,39 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
struct fellow_cache_res fcr;
unsigned readahead = fc->tune->readahead;
unsigned mod = readahead + 1;
struct fellow_cache_seg *fcsnext, *fcs, *fcsra,
*ra[mod], *raio[mod], *deref[mod];
struct fellow_cache_seg *fcsnext, *fcs, *ra[mod], *deref[mod];
struct fcscursor c, rac;
unsigned n = 0, ran = 0, raion, derefn = 0, flags, flush;
unsigned need, n = 0, ran = 0, flags, flush;
struct fcoi_deref fcoid[1];
const char *err;
ssize_t sz;
int ret2;
// 56 bytes per i_reqalloc + 176 bytes per reqs
// 56 * 8 + 2 * 176 = 800
struct buddy_reqs reqs[2] = {
BUDDY_REQS_LIT(fc->membuddy, 3),
BUDDY_REQS_LIT(fc->membuddy, 5)
};
unsigned newreqs = 0;
// stack usage
assert(readahead <= 31);
fcr.status = fcr_ok; // also if func() != 0
fcr.r.integer = 0;
CHECK_OBJ_NOTNULL(fco, FELLOW_CACHE_OBJ_MAGIC);
memset(ra, 0, sizeof ra);
memset(raio, 0, sizeof raio);
memset(deref, 0, sizeof deref);
INIT_OBJ(fcoid, FCOI_DEREF_MAGIC);
fcoid->max = mod;
fcoid->segs = deref;
fcoid->func = func;
fcoid->priv = priv;
fcsc_init(&c, &fco->seglist);
rac = c;
......@@ -3846,62 +4131,40 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
* that we do not read past the last busy segment
*/
assert(FCOS(fcs->state) >= FCOS_BUSY);
assert(fcs->fco == fco);
if (ra[n % mod] == NULL) {
assert(n == ran);
fcsra = FCSC_NEXT(&rac);
assert(fcsra == fcs);
ran++;
need = ran == n ? 1 : 0;
fellow_cache_seg_ref_in(fc, FCIO_SYNC, &fcs, 1);
ra[n % mod] = fcs;
assert(ran >= n);
if (ran - n <= readahead / 2 + need) {
DBG("(ran - n) %u <= %u + %u", ran - n,
readahead / 2, need);
fellow_cache_obj_readahead(fc, &rac, fco, ra, mod, &ran,
n + readahead + need, need, fcoid,
reqs, &newreqs);
}
DBG("ran %u", ran);
assert(ran > n);
assert(ra[n % mod] == fcs);
/* only read ahead if both the current segment and the next
* are > BUSY
*/
raion = 0;
if (FCOS(fcs->state) > FCOS_BUSY) {
while (ran <= n + readahead &&
(fcsnext = fcsc_peek(&rac)) != NULL &&
FCOS(fcsnext->state) > FCOS_BUSY) {
fcsra = FCSC_NEXT(&rac);
assert(fcsra == fcsnext); // because peek
raio[raion++] = fcsra;
TAKEZN(ra[ran % mod], fcsra);
ran++;
}
if (raion) {
fellow_cache_seg_ref_in(fc, FCIO_ASYNC,
raio, raion);
}
}
// aborted busy segment
if (fcs->state == FCS_USABLE)
break;
AZ(fcr.r.integer);
if (fcs->state == FCS_READING) {
/* We can not get here for readahead == 0, because of
* the _ref_in(FCIO_SYNC) above
*
* last round, we checked if this fcs was in core and
* flushed if it was not. Because we hold a reference,
* it can not go back from INCORE/CHECK to BUSY or
* READING, so the deref list needs to be empty
/* before we wait, do some useful work
* and free memory
*/
AN(readahead);
AZ(derefn);
fcr.r.integer =
fellow_cache_obj_iter_flush_deref(fcoid);
if (fcr.r.integer)
break;
AZ(pthread_mutex_lock(&fcs->fco->mtx));
while (fcs->state == FCS_READING)
fellow_cache_seg_wait_locked(fcs);
AZ(pthread_mutex_unlock(&fcs->fco->mtx));
}
// aborted busy segment
if (fcs->state == FCS_USABLE)
break;
err = fellow_cache_seg_check(fcs);
if (err != NULL) {
fcr = FCR_IOFAIL(err);
......@@ -3926,8 +4189,10 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
flags |= OBJ_ITER_END;
flush = OBJ_ITER_FLUSH;
}
else if (derefn == mod - 1 || fcsnext->state != FCS_INCORE)
else if (fcoid->n == fcoid->max - 1 ||
fcsnext->state != FCS_INCORE) {
flush = OBJ_ITER_FLUSH;
}
assert(ra[n % mod] == fcs);
ra[n % mod] = NULL;
......@@ -3943,7 +4208,7 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
fcr.r.integer = func(priv, flags | flush, fcs->alloc.ptr, sz);
if (final) {
AZ(derefn);
AZ(fcoid->n);
AN(flags & OBJ_ITER_FLUSH);
/* if the opportunistic free fails, the segment will
* get deleted when the object is */
......@@ -3955,14 +4220,10 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
AN(fellow_cache_seg_deref_locked(NULL, fcs));
}
AZ(pthread_mutex_unlock(&fcs->fco->mtx));
} else if (flush) {
assert(derefn < mod);
deref[derefn++] = fcs;
fellow_cache_seg_deref(deref, derefn);
derefn = 0;
} else {
assert(derefn < mod);
deref[derefn++] = fcs;
fcoi_add(fcoid, fcs);
if (flush)
fcoi_deref(fcoid);
}
assert(fcr.status == fcr_ok);
if (fcr.r.integer)
......@@ -3979,8 +4240,11 @@ fellow_cache_obj_iter(struct fellow_cache *fc, struct fellow_cache_obj *fco,
while (mod--)
AZ(ra[mod]);
if (derefn)
fellow_cache_seg_deref(deref, derefn);
fcoi_deref(fcoid);
buddy_alloc_async_done(&reqs[0]);
buddy_alloc_async_done(&reqs[1]);
if ((flags & OBJ_ITER_END) == 0 &&
(fcr.status == fcr_ok && fcr.r.integer == 0)) {
......
......@@ -88,10 +88,21 @@ stvfe_tune_check(struct stvfe_tune *tune)
l = (unsigned)sz;
if (tune->mem_reserve_chunks > l) {
fprintf(stderr,"fellow: mem_reserve_chunks limited to %u "
"(less than 1/8 of memory size)\n", l);
"(less than 1/8 of memory size per lru)\n", l);
tune->mem_reserve_chunks = l;
}
sz = tune->memsz >> (tune->chunk_exponent + 4);
if (tune->readahead > sz) {
assert(sz <= UINT_MAX);
l = (unsigned)sz;
fprintf(stderr,"fellow: readahead limited to "
"%u chunks * %zu chunk_bytes (%u chunk_exponent)"
" be less than 1/16 of memory\n",
l, (size_t)1 << tune->chunk_exponent, tune->chunk_exponent);
tune->readahead = l;
}
// 2MB
if (tune->chunk_exponent < 21U) {
l = 1U << (21U - tune->chunk_exponent);
......
......@@ -48,7 +48,8 @@ TUNE(unsigned, mem_reserve_chunks, 1, 0, UINT_MAX);
TUNE(size_t, objsize_hint, 256 * 1024, 4096, SIZE_MAX);
TUNE(size_t, objsize_max, 0, 0, SIZE_MAX);
TUNE(size_t, discard_immediate, 256 * 1024, 4096, SIZE_MAX);
TUNE(unsigned, readahead, 2, 0, UINT_MAX);
// 31 is safe max for stack usage, further limited by memsz
TUNE(unsigned, readahead, 5, 0, 31);
TUNE(unsigned, io_batch_min, 8, 1, UINT_MAX);
// right now, the io ring size is hardcoded to 1024, so 512 is half that
TUNE(unsigned, io_batch_max, 512, 1, UINT_MAX);
......
......@@ -672,14 +672,29 @@ fellow storage can be fine tuned:
* *readahead*
- unit: scalar
- default: 2
- default: 5
- minimum: 0
- maximum: 31 or 1/16th of *memsize*
specifies how many additional segments of an object's body should be
staged into memory asynchronously before being required. This
parameter helps keeping response times low and throughput high for
objects which are not already present in the memory cache.
The maximum is the lower of 31 or the value corresponding to 1/16th
of *memsize* divided by *chunk_bytes*.
Read ahead triggers whenever the number of read ahead segments is at
readahead / 2 (rounded down) or less. Thus, for the default value of
5, read ahead will, after the initial read of 5 segments, read 2
segments whenever 2 segments have been sent.
Note that, on a system with a decently sized memory cache, no disk
IO will happen for most requests. When segments are still in memory
cache, read ahead only references them. Disk IO is only needed for
segments which are accessed for the first time after a cache load or
LRU eviction.
* *discard_immediate*
- unit: bytes
......
......@@ -608,14 +608,29 @@ fellow storage can be fine tuned:
* *readahead*
- unit: scalar
- default: 2
- default: 5
- minimum: 0
- maximum: 31 or 1/16th of *memsize*
specifies how many additional segments of an object's body should be
staged into memory asynchronously before being required. This
parameter helps keeping response times low and throughput high for
objects which are not already present in the memory cache.
The maximum is the lower of 31 or the value corresponding to 1/16th
of *memsize* divided by *chunk_bytes*.
Read ahead triggers whenever the number of read ahead segments is at
readahead / 2 (rounded down) or less. Thus, for the default value of
5, read ahead will, after the initial read of 5 segments, read 2
segments whenever 2 segments have been sent.
Note that, on a system with a decently sized memory cache, no disk
IO will happen for most requests. When segments are still in memory
cache, read ahead only references them. Disk IO is only needed for
segments which are accessed for the first time after a cache load or
LRU eviction.
* *discard_immediate*
- unit: bytes
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment