Fix: Optimize FCO LRU eviction

Fix a regression from 44d788bf:

While we do want to reduce the critical region holding the lru mtx,
we can not release the fco mtx before we have completed the trans-
action on it with respect to LRU state.

Because we might need to un-do the LRU removal of the FCO, we
need to keep the mtx held until we know.

Otherwise another thread can race us and change the state under
our feet.

In this case, we raced fellow_cache_obj_delete():

 #9  0x00007f2711972fd6 in __GI___assert_fail (
     assertion=assertion@entry=0x7f27116f3ec1 "(fcs->fcs_onlru) != 0",
     file=file@entry=0x7f27116f31f8 "fellow_cache.c", line=line@entry=3145,
     function=function@entry=0x7f27116f6b50 <__PRETTY_FUNCTION__.13829> "fellow_cache_lru_work") at assert.c:101
 #10 0x00007f27116bd1db in fellow_cache_lru_work (wrk=wrk@entry=0x7edb0a8135d0, lru=lru@entry=0x7edb4421eb10)
     at fellow_cache.c:3145
 #11 0x00007f27116bd7c7 in fellow_cache_lru_thread (wrk=0x7edb0a8135d0, priv=0x7edb4421eb10)
     at fellow_cache.c:3322
 #12 0x000056544bcc06cb in wrk_bgthread (arg=0x7edb3a6e0900) at cache/cache_wrk.c:104
 #13 0x00007f2711b39609 in start_thread (arg=<optimized out>) at pthread_create.c:477
 #14 0x00007f2711a5e133 in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

(gdb) p *fcs
$1 = {magic = 25208, state = FCO_INCORE, fcs_onlru = 0, fco_infdb = 0, lcb_add = 0, lcb_remove = 0,
  fco_lru_mutate = 0, refcnt = 0, lru_list = {vtqe_next = 0x7eebee7a00a0, vtqe_prev = 0x7edb4421eb48},
  fco = 0x7ef07cf3c000, disk_seg = 0x7ef1cd8e8008, alloc = {ptr = 0x7ef1cd8e8000, size = 4096}, len = 0}
(gdb) p *fcs->fco
$2 = {magic = 2206029151, logstate = FCOL_DELETED, lru = 0x7edb4421eb10, fco_mem = {ptr = 0x7ef07cf3c000,
    bits = 13 '\r', magic = 4294193151}, mtx = pthread_mutex_t = {Type = Normal,
    Status = Acquired, possibly with waiters, Owner ID = 543234, Robust = No, Shared = No, Protocol = None},
  cond = pthread_cond_t = {Threads known to still execute a wait function = 0, Clock ID = CLOCK_REALTIME,
    Shared = No}, oc = 0x7ed3c82b0b00, fdb = {fdb = 2493649440769}, fdb_entry = {rbe_link = {0x7eebec72c000,
      0x0, 0x0}}, fdo_fcs = {magic = 25208, state = FCO_INCORE, fcs_onlru = 0, fco_infdb = 0, lcb_add = 0,
    lcb_remove = 0, fco_lru_mutate = 0, refcnt = 0, lru_list = {vtqe_next = 0x7eebee7a00a0,
      vtqe_prev = 0x7edb4421eb48}, fco = 0x7ef07cf3c000, disk_seg = 0x7ef1cd8e8008, alloc = {
      ptr = 0x7ef1cd8e8000, size = 4096}, len = 0}, aa_esidata_seg = {magic = 25208, state = FCS_USABLE,
    fcs_onlru = 0, fco_infdb = 0, lcb_add = 0, lcb_remove = 0, fco_lru_mutate = 0, refcnt = 0, lru_list = {
      vtqe_next = 0x0, vtqe_prev = 0x0}, fco = 0x7ef07cf3c000, disk_seg = 0x7ef1cd8e80f0, alloc = {ptr = 0x0,
      size = 0}, len = 0}, seglist = {magic = 3403082203, lsegs = 122, fdsl = 0x7ef1cd8e8178, fdsl_sz = 0,
    fcsl_sz = 0, next = 0x0, segs = 0x7ef07cf3c148}}

racing thread:

 Thread 3478 (Thread 0x7f2705d84700 (LWP 543234)):
 #0  __lll_lock_wait (futex=futex@entry=0x7edb4421eb20, private=0) at lowlevellock.c:52
 #1  0x00007f2711b3c0a3 in __GI___pthread_mutex_lock (mutex=mutex@entry=0x7edb4421eb20) at ../nptl/pthread_mutex_lock.c:80
 #2  0x00007f27116ab718 in fellow_cache_lru_chgbatch_apply (lcb=lcb@entry=0x7f2705d813f0) at fellow_cache.c:1104
 #3  0x00007f27116bf7b0 in fellow_cache_obj_delete (fc=0x7f27112ed000, fco=<optimized out>, fco@entry=0x7ef07cf3c000, hash=hash@entry=0x7ed6fb0c69b0 "b5*\371\064\062j\362\212Ze礤(X0լ\266\216JL&\231\223\302\031\315\365\277\n") at fellow_cache.c:4808
 #4  0x00007f271167eec2 in sfemem_free (wrk=wrk@entry=0x7f2705d825d0, memoc=memoc@entry=0x7ed3c82b0b00) at fellow_storage.c:543
 #5  0x00007f271167f365 in sfemem_objfree (wrk=0x7f2705d825d0, memoc=0x7ed3c82b0b00) at fellow_storage.c:577
 #6  0x000056544bc964aa in ObjFreeObj (wrk=wrk@entry=0x7f2705d825d0, oc=0x7ed3c82b0b00) at cache/cache_obj.c:412
 #7  0x000056544bc8ce8f in HSH_DerefObjCore (wrk=0x7f2705d825d0, ocp=ocp@entry=0x7f2705d82360, rushmax=rushmax@entry=0) at cache/cache_hash.c:1059
 #8  0x000056544bc81530 in exp_expire (now=1691019717.3146894, ep=0x7f2711246280) at cache/cache_expire.c:360
parent 81a5c96c
......@@ -3124,27 +3124,28 @@ fellow_cache_lru_work(struct worker *wrk, struct fellow_cache_lru *lru)
r = stvfe_mutate(wrk, lru, oc);
if (r) {
// success
// mutate was successful
AZ(fcs->fco_lru_mutate);
}
else {
// will be put back on LRU below
AN(fcs->fcs_onlru);
AN(fcs->fco_lru_mutate);
fcs->fco_lru_mutate = 0;
}
AZ(pthread_mutex_unlock(&fco->mtx));
if (r) {
/* mutate was successful
* VSC_C_main->n_lru_nuked++; // XXX per lru ?
*/
AZ(pthread_mutex_unlock(&fco->mtx));
break;
}
/* mutate has failed
*
* FCO will be put back on LRU below when we have the
* lru lock again
*/
AZ(pthread_mutex_lock(&lru->lru_mtx));
// mutate has failed
AN(fcs->fcs_onlru);
AZ(fcs->fco_lru_mutate);
AN(fcs->fco_lru_mutate);
fcs->fco_lru_mutate = 0;
VTAILQ_INSERT_TAIL(&lru->lru_head, fcs, lru_list);
AZ(pthread_mutex_unlock(&fco->mtx));
// re-start because we let go of lru_mtx
fcss = VTAILQ_FIRST(&lru->lru_head);
oc = NULL;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment