Add lru_exponent parameter

parent 4ed3cf53
...@@ -21,6 +21,11 @@ fellow ...@@ -21,6 +21,11 @@ fellow
.. https://gitlab.com/uplex/varnish/slash/-/commit/ .. https://gitlab.com/uplex/varnish/slash/-/commit/
* To cater for massively parallel systems with dozens of CPUs, the
parameter ``lru_exponent`` has been introduced to scale the number
of LRU lists (and corresponding eviction threads) between 1 and 64
(corresponding to ``lru_exponent = 0`` to ``lru_exponent = 6``).
* The allocation policy for disk regions has been improved. This * The allocation policy for disk regions has been improved. This
should reduce fragmentation and pressure on LRU as well as improve should reduce fragmentation and pressure on LRU as well as improve
response times (`a0e8e8f779f4ad8569ccc9c3b7eaee08dc79cfa4`_). response times (`a0e8e8f779f4ad8569ccc9c3b7eaee08dc79cfa4`_).
......
...@@ -35,6 +35,12 @@ recommendations for optimal fellow storage performance ...@@ -35,6 +35,12 @@ recommendations for optimal fellow storage performance
Note that a fellow storage using any of the `xxhash`_ hashes can Note that a fellow storage using any of the `xxhash`_ hashes can
only be loaded by an instance with `xxhash`_ support compiled in. only be loaded by an instance with `xxhash`_ support compiled in.
* On big systems with many CPUs, ``lru_exponent`` can be tuned to
achieve maximum performance with hundreds of thousands of requests per
second.
Reasonable values are yet to be determined experimentally.
compiling compiling
~~~~~~~~~ ~~~~~~~~~
......
...@@ -689,7 +689,6 @@ struct fellow_busy { ...@@ -689,7 +689,6 @@ struct fellow_busy {
struct fellow_cache_lrus { struct fellow_cache_lrus {
unsigned magic; unsigned magic;
#define FELLOW_CACHE_LRUS_MAGIC 0xadad56fb #define FELLOW_CACHE_LRUS_MAGIC 0xadad56fb
uint8_t exponent;
pthread_mutex_t mtx; pthread_mutex_t mtx;
struct fellow_cache_lru *lru[1 << MAX_NLRU_EXPONENT]; struct fellow_cache_lru *lru[1 << MAX_NLRU_EXPONENT];
}; };
...@@ -747,6 +746,7 @@ fellow_cache_get_lru(struct fellow_cache *fc, uint64_t n) ...@@ -747,6 +746,7 @@ fellow_cache_get_lru(struct fellow_cache *fc, uint64_t n)
{ {
struct fellow_cache_lrus *lrus; struct fellow_cache_lrus *lrus;
struct fellow_cache_lru *lru; struct fellow_cache_lru *lru;
struct stvfe_tune *tune;
uint8_t exponent; uint8_t exponent;
pthread_t thr; pthread_t thr;
size_t i; size_t i;
...@@ -754,8 +754,10 @@ fellow_cache_get_lru(struct fellow_cache *fc, uint64_t n) ...@@ -754,8 +754,10 @@ fellow_cache_get_lru(struct fellow_cache *fc, uint64_t n)
CHECK_OBJ_NOTNULL(fc, FELLOW_CACHE_MAGIC); CHECK_OBJ_NOTNULL(fc, FELLOW_CACHE_MAGIC);
lrus = fc->lrus; lrus = fc->lrus;
CHECK_OBJ_NOTNULL(lrus, FELLOW_CACHE_LRUS_MAGIC); CHECK_OBJ_NOTNULL(lrus, FELLOW_CACHE_LRUS_MAGIC);
tune = fc->tune;
CHECK_OBJ_NOTNULL(tune, STVFE_TUNE_MAGIC);
exponent = lrus->exponent; exponent = tune->lru_exponent;
assert(exponent <= MAX_NLRU_EXPONENT); assert(exponent <= MAX_NLRU_EXPONENT);
i = exponent ? fib(n, exponent) : 0; i = exponent ? fib(n, exponent) : 0;
......
...@@ -83,6 +83,7 @@ stvfe_tune_check(struct stvfe_tune *tune) ...@@ -83,6 +83,7 @@ stvfe_tune_check(struct stvfe_tune *tune)
} }
sz = tune->memsz >> (tune->chunk_exponent + 3); sz = tune->memsz >> (tune->chunk_exponent + 3);
sz >>= tune->lru_exponent;
assert(sz <= UINT_MAX); assert(sz <= UINT_MAX);
l = (unsigned)sz; l = (unsigned)sz;
if (tune->mem_reserve_chunks > l) { if (tune->mem_reserve_chunks > l) {
......
...@@ -42,6 +42,7 @@ TUNE(float, log_rewrite_ratio, 0.5, 0.001, FLT_MAX); ...@@ -42,6 +42,7 @@ TUNE(float, log_rewrite_ratio, 0.5, 0.001, FLT_MAX);
// reserve chunk is the larger of chunk_exponent and result from logbuffer size // reserve chunk is the larger of chunk_exponent and result from logbuffer size
TUNE(unsigned, chunk_exponent, 20 /* 1MB*/, 12 /* 4KB */, 30 /* 1GB */); TUNE(unsigned, chunk_exponent, 20 /* 1MB*/, 12 /* 4KB */, 30 /* 1GB */);
TUNE(uint8_t, wait_table_exponent, 10, 6, 32); TUNE(uint8_t, wait_table_exponent, 10, 6, 32);
TUNE(uint8_t, lru_exponent, 0, 0, 6);
TUNE(unsigned, dsk_reserve_chunks, 4, 2, UINT_MAX); TUNE(unsigned, dsk_reserve_chunks, 4, 2, UINT_MAX);
TUNE(unsigned, mem_reserve_chunks, 1, 0, UINT_MAX); TUNE(unsigned, mem_reserve_chunks, 1, 0, UINT_MAX);
TUNE(size_t, objsize_hint, 256 * 1024, 4096, SIZE_MAX); TUNE(size_t, objsize_hint, 256 * 1024, 4096, SIZE_MAX);
......
...@@ -479,8 +479,8 @@ will be used (which might fail of insufficient memory is available). ...@@ -479,8 +479,8 @@ will be used (which might fail of insufficient memory is available).
.. _xfellow.tune(): .. _xfellow.tune():
STRING xfellow.tune([INT logbuffer_size], [DURATION logbuffer_flush_interval], [REAL log_rewrite_ratio], [INT chunk_exponent], [BYTES chunk_bytes], [INT wait_table_exponent], [INT dsk_reserve_chunks], [INT mem_reserve_chunks], [BYTES objsize_hint], [BYTES objsize_max], [INT cram], [INT readahead], [BYTES discard_immediate], [INT io_batch_min], [INT io_batch_max], [ENUM hash_obj], [ENUM hash_log], [ENUM ioerr_obj], [ENUM ioerr_log], [ENUM allocerr_obj], [ENUM allocerr_log]) STRING xfellow.tune([INT logbuffer_size], [DURATION logbuffer_flush_interval], [REAL log_rewrite_ratio], [INT chunk_exponent], [BYTES chunk_bytes], [INT wait_table_exponent], [INT lru_exponent], [INT dsk_reserve_chunks], [INT mem_reserve_chunks], [BYTES objsize_hint], [BYTES objsize_max], [INT cram], [INT readahead], [BYTES discard_immediate], [INT io_batch_min], [INT io_batch_max], [ENUM hash_obj], [ENUM hash_log], [ENUM ioerr_obj], [ENUM ioerr_log], [ENUM allocerr_obj], [ENUM allocerr_log])
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
:: ::
...@@ -491,6 +491,7 @@ STRING xfellow.tune([INT logbuffer_size], [DURATION logbuffer_flush_interval], [ ...@@ -491,6 +491,7 @@ STRING xfellow.tune([INT logbuffer_size], [DURATION logbuffer_flush_interval], [
[INT chunk_exponent], [INT chunk_exponent],
[BYTES chunk_bytes], [BYTES chunk_bytes],
[INT wait_table_exponent], [INT wait_table_exponent],
[INT lru_exponent],
[INT dsk_reserve_chunks], [INT dsk_reserve_chunks],
[INT mem_reserve_chunks], [INT mem_reserve_chunks],
[BYTES objsize_hint], [BYTES objsize_hint],
...@@ -587,6 +588,24 @@ fellow storage can be fine tuned: ...@@ -587,6 +588,24 @@ fellow storage can be fine tuned:
disk. Once an object is read, its body data is read in parallel disk. Once an object is read, its body data is read in parallel
independent of this limit. independent of this limit.
* *lru_exponent*
TL;DR: 2-logarithm of number of LRU lists
- unit: number of LRU lists as a power of two
- default: 0
- minimum: 0
- maximum: 6
On large systems, with mostly memory bound access, the LRU
list becomes the main contender as segments are removed and
re-added from/to LRU frequently.
A single LRU (``lru_exponent=0``) is most fair, only the absolute
least recently used segment is eviced ever. But more LRUs reduce
contention on the LRU lists significantly and improve parallelism of
evictions.
* *dsk_reserve_chunks* * *dsk_reserve_chunks*
- unit: scalar - unit: scalar
...@@ -612,10 +631,10 @@ fellow storage can be fine tuned: ...@@ -612,10 +631,10 @@ fellow storage can be fine tuned:
- minimum: 0 - minimum: 0
- maximum: memsize / 8 / chunk_bytes - maximum: memsize / 8 / chunk_bytes
specifies a number of chunks to reserve in memory. The reserve is specifies a number of chunks to reserve in memory per LRU. The
used to provide memory for new objects or objects staged from disk reserve is used to provide memory for new objects or objects staged
to memory when memory is otherwise full. It can help reduce from disk to memory when memory is otherwise full. It can help
latencies in these situations at the expense of some memory reduce latencies in these situations at the expense of some memory
unavailable for caching. unavailable for caching.
The value is capped suck that the number of reserved chunks times The value is capped suck that the number of reserved chunks times
...@@ -824,8 +843,8 @@ Can only be called from ``vcl_init {}``. ...@@ -824,8 +843,8 @@ Can only be called from ``vcl_init {}``.
.. _slash.tune_fellow(): .. _slash.tune_fellow():
STRING tune_fellow(STEVEDORE storage, [INT logbuffer_size], [DURATION logbuffer_flush_interval], [REAL log_rewrite_ratio], [INT chunk_exponent], [BYTES chunk_bytes], [INT wait_table_exponent], [INT dsk_reserve_chunks], [INT mem_reserve_chunks], [BYTES objsize_hint], [BYTES objsize_max], [INT cram], [INT readahead], [BYTES discard_immediate], [INT io_batch_min], [INT io_batch_max], [ENUM hash_obj], [ENUM hash_log], [ENUM ioerr_obj], [ENUM ioerr_log], [ENUM allocerr_obj], [ENUM allocerr_log]) STRING tune_fellow(STEVEDORE storage, [INT logbuffer_size], [DURATION logbuffer_flush_interval], [REAL log_rewrite_ratio], [INT chunk_exponent], [BYTES chunk_bytes], [INT wait_table_exponent], [INT lru_exponent], [INT dsk_reserve_chunks], [INT mem_reserve_chunks], [BYTES objsize_hint], [BYTES objsize_max], [INT cram], [INT readahead], [BYTES discard_immediate], [INT io_batch_min], [INT io_batch_max], [ENUM hash_obj], [ENUM hash_log], [ENUM ioerr_obj], [ENUM ioerr_log], [ENUM allocerr_obj], [ENUM allocerr_log])
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
:: ::
...@@ -837,6 +856,7 @@ STRING tune_fellow(STEVEDORE storage, [INT logbuffer_size], [DURATION logbuffer_ ...@@ -837,6 +856,7 @@ STRING tune_fellow(STEVEDORE storage, [INT logbuffer_size], [DURATION logbuffer_
[INT chunk_exponent], [INT chunk_exponent],
[BYTES chunk_bytes], [BYTES chunk_bytes],
[INT wait_table_exponent], [INT wait_table_exponent],
[INT lru_exponent],
[INT dsk_reserve_chunks], [INT dsk_reserve_chunks],
[INT mem_reserve_chunks], [INT mem_reserve_chunks],
[BYTES objsize_hint], [BYTES objsize_hint],
......
...@@ -430,6 +430,7 @@ $Method STRING .tune( ...@@ -430,6 +430,7 @@ $Method STRING .tune(
[ INT chunk_exponent ], [ INT chunk_exponent ],
[ BYTES chunk_bytes ], [ BYTES chunk_bytes ],
[ INT wait_table_exponent ], [ INT wait_table_exponent ],
[ INT lru_exponent ],
[ INT dsk_reserve_chunks ], [ INT dsk_reserve_chunks ],
[ INT mem_reserve_chunks ], [ INT mem_reserve_chunks ],
[ BYTES objsize_hint ], [ BYTES objsize_hint ],
...@@ -525,6 +526,24 @@ fellow storage can be fine tuned: ...@@ -525,6 +526,24 @@ fellow storage can be fine tuned:
disk. Once an object is read, its body data is read in parallel disk. Once an object is read, its body data is read in parallel
independent of this limit. independent of this limit.
* *lru_exponent*
TL;DR: 2-logarithm of number of LRU lists
- unit: number of LRU lists as a power of two
- default: 0
- minimum: 0
- maximum: 6
On large systems, with mostly memory bound access, the LRU
list becomes the main contender as segments are removed and
re-added from/to LRU frequently.
A single LRU (``lru_exponent=0``) is most fair, only the absolute
least recently used segment is eviced ever. But more LRUs reduce
contention on the LRU lists significantly and improve parallelism of
evictions.
* *dsk_reserve_chunks* * *dsk_reserve_chunks*
- unit: scalar - unit: scalar
...@@ -550,10 +569,10 @@ fellow storage can be fine tuned: ...@@ -550,10 +569,10 @@ fellow storage can be fine tuned:
- minimum: 0 - minimum: 0
- maximum: memsize / 8 / chunk_bytes - maximum: memsize / 8 / chunk_bytes
specifies a number of chunks to reserve in memory. The reserve is specifies a number of chunks to reserve in memory per LRU. The
used to provide memory for new objects or objects staged from disk reserve is used to provide memory for new objects or objects staged
to memory when memory is otherwise full. It can help reduce from disk to memory when memory is otherwise full. It can help
latencies in these situations at the expense of some memory reduce latencies in these situations at the expense of some memory
unavailable for caching. unavailable for caching.
The value is capped suck that the number of reserved chunks times The value is capped suck that the number of reserved chunks times
...@@ -759,6 +778,7 @@ $Function STRING tune_fellow( ...@@ -759,6 +778,7 @@ $Function STRING tune_fellow(
[ INT chunk_exponent ], [ INT chunk_exponent ],
[ BYTES chunk_bytes ], [ BYTES chunk_bytes ],
[ INT wait_table_exponent ], [ INT wait_table_exponent ],
[ INT lru_exponent ],
[ INT dsk_reserve_chunks ], [ INT dsk_reserve_chunks ],
[ INT mem_reserve_chunks ], [ INT mem_reserve_chunks ],
[ BYTES objsize_hint ], [ BYTES objsize_hint ],
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment