Commit 105a1327 authored by Geoff Simmons's avatar Geoff Simmons

Perfect hashing uses 64-bit FNV-1a with xor folding.

Theoretically, this reduces the probability of collisions. Benchmarks
don't show much of a difference.
parent 883a6d61
......@@ -51,7 +51,8 @@ struct ph {
unsigned *tbl;
size_t minlen;
size_t maxlen;
int32_t mask;
unsigned mask;
unsigned bits;
};
struct bucket {
......@@ -59,36 +60,35 @@ struct bucket {
int n;
};
#define FNV32_OFFSET_BASIS (0x811c9dc5)
#define FNV32_PRIME (0x01000193)
#define FNV64_OFFSET_BASIS (0xcbf29ce484222325)
#define FNV64_PRIME (0x00000100000001b3)
/* FNV-1a */
/* FNV-1a 64-bit with xor-folding */
static inline uint32_t
hash(uint32_t h, const char *s)
hash(uint32_t h, const char *s, unsigned bits, unsigned mask)
{
if (h == 0)
h = FNV32_OFFSET_BASIS;
uint64_t h64 = h;
if (h64 == 0)
h64 = FNV64_OFFSET_BASIS;
while (*s) {
h ^= (uint32_t)*s++;
h *= FNV32_PRIME;
h64 ^= (uint64_t)*s++;
h64 *= FNV64_PRIME;
}
return (h);
return (((h64 >> bits) ^ h64) & mask);
}
/*
* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
* https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious
*/
static int
nxtpow2(int n)
static unsigned
lg(unsigned n)
{
n--;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
return (++n);
unsigned lg = 0;
while (n >>= 1)
lg++;
return (lg);
}
/*
......@@ -114,13 +114,16 @@ PH_Generate(char * const * const strings, unsigned n)
int *found = NULL;
struct bucket *buckets = NULL;
int32_t *inter = NULL;
unsigned sz, mask, i, *tbl = NULL;
unsigned bits, sz, mask, i, *tbl = NULL;
struct ph *ph = NULL;
AN(strings);
assert(n > 0 && n < 1 << 30);
sz = nxtpow2(n);
bits = lg(n);
if (n != (unsigned)(1 << bits))
bits++;
sz = 1 << bits;
if (sz < MINSZ)
sz = MINSZ;
mask = sz - 1;
......@@ -142,7 +145,7 @@ PH_Generate(char * const * const strings, unsigned n)
for (i = 0; i < n; i++) {
size_t len = strlen(strings[i]);
uint32_t h = hash(FNV32_OFFSET_BASIS, strings[i]) & mask;
uint32_t h = hash(0, strings[i], bits, mask);
assert(h < sz);
errno = 0;
......@@ -170,8 +173,8 @@ PH_Generate(char * const * const strings, unsigned n)
memset(found, 0, sz * sizeof(*found));
for (int j = 0; j < bucket->n && m < UINT32_MAX; j++) {
uint32_t h = hash(m, strings[bucket->idx[j]])
& mask;
uint32_t h = hash(m, strings[bucket->idx[j]], bits,
mask);
assert(h < sz);
if (tbl[h] != UINT_MAX || found[h] != 0) {
m++;
......@@ -186,10 +189,9 @@ PH_Generate(char * const * const strings, unsigned n)
goto exit;
}
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask]
= m;
inter[hash(0, strings[bucket->idx[0]], bits, mask)] = m;
for (int j = 0; j < bucket->n; j++)
tbl[hash(m, strings[bucket->idx[j]]) & mask]
tbl[hash(m, strings[bucket->idx[j]], bits, mask)]
= bucket->idx[j];
}
......@@ -207,7 +209,7 @@ PH_Generate(char * const * const strings, unsigned n)
struct bucket *bucket = &buckets[i];
tbl[found[n]] = bucket->idx[0];
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask]
inter[hash(0, strings[bucket->idx[0]], bits, mask)]
= -found[n] - 1;
}
......@@ -218,6 +220,7 @@ PH_Generate(char * const * const strings, unsigned n)
ph->inter = inter;
ph->tbl = tbl;
ph->mask = mask;
ph->bits = bits;
ph->minlen = min;
ph->maxlen = max;
......@@ -261,11 +264,11 @@ PH_Lookup(const struct ph * const restrict ph,
if (len > ph->maxlen)
return (UINT_MAX);
h = ph->inter[hash(FNV32_OFFSET_BASIS, subject) & ph->mask];
h = ph->inter[hash(0, subject, ph->bits, ph->mask)];
if (h < 0)
idx = ph->tbl[-h - 1];
else
idx = ph->tbl[hash(h, subject) & ph->mask];
idx = ph->tbl[hash(h, subject, ph->bits, ph->mask)];
if (idx == UINT_MAX || strcmp(subject, strings[idx]) != 0)
return (UINT_MAX);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment