Commit 29eb1857 authored by Geoff Simmons's avatar Geoff Simmons

Perfect hashing uses 64-bit FNV-1a with xor folding.

Theoretically, this reduces the probability of collisions. Benchmarks
don't show much of a difference.
parent ff908979
...@@ -51,7 +51,8 @@ struct ph { ...@@ -51,7 +51,8 @@ struct ph {
unsigned *tbl; unsigned *tbl;
size_t minlen; size_t minlen;
size_t maxlen; size_t maxlen;
int32_t mask; unsigned mask;
unsigned bits;
}; };
struct bucket { struct bucket {
...@@ -59,36 +60,35 @@ struct bucket { ...@@ -59,36 +60,35 @@ struct bucket {
int n; int n;
}; };
#define FNV32_OFFSET_BASIS (0x811c9dc5) #define FNV64_OFFSET_BASIS (0xcbf29ce484222325)
#define FNV32_PRIME (0x01000193) #define FNV64_PRIME (0x00000100000001b3)
/* FNV-1a */ /* FNV-1a 64-bit with xor-folding */
static inline uint32_t static inline uint32_t
hash(uint32_t h, const char *s) hash(uint32_t h, const char *s, unsigned bits, unsigned mask)
{ {
if (h == 0) uint64_t h64 = h;
h = FNV32_OFFSET_BASIS;
if (h64 == 0)
h64 = FNV64_OFFSET_BASIS;
while (*s) { while (*s) {
h ^= (uint32_t)*s++; h64 ^= (uint64_t)*s++;
h *= FNV32_PRIME; h64 *= FNV64_PRIME;
} }
return (h); return (((h64 >> bits) ^ h64) & mask);
} }
/* /*
* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogObvious
*/ */
static int static unsigned
nxtpow2(int n) lg(unsigned n)
{ {
n--; unsigned lg = 0;
n |= n >> 1; while (n >>= 1)
n |= n >> 2; lg++;
n |= n >> 4; return (lg);
n |= n >> 8;
n |= n >> 16;
return (++n);
} }
/* /*
...@@ -114,13 +114,16 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -114,13 +114,16 @@ PH_Generate(char * const * const strings, unsigned n)
int *found = NULL; int *found = NULL;
struct bucket *buckets = NULL; struct bucket *buckets = NULL;
int32_t *inter = NULL; int32_t *inter = NULL;
unsigned sz, mask, i, *tbl = NULL; unsigned bits, sz, mask, i, *tbl = NULL;
struct ph *ph = NULL; struct ph *ph = NULL;
AN(strings); AN(strings);
assert(n > 0 && n < 1 << 30); assert(n > 0 && n < 1 << 30);
sz = nxtpow2(n); bits = lg(n);
if (n != (unsigned)(1 << bits))
bits++;
sz = 1 << bits;
if (sz < MINSZ) if (sz < MINSZ)
sz = MINSZ; sz = MINSZ;
mask = sz - 1; mask = sz - 1;
...@@ -142,7 +145,7 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -142,7 +145,7 @@ PH_Generate(char * const * const strings, unsigned n)
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
size_t len = strlen(strings[i]); size_t len = strlen(strings[i]);
uint32_t h = hash(FNV32_OFFSET_BASIS, strings[i]) & mask; uint32_t h = hash(0, strings[i], bits, mask);
assert(h < sz); assert(h < sz);
errno = 0; errno = 0;
...@@ -170,8 +173,8 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -170,8 +173,8 @@ PH_Generate(char * const * const strings, unsigned n)
memset(found, 0, sz * sizeof(*found)); memset(found, 0, sz * sizeof(*found));
for (int j = 0; j < bucket->n && m < UINT32_MAX; j++) { for (int j = 0; j < bucket->n && m < UINT32_MAX; j++) {
uint32_t h = hash(m, strings[bucket->idx[j]]) uint32_t h = hash(m, strings[bucket->idx[j]], bits,
& mask; mask);
assert(h < sz); assert(h < sz);
if (tbl[h] != UINT_MAX || found[h] != 0) { if (tbl[h] != UINT_MAX || found[h] != 0) {
m++; m++;
...@@ -186,10 +189,9 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -186,10 +189,9 @@ PH_Generate(char * const * const strings, unsigned n)
goto exit; goto exit;
} }
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask] inter[hash(0, strings[bucket->idx[0]], bits, mask)] = m;
= m;
for (int j = 0; j < bucket->n; j++) for (int j = 0; j < bucket->n; j++)
tbl[hash(m, strings[bucket->idx[j]]) & mask] tbl[hash(m, strings[bucket->idx[j]], bits, mask)]
= bucket->idx[j]; = bucket->idx[j];
} }
...@@ -207,7 +209,7 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -207,7 +209,7 @@ PH_Generate(char * const * const strings, unsigned n)
struct bucket *bucket = &buckets[i]; struct bucket *bucket = &buckets[i];
tbl[found[n]] = bucket->idx[0]; tbl[found[n]] = bucket->idx[0];
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask] inter[hash(0, strings[bucket->idx[0]], bits, mask)]
= -found[n] - 1; = -found[n] - 1;
} }
...@@ -218,6 +220,7 @@ PH_Generate(char * const * const strings, unsigned n) ...@@ -218,6 +220,7 @@ PH_Generate(char * const * const strings, unsigned n)
ph->inter = inter; ph->inter = inter;
ph->tbl = tbl; ph->tbl = tbl;
ph->mask = mask; ph->mask = mask;
ph->bits = bits;
ph->minlen = min; ph->minlen = min;
ph->maxlen = max; ph->maxlen = max;
...@@ -261,11 +264,11 @@ PH_Lookup(const struct ph * const restrict ph, ...@@ -261,11 +264,11 @@ PH_Lookup(const struct ph * const restrict ph,
if (len > ph->maxlen) if (len > ph->maxlen)
return (UINT_MAX); return (UINT_MAX);
h = ph->inter[hash(FNV32_OFFSET_BASIS, subject) & ph->mask]; h = ph->inter[hash(0, subject, ph->bits, ph->mask)];
if (h < 0) if (h < 0)
idx = ph->tbl[-h - 1]; idx = ph->tbl[-h - 1];
else else
idx = ph->tbl[hash(h, subject) & ph->mask]; idx = ph->tbl[hash(h, subject, ph->bits, ph->mask)];
if (idx == UINT_MAX || strcmp(subject, strings[idx]) != 0) if (idx == UINT_MAX || strcmp(subject, strings[idx]) != 0)
return (UINT_MAX); return (UINT_MAX);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment