Commit 3baf5288 authored by Geoff Simmons's avatar Geoff Simmons

For PH, set a key vector length for each secondary hash.

The hash inner loop iterates no more often than necessary.

We also set min and max lengths for strings for each secondary hash,
so that misses may be found more quickly.
parent ac199d52
......@@ -62,6 +62,9 @@ struct hash {
uint64_t *k;
uint64_t addend;
uint32_t *tbl;
size_t minlen;
size_t maxlen;
size_t l;
};
union tbl_t {
......@@ -75,9 +78,6 @@ struct ph {
struct hash *h1;
union tbl_t *tbl;
struct vbitmap *collision;
size_t l;
size_t minlen;
size_t maxlen;
};
struct bucket {
......@@ -99,11 +99,18 @@ hash(const struct hash * const restrict hash,
uint32_t tail[2] = {0};
size_t l = (len / 8) * 2;
if (len < hash->minlen || len > hash->maxlen)
return (UINT_MAX);
assert((len >> 2) <= hash->l);
for (e = s + l; s < e; k += 2, s += 2)
h += (*s + *k) * (*(s+1) + *(k+1));
memcpy(tail, e, (void *)(subject + len) - (void *)e);
h += (*tail + *k) * (*(tail+1) + *(k+1));
/* When the subject length is not an exact multiple of 8. */
if ((len & 0x07) != 0) {
memcpy(tail, e, (void *)(subject + len) - (void *)e);
h += (*tail + *k) * (*(tail+1) + *(k+1));
}
return ((h >> 32) & hash->mask);
}
......@@ -197,6 +204,9 @@ PH_Generate(char * const * const strings, unsigned n)
if (h1->k == NULL)
goto exit;
h1->addend = rnd64();
h1->minlen = min;
h1->maxlen = max;
h1->l = l;
/*
* XXX the literature sometimes says that the keys should all be
* odd, since repeated multiplication makes the result tend to 0,
......@@ -252,18 +262,29 @@ PH_Generate(char * const * const strings, unsigned n)
if (bhsh == NULL)
goto exit;
tbl[i].h2 = bhsh;
bhsh->tbl = malloc(bsz * sizeof(*bhsh->tbl));
if (bhsh->tbl == NULL)
goto exit;
bhsh->k = malloc(l * sizeof(*bhsh->k));
if (bhsh->k == NULL)
goto exit;
memset(bhsh->tbl, 0xff, bsz * sizeof(*bhsh->tbl));
bhsh->mask = bsz - 1;
bhsh->minlen = SIZE_MAX;
bhsh->maxlen = 0;
for (int j = 0; j < bucket->n; j++) {
size_t len = strlen(strings[bucket->idx[j]]);
if (len < bhsh->minlen)
bhsh->minlen = len;
if (len > bhsh->maxlen)
bhsh->maxlen = len;
}
bhsh->l = ((bhsh->maxlen + 7) / 8) * 2;
bhsh->k = malloc(bhsh->l * sizeof(*bhsh->k));
if (bhsh->k == NULL)
goto exit;
bhsh->addend = rnd64();
/* XXX as above, unsure about |1 here */
for (unsigned j = 0; j < l; j++)
for (unsigned j = 0; j < bhsh->l; j++)
bhsh->k[j] = rnd64() | 1;
for (int j = 0; j < bucket->n; j++) {
......@@ -275,7 +296,7 @@ PH_Generate(char * const * const strings, unsigned n)
memset(bhsh->tbl, 0xff,
bsz * sizeof(*bhsh->tbl));
bhsh->addend = rnd64();
for (unsigned k = 0; k < l; k++)
for (unsigned k = 0; k < bhsh->l; k++)
bhsh->k[k] = rnd64() | 1;
continue;
}
......@@ -287,12 +308,9 @@ PH_Generate(char * const * const strings, unsigned n)
ALLOC_OBJ(ph, PH_MAGIC);
if (ph == NULL)
goto exit;
ph->l = l;
ph->h1 = h1;
ph->tbl = tbl;
ph->collision = collision;
ph->minlen = min;
ph->maxlen = max;
exit:
AN(buckets);
......@@ -336,7 +354,7 @@ PH_Lookup(const struct ph * const restrict ph,
{
size_t len;
uint32_t h;
unsigned idx;
unsigned idx = UINT_MAX;
if (ph == NULL)
return (UINT_MAX);
......@@ -348,19 +366,16 @@ PH_Lookup(const struct ph * const restrict ph,
AN(subject);
len = strlen(subject);
if (len < ph->minlen)
return (UINT_MAX);
if (len > ph->maxlen)
return (UINT_MAX);
assert((len >> 2) <= ph->l);
h = hash(ph->h1, subject, len);
idx = ph->tbl[h].idx;
if (h != UINT_MAX)
idx = ph->tbl[h].idx;
if (vbit_test(ph->collision, h)) {
struct hash *h2 = ph->tbl[h].h2;
CHECK_OBJ_NOTNULL(h2, HASH_MAGIC);
AN(h2->tbl);
h = hash(h2, subject, len);
if (h == UINT_MAX)
return (UINT_MAX);
idx = h2->tbl[h];
}
......@@ -380,14 +395,15 @@ PH_Dump(struct ph *ph, char **strings)
}
CHECK_OBJ(ph, PH_MAGIC);
CHECK_OBJ_NOTNULL(ph->h1, HASH_MAGIC);
AN(strings);
VSB_printf(sb, "minlen = %zu\n", ph->minlen);
VSB_printf(sb, "maxlen = %zu\n", ph->maxlen);
VSB_printf(sb, "l = %zu\n", ph->l);
VSB_printf(sb, "minlen = %zu\n", ph->h1->minlen);
VSB_printf(sb, "maxlen = %zu\n", ph->h1->maxlen);
VSB_printf(sb, "l = %zu\n", ph->h1->l);
VSB_printf(sb, "h1->mask = 0x%0x\n", ph->h1->mask);
VSB_printf(sb, "h1->addend = 0x%0lx\n", ph->h1->addend);
for (unsigned i = 0; i < ph->l; i++)
for (unsigned i = 0; i < ph->h1->l; i++)
VSB_printf(sb, "h1->k[%u] = 0x%0lx\n", i, ph->h1->k[i]);
for (unsigned i = 0; i <= ph->h1->mask; i++) {
VSB_printf(sb, "\n");
......@@ -405,9 +421,12 @@ PH_Dump(struct ph *ph, char **strings)
if (h2 == NULL)
continue;
CHECK_OBJ(h2, HASH_MAGIC);
VSB_printf(sb, "tbl[%u].h2->l = %zu\n", i, h2->l);
VSB_printf(sb, "tbl[%u].h2->minlen = %zu\n", i, h2->minlen);
VSB_printf(sb, "tbl[%u].h2->maxlen = %zu\n", i, h2->maxlen);
VSB_printf(sb, "tbl[%u].h2->mask = 0x%0x\n", i, h2->mask);
VSB_printf(sb, "tbl[%u].h2->addend = 0x%0lx\n", i, h2->addend);
for (unsigned j = 0; j < ph->l; j++)
for (unsigned j = 0; j < h2->l; j++)
VSB_printf(sb, "tbl[%u].h2->k[%u] = 0x%0lx\n", i, j,
h2->k[j]);
for (unsigned j = 0; j <= h2->mask; j++) {
......@@ -440,9 +459,9 @@ PH_Stats(const struct ph * const restrict ph,
AN(strings);
stats->buckets = ph->h1->mask + 1;
stats->klen = ph->l;
stats->minlen = ph->minlen;
stats->maxlen = ph->maxlen;
stats->klen = ph->h1->l;
stats->minlen = ph->h1->minlen;
stats->maxlen = ph->h1->maxlen;
stats->h2buckets_min = UINT64_MAX;
stats->h2strings_min = UINT64_MAX;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment