Commit 666548fb authored by Geoff Simmons's avatar Geoff Simmons

Add a perfect hash implementation, for full matches only.

Cannot be used for prefix matches.
parent 0c4dd4f8
......@@ -11,7 +11,9 @@ libvmod_selector_la_SOURCES = \
patricia.c \
qp.h \
qp.c \
popcnt_compat.h
popcnt_compat.h \
ph.h \
ph.c
nodist_libvmod_selector_la_SOURCES = \
vcc_if.c \
......@@ -27,6 +29,8 @@ vmod_selector.c patricia.c: patricia.h
qp.c: qp.h popcnt_compat.h
ph.c: ph.h
vmod_selector.lo: $(nodist_libvmod_selector_la_SOURCES)
vcc_if.h vmod_selector.rst vmod_selector.man.rst: vcc_if.c
......
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vdef.h"
#include "vas.h"
#include "miniobj.h"
#include "ph.h"
/*
* There is a non-zero probability that no perfect hash can be found,
* infinitesimal for moderate to large sets, but not impossible for small
* sets, if the tables are about as small as the set. Tests have never
* shown hash generation failure when the tables have this minimum size.
*/
#define MINSZ (1 << 8)
struct ph {
unsigned magic;
#define PH_MAGIC 0x00cd8c1d
int32_t *inter;
unsigned *tbl;
int32_t mask;
};
struct bucket {
unsigned *idx;
int n;
};
#define FNV32_OFFSET_BASIS (0x811c9dc5)
#define FNV32_PRIME (0x01000193)
/* FNV-1a */
static inline uint32_t
hash(uint32_t h, const char *s)
{
if (h == 0)
h = FNV32_OFFSET_BASIS;
while (*s) {
h ^= (uint32_t)*s++;
h *= FNV32_PRIME;
}
return (h);
}
/*
* https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
*/
static int
nxtpow2(int n)
{
n--;
n |= n >> 1;
n |= n >> 2;
n |= n >> 4;
n |= n >> 8;
n |= n >> 16;
return (++n);
}
/*
* Sort buckets by length in *descending* order, which is the opposite of
* the usual sense of qsort(). So:
*
* b1 > b2 -> return < 0
* b1 < b2 -> return > 0;
*/
static int
len_desc_cmp(const void *b1, const void *b2)
{
AN(b1);
AN(b2);
return ( ((struct bucket *)b2)->n - ((struct bucket *)b1)->n);
}
struct ph *
PH_Generate(char * const * const strings, unsigned n)
{
int *found = NULL;
struct bucket *buckets = NULL;
int32_t *inter = NULL;
unsigned sz, mask, i, *tbl = NULL;
struct ph *ph = NULL;
AN(strings);
assert(n > 0 && n < 1 << 30);
sz = nxtpow2(n);
if (sz < MINSZ)
sz = MINSZ;
mask = sz - 1;
errno = 0;
buckets = calloc(sz, sizeof(*buckets));
if (buckets == NULL)
return (NULL);
tbl = malloc(sz * sizeof(*tbl));
if (tbl == NULL)
goto exit;
memset(tbl, 0xff, sz * sizeof(*tbl));
inter = calloc(sz, sizeof(*inter));
if (inter == NULL)
goto exit;
found = malloc(sz * sizeof(*found));
if (found == NULL)
goto exit;
for (i = 0; i < n; i++) {
uint32_t h = hash(FNV32_OFFSET_BASIS, strings[i]) & mask;
assert(h < sz);
errno = 0;
buckets[h].idx = realloc(buckets[h].idx,
(buckets[h].n + 1)
* sizeof(*buckets[h].idx));
if (buckets[h].idx == NULL)
goto exit;
buckets[h].idx[buckets[h].n] = i;
buckets[h].n++;
}
qsort(buckets, sz, sizeof(*buckets), len_desc_cmp);
for (i = 0; i < sz && buckets[i].n > 1; i++) {
struct bucket *bucket = &buckets[i];
uint32_t m = 1;
AN(bucket->idx);
memset(found, 0, sz * sizeof(*found));
for (int j = 0; j < bucket->n && m < UINT32_MAX; j++) {
uint32_t h = hash(m, strings[bucket->idx[j]])
& mask;
assert(h < sz);
if (tbl[h] != UINT_MAX || found[h] != 0) {
m++;
j = -1;
memset(found, 0, sz * sizeof(*found));
continue;
}
found[h] = 1;
}
if (m == UINT32_MAX) {
errno = ERANGE;
goto exit;
}
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask]
= m;
for (int j = 0; j < bucket->n; j++)
tbl[hash(m, strings[bucket->idx[j]]) & mask]
= bucket->idx[j];
}
/* Reuse found[] for a list of free indices in tbl. */
memset(found, 0, sz * sizeof(*found));
for (unsigned j = 0, n = 0; j < sz; j++)
if (tbl[j] == UINT_MAX)
found[n++] = j;
/*
* Continuing with the value of i from the for loop above, now at
* buckets[i].n <= 1
*/
for (int n = 0; i < sz && buckets[i].n > 0; n++, i++) {
struct bucket *bucket = &buckets[i];
tbl[found[n]] = bucket->idx[0];
inter[hash(FNV32_OFFSET_BASIS, strings[bucket->idx[0]]) & mask]
= -found[n] - 1;
}
errno = 0;
ALLOC_OBJ(ph, PH_MAGIC);
if (ph == NULL)
goto exit;
ph->inter = inter;
ph->tbl = tbl;
ph->mask = mask;
exit:
AN(buckets);
for (i = 0; i < sz; i++)
if (buckets[i].idx != NULL)
free(buckets[i].idx);
free(buckets);
if (found != NULL)
free(found);
if (ph == NULL) {
if (inter != NULL)
free(inter);
if (tbl != NULL)
free(tbl);
}
return (ph);
}
unsigned
PH_Lookup(const struct ph * const restrict ph,
char * const restrict * const restrict strings,
const char * const restrict subject)
{
int32_t h;
unsigned idx;
CHECK_OBJ_ORNULL(ph, PH_MAGIC);
AN(strings);
AN(subject);
if (ph == NULL)
return (UINT_MAX);
h = ph->inter[hash(FNV32_OFFSET_BASIS, subject) & ph->mask];
if (h < 0)
idx = ph->tbl[-h - 1];
else
idx = ph->tbl[hash(h, subject) & ph->mask];
if (idx == UINT_MAX || strcmp(subject, strings[idx]) != 0)
return (UINT_MAX);
return (idx);
}
void
PH_Free(struct ph *ph)
{
if (ph == NULL)
return;
free(ph->inter);
free(ph->tbl);
FREE_OBJ(ph);
}
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include "vsb.h"
struct ph;
struct ph * PH_Generate(char * const * const strings, unsigned n);
unsigned PH_Lookup(const struct ph * const restrict ph,
char * const restrict * const restrict strings,
const char * const restrict subject);
void PH_Free(struct ph *ph);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment