Commit 1f7815f1 authored by Geoff Simmons's avatar Geoff Simmons

Implement perfect hashing based on universal hashing.

Universal hashing has a sounder theoretical basis; in particular, it
doesn't have the dubious minimum hash table size below which a
perfect hash may not be possible, and which was set by trial and error.

For nearly all test data, universal hashing performs at least as
well or better. Especially better for sets with longer strings,
since the subject string is cast as an array of uint32_t, so the
hash is computed in fewer operations.

The only exception I've noticed is /usr/share/dict/words, which now
appears to have more collisions than under the previous approach.
But it appears likely that this only becomes an issue for sets that
are much larger than are probable for VCL use cases (in the 100,000
range), and if all of the sets' elements are tested for matches
about equally often (whereas real-world usage patterns tend to
match a subset much more frequently).
parent b956c988
......@@ -13,7 +13,8 @@ libvmod_selector_la_SOURCES = \
qp.c \
popcnt_compat.h \
ph.h \
ph.c
ph.c \
rnd.h
nodist_libvmod_selector_la_SOURCES = \
vcc_if.c \
......@@ -29,7 +30,7 @@ vmod_selector.c patricia.c: patricia.h
qp.c: qp.h popcnt_compat.h
ph.c: ph.h
ph.c: ph.h rnd.h
vmod_selector.lo: $(nodist_libvmod_selector_la_SOURCES)
......
This diff is collapsed.
......@@ -26,18 +26,65 @@
* SUCH DAMAGE.
*/
/* Interface for perfect hashing */
#include <stdint.h>
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include "vsb.h"
/*
* A perfect hash comprises a struct ph and a table of strings, both of
* which are owned by a VMOD object. Successful lookups return the index
* of a string in the table.
*/
struct ph;
/*
* Initialize perfect hashing. Supplies a seed for random number
* generation, which should be obtained from an entropy source. Only
* needs to be called once.
*/
void PH_Init(uint32_t seed[4]);
/*
* Generate a perfect hash from a table of strings with n elements.
* strings MAY NOT be NULL, and SHALL NOT contain duplicates. n MUST be >
* 0 and <= 2^31.
*
* Returns non-NULL on success, NULL on error, except that PH_Generate()
* will probably not terminate if strings contains duplicates.
*
* On error, errno is set. errno == ERANGE if n is out of range, or may
* set for other errors (probably ENOMEM for malloc failures).
*/
struct ph * PH_Generate(char * const * const strings, unsigned n);
/*
* Return the index of subject in the table strings, with which ph was
* generated.
*
* ph MUST be generated for strings previously by PH_Generate() (or NULL).
* strings and subject MAY NOT be NULL.
*
* Returns the index of subject in strings, or UINT_MAX if subject is not
* in strings or if ph is NULL.
*/
unsigned PH_Lookup(const struct ph * const restrict ph,
char * const restrict * const restrict strings,
const char * const restrict subject);
/*
* Return a string dump of ph as generated for strings.
*
* Returns a empty buffer if ph is NULL. If ph is non-NULL, strings MAY
* NOT be NULL.
*/
struct vsb * PH_Dump(struct ph *ph, char **strings);
/*
* Free ph. Silently does nothing if ph is NULL.
*/
void PH_Free(struct ph *ph);
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* It isn't cryptography, but the theoretical guarantees of universal
* hashing depend on the randomness of the keys, so we need a good RNG.
*
* This is an implementation of KISS99, from George Marsaglia's post to
* sci.stat.math and sci.math on January 20, 1999. It passes all of the
* tests in TestU01.
*
* It is *not* thread-safe, due to the static state variables.
*
* http://www.ciphersbyritter.com/NEWS4/RANDC.HTM#36A5FC62.17C9CC33@stat.fsu.edu
* https://www.iro.umontreal.ca/~lecuyer/myftp/papers/testu01.pdf
*/
#include <stdint.h>
static uint32_t mwc1, mwc2, jsr, jcong;
static inline void
rnd_init(uint32_t seed[4])
{
mwc1 = seed[0];
mwc2 = seed[1];
jsr = seed[2];
jcong = seed[3];
}
#define MWC(n, x) ((n) * ((x) & 65535) + ((x) >> 16))
static inline uint32_t
rnd_nxt()
{
uint32_t mwc;
mwc1 = MWC(36969, mwc1);
mwc2 = MWC(18000, mwc2);
mwc = (mwc1 << 16) + mwc2;
jsr ^= jsr << 17;
jsr ^= jsr >> 13;
jsr ^= jsr << 5;
jcong = 69069 * jcong + 1234567;
return ((mwc ^ jcong) + jsr);
}
......@@ -106,22 +106,26 @@ usage(const char *argv, int status)
int
main(int argc, char *argv[])
{
FILE *stringsf = stdin, *csv = NULL;
FILE *stringsf = stdin, *csv = NULL, *urandom = NULL;;
char **strings = NULL, **inputs = NULL, *line, *inputf = NULL,
*csvf = NULL;
*csvf = NULL, *dumpf = NULL;
size_t lineln = LINE_MAX;
ssize_t readln;
unsigned n = 0, ninputs = 0;
struct timespec before, after, start, finish;
uint64_t ns = 0, iters, matches;
uint32_t seed[4];
int opt, do_shuf = 0, do_iters = ITERATIONS;
struct rusage rusage;
while ((opt = getopt(argc, argv, "hsc:i:n:")) != -1) {
while ((opt = getopt(argc, argv, "hsc:d:i:n:")) != -1) {
switch (opt) {
case 'c':
csvf = optarg;
break;
case 'd':
dumpf = optarg;
break;
case 'h':
usage(argv[0], EXIT_SUCCESS);
break;
......@@ -242,6 +246,26 @@ main(int argc, char *argv[])
printf("Clock resolution %ld ns\n",
before.tv_sec * BILLION + before.tv_nsec);
printf("\nInitializing perfect hashing ...\n");
errno = 0;
if ((urandom = fopen("/dev/urandom", "r")) == NULL) {
fprintf(stderr, "Cannot open /dev/urandom: %s\n",
strerror(errno));
exit(EXIT_FAILURE);
}
(void)fread(seed, sizeof(uint32_t), 4, urandom);
if (ferror(urandom)) {
fprintf(stderr, "Error reading /dev/urandom: %s\n",
strerror(errno));
exit(EXIT_FAILURE);
}
if (fclose(urandom) != 0) {
fprintf(stderr, "Error closing /dev/urandom: %s\n",
strerror(errno));
exit(EXIT_FAILURE);
}
PH_Init(seed);
printf("\nBuilding perfect hash ...\n");
errno = 0;
(void)clock_gettime(CLOCK, &before);
......@@ -262,6 +286,34 @@ main(int argc, char *argv[])
printf("Generated for %u strings in %.9f s, mean %lu ns/string\n", n,
ns * 1e-9, ns / n);
if (dumpf != NULL) {
FILE *df;
struct vsb *vsb;
printf("\nDumping hash to %s ...\n", dumpf);
vsb = PH_Dump(ph, strings);
CHECK_OBJ_NOTNULL(vsb, VSB_MAGIC);
errno = 0;
if ((df = fopen(dumpf, "w")) == NULL) {
fprintf(stderr, "Cannot open %s: %s\n", dumpf,
strerror(errno));
exit(EXIT_FAILURE);
}
if (fwrite(VSB_data(vsb), 1, VSB_len(vsb), df)
!= (unsigned)VSB_len(vsb)) {
fprintf(stderr, "Error writing to %s: %s\n", dumpf,
strerror(errno));
exit(EXIT_FAILURE);
}
VSB_destroy(&vsb);
if (fclose(df) != 0) {
fprintf(stderr, "Error closing %s: %s\n", dumpf,
strerror(errno));
exit(EXIT_FAILURE);
}
printf("... done.\n");
}
if (do_iters == 0)
exit(EXIT_SUCCESS);
......
......@@ -39,6 +39,7 @@
#include "vcl.h"
#include "vre.h"
#include "vbm.h"
#include "vrnd.h"
#include "cache/cache_director.h"
/*
......@@ -117,6 +118,7 @@ vmod_event(VRT_CTX, struct vmod_priv *priv, enum vcl_event_e e)
{
struct vsc_head *vsc_head;
struct vsc_entry *vsc_entry;
uint32_t seed[4];
ASSERT_CLI();
CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);
......@@ -132,6 +134,10 @@ vmod_event(VRT_CTX, struct vmod_priv *priv, enum vcl_event_e e)
vsc_head = priv->priv;
switch(e) {
case VCL_EVENT_LOAD:
AZ(VRND_RandomCrypto(seed, sizeof(seed)));
PH_Init(seed);
break;
case VCL_EVENT_DISCARD:
while (!VSLIST_EMPTY(vsc_head)) {
vsc_entry = VSLIST_FIRST(vsc_head);
......@@ -155,7 +161,7 @@ vmod_event(VRT_CTX, struct vmod_priv *priv, enum vcl_event_e e)
}
break;
default:
assert(e == VCL_EVENT_LOAD);
WRONG("Illegal event type");
}
return 0;
}
......@@ -405,8 +411,8 @@ vmod_set_compile(VRT_CTX, struct VPFX(selector_set) *set)
errno = 0;
if ((set->hash = PH_Generate(members, set->nmembers)) == NULL) {
if (errno == ERANGE)
VFAIL(ctx, "%s.compile(): perfect hash cannot be "
"generated for this set", set->vcl_name);
VFAIL(ctx, "%s.compile(): too many strings in the set",
set->vcl_name);
else
VFAIL(ctx, "%s.compile() failed: %s", set->vcl_name,
strerror(errno));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment