Commit bb903e03 authored by Geoff Simmons's avatar Geoff Simmons

Add the QP interface as a possible replacement for patricia tries.

For "quadbit patricia tries", inspired by the work of Tony Finch:
https://dotat.at/prog/qp/README.html

Radix 16 tries, examining a nibble at a time, to make the tries
smaller and reduce pointer chasing.
parent 9c27ee9f
......@@ -78,6 +78,9 @@ AC_FUNC_REALLOC
AC_TYPE_SIZE_T
AC_TYPE_UINT64_T
# Check if gcc has this builtin (clang has a has_builtin() macro).
AX_GCC_BUILTIN(__builtin_popcount)
# --enable-stack-protector
AC_ARG_ENABLE(stack-protector,
AS_HELP_STRING([--enable-stack-protector],[enable stack protector (default is YES)]),
......
......@@ -8,7 +8,10 @@ vmod_LTLIBRARIES = libvmod_selector.la
libvmod_selector_la_SOURCES = \
vmod_selector.c \
patricia.h \
patricia.c
patricia.c \
qp.h \
qp.c \
popcnt_compat.h
nodist_libvmod_selector_la_SOURCES = \
vcc_if.c \
......@@ -22,6 +25,8 @@ dist_man_MANS = vmod_selector.3
vmod_selector.c patricia.c: patricia.h
qp.c: qp.h popcnt_compat.h
vmod_selector.lo: $(nodist_libvmod_selector_la_SOURCES)
vcc_if.h vmod_selector.rst vmod_selector.man.rst: vcc_if.c
......
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#ifndef __has_builtin
#define __has_builtin(b) 0
#endif
#if defined(__POPCNT__) && \
(HAVE___BUILTIN_POPCOUNT || __has_builtin(__builtin_popcount))
static inline int
popcount (unsigned int x)
{
return (__builtin_popcount(x));
}
#else
/* Table from Stanford Bit Twiddling Hacks */
static const unsigned char popcnt_tbl[256] =
{
# define B2(n) n, n+1, n+1, n+2
# define B4(n) B2(n), B2(n+1), B2(n+1), B2(n+2)
# define B6(n) B4(n), B4(n+1), B4(n+1), B4(n+2)
B6(0), B6(1), B6(1), B6(2)
};
static inline int
popcount(uint16_t n)
{
return (popcnt_tbl[n >> 8] + popcnt_tbl[n & 0xff]);
}
#endif
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Inspired by Varnish hash_critbit.c, tarsnap's Patricia implementation,
* and Tony Finch's qp tries.
*/
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include "vdef.h"
#include "vas.h"
#include "miniobj.h"
#include "qp.h"
#include "popcnt_compat.h"
#define ANIB(x) AZ((x) & ~0x0f)
struct pt_y {
unsigned magic;
#define QP_Y_MAGIC 0x6dfde24a
unsigned idx;
struct pt_y **branch;
unsigned short off;
unsigned short len;
uint16_t bitmap;
unsigned char hilo;
};
static struct pt_y *
y_alloc(unsigned idx, unsigned short off, size_t len)
{
struct pt_y *y;
if (len > USHRT_MAX) {
errno = ERANGE;
return (NULL);
}
errno = 0;
ALLOC_OBJ(y, QP_Y_MAGIC);
if (y == NULL)
return (NULL);
y->idx = idx;
y->off = off;
y->len = (unsigned short) len;
AZ(y->branch);
AZ(y->bitmap);
AZ(y->hilo);
return (y);
}
static inline struct pt_y *
y_leaf_alloc(unsigned idx, unsigned char *c, unsigned char *b)
{
return y_alloc(idx, (unsigned short)(uintptr_t)(c - b),
strlen((char *)c));
}
static int
y_realloc_branch(struct pt_y * const y, uint16_t bitmap)
{
int len;
uint16_t prev, lobitmap;
int16_t higher;
uint8_t loidx;
assert(popcount(bitmap) == 1);
AN(y->bitmap & bitmap);
len = popcount(y->bitmap);
assert(len <= 16);
errno = 0;
y->branch = realloc(y->branch, len * sizeof(*y->branch));
if (y->branch == NULL)
return (-1);
/*
* If there was a bit in the previous bitmap such that the new bit
* is lower, move up the array entries from that index.
*/
prev = y->bitmap & ~bitmap;
higher = prev & ~((bitmap << 1) - 1);
if (higher == 0)
return (0);
assert(bitmap != 0x8000);
lobitmap = higher & -higher;
loidx = popcount(prev & (lobitmap - 1));
memmove(&y->branch[loidx + 1], &y->branch[loidx],
(len - loidx - 1) * sizeof(*y->branch));
return (0);
}
static struct pt_y *
y_dup(struct pt_y *y0, unsigned short len)
{
struct pt_y *y;
assert(len < y0->len);
y = y_alloc(y0->idx, y0->off + len, y0->len - len);
if (y == NULL)
return (NULL);
y->bitmap = y0->bitmap;
y->hilo = y0->hilo;
y->branch = y0->branch;
return (y);
}
static inline uint16_t
getbits(const struct pt_y * const restrict y, unsigned char c)
{
unsigned shift = y->hilo << 2;
unsigned mask = 0x0f << shift;
return (1 << ((c & mask) >> shift));
}
static inline uint8_t
getidx(const struct pt_y * const restrict y, uint16_t bitmap)
{
return (popcount(y->bitmap & (bitmap - 1)));
}
int
QP_Insert(struct pt_y * * restrict root, unsigned idx,
char * const restrict * const restrict strings)
{
struct pt_y *y;
unsigned char *c, *b;
AN(root);
CHECK_OBJ_ORNULL(*root, QP_Y_MAGIC);
AN(strings);
AN(strings[idx]);
if (*root == NULL) {
*root = y_alloc(idx, 0, strlen(strings[idx]));
if (*root == NULL)
return (-1);
return (0);
}
y = *root;
b = (unsigned char *)strings[idx];
c = b;
errno = 0;
for (;;) {
unsigned short i;
unsigned char *s;
uint16_t bitmap;
uint8_t n;
struct pt_y *y_new, *y_old;
CHECK_OBJ(y, QP_Y_MAGIC);
s = (unsigned char *)(strings[y->idx] + y->off);
for (i = 0; *c != '\0' && i < y->len && s[i] == *c; i++)
c++;
if (s[i] == '\0' && *c == '\0') {
/*
* The string to be inserted is already in the
* trie.
*/
assert(i == y->len);
errno = EINVAL;
return (-1);
}
if (i == y->len && y->branch != NULL) {
/*
* The string to be inserted has a prefix that is
* already in the trie.
*/
AN(y->bitmap);
bitmap = getbits(y, *c);
if ((y->bitmap & bitmap) != 0) {
/*
* Other strings in the trie have the same
* prefix, follow the branch.
*/
n = getidx(y, bitmap);
ANIB(n);
AN(y->branch[n]);
y = y->branch[n];
continue;
}
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_new;
return (0);
}
assert(s[i] != *c);
if (y->branch == NULL) {
/*
* Current node is a leaf.
*/
AZ(y->bitmap);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y_old = NULL;
if (s[i] != '\0') {
y_old = y_dup(y, i);
if (y_old == NULL) {
FREE_OBJ(y_new);
return (-1);
}
}
y->hilo = ((s[i] ^ *c) & 0xf0) != 0;
bitmap = getbits(y, *c);
y->bitmap = bitmap;
errno = 0;
y->branch = malloc(sizeof(*y->branch));
if (y->branch == NULL) {
FREE_OBJ(y_new);
return (-1);
}
AZ(getidx(y, bitmap));
y->branch[0] = y_new;
if (s[i] == '\0') {
/*
* The current node is a proper prefix, so
* we're done after adding the leaf.
*/
assert(i == y->len);
return (0);
}
/*
* Move the current node down as a branch.
*/
bitmap = getbits(y, s[i]);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_old);
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_old;
y->len = i;
return (0);
}
AN(y->bitmap);
if (i < y->len) {
AN(s[i]);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y_old = y_dup(y, i);
if (y_old == NULL) {
FREE_OBJ(y_new);
return (-1);
}
y->hilo = ((s[i] ^ *c) & 0xf0) != 0;
bitmap = getbits(y, *c);
y->bitmap = bitmap;
errno = 0;
y->branch = malloc(sizeof(*y->branch));
if (y->branch == NULL) {
FREE_OBJ(y_new);
return (-1);
}
AZ(getidx(y, bitmap));
y->branch[0] = y_new;
bitmap = getbits(y, s[i]);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_old);
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_old;
y->len = i;
return (0);
}
/*
* The branch slot is unoccupied, add a new leaf.
*/
bitmap = getbits(y, *c);
AZ(y->bitmap & bitmap);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_new;
return (0);
}
}
unsigned
QP_Lookup(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
const char * const restrict subject)
{
const struct pt_y *y;
size_t len;
AN(strings);
AN(subject);
if (root == NULL)
return UINT_MAX;
len = strlen(subject);
for (y = root;;) {
size_t l;
uint16_t bitmap;
uint8_t idx;
CHECK_OBJ(y, QP_Y_MAGIC);
l = y->off + y->len;
if (l > len)
return UINT_MAX;
if (y->branch == NULL)
break;
bitmap = getbits(y, subject[l]);
if ((y->bitmap & bitmap) == 0)
break;
idx = getidx(y, bitmap);
ANIB(idx);
y = y->branch[idx];
AN(y);
}
if (strcmp(subject, strings[y->idx]) == 0)
return y->idx;
return (UINT_MAX);
}
static int
qp_search(const struct pt_y * const restrict y,
char * const restrict * const restrict strings,
const unsigned char * restrict subject, size_t len,
struct match_data * const restrict match)
{
size_t l;
int branches;
if (y == NULL)
return (0);
CHECK_OBJ(y, QP_Y_MAGIC);
l = y->off + y->len;
if (l > len)
return (0);
#if 0
if (y->len > 0) {
if (y->off > 0 && subject[y->off] != strings[y->idx][y->off])
return (0);
if (memcmp(subject + y->off, strings[y->idx] + y->off, y->len)
!= 0)
return (0);
}
#endif
if (y->len > 0
&& memcmp(subject + y->off, strings[y->idx] + y->off, y->len) != 0)
return (0);
if (strings[y->idx][l] == '\0') {
if (match->n == match->limit)
return (-1);
match->indices[match->n] = y->idx;
match->n++;
if (y->idx < match->min)
match->min = y->idx;
if (y->idx > match->max)
match->max = y->idx;
if (l == len) {
match->exact = y->idx;
return (0);
}
}
if (y->branch == NULL)
return (0);
AN(y->bitmap);
branches = popcount(y->bitmap);
for (int i = 0; i < branches; i++)
if (qp_search(y->branch[i], strings, subject, len, match) != 0)
return (-1);
return (0);
}
int
QP_Prefixes(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
const char * const restrict subject,
struct match_data * const restrict match)
{
size_t len;
CHECK_OBJ_NOTNULL(match, MATCH_DATA_MAGIC);
AN(match->indices);
AN(match->limit);
AN(strings);
AN(subject);
match->n = 0;
match->min = UINT_MAX;
match->max = 0;
match->exact = UINT_MAX;
len = strlen(subject);
return (qp_search(root, strings, (unsigned char *)subject, len, match));
}
void
QP_Free(struct pt_y *y)
{
if (y == NULL)
return;
CHECK_OBJ(y, QP_Y_MAGIC);
if (y->branch != NULL) {
AN(y->bitmap);
for (int i = 0; i < popcount(y->bitmap); i++) {
AN(y->branch[i]);
QP_Free(y->branch[i]);
}
free(y->branch);
}
FREE_OBJ(y);
}
static void
qp_print_tree(struct pt_y *y, struct vsb *sb, char **strings)
{
CHECK_OBJ_NOTNULL(y, QP_Y_MAGIC);
CHECK_OBJ_NOTNULL(sb, VSB_MAGIC);
VSB_printf(sb, "node = %p\n", y);
VSB_printf(sb, "idx = %u\n", y->idx);
VSB_printf(sb, "off = %u\n", y->off);
VSB_printf(sb, "len = %u\n", y->len);
AN(strings[y->idx]);
VSB_printf(sb, "strings[idx] = %s\n", strings[y->idx]);
VSB_printf(sb, "strings[idx][0]..[off] = %.*s\n", y->off,
strings[y->idx]);
VSB_printf(sb, "strings[idx][off]..[off+len] = %.*s\n", y->len,
strings[y->idx] + y->off);
VSB_printf(sb, "bitmap = 0x%04x\n", y->bitmap);
VSB_printf(sb, "hilo = %d\n", y->hilo);
VSB_printf(sb, "branch = %p\n", y->branch);
VSB_printf(sb, "branches = %d\n", popcount(y->bitmap));
if (y->bitmap != 0) {
VSB_printf(sb, "next nibbles = ");
for (int i = 0; i < 16; i++)
if (y->bitmap & (1 << i))
VSB_printf(sb, "%x ", i);
VSB_printf(sb, "\n");
AN(y->branch);
for (int i = 0; i < popcount(y->bitmap); i++)
VSB_printf(sb, "branch[%d] = %p\n", i, y->branch[i]);
}
VSB_printf(sb, "\n");
if (y->bitmap != 0)
for (int i = 0; i < popcount(y->bitmap); i++)
qp_print_tree(y->branch[i], sb, strings);
}
struct vsb *
QP_Dump(struct pt_y *root, char **strings)
{
struct vsb *sb = VSB_new_auto();
VSB_printf(sb, "root = %p\n\n", root);
if (root != NULL) {
AN(strings);
qp_print_tree(root, sb, strings);
}
VSB_finish(sb);
return (sb);
}
void
qp_stats(const struct pt_y * const restrict y,
char * const restrict * const restrict strings,
struct qp_stats * const restrict stats, unsigned depth)
{
uint8_t fanout;
if (y == NULL)
return;
CHECK_OBJ(y, QP_Y_MAGIC);
depth++;
stats->nodes++;
if (strings[y->idx][y->off + y->len] == '\0') {
if (depth < stats->dmin)
stats->dmin = depth;
if (depth > stats->dmax)
stats->dmax = depth;
stats->davg += (depth - stats->davg) / (stats->terms + 1.);
stats->terms++;
}
if (y->bitmap == 0) {
AZ(y->branch);
stats->leaves++;
return;
}
AN(y->branch);
fanout = popcount(y->bitmap);
assert(fanout <= 16);
if (fanout < stats->fmin)
stats->fmin = fanout;
if (fanout > stats->fmax)
stats->fmax = fanout;
stats->favg +=
(fanout - stats->favg) / ((stats->nodes - stats->terms) + 1.);
for (int i = 0; i < popcount(y->bitmap); i++)
qp_stats(y->branch[i], strings, stats, depth);
}
void
QP_Stats(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
struct qp_stats * const restrict stats)
{
CHECK_OBJ_NOTNULL(stats, QP_STATS_MAGIC);
stats->nodes = 0;
stats->leaves = 0;
stats->terms = 0;
stats->dmin = UINT64_MAX;
stats->dmax = 0;
stats->davg = 0.;
stats->fmin = UINT64_MAX;
stats->fmax = 0;
stats->favg = 0.;
stats->nodesz = sizeof(*root);
qp_stats(root, strings, stats, 0);
}
/*-
* Copyright (c) 2020 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Geoffrey Simmons <geoffrey.simmons@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <errno.h>
#include <limits.h>
#include <unistd.h>
#include "vsb.h"
struct pt_y;
struct match_data {
unsigned magic;
#define MATCH_DATA_MAGIC 0x0d9a845e
unsigned *indices;
unsigned limit;
unsigned n;
unsigned exact;
unsigned min;
unsigned max;
};
struct qp_stats {
unsigned magic;
#define QP_STATS_MAGIC 0x06d2b30c
uint64_t nodes;
uint64_t leaves;
uint64_t terms;
uint64_t nodesz;
uint64_t dmin;
uint64_t dmax;
double davg;
uint64_t fmin;
uint64_t fmax;
double favg;
};
int QP_Insert(struct pt_y * * restrict root, unsigned idx,
char * const restrict * const restrict strings);
unsigned QP_Lookup(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
const char * const restrict subject);
int QP_Prefixes(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
const char * const restrict subject,
struct match_data * const restrict match);
void QP_Stats(const struct pt_y * const restrict root,
char * const restrict * const restrict strings,
struct qp_stats * const restrict stats);
void QP_Free(struct pt_y *y);
struct vsb * QP_Dump(struct pt_y *root, char **strings);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment