Commit 83364942 authored by Geoff Simmons's avatar Geoff Simmons

Implement QP prefix searching without recursion.

The new algorithm improves efficiency with iteration in place of
recursion, and in a number of other ways:

- Avoid searches into dead-end branches. The traversal of all branches
was done because of the overlapping prefix case -- "foo" and "foobar"
both in the set. Now we just search the tree for a match, but before
descending into the next branch, check if there are other branches at
which the current prefix matches a terminating node.

- Only do string comparisons when we hit a terminating node.

- Mark terminating nodes with a flag in the tree, so that we don't go
looking for the null byte in the strings table during the search.

While we're here, rename the flag for the nibble search as hinib --
non-zero if and only if we inspect the most significant nibble at that
node. Also remove some dead code from QP_Insert().
parent fa021199
......@@ -52,7 +52,8 @@ struct qp_y {
unsigned short off;
unsigned short len;
uint16_t bitmap;
unsigned char hilo;
unsigned int hinib:1;
unsigned int term:1;
};
static struct qp_y *
......@@ -75,15 +76,18 @@ y_alloc(unsigned idx, unsigned short off, size_t len)
y->len = (unsigned short) len;
AZ(y->branch);
AZ(y->bitmap);
AZ(y->hilo);
AZ(y->hinib);
AZ(y->term);
return (y);
}
static inline struct qp_y *
y_leaf_alloc(unsigned idx, unsigned char *c, unsigned char *b)
{
return y_alloc(idx, (unsigned short)(uintptr_t)(c - b),
strlen((char *)c));
struct qp_y *y = y_alloc(idx, (unsigned short)(uintptr_t)(c - b),
strlen((char *)c));
y->term = 1;
return (y);
}
static int
......@@ -133,7 +137,8 @@ y_dup(struct qp_y *y0, unsigned short len)
return (NULL);
y->bitmap = y0->bitmap;
y->hilo = y0->hilo;
y->hinib = y0->hinib;
y->term = y0->term;
y->branch = y0->branch;
return (y);
}
......@@ -141,7 +146,7 @@ y_dup(struct qp_y *y0, unsigned short len)
static inline uint16_t
getbits(const struct qp_y * const restrict y, unsigned char c)
{
unsigned shift = y->hilo << 2;
unsigned shift = y->hinib << 2;
unsigned mask = 0x0f << shift;
return (1 << ((c & mask) >> shift));
}
......@@ -168,6 +173,7 @@ QP_Insert(struct qp_y * * restrict root, unsigned idx,
*root = y_alloc(idx, 0, strlen(strings[idx]));
if (*root == NULL)
return (-1);
(*root)->term = 1;
return (0);
}
......@@ -251,7 +257,7 @@ QP_Insert(struct qp_y * * restrict root, unsigned idx,
}
}
y->hilo = ((s[i] ^ *c) & 0xf0) != 0;
y->hinib = ((s[i] ^ *c) & 0xf0) != 0;
bitmap = getbits(y, *c);
y->bitmap = bitmap;
......@@ -276,6 +282,8 @@ QP_Insert(struct qp_y * * restrict root, unsigned idx,
/*
* Move the current node down as a branch.
*/
if (y->branch != NULL)
y->term = 0;
bitmap = getbits(y, s[i]);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
......@@ -292,62 +300,43 @@ QP_Insert(struct qp_y * * restrict root, unsigned idx,
}
AN(y->bitmap);
if (i < y->len) {
AN(s[i]);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y_old = y_dup(y, i);
if (y_old == NULL) {
FREE_OBJ(y_new);
return (-1);
}
y->hilo = ((s[i] ^ *c) & 0xf0) != 0;
bitmap = getbits(y, *c);
y->bitmap = bitmap;
errno = 0;
y->branch = malloc(sizeof(*y->branch));
if (y->branch == NULL) {
FREE_OBJ(y_new);
return (-1);
}
AZ(getidx(y, bitmap));
y->branch[0] = y_new;
bitmap = getbits(y, s[i]);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_old);
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_old;
y->len = i;
return (0);
assert(i < y->len);
AN(s[i]);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
return (-1);
y_old = y_dup(y, i);
if (y_old == NULL) {
FREE_OBJ(y_new);
return (-1);
}
/*
* The branch slot is unoccupied, add a new leaf.
*/
y->hinib = ((s[i] ^ *c) & 0xf0) != 0;
y->term = 0;
bitmap = getbits(y, *c);
AZ(y->bitmap & bitmap);
y_new = y_leaf_alloc(idx, c, b);
if (y_new == NULL)
y->bitmap = bitmap;
errno = 0;
y->branch = malloc(sizeof(*y->branch));
if (y->branch == NULL) {
FREE_OBJ(y_new);
return (-1);
}
AZ(getidx(y, bitmap));
y->branch[0] = y_new;
bitmap = getbits(y, s[i]);
y->bitmap |= bitmap;
if (y_realloc_branch(y, bitmap) != 0) {
FREE_OBJ(y_old);
FREE_OBJ(y_new);
return (-1);
}
n = getidx(y, bitmap);
ANIB(n);
y->branch[n] = y_new;
y->branch[n] = y_old;
y->len = i;
return (0);
}
}
......@@ -391,48 +380,20 @@ QP_Lookup(const struct qp_y * const restrict root,
return (UINT_MAX);
}
static int
qp_search(const struct qp_y * const restrict y,
char * const restrict * const restrict strings,
const unsigned char * restrict subject, size_t len,
struct match_data * const restrict match)
static inline int
update_match(struct match_data * const match, unsigned idx, size_t len,
size_t l)
{
size_t l;
int branches;
if (y == NULL)
return (0);
CHECK_OBJ(y, QP_Y_MAGIC);
l = y->off + y->len;
if (l > len)
return (0);
if (y->len > 0
&& memcmp(subject + y->off, strings[y->idx] + y->off, y->len) != 0)
return (0);
if (strings[y->idx][l] == '\0') {
if (match->n == match->limit)
return (-1);
match->indices[match->n] = y->idx;
match->n++;
if (y->idx < match->min)
match->min = y->idx;
if (y->idx > match->max)
match->max = y->idx;
if (l == len) {
match->exact = y->idx;
return (0);
}
}
if (y->branch == NULL)
return (0);
AN(y->bitmap);
branches = popcount(y->bitmap);
for (int i = 0; i < branches; i++)
if (qp_search(y->branch[i], strings, subject, len, match) != 0)
return (-1);
if (match->n == match->limit)
return (-1);
match->indices[match->n] = idx;
match->n++;
if (idx < match->min)
match->min = idx;
if (idx > match->max)
match->max = idx;
if (l == len)
match->exact = idx;
return (0);
}
......@@ -451,12 +412,68 @@ QP_Prefixes(const struct qp_y * const restrict root,
AN(subject);
match->n = 0;
if (root == NULL)
return (0);
match->min = UINT_MAX;
match->max = 0;
match->exact = UINT_MAX;
len = strlen(subject);
return (qp_search(root, strings, (unsigned char *)subject, len, match));
for (const struct qp_y *y = root;;) {
size_t l;
uint16_t bitmap;
int idx = -1, branches;
CHECK_OBJ(y, QP_Y_MAGIC);
l = y->off + y->len;
if (l > len)
return (0);
if (y->term) {
if (strncmp(subject, strings[y->idx], l) != 0)
return (0);
if (update_match(match, y->idx, len, l) != 0)
return (-1);
if (l == len)
return (0);
}
if (y->branch == NULL)
return (0);
bitmap = getbits(y, subject[l]);
if ((y->bitmap & bitmap) != 0) {
idx = getidx(y, bitmap);
ANIB(idx);
}
/*
* Before we advance to the next branch, check if the
* current prefix matches any of the other branches from
* this node (this is the overlapping prefix case).
*/
branches = popcount(y->bitmap);
for (int i = 0; i < branches; i++) {
const struct qp_y *yy;
if (i == idx)
continue;
yy = y->branch[i];
CHECK_OBJ_NOTNULL(yy, QP_Y_MAGIC);
if (yy->off + yy-> len != l)
continue;
if (yy->term) {
if (strncmp(subject, strings[yy->idx], l) != 0)
return (0);
assert(l != len);
if (update_match(match, yy->idx, len, l) != 0)
return (-1);
}
}
if (idx == -1)
return (0);
y = y->branch[idx];
AN(y);
}
}
void
......@@ -493,7 +510,8 @@ qp_print_tree(struct qp_y *y, struct vsb *sb, char **strings)
VSB_printf(sb, "strings[idx][off]..[off+len] = %.*s\n", y->len,
strings[y->idx] + y->off);
VSB_printf(sb, "bitmap = 0x%04x\n", y->bitmap);
VSB_printf(sb, "hilo = %d\n", y->hilo);
VSB_printf(sb, "hinib = %d\n", y->hinib);
VSB_printf(sb, "term = %d\n", y->term);
VSB_printf(sb, "branch = %p\n", y->branch);
VSB_printf(sb, "branches = %d\n", popcount(y->bitmap));
if (y->bitmap != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment