Add body matches

parent 2a771ff7
......@@ -115,15 +115,16 @@ never changes during the lifetime of VCL, use ``match``.
.. _re.regex():
new xregex = re.regex(STRING, INT limit, INT limit_recursion)
-------------------------------------------------------------
new xregex = re.regex(STRING, INT limit, INT limit_recursion, BOOL forbody)
---------------------------------------------------------------------------
::
new xregex = re.regex(
STRING,
INT limit=1000,
INT limit_recursion=1000
INT limit_recursion=1000,
BOOL forbody=0
)
Description
......@@ -139,6 +140,10 @@ Description
per-object defaults for the respective parameters of the
`xregex.match()`_ method.
The optional parameter ``forbody`` is required if the
`xregex.match_body()`_ method is to be called on the
object.
Example
``new myregex = re.regex("\bmax-age\s*=\s*(\d+)");``
......@@ -172,6 +177,73 @@ Description
Example
``if (myregex.match(beresp.http.Surrogate-Control)) { # ...``
.. _xregex.match_body():
BOOL xregex.match_body(ENUM which, INT limit, INT limit_recursion)
------------------------------------------------------------------
::
BOOL xregex.match_body(
ENUM {req_body, bereq_body, resp_body} which,
INT limit=0,
INT limit_recursion=0
)
.. _multi segment matching: https://pcre.org/current/doc/html/pcre2partial.html#SEC4
Description
Like `xregex.match()`_, except that it operates on the named body.
For a regular expression to be used with this method, it needs
to be constructed with the ``forbody`` flag set in the
`re.regex()`_ constructor. Calling this method when the flag
was unset results in a VCL failure.
PCRE2 `multi segment matching`_ is used to implement this
method to reduce memory requirements. In particular, unlike
implementations in other vmods, this implementation does _not_
read the full body object into a contiguous memory region. It
might, however, require up to roughly as much workspace as all
body segments which the match found by the pattern spans.
Under ideal conditions, when the pattern spans only a single
segment of a cached object, the `xregex.match_body()`_ method
does not create copies of the body data.
When used with a ``req_body`` or ``bereq_body`` *which*
argument, this method consumes the request body. If it is to
be used again (for example, to send it to a backend), it
should first be cached by calling
``std.cache_req_body(<size>)``.
Example::
sub vcl_init {
new pattern = re.regex("(a|b)=([^&]*).*&(a|b)=([^&]*)",
forbody=true);
}
sub vcl_recv {
if (pattern.match_body(req_body)) {
return (synth(42200));
}
}
sub vcl_synth {
if (resp.status == 42200) {
set resp.http.n1 = pattern.backref(1, "");
set resp.http.v1 = pattern.backref(2, "");
set resp.http.n2 = pattern.backref(3, "");
set resp.http.v2 = pattern.backref(4, "");
set resp.body = "";
return (deliver);
}
}
# response contains first parameter named a or b from the body as n1,
# first value as v1, and the second parameter and value as n2
# and v2
.. _xregex.backref():
STRING xregex.backref(INT, STRING fallback)
......
varnishtest ".match_body(*req_body)"
varnish v1 -vcl {
import re from "${vmod_topbuild}/src/.libs/libvmod_re.so";
import std;
backend none none;
sub vcl_init {
new pattern = re.regex("(a|b)=([^&]*).*&(a|b)=([^&]*)",
forbody=true);
}
sub vcl_synth {
if (resp.status == 200) {
set resp.http.n1 = pattern.backref(1, "");
set resp.http.v1 = pattern.backref(2, "");
set resp.http.n2 = pattern.backref(3, "");
set resp.http.v2 = pattern.backref(4, "");
set resp.body = "";
return (deliver);
}
}
sub vcl_backend_fetch {
# adjust if number of reqs in c1 changes
if (std.integer(bereq.xid) > 1019) {
std.cache_req_body(1KB);
}
set bereq.http.matched = pattern.match_body(bereq_body);
}
sub vcl_backend_error {
if (bereq.http.matched == "true") {
set beresp.status = 200;
set beresp.http.n1 = pattern.backref(1, "");
set beresp.http.v1 = pattern.backref(2, "");
set beresp.http.n2 = pattern.backref(3, "");
set beresp.http.v2 = pattern.backref(4, "");
set beresp.body = "";
return (deliver);
}
set beresp.status = 400;
}
# std.integer can go away after v-c
# 1d670e2f04a4ff13615c7e6a4ee800e4ba8ebaf2
sub vcl_recv {
# adjust if number of reqs in c1 changes
if (std.integer(req.xid) > 1009) {
return (pass);
}
# adjust if number of reqs in c1 changes
if (std.integer(req.xid) > 1005) {
std.cache_req_body(1KB);
}
if (pattern.match_body(req_body)) {
return (synth(200));
}
return (synth(400));
}
} -start
client c1 -repeat 4 {
txreq
rxresp
expect resp.status == 400
txreq -body "wontmatch"
rxresp
expect resp.status == 400
# simple case - all in one, C-l
txreq -body "saldkhaskdhsaksa=123&sadsadjhsakdh82378e3d&b=43875643543"
rxresp
expect resp.status == 200
expect resp.http.n1 == "a"
expect resp.http.v1 == "123"
expect resp.http.n2 == "b"
expect resp.http.v2 == "43875643543"
txreq -nolen -hdr "Transfer-encoding: chunked"
chunked "saldkhaskdhs"
chunked "aksa="
chunked "12"
chunked "3&sadsadjhsakdh82378e3d&b=43"
chunked "875643543"
chunkedlen 0
rxresp
expect resp.status == 200
expect resp.http.n1 == "a"
expect resp.http.v1 == "123"
expect resp.http.n2 == "b"
expect resp.http.v2 == "43875643543"
} -run
varnishtest ".match_body(resp.body)"
server s1 {
rxreq
txresp
rxreq
txresp -body "wontmatch"
# simple case - all in one, C-l
rxreq
txresp -body "saldkhaskdhsaksa=123&sadsadjhsakdh82378e3d&b=43875643543"
rxreq
txresp -nolen -hdr "Transfer-encoding: chunked"
chunked "saldkhaskdhs"
chunked "aksa="
chunked "12"
chunked "3&sadsadjhsakdh82378e3d&b=43"
chunked "875643543"
chunkedlen 0
} -start
varnish v1 -vcl+backend {
import re from "${vmod_topbuild}/src/.libs/libvmod_re.so";
import std;
sub vcl_init {
new pattern = re.regex("(a|b)=([^&]*).*&(a|b)=([^&]*)",
forbody=true);
}
sub vcl_deliver {
if (! pattern.match_body(resp_body)) {
set resp.status = 400;
return (deliver);
}
set resp.http.n1 = pattern.backref(1, "");
set resp.http.v1 = pattern.backref(2, "");
set resp.http.n2 = pattern.backref(3, "");
set resp.http.v2 = pattern.backref(4, "");
}
} -start
client c1 {
txreq -url "/1"
rxresp
expect resp.status == 400
txreq -url "/2"
rxresp
expect resp.status == 400
txreq -url "/3"
rxresp
expect resp.status == 200
expect resp.http.n1 == "a"
expect resp.http.v1 == "123"
expect resp.http.n2 == "b"
expect resp.http.v2 == "43875643543"
txreq -url "/4"
rxresp
expect resp.status == 200
expect resp.http.n1 == "a"
expect resp.http.v1 == "123"
expect resp.http.n2 == "b"
expect resp.http.v2 == "43875643543"
} -run
varnishtest ".match_body coverage"
varnish v1 -vcl {
import re from "${vmod_topbuild}/src/.libs/libvmod_re.so";
backend none none;
sub vcl_init {
new pattern = re.regex("(a|b)=([^&]*).*&(a|b)=([^&]*)");
}
sub vcl_recv {
pattern.match_body(req_body);
}
} -start
client c1 {
txreq
rxresp
expect resp.status == 503
expect resp.reason == "VCL failed"
} -run
......@@ -99,13 +99,15 @@ re_compile(const char *pattern, unsigned options, char *errbuf,
VCL_VOID
vmod_regex__init(VRT_CTX, struct vmod_re_regex **rep, const char *vcl_name,
VCL_STRING pattern, VCL_INT limit, VCL_INT limit_recursion)
VCL_STRING pattern, VCL_INT limit, VCL_INT limit_recursion,
VCL_BOOL forbody)
{
struct vmod_re_regex *re;
vre_t *vre;
char errbuf[VRE_ERROR_LEN];
int erroffset;
const char *error;
unsigned options = 0;
CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);
AN(rep);
......@@ -125,7 +127,9 @@ vmod_regex__init(VRT_CTX, struct vmod_re_regex **rep, const char *vcl_name,
return;
}
vre = re_compile(pattern, 0, errbuf, sizeof errbuf, &erroffset);
if (forbody)
options |= PCRE2_PARTIAL_HARD;
vre = re_compile(pattern, options, errbuf, sizeof errbuf, &erroffset);
if (vre == NULL) {
VRT_fail(ctx, "vmod re: error compiling regex \"%s\" in %s "
"constructor: %s (at offset %d)", pattern, vcl_name,
......@@ -133,6 +137,11 @@ vmod_regex__init(VRT_CTX, struct vmod_re_regex **rep, const char *vcl_name,
return;
}
/* duplication with varnish-cache */
if (forbody)
(void) pcre2_jit_compile(VRE_unpack(vre),
PCRE2_JIT_PARTIAL_HARD | PCRE2_JIT_COMPLETE);
ALLOC_OBJ(re, VMOD_RE_REGEX_MAGIC);
AN(re);
re->vre = vre;
......@@ -161,6 +170,7 @@ init_task(VRT_CTX, struct vmod_priv *task)
{
ov_t *ov;
AN(task);
AZ(task->priv);
task->priv = WS_Alloc(ctx->ws, sizeof(*ov));
if (task->priv == NULL) {
......@@ -349,6 +359,234 @@ vmod_regex_match(VRT_CTX, struct vmod_re_regex *re, VCL_STRING subject,
> PCRE2_ERROR_NOMATCH);
}
struct re_iter_priv {
unsigned magic;
#define RE_ITER_PRIV_MAGIC 0x04383ab8
uint32_t options;
int s;
VRT_CTX;
const vre_t *vre;
PCRE2_SIZE startoffset;
struct vmod_priv *task;
const struct vre_limits *vre_limits;
char *b, *e;
};
static int v_matchproto_(objiterate_f)
match_iter_f(void *priv, unsigned flush, const void *ptr, ssize_t len)
{
struct re_iter_priv *reip;
VCL_STRING subject;
void *p;
ov_t *ov;
int i;
CAST_OBJ_NOTNULL(reip, priv, RE_ITER_PRIV_MAGIC);
(void) flush;
#ifdef ITERDBG
VSLb(reip->ctx->vsl, SLT_Debug, "flush=%u, s=%d, ptr=%.*s, len=%zd",
flush, reip->s, len, ptr, len);
#endif
/* already have a match ? */
if (reip->s > PCRE2_ERROR_NOMATCH)
return (0);
if (reip->s == PCRE2_ERROR_NOMATCH) {
AZ(reip->startoffset);
AZ(reip->b);
AZ(reip->e);
subject = ptr;
}
else if (reip->s == PCRE2_ERROR_PARTIAL) {
AN(reip->e);
assert(reip->e >= reip->b);
if ((reip->e - reip->b) + len >
WS_ReservationSize(reip->ctx->ws)) {
errmsg(reip->ctx, "vmod re: insufficient workspace "
"while iterating (append)");
return (-1);
}
memcpy(reip->e, ptr, len);
reip->e += len;
len = (reip->e - reip->b);
subject = reip->b;
}
else {
WRONG("match error should have been latched "
"in previous iteration");
}
if (flush & OBJ_ITER_END)
reip->options &= ~PCRE2_PARTIAL_HARD;
reip->s = match(reip->ctx, reip->vre, subject, len,
reip->startoffset, reip->options, reip->task,
reip->vre_limits);
#ifdef ITERDBG
VSLb(reip->ctx->vsl, SLT_Debug, "match=%d, subject=%.*s, len=%zd",
reip->s, len, subject, len);
#endif
reip->options |= PCRE2_NOTBOL;
if (reip->s < PCRE2_ERROR_PARTIAL)
return (1);
if (reip->s == PCRE2_ERROR_NOMATCH) {
reip->startoffset = 0;
reip->b = 0;
reip->e = 0;
return (0);
}
CAST_OBJ_NOTNULL(ov, reip->task->priv, OV_MAGIC);
if (reip->s == PCRE2_ERROR_PARTIAL && reip->b != NULL)
return (0);
if (reip->s == PCRE2_ERROR_PARTIAL) {
if (len > WS_ReservationSize(reip->ctx->ws)) {
errmsg(reip->ctx, "vmod re: insufficient workspace "
"for partial copy");
return (-1);
}
reip->b = WS_Reservation(reip->ctx->ws);
memcpy(reip->b, ptr, len);
reip->e = reip->b + len;
reip->startoffset = ov->ovector[0];
return (0);
}
assert(reip->s > PCRE2_ERROR_NOMATCH);
if (reip->b == NULL && (flush & OBJ_ITER_FLUSH) == 0) {
/* no need to copy */
WS_Release(reip->ctx->ws, 0);
return (0);
}
len = ov->ovector[1] - ov->ovector[0];
assert(len >= 0);
if (reip->b == NULL && len > WS_ReservationSize(reip->ctx->ws)) {
errmsg(reip->ctx, "vmod re: insufficient workspace "
"for match copy");
WS_Release(reip->ctx->ws, 0);
return (-1);
}
ov->subject = p = WS_Reservation(reip->ctx->ws);
if (reip->b == NULL)
memcpy(p, subject + ov->ovector[0], len);
else
memmove(p, subject + ov->ovector[0], len);
WS_Release(reip->ctx->ws, len);
/* we have copied subject from start of match, fix all offsets */
len = ov->ovector[0];
for (i = 0; i < reip->s * 2; i++) {
assert(ov->ovector[i] >= len);
ov->ovector[i] -= len;
}
return (0);
}
VCL_BOOL
vmod_regex_match_body(VRT_CTX, struct vmod_re_regex *re, VCL_ENUM which,
VCL_INT limit, VCL_INT limit_recursion)
{
struct re_iter_priv reip[1];
struct vmod_priv *task;
struct vre_limits buf;
uint32_t u;
CHECK_OBJ_NOTNULL(ctx, VRT_CTX_MAGIC);
CHECK_OBJ_NOTNULL(re, VMOD_RE_REGEX_MAGIC);
AN(re->vre);
AZ(pcre2_pattern_info(VRE_unpack(re->vre), PCRE2_INFO_ARGOPTIONS, &u));
if ((u & PCRE2_PARTIAL_HARD) == 0) {
VRT_fail(ctx, "vmod re: .match_body() requires "
"construction with forbody=true");
return (0);
}
task = VRT_priv_task(ctx, re);
if (task == NULL) {
errmsg(ctx, "vmod re: no priv - out of workspace?");
return (0);
}
init_task(ctx, task);
if (task->priv == NULL)
return (0);
if (! WS_ReserveAll(ctx->ws)) {
errmsg(ctx, "vmod re: no workspace reservation possible");
return (0);
}
INIT_OBJ(reip, RE_ITER_PRIV_MAGIC);
reip->options = PCRE2_PARTIAL_HARD;
reip->s = PCRE2_ERROR_NOMATCH;
reip->ctx = ctx;
reip->vre = re->vre;
reip->task = task;
reip->vre_limits = get_limits(re, &buf, limit, limit_recursion);
if (which == VENUM(req_body)) {
if (ctx->req == NULL)
errmsg(ctx, "vmod re: .match_body(which = req_body) "
"called but no request body found");
else
(void) VRB_Iterate(ctx->req->wrk, ctx->vsl, ctx->req,
match_iter_f, reip);
}
else if (which == VENUM(bereq_body)) {
if (ctx->bo == NULL || ctx->bo->req == NULL)
errmsg(ctx, "vmod re: .match_body(which = bereq_body) "
"called but no backend request body found");
else
(void) VRB_Iterate(ctx->bo->wrk, ctx->vsl, ctx->bo->req,
match_iter_f, reip);
}
else if (which == VENUM(resp_body)) {
if (ctx->req == NULL || ctx->req->objcore == NULL)
errmsg(ctx, "vmod re: .match_body(which = resp_body) "
"called but no response body found");
else
(void) ObjIterate(ctx->req->wrk, ctx->req->objcore,
reip, match_iter_f, 0);
}
// XXX core code does not send OBJ_ITER_END reliably
if (reip->s == PCRE2_ERROR_PARTIAL)
(void) match_iter_f(reip, OBJ_ITER_END, "", 0);
assert(reip->s != PCRE2_ERROR_PARTIAL);
if (reip->s > PCRE2_ERROR_NOMATCH) {
memset(reip, 0, sizeof *reip);
return (1);
}
// errror or no match
memset(reip, 0, sizeof *reip);
WS_Release(ctx->ws, 0);
return (0);
}
VCL_STRING
vmod_regex_backref(VRT_CTX, struct vmod_re_regex *re, VCL_INT refnum,
VCL_STRING fallback)
......
......@@ -109,7 +109,7 @@ since it re-uses the compiled expression obtained at VCL
initialization. So if you are matching against a fixed pattern that
never changes during the lifetime of VCL, use ``match``.
$Object regex(STRING, INT limit=1000, INT limit_recursion=1000)
$Object regex(STRING, INT limit=1000, INT limit_recursion=1000, BOOL forbody=0)
Description
Create a regex object with the given regular expression. The
......@@ -124,6 +124,10 @@ Description
per-object defaults for the respective parameters of the
`xregex.match()`_ method.
The optional parameter ``forbody`` is required if the
`xregex.match_body()`_ method is to be called on the
object.
Example
``new myregex = re.regex("\bmax-age\s*=\s*(\d+)");``
......@@ -150,6 +154,63 @@ Description
Example
``if (myregex.match(beresp.http.Surrogate-Control)) { # ...``
$Method BOOL .match_body(ENUM {req_body, bereq_body, resp_body } which,
INT limit=0, INT limit_recursion=0)
.. _multi segment matching: https://pcre.org/current/doc/html/pcre2partial.html#SEC4
Description
Like `xregex.match()`_, except that it operates on the named body.
For a regular expression to be used with this method, it needs
to be constructed with the ``forbody`` flag set in the
`re.regex()`_ constructor. Calling this method when the flag
was unset results in a VCL failure.
PCRE2 `multi segment matching`_ is used to implement this
method to reduce memory requirements. In particular, unlike
implementations in other vmods, this implementation does _not_
read the full body object into a contiguous memory region. It
might, however, require up to roughly as much workspace as all
body segments which the match found by the pattern spans.
Under ideal conditions, when the pattern spans only a single
segment of a cached object, the `xregex.match_body()`_ method
does not create copies of the body data.
When used with a ``req_body`` or ``bereq_body`` *which*
argument, this method consumes the request body. If it is to
be used again (for example, to send it to a backend), it
should first be cached by calling
``std.cache_req_body(<size>)``.
Example::
sub vcl_init {
new pattern = re.regex("(a|b)=([^&]*).*&(a|b)=([^&]*)",
forbody=true);
}
sub vcl_recv {
if (pattern.match_body(req_body)) {
return (synth(42200));
}
}
sub vcl_synth {
if (resp.status == 42200) {
set resp.http.n1 = pattern.backref(1, "");
set resp.http.v1 = pattern.backref(2, "");
set resp.http.n2 = pattern.backref(3, "");
set resp.http.v2 = pattern.backref(4, "");
set resp.body = "";
return (deliver);
}
}
# response contains first parameter named a or b from the body as n1,
# first value as v1, and the second parameter and value as n2
# and v2
$Method STRING .backref(INT, STRING fallback="**BACKREF METHOD FAILED**")
Description
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment