Fix url parsing for zipflow.subreqs_from_body()

05699e62 · Nils Goroll · 3d299460 · 05699e62 · 05699e62 · 05699e62
Unverified Commit 05699e62 authored Sep 09, 2023 by Nils Goroll
Hide whitespace changes
Inline Side-by-side

Showing with 141 additions and 43 deletions

vmod_zipflow.vcc src/vmod_zipflow.vcc +10 -1

sub-body.vtc src/vtc/sub-body.vtc +24 -0

zfr_iter.c src/zfr_iter.c +107 -42

No files found.
--- a/src/vmod_zipflow.vcc
+++ b/src/vmod_zipflow.vcc
@@ -70,13 +70,22 @@ separated by any whitespace (``\\r\\n\\t\\s``)
 * ``http://``\ *host*\ *url*
 * ``https://``\ *host*\ *url*
 * ``//``\ *host*\ *url*
-* ``url``
+* *url*

 with *host* containing any non-whitespace character except for ``/``
 and *url* starting with ``/`` and run a sub request for each token as
 if ``subreq(``\ *url*\ ``, ``\ *host*\ ``)`` was invoced, but not
 using any workspace memory.

+.. _RFC 3965: https://www.ietf.org/rfc/rfc3986.txt
+
+*host* may not be empty. The syntactic ambiguity prevents *urls*
+ starting with ``///``.
+
+(Note: *host* and *url* are used in varnish-cache terminology, in `RFC
+3965`_ parlance, *host* is called authority and *url* is a path. The
+``//`` scheme is called a network-path reference)
+
 This function can only be called from the top level, that is, not from
 a sub request.


--- a/src/vtc/sub-body.vtc
+++ b/src/vtc/sub-body.vtc
@@ -43,10 +43,32 @@ varnish v1 -vcl {
 	}
 } -start

+logexpect l1 -v v1 -g request -q "ReqURL ~ \"/REQ/first/file\"" {
+	fail add *	ReqURL	"valid"
+	expect	 * *	ReqURL	"/fromvcl"
+	expect	 * *	ReqURL	"/FIRST/FROM/RESP/file"
+	expect	 * *	ReqURL	"/file1"
+	expect	 * *	ReqURL	"/file3"
+	expect	 * *	ReqURL	"/file4"
+	expect	 * *	ReqURL	"/REQ/first/file"
+	expect	 * *	ReqURL	"/path/file1"
+	expect	 * *	ReqURL	"/file3"
+	expect	 * *	ReqURL	"/file4"
+	expect	 * *	ReqURL	"/another/file2"
+	expect	 * *	ReqURL	"/file3"
+	expect	 * *	ReqURL	"/file4"
+	fail clear
+} -start
+
 client c1 {
 	txreq -body {
 	/REQ/first/file
 	http://thishost/path/file1
+
+	in valid
+	https://invalid
+	// https://
+
 	https://thishost/another/file2
 	//thishost/file3
 	}
@@ -58,5 +80,7 @@ client c1 {
 	expect resp.status == 500
 } -run

+logexpect l1 -wait
+
 # all default
 shell "curl --data-raw \"/REQ/CURL/first http://thishost/path/file1 https://thishost/another/file2 //thishost/xxx/file3\" -so t.zip -H 'Host: ${v1_addr}' http://${v1_addr}:${v1_port}/ && unzip -Z t.zip"
--- a/src/zfr_iter.c
+++ b/src/zfr_iter.c
@@ -56,6 +56,23 @@ pincr(const char **p, size_t l)
 	    pincr(&p, strlen(s))					\
 	    )

+#ifdef TEST_DRIVER
+static const char *DBG_begin;
+#define DBG_BEGIN(p) DBG_begin = (p)
+
+#define DBG_INVALID(p, why) do {					\
+		printf("invalid " why " >%.*s<\n",			\
+		    (int)pdiff(DBG_begin, p), DBG_begin);		\
+	} while (0)
+
+#define DBG_VALID(p) \
+	printf("valid >%.*s<\n", (int)pdiff(DBG_begin, p), DBG_begin)
+#else
+#define DBG_BEGIN(p)		(void)0
+#define DBG_INVALID(p, why)	(void)0
+#define DBG_VALID(p)		(void)0
+#endif
+
 int
 zfr_iter(void *priv, unsigned flush, const void *ptr, ssize_t alen)
 {
@@ -106,44 +123,55 @@ zfr_iter(void *priv, unsigned flush, const void *ptr, ssize_t alen)
 			p++;
 			continue;
 		}
-		if (tok(p, e, "https://") ||
-		    tok(p, e, "http://")) {
-			p -= 2;
-			// no need to keep this prefix in the buffer
-			pp = p;
-		}
+		DBG_BEGIN(p);
 		h = NULL;
-		if (tok(p, e, "//")) {
-			h = u = p;
-			p -= 2;
-			while (u < e && *u != '/')
-				u++;
-		} else
-			u = p;
+		if (tok(p, e, "//") ||
+		    tok(p, e, "http://") ||
+		    tok(p, e, "https://")) {
+			// sufficient to keep "//" in buffer
+			pp = p - 2;
+			h = p;
+			while (p < e && !vct_islws(*p) &&  *p != '/')
+				p++;
+		}
+		u = p;

+		// skip over whatever non-whitespace
 		while (p < e && !vct_islws(*p))
 			p++;

-		if (*u != '/')
+		// continue in next chunk
+		if (p == e && (! (flush & OBJ_ITER_END)))
 			break;

-		if (p < e || flush & OBJ_ITER_END) {
-			/* match! */
-			hh = NULL;
-			if (h) {
-				assert(u > h);
-				hh = strndup(h, pdiff(h, u));
-				AN(hh);
-			}
-			assert(p > u);
-			uu = strndup(u, pdiff(u, p));
-			AN(uu);
-			AN(zis->func);
-			zis->func(zis->priv, uu, hh);
-			free(hh);
-			free(uu);
-			pp = p;
+		// token complete, checkpoint
+		pp = p;
+
+		if (*u != '/') {
+			DBG_INVALID(p, "non-url");
+			continue;
+		}
+
+		if (h != NULL && h == u) {
+			DBG_INVALID(p, "no-host");
+			continue;
 		}
+
+		/* match! */
+		DBG_VALID(p);
+		hh = NULL;
+		if (h) {
+			assert(u > h);
+			hh = strndup(h, pdiff(h, u));
+			AN(hh);
+		}
+		assert(p > u);
+		uu = strndup(u, pdiff(u, p));
+		AN(uu);
+		AN(zis->func);
+		zis->func(zis->priv, uu, hh);
+		free(hh);
+		free(uu);
 	}

 	assert(e >= pp);
@@ -182,7 +210,7 @@ struct expect {
 	const char *u, *h, **pfx;
 };

-const char *pfx_host[7] = {
+static const char *pfx_host[7] = {
 	"https://",
 	"http://",
 	"//",
@@ -192,15 +220,33 @@ const char *pfx_host[7] = {
 	NULL
 };

-const char *pfx_nohost[2] = {
+static const char *pfx_nohost[2] = {
 	"",
 	NULL
 };

-static struct expect testcase[7] = {
+// incomplete tokens to skip
+static const char *invalid[10] = {
+	// lone prefixes
+	"https://",
+	"http:///",
+	"//",
+	"///",
+	"///a",
+	"//a",
+	// partials
+	"http",
+	"invalid",
+	"a",
+	NULL
+};
+
+static struct expect testcase[9] = {
 	{ .u = "/url", .h = "host", .pfx = pfx_host },
 	{ .u = "/",    .h = "host", .pfx = pfx_host },
 	{ .u = "///",  .h = "ho-t", .pfx = pfx_host },
+	{ .u = "/",    .h = "h",    .pfx = pfx_host },
+	{ .u = "///",  .h = "h",    .pfx = pfx_host },
 	{ .u = "/url", .h = NULL,   .pfx = pfx_nohost },
 	{ .u = "/",    .h = NULL,   .pfx = pfx_nohost },
 	{ .u = "/a",   .h = NULL,   .pfx = pfx_nohost },
@@ -231,13 +277,17 @@ cb_want(void *priv, const char *u, const char *h)

 	if (e->u == NULL)
 		AZ(u);
-	else
+	else {
+		AN(u);
 		AZ(strcmp(e->u, u));
+	}

 	if (e->h == NULL)
 		AZ(h);
-	else
+	else {
+		AN(h);
 		AZ(strcmp(e->h, h));
+	}

 	p->count++;
 }
@@ -274,7 +324,7 @@ int
 main(void) {
 	const struct expect *want;
 	struct zfr_iter_s zis[1];
-	const char **p, *t;
+	const char **p, *t, **inv = invalid;
 	char s[256], u[64];
 	unsigned n;
 	size_t l;
@@ -284,7 +334,6 @@ main(void) {
 	t = s;
 	assert(tok(t, strchr(s, '\0'), "https://"));

-
 	for (want = testcase; want->u != NULL; want++) {
 		INIT_OBJ(zis, ZFR_ITER_MAGIC);
 		zis->priv = NULL;
@@ -293,16 +342,22 @@ main(void) {

 		n = 0;
 		*s = '\0';
-		for (p = want->pfx; *p != NULL; p++) {
+		for (p = want->pfx; *p != NULL; p++, inv++) {
+			AN(inv);
+			if (*inv == NULL)
+				inv = invalid;
 			AN(p);
 			AN(*p);
-			if (**p == '\0')
-				l = snprintf(u, sizeof u, "\t%s", want->u);
+			if (**p == '\0') {
+				l = snprintf(u, sizeof u, " %s\t%s",
+				    *inv, want->u);
+			}
 			else {
-				l = snprintf(u, sizeof u, "\t%s%s%s",
-				    *p, want->h, want->u);
+				l = snprintf(u, sizeof u, " %s\t%s%s%s",
+				    *inv, *p, want->h, want->u);
 			}
 			assert(l < sizeof u);
+			inv++;

 			if (sizeof s - strlen(s) - 1 < l)
 				break;
@@ -310,6 +365,16 @@ main(void) {
 			n++;

 			t_steps(want, n, s);
+
+			if (*inv == NULL)
+				continue;
+
+			if (sizeof s - strlen(s) - 2 < strlen(*inv))
+				break;
+			(void) strcat(s, "\r");
+			(void) strcat(s, *inv);
+			t_steps(want, n, s);
+			inv++;
 		}
 	}