Add JSON string tools & UTF-8 decoder

parent f59e1985
......@@ -24,6 +24,7 @@ stamp-h1
# test suite
vsbjson-test
*.log
*.trs
......
[submodule "hoehrmann-utf8"]
path = hoehrmann-utf8
url = https://git.sr.ht/~slink/hoehrmann-utf8
hoehrmann-utf8 @ 690b8ebc
Subproject commit 690b8ebc02c5b50d5d73238b9dfb569b76afc9ba
......@@ -6,13 +6,29 @@ vmod_LTLIBRARIES = \
libvmod_j.la
libvmod_j_la_LDFLAGS = $(VMOD_LDFLAGS)
libvmod_j_la_SOURCES = vmod_j.c
libvmod_j_la_SOURCES = \
vsbjson.c \
vsbjson.h \
vmod_j.c
nodist_libvmod_j_la_SOURCES = \
vcc_j_if.c \
vcc_j_if.h
@BUILD_VMOD_J@
noinst_PROGRAMS = \
vsbjson-test
vsbjson_test_CFLAGS = $(AM_CFLAGS)
vsbjson_test_LDFLAGS = $(VARNISHAPI_LIBS)
vsbjson_test_SOURCES = \
vsbjson.c \
vsbjson.h \
vsbjson-test.c
# Test suite
AM_TESTS_ENVIRONMENT = \
......@@ -25,6 +41,7 @@ AM_VTC_LOG_FLAGS = \
-p vmod_path="$(abs_builddir)/.libs:$(vmoddir):$(VARNISHAPI_VMODDIR)"
TESTS = \
vsbjson-test \
vtc/vmod_j.vtc
# Documentation
......
......@@ -34,6 +34,7 @@
#include <cache/cache.h>
#include <vsb.h>
#include "vsbjson.h"
#include "vcc_j_if.h"
static int
......
/*-
* Copyright 2023 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Nils Goroll <nils.goroll@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include <stdio.h>
#include <string.h>
#include <vdef.h>
#include <vas.h>
#include <vsb.h>
#include <assert.h>
#include "vsbjson.h"
#define notquot(s, o) do { \
assert(is_jquot(s, &err) == 0); \
assert(err == s + o); \
} while(0)
static void
t_is_jquot(void)
{
const char *err = NULL;
assert(is_jquot("", NULL));
assert(is_jquot("a", NULL));
assert(is_jquot("a€", NULL));
assert(is_jquot("🐰", NULL));
notquot("\x01", 0);
notquot("a\x1f", 1);
assert(is_jquot("xx\\\"", NULL));
assert(is_jquot("xx\\ta", NULL));
assert(is_jquot("x\\t\\ub33Fx\\u0000yy", NULL));
assert(is_jquot("\\uAFFE\\u1234", NULL));
notquot("\\", 1);
notquot("\\x", 1);
notquot("\\u", 2);
notquot("\\us", 2);
notquot("\\ua", 3);
notquot("\\uab", 4);
notquot("\\uabc", 5);
notquot("\\u", 2);
assert(is_jquot("x\\t\\ub33F", NULL));
assert(is_jquot("x\\t\\ub33Fx", NULL));
}
static void
junquot(const char *from, const char *want)
{
struct vsb *vsb, vsbs[1];
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AN(vsbjunquot(vsb, from, NULL));
AZ(VSB_finish(vsb));
if (strcmp(want, VSB_data(vsb))) {
printf("want: \"%s\" have: \"%s\"\n",
want, VSB_data(vsb));
WRONG("unquote");
}
VSB_fini(vsb);
}
static void
junquotoverflow(const char *from)
{
struct vsb *vsb, vsbs[1];
const char *err = NULL;
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AZ(vsbjunquot(vsb, from, &err));
AN(VSB_finish(vsb));
VSB_fini(vsb);
AZ(err);
}
static void
junquoterr(const char *from, unsigned o)
{
struct vsb *vsb, vsbs[1];
const char *err = NULL;
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AZ(vsbjunquot(vsb, from, &err));
AZ(VSB_finish(vsb));
VSB_fini(vsb);
assert(err == from + o);
}
static void
jascii(const char *from, const char *want)
{
struct vsb *vsb, vsbs[1];
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AN(vsbjascii(vsb, from, NULL));
AZ(VSB_finish(vsb));
if (strcmp(want, VSB_data(vsb))) {
printf("want: \"%s\" have: \"%s\"\n",
want, VSB_data(vsb));
WRONG("expectation");
}
VSB_fini(vsb);
junquot(want, from);
}
static void
notutf(const char *p, unsigned off)
{
struct vsb *vsb, vsbs[1];
const char *err;
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AZ(vsbjascii(vsb, p, &err));
AZ(VSB_finish(vsb));
assert(err == p + off);
VSB_fini(vsb);
}
static void
jasciioverflow(const char *p)
{
struct vsb *vsb, vsbs[1];
const char *err = NULL;
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AZ(vsbjascii(vsb, p, &err));
AZ(err);
AN(VSB_finish(vsb));
VSB_fini(vsb);
}
static void
t_vsbjascii(void)
{
jascii("", "");
jascii("abc", "abc");
jascii("\"", "\\\"");
jascii("a\"b", "a\\\"b");
jascii("\t", "\\t");
jascii("\tbcde", "\\tbcde");
jascii("\x01", "\\u0001");
jascii("xx\x01", "xx\\u0001");
jascii("xx€Z", "xx\\u20acZ");
jascii("Z🐰", "Z\\ud83d\\udc30");
jascii("ࠀ", "\\u0800");
jascii("퟿", "\\ud7ff");
jascii("", "\\ue000");
jascii("abࠀ", "ab\\u0800");
jascii("ab퟿", "ab\\ud7ff");
jascii("ab", "ab\\ue000");
jascii("ࠀcd", "\\u0800cd");
jascii("퟿cd", "\\ud7ffcd");
jascii("cd", "\\ue000cd");
// highest code point 0x10ffff
jascii("\xf4\x8f\xbf\xbf","\\udbff\\udfff");
notutf("\x81", 0);
notutf("x\x81", 1);
jascii("🐰€ ", "\\ud83d\\udc30\\u20ac ");
jasciioverflow("🐰€ ");
jasciioverflow("€ 🐰");
jasciioverflow("🐰 €");
jasciioverflow("🐰€\t");
}
static void
jminimal(const char *from, const char *want)
{
struct vsb *vsb, vsbs[1];
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AN(vsbjminimal(vsb, from));
AZ(VSB_finish(vsb));
if (strcmp(want, VSB_data(vsb))) {
printf("want: \"%s\" have: \"%s\"\n",
want, VSB_data(vsb));
WRONG("expectation");
}
VSB_fini(vsb);
junquot(want, from);
}
static void
jminimaloverflow(const char *p)
{
struct vsb *vsb, vsbs[1];
char buf[20];
vsb = VSB_init(vsbs, buf, sizeof buf);
assert(vsb != NULL);
AZ(vsbjminimal(vsb, p));
AN(VSB_finish(vsb));
VSB_fini(vsb);
}
static void
t_vsbjminimal(void)
{
jminimal("", "");
jminimal("abc", "abc");
jminimal("\"", "\\\"");
jminimal("a\"b", "a\\\"b");
jminimal("\t", "\\t");
jminimal("\tbcde", "\\tbcde");
jminimal("\x01", "\\u0001");
jminimal("xx\x01", "xx\\u0001");
jminimal("xx€\\€Z", "xx€\\\\€Z");
jminimal("Z🐰", "Z🐰");
jminimal("🐰🐰🐰🐰€", "🐰🐰🐰🐰€");
jminimaloverflow("🐰🐰🐰🐰€ ");
jminimaloverflow("🐰🐰🐰🐰€\t");
jminimaloverflow("🐰🐰🐰🐰🐰");
}
static void
t_vsbjunquot(void)
{
junquot("", "");
junquot("\\t€", "\t€");
junquot("\\u0022", "\"");
junquot("\\u0800X", "ࠀX");
junquot("\\ue000X", "X");
junquoterr("\\x", 1);
junquoterr("\\u", 2);
junquoterr("\\u1", 3);
junquoterr("\\u12", 4);
junquoterr("\\u123", 5);
junquoterr("\\ud801", 6);
junquoterr("\\ud801\\", 7);
junquoterr("\\ud801\\u", 8);
junquoterr("\\ud801\\u1", 9);
junquoterr("\\ud801\\u12", 10);
junquoterr("\\ud801\\u123", 11);
junquoterr("\\ud801\\u1234", 12);
// low surrogate
junquoterr("\\udc00", 6);
#define B "\\ud83d\\udc30" // 🐰
#define E "\\u20ac" // €
junquot(B B B B E, "🐰🐰🐰🐰€");
junquotoverflow(B B B B E " ");
junquotoverflow(B B B B " ");
junquotoverflow(B B B B E "\\t");
junquotoverflow(B B B B E "\\u0020");
junquotoverflow(B B B B E "\\u07ff");
junquotoverflow(" " B B B B E);
junquotoverflow(" " B B B E B);
#undef B
#undef E
}
int
main(void)
{
t_is_jquot();
t_vsbjascii();
t_vsbjminimal();
t_vsbjunquot();
return (0);
}
/*-
* Copyright 2023 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Nils Goroll <nils.goroll@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include <string.h>
#include <stddef.h>
#include <sys/types.h>
#include <vdef.h>
#include <vas.h>
#include "vsbjson.h"
#include "hoehrmann-utf8/src/hoehrmann-utf8.h"
/* add json string
*
* XXX various UTF-8 cases still need handling
* - overlong
* - use of surrogate
* ...
*
* quotes:
* " \ / \b \f \n \r \t
* 22 5c 2f
*/
/* <0x80 values needing quoting */
static uint8_t jquot[256];
static uint8_t junquot[256];
static void __attribute__((constructor))
init_jquot(void)
{
memset(jquot, 0, sizeof(jquot));
jquot['"'] = '"';
jquot['\\'] = '\\';
// jquot['/'] = '/';
jquot['\b'] = 'b';
jquot['\f'] = 'f';
jquot['\n'] = 'n';
jquot['\r'] = 'r';
jquot['\t'] = 't';
memset(junquot, 0, sizeof(junquot));
junquot['"'] = '"';
junquot['\\'] = '\\';
junquot['/'] = '/';
junquot['b'] = '\b';
junquot['f'] = '\f';
junquot['n'] = '\n';
junquot['r'] = '\r';
junquot['t'] = '\t';
}
static inline int
jerr(const uint8_t *p, const char **err)
{
if (err)
*err = (const char *)p;
return (0);
}
struct unhex16_r {
uint32_t value;
unsigned len;
};
static inline struct unhex16_r
unhex16(const uint8_t *p)
{
struct unhex16_r r;
unsigned u;
uint8_t t;
r.value = 0;
for (u = 0; u < 4; u++) {
if (p[u] >= '0' && p[u] <= '9')
t = p[u] - '0';
else if (p[u] >= 'a' && p[u] <= 'f')
t = 0xa + p[u] - 'a';
else if (p[u] >= 'A' && p[u] <= 'F')
t = 0xa + p[u] - 'A';
else
break; // XXX illegal hex;
r.value |= t << (12 - (4 * u));
}
r.len = u;
return (r);
}
// JSON to utf-8
int
vsbjunquot(struct vsb *vsb, const char *pa, const char **err)
{
const uint8_t *p = (const uint8_t *)pa;
struct unhex16_r hex;
const uint8_t *s;
uint32_t code;
uint8_t t[4];
while (*p) {
s = p;
while (*p && *p != '\\')
p++;
if (s != p && VSB_bcat(vsb, s, p - s))
return (0);
if (! *p)
return (1);
assert(*p == '\\');
p++;
if (junquot[*p]) {
if (VSB_putc(vsb, junquot[*p]))
return (0);
p++;
continue;
}
if (*p != 'u')
return (jerr(p, err));
p++;
hex = unhex16(p);
p += hex.len;
if (hex.len != 4)
return (jerr(p, err));
if (hex.value >= 0xd800 && hex.value < 0xdc00) {
code = (hex.value & ((1<<10) - 1)) << 10;
if (*p != '\\')
return (jerr(p, err));
p++;
if (*p != 'u')
return (jerr(p, err));
p++;
hex = unhex16(p);
p += hex.len;
if (hex.len != 4)
return (jerr(p, err));
if (hex.value < 0xdc00 || hex.value > 0xdfff)
return (jerr(p, err));
code |= hex.value & ((1<<10) - 1);
code += 0x10000;
}
else
code = hex.value;
if (code < 0x80) {
if (VSB_putc(vsb, (uint8_t)code))
return (0);
}
else if (code < 0x800) {
t[0] = 0xc0 | (code >> 6);
t[1] = 0x80 | (code & ((1<<6) - 1));
if (VSB_bcat(vsb, t, 2))
return (0);
}
else if (code >= 0xd800 && code <= 0xdfff)
return (jerr(p, err));
else if (code < 0x10000) {
t[0] = 0xe0 | (code >> 12);
t[1] = 0x80 | ((code >> 6) & ((1<<6) - 1));
t[2] = 0x80 | (code & ((1<<6) - 1));
if (VSB_bcat(vsb, t, 3))
return (0);
}
else if (code < 0x110000) {
t[0] = 0xf0 | (code >> 18);
t[1] = 0x80 | ((code >> 12) & ((1<<6) - 1));
t[2] = 0x80 | ((code >> 6) & ((1<<6) - 1));
t[3] = 0x80 | (code & ((1<<6) - 1));
if (VSB_bcat(vsb, t, 4))
return (0);
}
else
WRONG("impossible high code point");
}
return (1);
}
// json-encode characters below 0x80
static inline int
vsbjx80(struct vsb *vsb, const uint8_t c)
{
if (c <= 0x7e && jquot[c]) {
(void)VSB_putc(vsb, '\\');
(void)VSB_putc(vsb, jquot[c]);
return (1);
}
if (c < 0x20) {
(void)VSB_printf(vsb, "\\u%04x", c);
return (1);
}
return (0);
}
/* utf-8 to 7-bit / ASCII JSON
*
* returns 0 for VSB or UTF error
* for UTF error, *err gets set
*/
int
vsbjascii(struct vsb *vsb, const char *pa, const char **err)
{
const uint8_t *p = (const uint8_t *)pa;
const uint8_t *s;
uint32_t state = 0, code;
while (*p) {
s = p;
while (*p <= 0x7e && *p >= 0x20 && jquot[*p] == 0)
p++;
if (s != p && VSB_bcat(vsb, s, p - s))
return (0);
if (! *p)
break;
if (vsbjx80(vsb, *p)) {
// no error checking in vsbjx80
if (VSB_error(vsb))
return (0);
p++;
continue;
}
assert(*p >= 0x80);
state = 0;
while (*p) {
switch (utf8_decode(&state, &code, *p++)) {
case UTF8_REJECT: {
if (err != NULL)
*err = (const char *)p - 1;
return (0);
}
case UTF8_ACCEPT:
/* we do not decode for < 0x80 and
* overlong codes are rejected
*/
assert(code >= 0x80);
assert(code < 0x110000);
if (code < 0x10000) {
if (VSB_printf(vsb, "\\u%04x", code))
return (0);
break;
}
code -= 0x10000;
if (VSB_printf(vsb, "\\u%04x\\u%04x",
0xd800 | (code >> 10),
0xdc00 | (code & ((1<<10) - 1))))
return (0);
break;
default:
continue;
}
break;
}
}
return (1);
}
// minimal JSON quoting
int
vsbjminimal(struct vsb *vsb, const char *pa)
{
const uint8_t *p = (const uint8_t *)pa;
const uint8_t *s;
while (*p) {
s = p;
while (*p > 0x7e || (*p >= 0x20 && jquot[*p] == 0))
p++;
if (s != p && VSB_bcat(vsb, s, p - s))
return (0);
if (! *p)
break;
AN(vsbjx80(vsb, *p));
if (VSB_error(vsb))
return (0);
p++;
}
return (1);
}
int
is_jquot(const char *pa, const char **err)
{
const uint8_t *p = (uint8_t *)pa;
unsigned u;
for (; *p; p++) {
if (*p < 0x20 || *p == '"')
return (jerr(p, err));
if (*p != '\\')
continue;
p++;
switch (*p) {
case '"':
case '\\':
case '/':
case 'b':
case 'f':
case 'n':
case 'r':
case 't':
continue;
case 'u':
break;
default:
return (jerr(p, err));
}
assert(*p == 'u');
for (u = 0; u < 4; u++) {
p++;
if (! ((*p >= '0' && *p <= '9') ||
(*p >= 'a' && *p <= 'f') ||
(*p >= 'A' && *p <= 'F')))
return (jerr(p, err));
}
}
return (1);
}
/*-
* Copyright 2023 UPLEX Nils Goroll Systemoptimierung
* All rights reserved
*
* Author: Nils Goroll <nils.goroll@uplex.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.