chroma/include/vendor/tomlc17.c
NotAShelf c84819b3e8
build: add tomlc17 dep
Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I7aa52879362f01cc2e61fe391f6ff4576a6a6964
2026-05-01 13:10:25 +03:00

2915 lines
74 KiB
C
Vendored
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/* Copyright (c) 2024-2026, CK Tan.
* https://github.com/cktan/tomlc17/blob/main/LICENSE
*/
#include "tomlc17.h"
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <limits.h>
#include <math.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const toml_datum_t DATUM_ZERO = {0};
static toml_option_t toml_option = {0, realloc, free};
#define MALLOC(n) toml_option.mem_realloc(0, n)
#define REALLOC(p, n) toml_option.mem_realloc(p, n)
#define FREE(p) toml_option.mem_free(p)
#define DO(x) \
if (x) \
return -1; \
else \
(void)0
// Copy string src to dst where dst is limited to dstsz that includes
// NUL. Return 0 on success, -1 otherwise (because src[] is longer than dst[]).
static inline int copystring(char *dst, int dstsz, const char *src) {
int srcsz = strlen(src) + 1;
if (srcsz > dstsz) {
return -1;
}
memcpy(dst, src, srcsz);
return 0;
}
/*
* Error buffer
*/
typedef struct ebuf_t ebuf_t;
struct ebuf_t {
char *ptr;
int len;
};
/*
* Format an error into ebuf[]. Always return -1.
*/
static int SETERROR(ebuf_t ebuf, int lineno, const char *fmt, ...) {
va_list args;
va_start(args, fmt);
char *p = ebuf.ptr;
char *q = p + ebuf.len;
if (lineno) {
snprintf(p, p < q ? q - p : 0, "(line %d) ", lineno);
p += strlen(p);
}
vsnprintf(p, p < q ? q - p : 0, fmt, args);
va_end(args);
return -1;
}
/*
* Memory pool. Allocated a big block once and hand out piecemeal.
*/
typedef struct pool_t pool_t;
struct pool_t {
int max; // size of buf[]
int top; // offset of first free byte in buf[]
char buf[1]; // first byte starts here
};
/**
* Create a memory pool of N bytes. Return the memory pool on
* success, or NULL if out of memory.
*/
static pool_t *pool_create(int N) {
if (N <= 0) {
N = 100; // minimum
}
int totalsz = sizeof(pool_t) + N;
pool_t *pool = MALLOC(totalsz);
if (!pool) {
return NULL;
}
memset(pool, 0, totalsz);
pool->max = N;
return pool;
}
/**
* Destroy a memory pool.
*/
static void pool_destroy(pool_t *pool) { FREE(pool); }
/**
* Allocate n bytes from pool. Return the memory allocated on
* success, or NULL if out of memory.
*/
static char *pool_alloc(pool_t *pool, int n) {
if (pool->top + n > pool->max) {
return NULL;
}
char *ret = pool->buf + pool->top;
pool->top += n;
return ret;
}
/* This is a string view. */
typedef struct span_t span_t;
struct span_t {
const char *ptr;
int len;
};
/* Represents a multi-part key */
#define KEYPARTMAX 10
typedef struct keypart_t keypart_t;
struct keypart_t {
int nspan;
span_t span[KEYPARTMAX];
};
static int utf8_to_ucs(const char *s, int len, uint32_t *ret);
static int ucs_to_utf8(uint32_t code, char buf[4]);
// flags for toml_datum_t::flag.
#define FLAG_INLINED 1
#define FLAG_STDEXPR 2
#define FLAG_EXPLICIT 4
// Maximum levels of brackets and braces to prevent
// stack overflow during recursive descent of the parser.
#define BRACKET_LEVEL_MAX 30
#define BRACE_LEVEL_MAX 30
static inline size_t align8(size_t x) { return (((x) + 7) & ~7); }
enum toktyp_t {
TOK_DOT = 1,
TOK_EQUAL,
TOK_COMMA,
TOK_LBRACK, // [
TOK_LLBRACK, // [[
TOK_RBRACK, // ]
TOK_RRBRACK, // ]]
TOK_LBRACE, // {
TOK_RBRACE, // }
TOK_LIT,
TOK_STRING, // "string"
TOK_MLSTRING, // """multi-line-string"""
TOK_LITSTRING, // 'lit-string'
TOK_MLLITSTRING, // '''multi-line-lit-string'''
TOK_TIME,
TOK_DATE,
TOK_DATETIME,
TOK_DATETIMETZ,
TOK_INTEGER,
TOK_FLOAT,
TOK_BOOL,
TOK_ENDL,
TOK_FIN = -5000, // EOF
};
typedef enum toktyp_t toktyp_t;
typedef struct scanner_t scanner_t;
/* Remember the current state of a scanner */
typedef struct scanner_state_t scanner_state_t;
struct scanner_state_t {
scanner_t *sp;
const char *cur; // points into scanner_t::src[]
int lineno; // current line number
};
// A scan token
typedef struct token_t token_t;
struct token_t {
toktyp_t toktyp;
int lineno;
span_t str;
// values represented by str
union {
const char *escp; // point to an esc char in str
int64_t int64;
double fp64;
bool b1;
struct {
// validity depends on toktyp for TIME, DATE, DATETIME, DATETIMETZ
int year, month, day, hour, minute, sec, usec;
int tz; // +- minutes
} tsval;
} u;
};
// Scanner object
struct scanner_t {
const char *src; // src[] is a NUL-terminated string
const char *endp; // end of src[]. always pointing at a NUL char.
const char *cur; // current char in src[]
int lineno; // line number of current char
char *errmsg; // set to ebuf.ptr if there was an error
ebuf_t ebuf; // buffer to store error message
int bracket_level; // count depth of [ ]
int brace_level; // count depth of { }
};
static void scan_init(scanner_t *sp, const char *src, int len, char *errbuf,
int errbufsz);
static int scan_key(scanner_t *sp, token_t *tok);
static int scan_value(scanner_t *sp, token_t *tok);
// restore scanner to state before tok was returned
static scanner_state_t scan_mark(scanner_t *sp);
static void scan_restore(scanner_t *sp, scanner_state_t state);
#ifndef min
static inline int min(int a, int b) { return a < b ? a : b; }
#endif
// Copy up to dstsz - 1 chars from the current position of the scanner
// to dst, and always terminate dst[] with a NUL if dstsz > 0.
static void scan_copystr(scanner_t *sp, char *dst, int dstsz) {
assert(dstsz > 0);
int len = min(sp->endp - sp->cur, dstsz - 1); // account for NUL
if (len > 0) {
memcpy(dst, sp->cur, len);
dst[len] = '\0';
}
}
// Parser object
typedef struct parser_t parser_t;
struct parser_t {
scanner_t scanner;
toml_datum_t toptab; // top table
toml_datum_t *curtab; // current table
pool_t *pool; // memory pool for strings
ebuf_t ebuf; // buffer to store last error message
};
// Find key in tab and return its index. If not found, return -1.
static int tab_find(toml_datum_t *tab, span_t key) {
assert(tab->type == TOML_TABLE);
for (int i = 0, top = tab->u.tab.size; i < top; i++) {
if (tab->u.tab.len[i] == key.len &&
0 == memcmp(tab->u.tab.key[i], key.ptr, key.len)) {
return i;
}
}
return -1;
}
// Put key into tab dictionary. Return a place to
// the datum for the key on success, or NULL otherwise.
static toml_datum_t *tab_emplace(toml_datum_t *tab, span_t key,
const char **reason) {
assert(tab->type == TOML_TABLE);
int i = tab_find(tab, key);
if (i >= 0) {
return &tab->u.tab.value[i];
}
// Expand pkey[], plen[] and value[].
int N = tab->u.tab.size;
{
char **pkey = REALLOC(tab->u.tab.key, sizeof(*pkey) * align8(N + 1));
int *plen = REALLOC(tab->u.tab.len, sizeof(*plen) * align8(N + 1));
toml_datum_t *value =
REALLOC(tab->u.tab.value, sizeof(*value) * align8(N + 1));
// on success, must save new pointers in tab->u.tab because the
// old memory areas are gone.
if (pkey) {
tab->u.tab.key = (const char **)pkey;
}
if (plen) {
tab->u.tab.len = plen;
}
if (value) {
tab->u.tab.value = value;
}
// if any fail, it is safe to bail out.
if (!pkey || !plen || !value) {
*reason = "out of memory";
return NULL;
}
}
// There is sufficient space in all the arrays for one more element.
// Append the new key. The value is set to DATUM_ZERO. Caller will
// overwrite with a valid datum.
tab->u.tab.size = N + 1;
tab->u.tab.key[N] = (char *)key.ptr;
tab->u.tab.len[N] = key.len;
tab->u.tab.value[N] = DATUM_ZERO;
return &tab->u.tab.value[N];
}
// Add a new key in tab. Return 0 on success, -1 otherwise.
// On error, *reason will point to an error message.
static int tab_add(toml_datum_t *tab, span_t newkey, toml_datum_t newvalue,
const char **reason) {
assert(tab->type == TOML_TABLE);
toml_datum_t *pvalue = tab_emplace(tab, newkey, reason);
if (!pvalue) {
return -1;
}
if (pvalue->type) {
*reason = "duplicate key";
return -1;
}
*pvalue = newvalue;
return 0;
}
// Add a new element into an array. Return 0 on success, -1 otherwise.
// On error, *reason will point to an error message.
static toml_datum_t *arr_emplace(toml_datum_t *arr, const char **reason) {
assert(arr->type == TOML_ARRAY);
int n = arr->u.arr.size;
toml_datum_t *elem = REALLOC(arr->u.arr.elem, sizeof(*elem) * align8(n + 1));
if (!elem) {
*reason = "out of memory";
return NULL;
}
arr->u.arr.elem = elem;
arr->u.arr.size = n + 1;
elem[n] = DATUM_ZERO;
return &elem[n];
}
// ------------------- parser section
static int parse_norm(parser_t *pp, token_t tok, span_t *ret_span);
static int parse_val(parser_t *pp, token_t tok, toml_datum_t *ret);
static int parse_keyvalue_expr(parser_t *pp, token_t tok);
static int parse_std_table_expr(parser_t *pp, token_t tok);
static int parse_array_table_expr(parser_t *pp, token_t tok);
static toml_datum_t mkdatum(toml_type_t ty) {
toml_datum_t ret = {0};
ret.type = ty;
if (ty == TOML_DATE || ty == TOML_TIME || ty == TOML_DATETIME ||
ty == TOML_DATETIMETZ) {
ret.u.ts.year = -1;
ret.u.ts.month = -1;
ret.u.ts.day = -1;
ret.u.ts.hour = -1;
ret.u.ts.minute = -1;
ret.u.ts.second = -1;
ret.u.ts.usec = -1;
ret.u.ts.tz = -1;
}
return ret;
}
// Recursively free any dynamically allocated memory in the datum tree
static void datum_free(toml_datum_t *datum) {
if (datum->type == TOML_TABLE) {
for (int i = 0, top = datum->u.tab.size; i < top; i++) {
datum_free(&datum->u.tab.value[i]);
}
FREE(datum->u.tab.key);
FREE(datum->u.tab.len);
FREE(datum->u.tab.value);
} else if (datum->type == TOML_ARRAY) {
for (int i = 0, top = datum->u.arr.size; i < top; i++) {
datum_free(&datum->u.arr.elem[i]);
}
FREE(datum->u.arr.elem);
}
// other types do not allocate memory
*datum = DATUM_ZERO;
}
// Make a deep copy of src to dst.
// Return 0 on success, -1 otherwise.
static int datum_copy(toml_datum_t *dst, toml_datum_t src, pool_t *pool,
const char **reason) {
*dst = mkdatum(src.type);
switch (src.type) {
case TOML_STRING:
dst->u.str.ptr = pool_alloc(pool, src.u.str.len + 1);
if (!dst->u.str.ptr) {
*reason = "out of memory";
goto bail;
}
dst->u.str.len = src.u.str.len;
memcpy((char *)dst->u.str.ptr, src.u.str.ptr, src.u.str.len + 1);
break;
case TOML_TABLE:
for (int i = 0; i < src.u.tab.size; i++) {
span_t newkey = {src.u.tab.key[i], src.u.tab.len[i]};
toml_datum_t *pvalue = tab_emplace(dst, newkey, reason);
if (!pvalue) {
goto bail;
}
if (datum_copy(pvalue, src.u.tab.value[i], pool, reason)) {
goto bail;
}
}
break;
case TOML_ARRAY:
for (int i = 0; i < src.u.arr.size; i++) {
toml_datum_t *pelem = arr_emplace(dst, reason);
if (!pelem) {
goto bail;
}
if (datum_copy(pelem, src.u.arr.elem[i], pool, reason)) {
goto bail;
}
}
break;
default:
*dst = src;
break;
}
return 0;
bail:
datum_free(dst);
return -1;
}
// Check if datum is an array of tables.
static inline bool is_array_of_tables(toml_datum_t datum) {
bool ret = (datum.type == TOML_ARRAY);
for (int i = 0; ret && i < datum.u.arr.size; i++) {
ret = (datum.u.arr.elem[i].type == TOML_TABLE);
}
return ret;
}
// Merge src into dst. Return 0 on success, -1 otherwise.
static int datum_merge(toml_datum_t *dst, toml_datum_t src, pool_t *pool,
const char **reason) {
if (dst->type != src.type) {
datum_free(dst);
return datum_copy(dst, src, pool, reason);
}
switch (src.type) {
case TOML_TABLE:
// for key-value in src:
// override key-value in dst.
for (int i = 0; i < src.u.tab.size; i++) {
span_t key;
key.ptr = src.u.tab.key[i];
key.len = src.u.tab.len[i];
toml_datum_t *pvalue = tab_emplace(dst, key, reason);
if (!pvalue) {
return -1;
}
if (pvalue->type) {
DO(datum_merge(pvalue, src.u.tab.value[i], pool, reason));
} else {
datum_free(pvalue);
DO(datum_copy(pvalue, src.u.tab.value[i], pool, reason));
}
}
return 0;
case TOML_ARRAY:
if (is_array_of_tables(src)) {
// append src array to dst
for (int i = 0; i < src.u.arr.size; i++) {
toml_datum_t *pelem = arr_emplace(dst, reason);
if (!pelem) {
return -1;
}
DO(datum_copy(pelem, src.u.arr.elem[i], pool, reason));
}
return 0;
}
// fallthru
default:
break;
}
datum_free(dst);
return datum_copy(dst, src, pool, reason);
}
// Compare the content of a and b.
static bool datum_equiv(toml_datum_t a, toml_datum_t b) {
if (a.type != b.type) {
return false;
}
int N;
switch (a.type) {
case TOML_STRING:
return a.u.str.len == b.u.str.len &&
0 == memcmp(a.u.str.ptr, b.u.str.ptr, a.u.str.len);
case TOML_INT64:
return a.u.int64 == b.u.int64;
case TOML_FP64:
return a.u.fp64 == b.u.fp64 || (isnan(a.u.fp64) && isnan(b.u.fp64));
case TOML_BOOLEAN:
return !!a.u.boolean == !!b.u.boolean;
case TOML_DATE:
return a.u.ts.year == b.u.ts.year && a.u.ts.month == b.u.ts.month &&
a.u.ts.day == b.u.ts.day;
case TOML_TIME:
return a.u.ts.hour == b.u.ts.hour && a.u.ts.minute == b.u.ts.minute &&
a.u.ts.second == b.u.ts.second && a.u.ts.usec == b.u.ts.usec;
case TOML_DATETIME:
return a.u.ts.year == b.u.ts.year && a.u.ts.month == b.u.ts.month &&
a.u.ts.day == b.u.ts.day && a.u.ts.hour == b.u.ts.hour &&
a.u.ts.minute == b.u.ts.minute && a.u.ts.second == b.u.ts.second &&
a.u.ts.usec == b.u.ts.usec;
case TOML_DATETIMETZ:
return a.u.ts.year == b.u.ts.year && a.u.ts.month == b.u.ts.month &&
a.u.ts.day == b.u.ts.day && a.u.ts.hour == b.u.ts.hour &&
a.u.ts.minute == b.u.ts.minute && a.u.ts.second == b.u.ts.second &&
a.u.ts.usec == b.u.ts.usec && a.u.ts.tz == b.u.ts.tz;
case TOML_ARRAY:
N = a.u.arr.size;
if (N != b.u.arr.size) {
return false;
}
for (int i = 0; i < N; i++) {
if (!datum_equiv(a.u.arr.elem[i], b.u.arr.elem[i])) {
return false;
}
}
return true;
case TOML_TABLE:
N = a.u.tab.size;
if (N != b.u.tab.size) {
return false;
}
for (int i = 0; i < N; i++) {
int len = a.u.tab.len[i];
if (len != b.u.tab.len[i]) {
return false;
}
if (0 != memcmp(a.u.tab.key[i], b.u.tab.key[i], len)) {
return false;
}
if (!datum_equiv(a.u.tab.value[i], b.u.tab.value[i])) {
return false;
}
}
return true;
default:
break;
}
return false;
}
/**
* Override values in r1 using r2. Return a new result. All results
* (i.e., r1, r2 and the returned result) must be freed using toml_free()
* after use.
*
* LOGIC:
* ret = copy of r1
* for each item x in r2:
* if x is not in ret:
* override
* elif x in ret is NOT of the same type:
* override
* elif x is an array of tables:
* append r2.x to ret.x
* elif x is a table:
* merge r2.x to ret.x
* else:
* override
*/
toml_result_t toml_merge(const toml_result_t *r1, const toml_result_t *r2) {
const char *reason = "";
toml_result_t ret = {0};
pool_t *pool = 0;
if (!r1->ok) {
reason = "param error: r1 not ok";
goto bail;
}
if (!r2->ok) {
reason = "param error: r2 not ok";
goto bail;
}
{
pool_t *r1pool = (pool_t *)r1->__internal;
pool_t *r2pool = (pool_t *)r2->__internal;
pool = pool_create(r1pool->top + r2pool->top);
if (!pool) {
reason = "out of memory";
goto bail;
}
}
// Make a copy of r1
if (datum_copy(&ret.toptab, r1->toptab, pool, &reason)) {
goto bail;
}
// Merge r2 into the result
if (datum_merge(&ret.toptab, r2->toptab, pool, &reason)) {
goto bail;
}
ret.ok = 1;
ret.__internal = pool;
return ret;
bail:
pool_destroy(pool);
snprintf(ret.errmsg, sizeof(ret.errmsg), "%s", reason);
return ret;
}
bool toml_equiv(const toml_result_t *r1, const toml_result_t *r2) {
if (!(r1->ok && r2->ok)) {
return false;
}
return datum_equiv(r1->toptab, r2->toptab);
}
/**
* Find a key in a toml_table. Return the value of the key if found,
* or a TOML_UNKNOWN otherwise.
*/
toml_datum_t toml_get(toml_datum_t datum, const char *key) {
if (datum.type == TOML_TABLE) {
int n = datum.u.tab.size;
const char **pkey = datum.u.tab.key;
toml_datum_t *pvalue = datum.u.tab.value;
for (int i = 0; i < n; i++) {
if (0 == strcmp(pkey[i], key)) {
return pvalue[i];
}
}
}
return DATUM_ZERO;
}
/**
* Locate a value starting from a toml_table. Return the value of the key if
* found, or a TOML_UNKNOWN otherwise.
*
* Note: the multipart-key is separated by DOT, and must not have any escape
* chars.
*/
toml_datum_t toml_seek(toml_datum_t table, const char *multipart_key) {
if (table.type != TOML_TABLE) {
return DATUM_ZERO;
}
// Make a mutable copy of the multipart_key for splitting
char buf[256];
if (copystring(buf, sizeof(buf), multipart_key)) {
// if the multipart_key is longer than buffer, just
// signal a not-found.
return DATUM_ZERO;
}
// Go through the multipart name part by part.
char *p = buf;
toml_datum_t datum = table;
while (datum.type == TOML_TABLE) {
char *q = strchr(p, '.');
if (q) {
// traverse to next key
*q = 0;
datum = toml_get(datum, p);
p = q + 1;
continue;
}
// At end of last keypart.
// look up p in the final table
return toml_get(datum, p);
}
return DATUM_ZERO;
}
/**
* Return the default options.
*/
toml_option_t toml_default_option(void) {
toml_option_t opt = {0, realloc, free};
return opt;
}
/**
* Override the current options.
*/
void toml_set_option(toml_option_t opt) { toml_option = opt; }
/**
* Free the result returned by toml_parse().
*/
void toml_free(toml_result_t result) {
datum_free(&result.toptab);
pool_destroy((pool_t *)result.__internal);
}
/**
* Parse a toml document.
*/
toml_result_t toml_parse_file_ex(const char *fname) {
toml_result_t result = {0};
FILE *fp = fopen(fname, "r");
if (!fp) {
snprintf(result.errmsg, sizeof(result.errmsg), "fopen %s: %s", fname,
strerror(errno));
return result;
}
result = toml_parse_file(fp);
fclose(fp);
return result;
}
/**
* Parse a toml document.
*/
toml_result_t toml_parse_file(FILE *fp) {
toml_result_t result = {0};
char *buf = 0;
int top, max; // index into buf[]
top = max = 0;
// Read file into memory
while (!feof(fp)) {
assert(top <= max);
if (top == max) {
// need to extend buf[]
int64_t tmpmax64 = (int64_t)max * 3 / 2 + 1000;
int tmpmax = (tmpmax64 > INT_MAX - 1) ? INT_MAX - 1 : (int)tmpmax64;
if (tmpmax == INT_MAX - 1) {
snprintf(result.errmsg, sizeof(result.errmsg), "file is too big");
FREE(buf);
return result;
}
// add an extra byte for terminating NUL
char *tmp = REALLOC(buf, tmpmax + 1);
if (!tmp) {
snprintf(result.errmsg, sizeof(result.errmsg), "out of memory");
FREE(buf);
return result;
}
buf = tmp;
max = tmpmax;
}
errno = 0;
top += fread(buf + top, 1, max - top, fp);
if (ferror(fp)) {
snprintf(result.errmsg, sizeof(result.errmsg), "%s",
errno ? strerror(errno) : "Error reading file");
FREE(buf);
return result;
}
}
buf[top] = 0; // NUL terminator
result = toml_parse(buf, top);
FREE(buf);
return result;
}
/**
* Parse a toml document.
*/
toml_result_t toml_parse(const char *src, int len) {
toml_result_t result = {0};
parser_t parser = {0};
parser_t *pp = &parser;
// Check that src is NUL terminated.
if (src[len]) {
snprintf(result.errmsg, sizeof(result.errmsg),
"src[] must be NUL terminated");
goto bail;
}
// If user insists, check that src[] is a valid utf8 string.
if (toml_option.check_utf8) {
int line = 1; // keeps track of line number
for (int i = 0; i < len;) {
uint32_t ch;
int n = utf8_to_ucs(src + i, len - i, &ch);
if (n < 0) {
snprintf(result.errmsg, sizeof(result.errmsg),
"invalid UTF8 char on line %d", line);
goto bail;
}
if (0xD800 <= ch && ch <= 0xDFFF) {
// explicitly prohibit surrogates (non-scalar unicode code point)
snprintf(result.errmsg, sizeof(result.errmsg),
"invalid UTF8 char \\u%04x on line %d", ch, line);
goto bail;
}
line += (ch == '\n' ? 1 : 0);
i += n;
}
}
// Initialize parser
pp->toptab = mkdatum(TOML_TABLE);
pp->curtab = &pp->toptab;
pp->ebuf.ptr = result.errmsg; // parse error will be printed into pp->ebuf
pp->ebuf.len = sizeof(result.errmsg);
// Alloc memory pool
pp->pool =
pool_create(len + 10); // add some extra bytes for NUL term and safety
if (!pp->pool) {
snprintf(result.errmsg, sizeof(result.errmsg), "out of memory");
goto bail;
}
// Initialize scanner. Scan error will be printed into pp->ebuf.
scan_init(&pp->scanner, src, len, pp->ebuf.ptr, pp->ebuf.len);
// Keep parsing until FIN
for (;;) {
token_t tok;
if (scan_key(&pp->scanner, &tok)) {
goto bail;
}
// break on FIN
if (tok.toktyp == TOK_FIN) {
break;
}
switch (tok.toktyp) {
case TOK_ENDL: // skip blank lines
continue;
case TOK_LBRACK:
if (parse_std_table_expr(pp, tok)) {
goto bail;
}
break;
case TOK_LLBRACK:
if (parse_array_table_expr(pp, tok)) {
goto bail;
}
break;
default:
// non-blank line: parse an expression
if (parse_keyvalue_expr(pp, tok)) {
goto bail;
}
break;
}
// each expression must be followed by newline
if (scan_key(&pp->scanner, &tok)) {
goto bail;
}
if (tok.toktyp == TOK_FIN || tok.toktyp == TOK_ENDL) {
continue;
}
SETERROR(pp->ebuf, tok.lineno, "ENDL expected");
goto bail;
}
// return result
result.ok = true;
result.toptab = pp->toptab;
result.__internal = (void *)pp->pool;
return result;
bail:
// return error
datum_free(&pp->toptab);
pool_destroy(pp->pool);
result.ok = false;
if (result.errmsg[0] == '\0') {
assert(0);
snprintf(result.errmsg, sizeof(result.errmsg), "Error near line %d\n",
pp->scanner.lineno);
}
return result;
}
// Convert a (LITSTRING, LIT, MLLITSTRING, MLSTRING, or STRING) token to a
// datum.
static int token_to_string(parser_t *pp, token_t tok, toml_datum_t *ret) {
*ret = mkdatum(TOML_STRING);
span_t span;
DO(parse_norm(pp, tok, &span));
ret->u.str.ptr = (char *)span.ptr;
ret->u.str.len = span.len;
return 0;
}
// Convert a TIME/DATE/DATETIME/DATETIMETZ to a datum
static int token_to_timestamp(parser_t *pp, token_t tok, toml_datum_t *ret) {
(void)pp;
static const toml_type_t map[] = {[TOK_TIME] = TOML_TIME,
[TOK_DATE] = TOML_DATE,
[TOK_DATETIME] = TOML_DATETIME,
[TOK_DATETIMETZ] = TOML_DATETIMETZ};
switch (tok.toktyp) {
case TOK_TIME:
case TOK_DATE:
case TOK_DATETIME:
case TOK_DATETIMETZ:
break;
default:
assert(0 && "unexpected token type");
return -1;
}
*ret = mkdatum(map[tok.toktyp]);
ret->u.ts.year = tok.u.tsval.year;
ret->u.ts.month = tok.u.tsval.month;
ret->u.ts.day = tok.u.tsval.day;
ret->u.ts.hour = tok.u.tsval.hour;
ret->u.ts.minute = tok.u.tsval.minute;
ret->u.ts.second = tok.u.tsval.sec;
ret->u.ts.usec = tok.u.tsval.usec;
ret->u.ts.tz = tok.u.tsval.tz;
return 0;
}
// Convert an int64 token to a datum.
static int token_to_int64(parser_t *pp, token_t tok, toml_datum_t *ret) {
(void)pp;
assert(tok.toktyp == TOK_INTEGER);
*ret = mkdatum(TOML_INT64);
ret->u.int64 = tok.u.int64;
return 0;
}
// Convert a fp64 token to a datum.
static int token_to_fp64(parser_t *pp, token_t tok, toml_datum_t *ret) {
(void)pp;
assert(tok.toktyp == TOK_FLOAT);
*ret = mkdatum(TOML_FP64);
ret->u.fp64 = tok.u.fp64;
return 0;
}
// Convert a boolean token to a datum.
static int token_to_boolean(parser_t *pp, token_t tok, toml_datum_t *ret) {
(void)pp;
assert(tok.toktyp == TOK_BOOL);
*ret = mkdatum(TOML_BOOLEAN);
ret->u.boolean = tok.u.b1;
return 0;
}
// Parse a multipart key. Return 0 on success, -1 otherwise.
static int parse_key(parser_t *pp, token_t tok, keypart_t *ret_keypart) {
ret_keypart->nspan = 0;
// key = simple-key | dotted_key
// simple-key = STRING | LITSTRING | LIT
// dotted-key = simple-key (DOT simple-key)+
if (tok.toktyp != TOK_STRING && tok.toktyp != TOK_LITSTRING &&
tok.toktyp != TOK_LIT) {
return SETERROR(pp->ebuf, tok.lineno, "missing key");
}
int n = 0;
span_t *kpspan = ret_keypart->span;
// Normalize the first keypart
if (parse_norm(pp, tok, &kpspan[n])) {
return SETERROR(pp->ebuf, tok.lineno,
"unable to normalize string; probably a unicode issue");
}
n++;
// Scan and normalize the second to last keypart
while (1) {
scanner_state_t mark = scan_mark(&pp->scanner);
// Eat the dot if it is there
DO(scan_key(&pp->scanner, &tok));
// If not a dot, we are done with keyparts.
if (tok.toktyp != TOK_DOT) {
scan_restore(&pp->scanner, mark);
break;
}
// Scan the n-th key
DO(scan_key(&pp->scanner, &tok));
if (tok.toktyp != TOK_STRING && tok.toktyp != TOK_LITSTRING &&
tok.toktyp != TOK_LIT) {
return SETERROR(pp->ebuf, tok.lineno, "expects a string in dotted-key");
}
if (n >= KEYPARTMAX) {
return SETERROR(pp->ebuf, tok.lineno, "too many key parts");
}
// Normalize the n-th key.
DO(parse_norm(pp, tok, &kpspan[n]));
n++;
}
// This key has n parts.
ret_keypart->nspan = n;
return 0;
}
// Starting at toptab, descend following keypart[]. If a key does not
// exist in the current table, create a new table entry for the
// key. Returns the final table represented by the key.
static toml_datum_t *descend_keypart(parser_t *pp, int lineno,
toml_datum_t *toptab, keypart_t *keypart,
bool stdtabexpr) {
toml_datum_t *tab = toptab; // current tab
for (int i = 0; i < keypart->nspan; i++) {
const char *reason;
// Find the i-th keypart
int j = tab_find(tab, keypart->span[i]);
// Not found: add a new (key, tab) pair.
if (j < 0) {
toml_datum_t newtab = mkdatum(TOML_TABLE);
newtab.flag |= stdtabexpr ? FLAG_STDEXPR : 0;
if (tab_add(tab, keypart->span[i], newtab, &reason)) {
SETERROR(pp->ebuf, lineno, "%s", reason);
return NULL;
}
tab = &tab->u.tab.value[tab->u.tab.size - 1]; // descend
continue;
}
// Found: extract the value of the key.
toml_datum_t *value = &tab->u.tab.value[j];
// If the value is a table, descend.
if (value->type == TOML_TABLE) {
tab = value; // descend
continue;
}
// If the value is an array: locate the last entry and descend.
if (value->type == TOML_ARRAY) {
// If empty: error.
if (value->u.arr.size <= 0) {
SETERROR(pp->ebuf, lineno, "array %s has no elements",
keypart->span[i].ptr);
return NULL;
}
// Extract the last element of the array.
value = &value->u.arr.elem[value->u.arr.size - 1];
// It must be a table!
if (value->type != TOML_TABLE) {
SETERROR(pp->ebuf, lineno, "array %s must be array of tables",
keypart->span[i].ptr);
return NULL;
}
tab = value; // descend
continue;
}
// key not found
SETERROR(pp->ebuf, lineno, "cannot locate table at key %s",
keypart->span[i].ptr);
return NULL;
}
// Return the table corresponding to the keypart[].
return tab;
}
// Recursively set flags on datum
static void set_flag_recursive(toml_datum_t *datum, uint32_t flag) {
datum->flag |= flag;
switch (datum->type) {
case TOML_ARRAY:
for (int i = 0, top = datum->u.arr.size; i < top; i++) {
set_flag_recursive(&datum->u.arr.elem[i], flag);
}
break;
case TOML_TABLE:
for (int i = 0, top = datum->u.tab.size; i < top; i++) {
set_flag_recursive(&datum->u.tab.value[i], flag);
}
break;
default:
break;
}
}
// Parse an inline array.
static int parse_inline_array(parser_t *pp, token_t tok,
toml_datum_t *ret_datum) {
assert(tok.toktyp == TOK_LBRACK);
*ret_datum = mkdatum(TOML_ARRAY);
int need_comma = 0;
// loop until RBRACK
for (;;) {
// skip ENDL
do {
DO(scan_value(&pp->scanner, &tok));
} while (tok.toktyp == TOK_ENDL);
// If got an RBRACK: done!
if (tok.toktyp == TOK_RBRACK) {
break;
}
// If got a COMMA: check if it is expected.
if (tok.toktyp == TOK_COMMA) {
if (need_comma) {
need_comma = 0;
continue;
}
return SETERROR(pp->ebuf, tok.lineno,
"syntax error while parsing array: unexpected comma");
}
// Not a comma, but need a comma: error!
if (need_comma) {
return SETERROR(pp->ebuf, tok.lineno,
"syntax error while parsing array: missing comma");
}
// This is a valid value! Obtain the value.
toml_datum_t value = DATUM_ZERO;
if (parse_val(pp, tok, &value)) {
datum_free(&value);
return -1;
}
// Add the value to the array.
const char *reason;
toml_datum_t *pelem = arr_emplace(ret_datum, &reason);
if (!pelem) {
datum_free(&value);
return SETERROR(pp->ebuf, tok.lineno, "while parsing array: %s", reason);
}
*pelem = value;
// Need comma before the next value.
need_comma = 1;
}
// Set the INLINE flag for all things in this array.
set_flag_recursive(ret_datum, FLAG_INLINED);
return 0;
}
// Parse an inline table.
static int parse_inline_table(parser_t *pp, token_t tok,
toml_datum_t *ret_datum) {
assert(tok.toktyp == TOK_LBRACE);
*ret_datum = mkdatum(TOML_TABLE);
bool need_comma = 0;
bool was_comma = 0;
// loop until RBRACE
for (;;) {
DO(scan_key(&pp->scanner, &tok));
// Got an RBRACE: done!
if (tok.toktyp == TOK_RBRACE) {
if (was_comma) {
/*
return SETERROR(pp->ebuf, tok.lineno,
"extra comma before closing brace");
*/
// extra comma before RBRACE is allowed for v1.1
(void)0;
}
break;
}
// Got a comma: check if it is expected.
if (tok.toktyp == TOK_COMMA) {
if (need_comma) {
need_comma = 0, was_comma = 1;
continue;
}
return SETERROR(pp->ebuf, tok.lineno, "unexpected comma");
}
// Newline not allowed in inline table.
// newline is allowed in v1.1
if (tok.toktyp == TOK_ENDL) {
// return SETERROR(pp->ebuf, tok.lineno, "unexpected newline");
continue;
}
// Not a comma, but need a comma: error!
if (need_comma) {
return SETERROR(pp->ebuf, tok.lineno, "missing comma");
}
// Get the keyparts
keypart_t keypart = {0};
int keylineno = tok.lineno;
DO(parse_key(pp, tok, &keypart));
// Descend to one keypart before last
span_t lastkeypart = keypart.span[--keypart.nspan];
toml_datum_t *tab =
descend_keypart(pp, keylineno, ret_datum, &keypart, false);
if (!tab) {
return -1;
}
// If tab is a previously declared inline table: error.
if (tab->flag & FLAG_INLINED) {
return SETERROR(pp->ebuf, tok.lineno, "inline table cannot be extended");
}
// We are explicitly defining it now.
tab->flag |= FLAG_EXPLICIT;
// match EQUAL
DO(scan_value(&pp->scanner, &tok));
if (tok.toktyp != TOK_EQUAL) {
if (tok.toktyp == TOK_ENDL) {
return SETERROR(pp->ebuf, tok.lineno, "unexpected newline");
} else {
return SETERROR(pp->ebuf, tok.lineno, "missing '='");
}
}
// obtain the value
DO(scan_value(&pp->scanner, &tok));
toml_datum_t value = DATUM_ZERO;
if (parse_val(pp, tok, &value)) {
datum_free(&value);
return -1;
}
// Add the value to tab.
const char *reason;
if (tab_add(tab, lastkeypart, value, &reason)) {
datum_free(&value);
return SETERROR(pp->ebuf, tok.lineno, "%s", reason);
}
need_comma = 1, was_comma = 0;
}
set_flag_recursive(ret_datum, FLAG_INLINED);
return 0;
}
// Parse a value.
static int parse_val(parser_t *pp, token_t tok, toml_datum_t *ret) {
*ret = DATUM_ZERO; // initialize
// val = string / boolean / array / inline-table / date-time / float / integer
switch (tok.toktyp) {
case TOK_STRING:
case TOK_MLSTRING:
case TOK_LITSTRING:
case TOK_MLLITSTRING:
return token_to_string(pp, tok, ret);
case TOK_TIME:
case TOK_DATE:
case TOK_DATETIME:
case TOK_DATETIMETZ:
return token_to_timestamp(pp, tok, ret);
case TOK_INTEGER:
return token_to_int64(pp, tok, ret);
case TOK_FLOAT:
return token_to_fp64(pp, tok, ret);
case TOK_BOOL:
return token_to_boolean(pp, tok, ret);
case TOK_LBRACK: // inline-array
return parse_inline_array(pp, tok, ret);
case TOK_LBRACE: // inline-table
return parse_inline_table(pp, tok, ret);
default:
break;
}
return SETERROR(pp->ebuf, tok.lineno, "missing value");
}
// Parse a standard table expression, and set the curtab of the parser
// to the table referenced. A standard table expression is a line
// like [a.b.c.d].
static int parse_std_table_expr(parser_t *pp, token_t tok) {
// std-table = [ key ]
// Eat the [
assert(tok.toktyp == TOK_LBRACK); // [ ate by caller
// Read the first keypart
DO(scan_key(&pp->scanner, &tok));
// Extract the keypart[]
int keylineno = tok.lineno;
keypart_t keypart;
DO(parse_key(pp, tok, &keypart));
// Eat the ]
DO(scan_key(&pp->scanner, &tok));
if (tok.toktyp != TOK_RBRACK) {
return SETERROR(pp->ebuf, tok.lineno, "missing right-bracket");
}
// Descend to one keypart before last.
span_t lastkeypart = keypart.span[--keypart.nspan];
// Descend keypart from the toptab.
toml_datum_t *tab =
descend_keypart(pp, keylineno, &pp->toptab, &keypart, true);
if (!tab) {
return -1;
}
// Look for the last keypart in the final tab
int j = tab_find(tab, lastkeypart);
if (j < 0) {
// If not found: add it.
if (tab->flag & FLAG_INLINED) {
return SETERROR(pp->ebuf, keylineno, "inline table cannot be extended");
}
const char *reason;
toml_datum_t newtab = mkdatum(TOML_TABLE);
newtab.flag |= FLAG_STDEXPR;
if (tab_add(tab, lastkeypart, newtab, &reason)) {
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
// this is the new tab
tab = &tab->u.tab.value[tab->u.tab.size - 1];
} else {
// Found: check for errors
tab = &tab->u.tab.value[j];
if (tab->flag & FLAG_EXPLICIT) {
/*
This is not OK:
[x.y.z]
[x.y.z]
but this is OK:
[x.y.z]
[x]
*/
return SETERROR(pp->ebuf, keylineno, "table defined more than once");
}
if (!(tab->flag & FLAG_STDEXPR)) {
/*
[t1] # OK
t2.t3.v = 0 # OK
[t1.t2] # should FAIL - t2 was non-explicit but was not
created by std-table-expr
*/
return SETERROR(pp->ebuf, keylineno, "table defined before");
}
}
// Set explicit flag on tab
tab->flag |= FLAG_EXPLICIT;
// Set tab as curtab of the parser
pp->curtab = tab;
return 0;
}
// Parse an array table expression, and set the curtab of the parser
// to the table referenced. A standard array table expresison is a line
// like [[a.b.c.d]].
static int parse_array_table_expr(parser_t *pp, token_t tok) {
// array-table = [[ key ]]
assert(tok.toktyp == TOK_LLBRACK); // [[ ate by caller
// Read the first keypart
DO(scan_key(&pp->scanner, &tok));
int keylineno = tok.lineno;
keypart_t keypart;
DO(parse_key(pp, tok, &keypart));
// eat the ]]
token_t rrb;
DO(scan_key(&pp->scanner, &rrb));
if (rrb.toktyp != TOK_RRBRACK) {
return SETERROR(pp->ebuf, rrb.lineno, "missing ']]'");
}
// remove the last keypart from keypart[]
span_t lastkeypart = keypart.span[--keypart.nspan];
// descend the key from the toptab
toml_datum_t *tab = &pp->toptab;
for (int i = 0; i < keypart.nspan; i++) {
span_t curkey = keypart.span[i];
int j = tab_find(tab, curkey);
if (j < 0) {
// If not found: add a new (key,tab) pair
const char *reason;
toml_datum_t newtab = mkdatum(TOML_TABLE);
newtab.flag |= FLAG_STDEXPR;
if (tab_add(tab, curkey, newtab, &reason)) {
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
tab = &tab->u.tab.value[tab->u.tab.size - 1];
continue;
}
// Found: get the value
toml_datum_t *value = &tab->u.tab.value[j];
// If value is table, then point to that table and continue descent.
if (value->type == TOML_TABLE) {
tab = value;
continue;
}
// If value is an array of table, point to the last element of the array and
// continue descent.
if (value->type == TOML_ARRAY) {
if (value->flag & FLAG_INLINED) {
return SETERROR(pp->ebuf, keylineno, "cannot expand array %s",
curkey.ptr);
}
if (value->u.arr.size <= 0) {
return SETERROR(pp->ebuf, keylineno, "array %s has no elements",
curkey.ptr);
}
value = &value->u.arr.elem[value->u.arr.size - 1];
if (value->type != TOML_TABLE) {
return SETERROR(pp->ebuf, keylineno, "array %s must be array of tables",
curkey.ptr);
}
tab = value;
continue;
}
// keypart not found
return SETERROR(pp->ebuf, keylineno, "cannot locate table at key %s",
curkey.ptr);
}
// For the final keypart, make sure entry at key is an array of tables
const char *reason;
int idx = tab_find(tab, lastkeypart);
if (idx == -1) {
// If not found, add an array of table.
if (tab_add(tab, lastkeypart, mkdatum(TOML_ARRAY), &reason)) {
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
idx = tab_find(tab, lastkeypart);
assert(idx >= 0);
}
// Check that this is an array.
if (tab->u.tab.value[idx].type != TOML_ARRAY) {
return SETERROR(pp->ebuf, keylineno, "entry must be an array");
}
// Add an empty table to the array
toml_datum_t *arr = &tab->u.tab.value[idx];
if (arr->flag & FLAG_INLINED) {
return SETERROR(pp->ebuf, keylineno, "cannot extend a static array");
}
toml_datum_t *pelem = arr_emplace(arr, &reason);
if (!pelem) {
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
*pelem = mkdatum(TOML_TABLE);
// Set the last element of this array as curtab of the parser
pp->curtab = &arr->u.arr.elem[arr->u.arr.size - 1];
assert(pp->curtab->type == TOML_TABLE);
return 0;
}
// Parse an expression. A toml doc is just a list of expressions.
static int parse_keyvalue_expr(parser_t *pp, token_t tok) {
// Obtain the key
int keylineno = tok.lineno;
keypart_t keypart;
DO(parse_key(pp, tok, &keypart));
// match the '='
DO(scan_key(&pp->scanner, &tok));
if (tok.toktyp != TOK_EQUAL) {
return SETERROR(pp->ebuf, tok.lineno, "expect '='");
}
// Locate the last table using keypart[]
const char *reason;
toml_datum_t *tab = pp->curtab;
for (int i = 0; i < keypart.nspan - 1; i++) {
int j = tab_find(tab, keypart.span[i]);
if (j < 0) {
if (i > 0 && (tab->flag & FLAG_EXPLICIT)) {
return SETERROR(
pp->ebuf, keylineno,
"cannot extend a previously defined table using dotted expression");
}
toml_datum_t newtab = mkdatum(TOML_TABLE);
if (tab_add(tab, keypart.span[i], newtab, &reason)) {
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
tab = &tab->u.tab.value[tab->u.tab.size - 1];
continue;
}
toml_datum_t *value = &tab->u.tab.value[j];
if (value->type == TOML_TABLE) {
tab = value;
continue;
}
if (value->type == TOML_ARRAY) {
return SETERROR(pp->ebuf, keylineno,
"encountered previously declared array '%s'",
keypart.span[i].ptr);
}
return SETERROR(pp->ebuf, keylineno, "cannot locate table at '%s'",
keypart.span[i].ptr);
}
// Check for disallowed situations.
if (tab->flag & FLAG_INLINED) {
return SETERROR(pp->ebuf, keylineno, "inline table cannot be extended");
}
if (keypart.nspan > 1 && (tab->flag & FLAG_EXPLICIT)) {
return SETERROR(
pp->ebuf, keylineno,
"cannot extend a previously defined table using dotted expression");
}
// Obtain the value
DO(scan_value(&pp->scanner, &tok));
toml_datum_t newval = DATUM_ZERO;
if (parse_val(pp, tok, &newval)) {
datum_free(&newval);
return -1;
}
// Add a new key/value for tab.
if (tab_add(tab, keypart.span[keypart.nspan - 1], newval, &reason)) {
datum_free(&newval);
return SETERROR(pp->ebuf, keylineno, "%s", reason);
}
return 0;
}
// Normalize a LIT/STRING/MLSTRING/LITSTRING/MLLITSTRING
// -> unescape all escaped chars
// The returned string is allocated out of pp->sbuf[]
static int parse_norm(parser_t *pp, token_t tok, span_t *ret_span) {
// Allocate a buffer to store the normalized string. Add one
// extra-byte for terminating NUL.
char *p = pool_alloc(pp->pool, tok.str.len + 1);
if (!p) {
return SETERROR(pp->ebuf, tok.lineno, "out of memory");
}
// Copy from token string into buffer
memcpy(p, tok.str.ptr, tok.str.len);
p[tok.str.len] = 0; // additional NUL term for safety
ret_span->ptr = p;
ret_span->len = tok.str.len;
switch (tok.toktyp) {
case TOK_LIT:
case TOK_LITSTRING:
case TOK_MLLITSTRING:
// no need to handle escape chars
return 0;
case TOK_STRING:
case TOK_MLSTRING:
// need to handle escape chars
break;
default:
return SETERROR(pp->ebuf, 0, "internal: arg must be a string");
}
// if there is no escape char, then done!
if (!tok.u.escp) {
return 0; // success
}
// p points to the backslash
p += (tok.u.escp - tok.str.ptr);
assert(p - ret_span->ptr == tok.u.escp - tok.str.ptr);
assert(*p == '\\');
// Normalize the escaped chars
char *dst = p;
while (*p) {
if (*p != '\\') {
*dst++ = *p++;
continue;
}
switch (p[1]) {
case '"':
case '\\':
*dst++ = p[1];
p += 2;
continue;
case 'b':
*dst++ = '\b';
p += 2;
continue;
case 't':
*dst++ = '\t';
p += 2;
continue;
case 'n':
*dst++ = '\n';
p += 2;
continue;
case 'f':
*dst++ = '\f';
p += 2;
continue;
case 'r':
*dst++ = '\r';
p += 2;
continue;
case 'e':
*dst++ = '\033';
p += 2;
continue;
case 'x': {
char buf[3];
memcpy(buf, p + 2, 2);
buf[2] = 0;
// There is no need to check for two hex digits here because
// the scanner already checked it.
int32_t ucs = strtol(buf, 0, 16);
int n = ucs_to_utf8(ucs, dst);
if (n < 0) {
return SETERROR(pp->ebuf, tok.lineno, "error converting UCS %s to UTF8",
buf);
}
dst += n;
p += 2 + 2; // \xNN
continue;
}
case 'u':
case 'U': {
char buf[9];
int sz = (p[1] == 'u' ? 4 : 8);
memcpy(buf, p + 2, sz);
buf[sz] = 0;
// There is no need to check for 4 or 8 hex digits here because
// the scanner already checked it.
int32_t ucs = strtol(buf, 0, 16);
if (0xD800 <= ucs && ucs <= 0xDFFF) {
// explicitly prohibit surrogates (non-scalar unicode code point)
return SETERROR(pp->ebuf, tok.lineno, "invalid UTF8 char \\u%04x", ucs);
}
int n = ucs_to_utf8(ucs, dst);
if (n < 0) {
return SETERROR(pp->ebuf, tok.lineno, "error converting UCS %s to UTF8",
buf);
}
dst += n;
p += 2 + sz; // \uNNNN or \UNNNNNNNN
continue;
}
case ' ':
case '\t':
case '\r':
// line-ending backslash
// --- allow for extra whitespace chars after backslash
// --- skip until newline
p++; // skip the escape char
p += strspn(p, " \t\r"); // skip whitespaces
if (*p != '\n') {
return SETERROR(pp->ebuf, tok.lineno,
"unexpected char after line-ending backslash");
}
// fallthru
case '\n':
// skip all whitespaces including newline
p++;
p += strspn(p, " \t\r\n");
continue;
default:
return SETERROR(pp->ebuf, tok.lineno,
"internal: unknown escape char \\%c", p[1]);
}
}
*dst = 0;
ret_span->len = dst - ret_span->ptr;
return 0;
}
// ===================================================================
// == SCANNER SECTION
// ===================================================================
// Get the next char
static int scan_get(scanner_t *sp) {
int ret = TOK_FIN;
const char *p = sp->cur;
if (p < sp->endp) {
ret = *p++;
if (ret == '\r' && p < sp->endp && *p == '\n') {
ret = *p++;
}
}
sp->cur = p;
sp->lineno += (ret == '\n' ? 1 : 0);
return ret;
}
// Check if the next char matches ch.
static inline bool scan_match(scanner_t *sp, int ch) {
const char *p = sp->cur;
// exact match? done.
if (p < sp->endp && *p == ch) {
return true;
}
// \n also matches \r\n
if (ch == '\n' && p + 1 < sp->endp) {
return p[0] == '\r' && p[1] == '\n';
}
// not a match
return false;
}
// Check if the next char is in accept[].
static bool scan_matchany(scanner_t *sp, const char *accept) {
for (; *accept; accept++) {
if (scan_match(sp, *accept)) {
return true;
}
}
return false;
}
// Check if the next n chars match ch.
static inline bool scan_nmatch(scanner_t *sp, int ch, int n) {
assert(ch != '\n'); // not handled
if (sp->cur + n > sp->endp) {
return false;
}
const char *p = sp->cur;
int i;
for (i = 0; i < n && p[i] == ch; i++)
;
return i == n;
}
// Initialize a token.
static inline token_t mktoken(scanner_t *sp, toktyp_t typ) {
token_t tok = {0};
tok.toktyp = typ;
tok.str.ptr = sp->cur;
tok.lineno = sp->lineno;
return tok;
}
#define S_GET() scan_get(sp)
#define S_MATCH(ch) scan_match(sp, (ch))
#define S_MATCH3(ch) scan_nmatch(sp, (ch), 3)
#define S_MATCH4(ch) scan_nmatch(sp, (ch), 4)
#define S_MATCH6(ch) scan_nmatch(sp, (ch), 6)
static inline bool is_valid_char(int ch) {
// i.e. (0x20 <= ch && ch <= 0x7e) || (ch & 0x80);
return isprint(ch) || (ch & 0x80);
}
static inline bool is_hex_char(int ch) {
ch = toupper(ch);
return ('0' <= ch && ch <= '9') || ('A' <= ch && ch <= 'F');
}
// Initialize a scanner
static void scan_init(scanner_t *sp, const char *src, int len, char *errbuf,
int errbufsz) {
memset(sp, 0, sizeof(*sp));
sp->src = src;
sp->endp = src + len;
assert(*sp->endp == '\0');
sp->cur = src;
sp->lineno = 1;
sp->ebuf.ptr = errbuf;
sp->ebuf.len = errbufsz;
}
static int scan_multiline_string(scanner_t *sp, token_t *tok) {
assert(S_MATCH3('"'));
S_GET(), S_GET(), S_GET(); // skip opening """
// According to spec: trim first newline after """
if (S_MATCH('\n')) {
S_GET();
}
*tok = mktoken(sp, TOK_MLSTRING);
// scan until terminating """
const char *escp = NULL;
while (1) {
if (S_MATCH3('"')) {
if (S_MATCH4('"')) {
// special case... """abcd """" -> (abcd ")
// but sequences of 3 or more double quotes are not allowed
if (S_MATCH6('"')) {
return SETERROR(sp->ebuf, sp->lineno,
"detected sequences of 3 or more double quotes");
} else {
; // no problem
}
} else {
break; // found terminating """
}
}
int ch = S_GET();
if (ch == TOK_FIN) {
return SETERROR(sp->ebuf, sp->lineno, "unterminated \"\"\"");
}
// If non-escaped char ...
if (ch != '\\') {
if (!(is_valid_char(ch) || (ch && strchr(" \t\n", ch)))) {
return SETERROR(sp->ebuf, sp->lineno, "invalid char in string");
}
continue;
}
// ch is backslash
if (!escp) {
escp = sp->cur - 1;
assert(*escp == '\\');
}
// handle escape char
ch = S_GET();
if (ch && strchr("btnfre\"\\", ch)) {
// skip \", \\, \b, \f, \n, \r, \t
continue;
}
int top = 0;
switch (ch) {
case 'x':
top = 2;
break;
case 'u':
top = 4;
break;
case 'U':
top = 8;
break;
default:
break;
}
if (top) {
for (int i = 0; i < top; i++) {
if (!is_hex_char(S_GET())) {
return SETERROR(sp->ebuf, sp->lineno,
"expect %d hex digits after \\%c", top, ch);
}
}
continue;
}
// handle line-ending backslash
if (ch == ' ' || ch == '\t') {
// Although the spec does not allow for whitespace following a
// line-ending backslash, some standard tests expect it.
// Skip whitespace till EOL.
while (ch != TOK_FIN && ch && strchr(" \t", ch)) {
ch = S_GET();
}
if (ch != '\n') {
// Got a backslash followed by whitespace, followed by some char
// before newline
return SETERROR(sp->ebuf, sp->lineno, "bad escape char in string");
}
// fallthru
}
if (ch == '\n') {
// got a line-ending backslash
// - skip all whitespaces
while (scan_matchany(sp, " \t\n")) {
S_GET();
}
continue;
}
return SETERROR(sp->ebuf, sp->lineno, "bad escape char in string");
}
tok->str.len = sp->cur - tok->str.ptr;
tok->u.escp = escp;
assert(S_MATCH3('"'));
S_GET(), S_GET(), S_GET();
return 0;
}
static int scan_string(scanner_t *sp, token_t *tok) {
assert(S_MATCH('"'));
if (S_MATCH3('"')) {
return scan_multiline_string(sp, tok);
}
S_GET(); // skip opening "
// scan until closing "
*tok = mktoken(sp, TOK_STRING);
const char *escp = NULL;
while (!S_MATCH('"')) {
int ch = S_GET();
if (ch == TOK_FIN) {
return SETERROR(sp->ebuf, sp->lineno, "unterminated string");
}
// If non-escaped char ...
if (ch != '\\') {
if (!(is_valid_char(ch) || ch == ' ' || ch == '\t')) {
return SETERROR(sp->ebuf, sp->lineno, "invalid char in string");
}
continue;
}
// ch is backslash
if (!escp) {
escp = sp->cur - 1;
assert(*escp == '\\');
}
// handle escape char
ch = S_GET();
if (ch && strchr("btnfre\"\\", ch)) {
// skip \b, \t, \n, \f, \r, \e, \", \\ .
continue;
}
int top = 0;
switch (ch) {
case 'x':
top = 2;
break;
case 'u':
top = 4;
break;
case 'U':
top = 8;
break;
default:
return SETERROR(sp->ebuf, sp->lineno, "bad escape char in string");
}
for (int i = 0; i < top; i++) {
if (!is_hex_char(S_GET())) {
return SETERROR(sp->ebuf, sp->lineno, "expect %d hex digits after \\%c",
top, ch);
}
}
}
tok->str.len = sp->cur - tok->str.ptr;
tok->u.escp = escp;
assert(S_MATCH('"'));
S_GET(); // skip the terminating "
return 0;
}
static int scan_multiline_litstring(scanner_t *sp, token_t *tok) {
assert(S_MATCH3('\''));
S_GET(), S_GET(), S_GET(); // skip opening '''
// According to spec: trim first newline after '''
if (S_MATCH('\n')) {
S_GET();
}
// scan until terminating '''
*tok = mktoken(sp, TOK_MLLITSTRING);
while (1) {
if (S_MATCH3('\'')) {
if (S_MATCH4('\'')) {
// special case... '''abcd '''' -> (abcd ')
// but sequences of 3 or more single quotes are not allowed
if (S_MATCH6('\'')) {
return SETERROR(sp->ebuf, sp->lineno,
"sequences of 3 or more single quotes");
} else {
; // no problem
}
} else {
break; // found terminating '''
}
}
int ch = S_GET();
if (ch == TOK_FIN) {
return SETERROR(sp->ebuf, sp->lineno,
"unterminated multiline lit string");
}
if (!(is_valid_char(ch) || (ch && strchr(" \t\n", ch)))) {
return SETERROR(sp->ebuf, sp->lineno, "invalid char in string");
}
}
tok->str.len = sp->cur - tok->str.ptr;
assert(S_MATCH3('\''));
S_GET(), S_GET(), S_GET();
return 0;
}
static int scan_litstring(scanner_t *sp, token_t *tok) {
assert(S_MATCH('\''));
if (S_MATCH3('\'')) {
return scan_multiline_litstring(sp, tok);
}
S_GET(); // skip opening '
// scan until closing '
*tok = mktoken(sp, TOK_LITSTRING);
while (!S_MATCH('\'')) {
int ch = S_GET();
if (ch == TOK_FIN) {
return SETERROR(sp->ebuf, sp->lineno, "unterminated string");
}
if (!(is_valid_char(ch) || ch == '\t')) {
return SETERROR(sp->ebuf, sp->lineno, "invalid char in string");
}
}
tok->str.len = sp->cur - tok->str.ptr;
assert(S_MATCH('\''));
S_GET();
return 0;
}
static bool is_valid_date(int year, int month, int day) {
if (!(1 <= year)) {
return false;
}
if (!(1 <= month && month <= 12)) {
return false;
}
int is_leap_year = (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0);
int days_in_month[] = {
31, 28 + is_leap_year, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
return (1 <= day && day <= days_in_month[month - 1]);
}
static bool is_valid_time(int hour, int minute, int sec, int usec) {
if (!(0 <= hour && hour <= 23)) {
return false;
}
if (!(0 <= minute && minute <= 59)) {
return false;
}
if (!(0 <= sec && sec <= 59)) {
return false;
}
if (!(0 <= usec)) {
return false;
}
return true;
}
static bool is_valid_timezone(int minute) {
minute = (minute < 0 ? -minute : minute);
int hour = minute / 60;
minute = minute % 60;
if (!(0 <= hour && hour <= 23)) {
return false;
}
if (!(0 <= minute && minute < 60)) {
return false;
}
return true;
}
// Read an int (without signs) from the string p.
static int read_int(const char *p, int *ret) {
const char *pp = p;
int val = 0;
for (; isdigit(*p); p++) {
val = val * 10u + (*p - '0');
if (val < 0) {
return 0; // overflowed
}
}
*ret = val;
return p - pp;
}
// Read a date as YYYY-MM-DD from p[]. Return #bytes consumed.
static int read_date(const char *p, int *year, int *month, int *day) {
const char *pp = p;
int n;
n = read_int(p, year);
if (n != 4 || p[4] != '-') {
return 0;
}
n = read_int(p += n + 1, month);
if (n != 2 || p[2] != '-') {
return 0;
}
n = read_int(p += n + 1, day);
if (n != 2) {
return 0;
}
p += 2;
assert(p - pp == 10);
return p - pp;
}
// Read a time as HH:MM:SS.subsec from p[]. Return #bytes consumed.
static int read_time(const char *p, int *hour, int *minute, int *second,
int *usec) {
const char *pp = p;
int n;
*hour = *minute = *second = *usec = 0;
// scan hours
n = read_int(p, hour);
if (n != 2 || p[2] != ':') {
return 0;
}
p += 3;
// scan minutes
n = read_int(p, minute);
if (n != 2) {
return 0;
}
if (p[2] != ':') {
// seconds are optional in v1.1
p += 2;
return p - pp;
}
p += 3;
// scan seconds
n = read_int(p, second);
if (n != 2) {
return 0;
}
p += 2;
if (*p != '.') {
return p - pp;
}
p++; // skip the period
if (!isdigit(*p)) {
// trailing period
return 0;
}
int micro_factor = 100000;
while (isdigit(*p) && micro_factor) {
*usec += (*p - '0') * micro_factor;
micro_factor /= 10;
p++;
}
return p - pp;
}
// Reads a timezone from p[]. Return #bytes consumed.
// tzhours and tzminutes restricted to 2-char integers only.
static int read_tzone(const char *p, char *tzsign, int *tzhour, int *tzminute) {
const char *pp = p;
// Default values
*tzhour = *tzminute = 0;
*tzsign = '+';
// Look for Zulu
if (*p == 'Z' || *p == 'z') {
return 1; // done! tz is +00:00.
}
// Look for +/-
*tzsign = *p++;
if (!(*tzsign == '+' || *tzsign == '-')) {
return 0;
}
// Look for HH:MM
int n;
n = read_int(p, tzhour);
if (n != 2 || p[2] != ':') {
return 0;
}
n = read_int(p += 3, tzminute);
if (n != 2) {
return 0;
}
p += 2;
return p - pp;
}
static int scan_time(scanner_t *sp, token_t *tok) {
int lineno = sp->lineno;
char buffer[20];
scan_copystr(sp, buffer, sizeof(buffer));
char *p = buffer;
int hour, minute, sec, usec;
int len = read_time(p, &hour, &minute, &sec, &usec);
if (len == 0) {
return SETERROR(sp->ebuf, lineno, "invalid time");
}
if (!is_valid_time(hour, minute, sec, usec)) {
return SETERROR(sp->ebuf, lineno, "invalid time");
}
*tok = mktoken(sp, TOK_TIME);
tok->str.len = len;
sp->cur += len;
tok->u.tsval.year = -1;
tok->u.tsval.month = -1;
tok->u.tsval.day = -1;
tok->u.tsval.hour = hour;
tok->u.tsval.minute = minute;
tok->u.tsval.sec = sec;
tok->u.tsval.usec = usec;
tok->u.tsval.tz = -1;
return 0;
}
static int scan_timestamp(scanner_t *sp, token_t *tok) {
int year, month, day, hour, minute, sec, usec, tz;
year = month = day = hour = minute = sec = usec = tz = -1;
int n;
// make a copy of sp->cur into buffer to ensure NUL terminated string
char buffer[80];
scan_copystr(sp, buffer, sizeof(buffer));
toktyp_t toktyp = TOK_FIN;
int lineno = sp->lineno;
// See if this a TIME only
const char *p = buffer;
if (isdigit(p[0]) && isdigit(p[1]) && p[2] == ':') {
n = read_time(buffer, &hour, &minute, &sec, &usec);
if (!n) {
return SETERROR(sp->ebuf, lineno, "invalid time");
}
toktyp = TOK_TIME;
p += n;
goto done;
}
// Try reading a DATE
n = read_date(p, &year, &month, &day);
if (!n) {
return SETERROR(sp->ebuf, lineno, "invalid date");
}
toktyp = TOK_DATE;
p += n;
// Check if there is no time component in addition
if (!((p[0] == 'T' || p[0] == ' ' || p[0] == 't') && isdigit(p[1]) &&
isdigit(p[2]) && p[3] == ':')) {
goto done; // no TIME component. we are done.
}
// Read the TIME
n = read_time(p += 1, &hour, &minute, &sec, &usec);
if (!n) {
return SETERROR(sp->ebuf, lineno, "invalid timestamp");
}
toktyp = TOK_DATETIME;
p += n;
// Read the (optional) timezone
char tzsign;
int tzhour, tzminute;
n = read_tzone(p, &tzsign, &tzhour, &tzminute);
if (n == 0) {
goto done; // datetime only
}
toktyp = TOK_DATETIMETZ;
p += n;
// Check tzminute range. This must be done here instead of is_valid_timezone()
// because we combine tzhour and tzminute into tz (by minutes only).
if (!(0 <= tzminute && tzminute < 60)) {
return SETERROR(sp->ebuf, lineno, "invalid timezone");
}
tz = (tzhour * 60 + tzminute) * (tzsign == '-' ? -1 : 1);
goto done; // datetimetz
done:
*tok = mktoken(sp, toktyp);
n = p - buffer;
tok->str.len = n;
sp->cur += n;
tok->u.tsval.year = year;
tok->u.tsval.month = month;
tok->u.tsval.day = day;
tok->u.tsval.hour = hour;
tok->u.tsval.minute = minute;
tok->u.tsval.sec = sec;
tok->u.tsval.usec = usec;
tok->u.tsval.tz = tz;
// Do some error checks based on type
switch (tok->toktyp) {
case TOK_TIME:
if (!is_valid_time(hour, minute, sec, usec)) {
return SETERROR(sp->ebuf, lineno, "invalid time");
}
break;
case TOK_DATE:
if (!is_valid_date(year, month, day)) {
return SETERROR(sp->ebuf, lineno, "invalid date");
}
break;
case TOK_DATETIME:
case TOK_DATETIMETZ:
if (!is_valid_date(year, month, day)) {
return SETERROR(sp->ebuf, lineno, "invalid date");
}
if (!is_valid_time(hour, minute, sec, usec)) {
return SETERROR(sp->ebuf, lineno, "invalid time");
}
if (tok->toktyp == TOK_DATETIMETZ && !is_valid_timezone(tz)) {
return SETERROR(sp->ebuf, lineno, "invalid timezone");
}
break;
default:
assert(0);
return SETERROR(sp->ebuf, lineno, "internal error");
}
return 0;
}
// Given a toml number (int and float) in buffer[]:
// 1. squeeze out '_'
// 2. check for syntax restrictions
static int process_numstr(char *buffer, int base, const char **reason) {
// squeeze out _
char *q = strchr(buffer, '_');
if (q) {
for (int i = q - buffer; buffer[i]; i++) {
if (buffer[i] != '_') {
*q++ = buffer[i];
continue;
}
int left = (i == 0) ? 0 : buffer[i - 1];
int right = buffer[i + 1];
if (!isdigit(left) && !(base == 16 && is_hex_char(left))) {
*reason = "underscore only allowed between digits";
return -1;
}
if (!isdigit(right) && !(base == 16 && is_hex_char(right))) {
*reason = "underscore only allowed between digits";
return -1;
}
}
*q = 0;
}
// decimal points must be surrounded by digits. Also, convert to lowercase.
for (int i = 0; buffer[i]; i++) {
if (buffer[i] == '.') {
if (i == 0 || !isdigit(buffer[i - 1]) || !isdigit(buffer[i + 1])) {
*reason = "decimal point must be surrounded by digits";
return -1;
}
} else if ('A' <= buffer[i] && buffer[i] <= 'Z') {
buffer[i] = tolower(buffer[i]);
}
}
if (base == 10) {
// check for leading 0: '+01' is an error!
q = buffer;
q += (*q == '+' || *q == '-') ? 1 : 0;
if (q[0] == '0' && isdigit(q[1])) {
*reason = "leading 0 in numbers";
return -1;
}
}
return 0;
}
static int scan_float(scanner_t *sp, token_t *tok) {
char buffer[50]; // need to accomodate "9_007_199_254_740_991.0"
scan_copystr(sp, buffer, sizeof(buffer));
int lineno = sp->lineno;
char *p = buffer;
p += (*p == '+' || *p == '-') ? 1 : 0;
if (0 == memcmp(p, "nan", 3) || (0 == memcmp(p, "inf", 3))) {
p += 3;
} else {
p += strspn(p, "_0123456789eE.+-");
}
int len = p - buffer;
buffer[len] = 0;
const char *reason;
if (process_numstr(buffer, 10, &reason)) {
return SETERROR(sp->ebuf, lineno, "%s", reason);
}
errno = 0;
char *q;
double fp64 = strtod(buffer, &q);
if (errno || *q || q == buffer) {
return SETERROR(sp->ebuf, lineno, "error parsing float");
}
*tok = mktoken(sp, TOK_FLOAT);
tok->u.fp64 = fp64;
tok->str.len = len;
sp->cur += len;
return 0;
}
static int scan_number(scanner_t *sp, token_t *tok) {
const char *reason;
char buffer[50]; // need to accomodate "9_007_199_254_740_991.0"
scan_copystr(sp, buffer, sizeof(buffer));
char *p = buffer;
int lineno = sp->lineno;
// process %0x, %0o or %0b integers
if (p[0] == '0') {
const char *span = 0;
int base = 0;
switch (p[1]) {
case 'x':
base = 16;
span = "_0123456789abcdefABCDEF";
break;
case 'o':
base = 8;
span = "_01234567";
break;
case 'b':
base = 2;
span = "_01";
break;
}
if (base) {
p += 2;
p += strspn(p, span);
int len = p - buffer;
buffer[len] = 0;
if (process_numstr(buffer + 2, base, &reason)) {
return SETERROR(sp->ebuf, lineno, "%s", reason);
}
// use strtoll to obtain the value
*tok = mktoken(sp, TOK_INTEGER);
char *q;
errno = 0;
tok->u.int64 = strtoll(buffer + 2, &q, base);
if (errno || *q || q == buffer + 2) {
return SETERROR(sp->ebuf, lineno, "error parsing integer");
}
tok->str.len = len;
sp->cur += len;
return 0;
}
}
// handle inf/nan
if (*p == '+' || *p == '-') {
p++;
}
if (*p == 'i' || *p == 'n') {
return scan_float(sp, tok);
}
// regular int or float
p = buffer;
p += strspn(p, "0123456789_+-.eE");
int len = p - buffer;
buffer[len] = 0;
if (process_numstr(buffer, 10, &reason)) {
return SETERROR(sp->ebuf, lineno, "%s", reason);
}
*tok = mktoken(sp, TOK_INTEGER);
char *q;
errno = 0;
tok->u.int64 = strtoll(buffer, &q, 10);
if (errno || *q || q == buffer) {
if (*q && strchr(".eE", *q)) {
return scan_float(sp, tok); // try to fit a float
}
return SETERROR(sp->ebuf, lineno, "error parsing integer");
}
tok->str.len = len;
sp->cur += len;
return 0;
}
static int scan_bool(scanner_t *sp, token_t *tok) {
char buffer[10];
scan_copystr(sp, buffer, sizeof(buffer));
int lineno = sp->lineno;
bool val = false;
const char *p = buffer;
if (0 == strncmp(p, "true", 4)) {
val = true;
p += 4;
} else if (0 == strncmp(p, "false", 5)) {
val = false;
p += 5;
} else {
return SETERROR(sp->ebuf, lineno, "invalid boolean value");
}
if (*p && !strchr("# \r\n\t,}]", *p)) {
return SETERROR(sp->ebuf, lineno, "invalid boolean value");
}
int len = p - buffer;
*tok = mktoken(sp, TOK_BOOL);
tok->u.b1 = val;
tok->str.len = len;
sp->cur += len;
return 0;
}
// Check if the next token may be TIME
static inline bool test_time(const char *p, const char *endp) {
return &p[2] < endp && isdigit(p[0]) && isdigit(p[1]) && p[2] == ':';
}
// Check if the next token may be DATE
static inline bool test_date(const char *p, const char *endp) {
return &p[4] < endp && isdigit(p[0]) && isdigit(p[1]) && isdigit(p[2]) &&
isdigit(p[3]) && p[4] == '-';
}
// Check if the next token may be BOOL
static inline bool test_bool(const char *p, const char *endp) {
return &p[0] < endp && (*p == 't' || *p == 'f');
}
// Check if the next token may be NUMBER
static bool test_number(const char *p, const char *endp) {
if (&p[0] < endp && *p && strchr("0123456789+-._", *p)) {
return true;
}
if (&p[2] < endp) {
if (0 == memcmp(p, "nan", 3) || 0 == memcmp(p, "inf", 3)) {
return true;
}
}
return false;
}
// Scan a literal that is not a string
static int scan_nonstring_literal(scanner_t *sp, token_t *tok) {
int lineno = sp->lineno;
if (test_time(sp->cur, sp->endp)) {
return scan_time(sp, tok);
}
if (test_date(sp->cur, sp->endp)) {
return scan_timestamp(sp, tok);
}
if (test_bool(sp->cur, sp->endp)) {
return scan_bool(sp, tok);
}
if (test_number(sp->cur, sp->endp)) {
return scan_number(sp, tok);
}
return SETERROR(sp->ebuf, lineno, "invalid value");
}
// Scan a literal
static int scan_literal(scanner_t *sp, token_t *tok) {
*tok = mktoken(sp, TOK_LIT);
const char *p = sp->cur;
while (p < sp->endp && (isalnum(*p) || *p == '_' || *p == '-')) {
p++;
}
tok->str.len = p - tok->str.ptr;
sp->cur = p;
return 0;
}
// Save the current state of the scanner
static scanner_state_t scan_mark(scanner_t *sp) {
scanner_state_t mark;
mark.sp = sp;
mark.cur = sp->cur;
mark.lineno = sp->lineno;
return mark;
}
// Restore the scanner state to a previously saved state
static void scan_restore(scanner_t *sp, scanner_state_t mark) {
assert(mark.sp == sp);
sp->cur = mark.cur;
sp->lineno = mark.lineno;
}
// Return the next token
static int scan_next(scanner_t *sp, bool keymode, token_t *tok) {
static const toktyp_t map[128] = {
['\n'] = TOK_ENDL, ['.'] = TOK_DOT, ['='] = TOK_EQUAL,
[','] = TOK_COMMA, ['{'] = TOK_LBRACE, ['}'] = TOK_RBRACE};
again:
*tok = mktoken(sp, TOK_FIN);
int ch = S_GET();
if (ch == TOK_FIN) {
return 0;
}
tok->str.len = 1;
if (0 <= ch && ch < 128 && map[ch]) {
// map simple char to token type and done
tok->toktyp = map[ch];
return 0;
}
// handle char that require logic
switch (ch) {
case ' ':
case '\t':
goto again; // skip whitespace
case '#':
// comment: skip until newline
while (!S_MATCH('\n')) {
ch = S_GET();
if (ch == TOK_FIN)
break;
if ((0 <= ch && ch <= 0x8) || (0x0a <= ch && ch <= 0x1f) ||
(ch == 0x7f)) {
return SETERROR(sp->ebuf, sp->lineno, "bad control char in comment");
}
}
goto again; // skip comment
case '[':
tok->toktyp = TOK_LBRACK;
if (keymode && S_MATCH('[')) {
S_GET();
tok->toktyp = TOK_LLBRACK;
tok->str.len = 2;
}
break;
case ']':
tok->toktyp = TOK_RBRACK;
if (keymode && S_MATCH(']')) {
S_GET();
tok->toktyp = TOK_RRBRACK;
tok->str.len = 2;
}
break;
case '"':
sp->cur--;
DO(scan_string(sp, tok));
break;
case '\'':
sp->cur--;
DO(scan_litstring(sp, tok));
break;
default:
sp->cur--;
DO(keymode ? scan_literal(sp, tok) : scan_nonstring_literal(sp, tok));
break;
}
return 0;
}
// Check for stack overflow due to excessive number of brackets or braces
static int check_overflow(scanner_t *sp, token_t *tok) {
switch (tok->toktyp) {
case TOK_LBRACK:
sp->bracket_level++;
if (sp->bracket_level > BRACKET_LEVEL_MAX) {
return SETERROR(sp->ebuf, sp->lineno, "stack overflow");
}
break;
case TOK_RBRACK:
sp->bracket_level--;
break;
case TOK_LBRACE:
sp->brace_level++;
if (sp->brace_level > BRACE_LEVEL_MAX) {
return SETERROR(sp->ebuf, sp->lineno, "stack overflow");
}
break;
case TOK_RBRACE:
sp->brace_level--;
break;
default:
break;
}
return 0;
}
static int scan_key(scanner_t *sp, token_t *tok) {
if (sp->errmsg) {
return -1;
}
if (scan_next(sp, true, tok) || check_overflow(sp, tok)) {
sp->errmsg = sp->ebuf.ptr;
return -1;
}
return 0;
}
static int scan_value(scanner_t *sp, token_t *tok) {
if (sp->errmsg) {
return -1;
}
if (scan_next(sp, false, tok) || check_overflow(sp, tok)) {
sp->errmsg = sp->ebuf.ptr;
return -1;
}
return 0;
}
/**
* Convert a char in utf8 into UCS, and store it in *ret.
* Return #bytes consumed or -1 on failure.
*/
static int utf8_to_ucs(const char *orig, int len, uint32_t *ret) {
const unsigned char *buf = (const unsigned char *)orig;
unsigned i = *buf++;
uint32_t v;
/* 0x00000000 - 0x0000007F:
0xxxxxxx
*/
if (0 == (i >> 7)) {
if (len < 1)
return -1;
v = i;
return *ret = v, 1;
}
/* 0x00000080 - 0x000007FF:
110xxxxx 10xxxxxx
*/
if (0x6 == (i >> 5)) {
if (len < 2)
return -1;
v = i & 0x1f;
for (int j = 0; j < 1; j++) {
i = *buf++;
if (0x2 != (i >> 6))
return -1;
v = (v << 6) | (i & 0x3f);
}
return *ret = v, (const char *)buf - orig;
}
/* 0x00000800 - 0x0000FFFF:
1110xxxx 10xxxxxx 10xxxxxx
*/
if (0xE == (i >> 4)) {
if (len < 3)
return -1;
v = i & 0x0F;
for (int j = 0; j < 2; j++) {
i = *buf++;
if (0x2 != (i >> 6))
return -1;
v = (v << 6) | (i & 0x3f);
}
return *ret = v, (const char *)buf - orig;
}
/* 0x00010000 - 0x001FFFFF:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (0x1E == (i >> 3)) {
if (len < 4)
return -1;
v = i & 0x07;
for (int j = 0; j < 3; j++) {
i = *buf++;
if (0x2 != (i >> 6))
return -1;
v = (v << 6) | (i & 0x3f);
}
return *ret = v, (const char *)buf - orig;
}
if (0) {
// NOTE: these code points taking more than 4 bytes are not supported
/* 0x00200000 - 0x03FFFFFF:
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (0x3E == (i >> 2)) {
if (len < 5)
return -1;
v = i & 0x03;
for (int j = 0; j < 4; j++) {
i = *buf++;
if (0x2 != (i >> 6))
return -1;
v = (v << 6) | (i & 0x3f);
}
return *ret = v, (const char *)buf - orig;
}
/* 0x04000000 - 0x7FFFFFFF:
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (0x7e == (i >> 1)) {
if (len < 6)
return -1;
v = i & 0x01;
for (int j = 0; j < 5; j++) {
i = *buf++;
if (0x2 != (i >> 6))
return -1;
v = (v << 6) | (i & 0x3f);
}
return *ret = v, (const char *)buf - orig;
}
}
return -1;
}
/**
* Convert a UCS char to utf8 code, and return it in buf.
* Return #bytes used in buf to encode the char, or
* -1 on error.
*/
static int ucs_to_utf8(uint32_t code, char buf[4]) {
/* http://stackoverflow.com/questions/6240055/manually-converting-unicode-codepoints-into-utf-8-and-utf-16
*/
/* The UCS code values 0xd8000xdfff (UTF-16 surrogates) as well
* as 0xfffe and 0xffff (UCS noncharacters) should not appear in
* conforming UTF-8 streams.
*/
/*
* https://github.com/toml-lang/toml-test/issues/165
* [0xd800, 0xdfff] and [0xfffe, 0xffff] are implicitly allowed by TOML, so
* we disable the check.
*/
if (0) {
if (0xd800 <= code && code <= 0xdfff)
return -1;
if (0xfffe <= code && code <= 0xffff)
return -1;
}
/* 0x00000000 - 0x0000007F:
0xxxxxxx
*/
if (code <= 0x7F) {
buf[0] = (unsigned char)code;
return 1;
}
/* 0x00000080 - 0x000007FF:
110xxxxx 10xxxxxx
*/
if (code <= 0x000007FF) {
buf[0] = (unsigned char)(0xc0 | (code >> 6));
buf[1] = (unsigned char)(0x80 | (code & 0x3f));
return 2;
}
/* 0x00000800 - 0x0000FFFF:
1110xxxx 10xxxxxx 10xxxxxx
*/
if (code <= 0x0000FFFF) {
buf[0] = (unsigned char)(0xe0 | (code >> 12));
buf[1] = (unsigned char)(0x80 | ((code >> 6) & 0x3f));
buf[2] = (unsigned char)(0x80 | (code & 0x3f));
return 3;
}
/* 0x00010000 - 0x001FFFFF:
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (code <= 0x001FFFFF) {
buf[0] = (unsigned char)(0xf0 | (code >> 18));
buf[1] = (unsigned char)(0x80 | ((code >> 12) & 0x3f));
buf[2] = (unsigned char)(0x80 | ((code >> 6) & 0x3f));
buf[3] = (unsigned char)(0x80 | (code & 0x3f));
return 4;
}
#ifdef UNDEF
if (0) {
// NOTE: these code points taking more than 4 bytes are not supported
/* 0x00200000 - 0x03FFFFFF:
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (code <= 0x03FFFFFF) {
buf[0] = (unsigned char)(0xf8 | (code >> 24));
buf[1] = (unsigned char)(0x80 | ((code >> 18) & 0x3f));
buf[2] = (unsigned char)(0x80 | ((code >> 12) & 0x3f));
buf[3] = (unsigned char)(0x80 | ((code >> 6) & 0x3f));
buf[4] = (unsigned char)(0x80 | (code & 0x3f));
return 5;
}
/* 0x04000000 - 0x7FFFFFFF:
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
if (code <= 0x7FFFFFFF) {
buf[0] = (unsigned char)(0xfc | (code >> 30));
buf[1] = (unsigned char)(0x80 | ((code >> 24) & 0x3f));
buf[2] = (unsigned char)(0x80 | ((code >> 18) & 0x3f));
buf[3] = (unsigned char)(0x80 | ((code >> 12) & 0x3f));
buf[4] = (unsigned char)(0x80 | ((code >> 6) & 0x3f));
buf[5] = (unsigned char)(0x80 | (code & 0x3f));
return 6;
}
}
#endif
return -1;
}