#include "dtob_internal.h"
#include "html_rules.h"
#include <strings.h>
typedef struct {
const char *buf;
size_t pos;
size_t len;
int html;
} XmlReader;
static int xr_eof(XmlReader *r) { return r->pos >= r->len; }
static char xr_peek(XmlReader *r) { return xr_eof(r) ? '\0' : r->buf[r->pos]; }
static char xr_consume(XmlReader *r) { return xr_eof(r) ? '\0' : r->buf[r->pos++]; }
static void xr_skip_ws(XmlReader *r)
{
while (!xr_eof(r) && (r->buf[r->pos] == ' ' || r->buf[r->pos] == '\t' ||
r->buf[r->pos] == '\n' || r->buf[r->pos] == '\r'))
r->pos++;
}
static int xr_match(XmlReader *r, const char *s)
{
size_t slen = strlen(s);
if (r->pos + slen > r->len) return 0;
if (memcmp(r->buf + r->pos, s, slen) != 0) return 0;
r->pos += slen;
return 1;
}
static int xr_starts_with(XmlReader *r, const char *s)
{
size_t slen = strlen(s);
if (r->pos + slen > r->len) return 0;
return memcmp(r->buf + r->pos, s, slen) == 0;
}
static size_t utf8_encode(uint32_t cp, char *out)
{
if (cp < 0x80) {
out[0] = (char)cp;
return 1;
} else if (cp < 0x800) {
out[0] = (char)(0xC0 | (cp >> 6));
out[1] = (char)(0x80 | (cp & 0x3F));
return 2;
} else if (cp < 0x10000) {
out[0] = (char)(0xE0 | (cp >> 12));
out[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
out[2] = (char)(0x80 | (cp & 0x3F));
return 3;
} else if (cp < 0x110000) {
out[0] = (char)(0xF0 | (cp >> 18));
out[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
out[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
out[3] = (char)(0x80 | (cp & 0x3F));
return 4;
}
return 0;
}
static int xr_decode_entity(XmlReader *r, char **out, size_t *out_pos, size_t *out_cap)
{
#define ENT_GROW() do { \
if (*out_pos + 8 >= *out_cap) { \
*out_cap = *out_cap * 2 + 16; \
*out = realloc(*out, *out_cap); \
} \
} while (0)
ENT_GROW();
if (xr_match(r, "amp;")) { (*out)[(*out_pos)++] = '&'; return 1; }
if (xr_match(r, "lt;")) { (*out)[(*out_pos)++] = '<'; return 1; }
if (xr_match(r, "gt;")) { (*out)[(*out_pos)++] = '>'; return 1; }
if (xr_match(r, "quot;")) { (*out)[(*out_pos)++] = '"'; return 1; }
if (xr_match(r, "apos;")) { (*out)[(*out_pos)++] = '\''; return 1; }
if (xr_match(r, "#x") || xr_match(r, "#X")) {
uint32_t cp = 0;
while (!xr_eof(r) && xr_peek(r) != ';') {
char c = xr_consume(r);
if (c >= '0' && c <= '9') cp = cp * 16 + (c - '0');
else if (c >= 'a' && c <= 'f') cp = cp * 16 + 10 + (c - 'a');
else if (c >= 'A' && c <= 'F') cp = cp * 16 + 10 + (c - 'A');
else break;
}
if (xr_peek(r) == ';') xr_consume(r);
*out_pos += utf8_encode(cp, *out + *out_pos);
return 1;
}
if (xr_match(r, "#")) {
uint32_t cp = 0;
while (!xr_eof(r) && xr_peek(r) != ';') {
char c = xr_consume(r);
if (c >= '0' && c <= '9') cp = cp * 10 + (c - '0');
else break;
}
if (xr_peek(r) == ';') xr_consume(r);
*out_pos += utf8_encode(cp, *out + *out_pos);
return 1;
}
static const struct { const char *name; uint32_t cp; } named[] = {
{"nbsp;", 0x00A0}, {"iexcl;", 0x00A1}, {"cent;", 0x00A2},
{"pound;", 0x00A3}, {"curren;", 0x00A4}, {"yen;", 0x00A5},
{"brvbar;", 0x00A6}, {"sect;", 0x00A7}, {"uml;", 0x00A8},
{"copy;", 0x00A9}, {"ordf;", 0x00AA}, {"laquo;", 0x00AB},
{"not;", 0x00AC}, {"shy;", 0x00AD}, {"reg;", 0x00AE},
{"macr;", 0x00AF}, {"deg;", 0x00B0}, {"plusmn;", 0x00B1},
{"sup2;", 0x00B2}, {"sup3;", 0x00B3}, {"acute;", 0x00B4},
{"micro;", 0x00B5}, {"para;", 0x00B6}, {"middot;", 0x00B7},
{"cedil;", 0x00B8}, {"sup1;", 0x00B9}, {"ordm;", 0x00BA},
{"raquo;", 0x00BB}, {"frac14;", 0x00BC}, {"frac12;", 0x00BD},
{"frac34;", 0x00BE}, {"iquest;", 0x00BF},
{"Agrave;", 0x00C0}, {"Aacute;", 0x00C1}, {"Acirc;", 0x00C2},
{"Atilde;", 0x00C3}, {"Auml;", 0x00C4}, {"Aring;", 0x00C5},
{"AElig;", 0x00C6}, {"Ccedil;", 0x00C7}, {"Egrave;", 0x00C8},
{"Eacute;", 0x00C9}, {"Ecirc;", 0x00CA}, {"Euml;", 0x00CB},
{"Igrave;", 0x00CC}, {"Iacute;", 0x00CD}, {"Icirc;", 0x00CE},
{"Iuml;", 0x00CF}, {"ETH;", 0x00D0}, {"Ntilde;", 0x00D1},
{"Ograve;", 0x00D2}, {"Oacute;", 0x00D3}, {"Ocirc;", 0x00D4},
{"Otilde;", 0x00D5}, {"Ouml;", 0x00D6}, {"times;", 0x00D7},
{"Oslash;", 0x00D8}, {"Ugrave;", 0x00D9}, {"Uacute;", 0x00DA},
{"Ucirc;", 0x00DB}, {"Uuml;", 0x00DC}, {"Yacute;", 0x00DD},
{"THORN;", 0x00DE}, {"szlig;", 0x00DF},
{"agrave;", 0x00E0}, {"aacute;", 0x00E1}, {"acirc;", 0x00E2},
{"atilde;", 0x00E3}, {"auml;", 0x00E4}, {"aring;", 0x00E5},
{"aelig;", 0x00E6}, {"ccedil;", 0x00E7}, {"egrave;", 0x00E8},
{"eacute;", 0x00E9}, {"ecirc;", 0x00EA}, {"euml;", 0x00EB},
{"igrave;", 0x00EC}, {"iacute;", 0x00ED}, {"icirc;", 0x00EE},
{"iuml;", 0x00EF}, {"eth;", 0x00F0}, {"ntilde;", 0x00F1},
{"ograve;", 0x00F2}, {"oacute;", 0x00F3}, {"ocirc;", 0x00F4},
{"otilde;", 0x00F5}, {"ouml;", 0x00F6}, {"divide;", 0x00F7},
{"oslash;", 0x00F8}, {"ugrave;", 0x00F9}, {"uacute;", 0x00FA},
{"ucirc;", 0x00FB}, {"uuml;", 0x00FC}, {"yacute;", 0x00FD},
{"thorn;", 0x00FE}, {"yuml;", 0x00FF},
{"OElig;", 0x0152}, {"oelig;", 0x0153}, {"Scaron;", 0x0160},
{"scaron;", 0x0161}, {"Yuml;", 0x0178},
{"fnof;", 0x0192}, {"circ;", 0x02C6}, {"tilde;", 0x02DC},
{"ensp;", 0x2002}, {"emsp;", 0x2003}, {"thinsp;", 0x2009},
{"zwnj;", 0x200C}, {"zwj;", 0x200D}, {"lrm;", 0x200E},
{"rlm;", 0x200F}, {"ndash;", 0x2013}, {"mdash;", 0x2014},
{"lsquo;", 0x2018}, {"rsquo;", 0x2019}, {"sbquo;", 0x201A},
{"ldquo;", 0x201C}, {"rdquo;", 0x201D}, {"bdquo;", 0x201E},
{"dagger;", 0x2020}, {"Dagger;", 0x2021}, {"bull;", 0x2022},
{"hellip;", 0x2026}, {"permil;", 0x2030}, {"lsaquo;", 0x2039},
{"rsaquo;", 0x203A}, {"euro;", 0x20AC}, {"trade;", 0x2122},
{"larr;", 0x2190}, {"uarr;", 0x2191}, {"rarr;", 0x2192},
{"darr;", 0x2193}, {"harr;", 0x2194}, {"lArr;", 0x21D0},
{"rArr;", 0x21D2}, {"hArr;", 0x21D4},
{"forall;", 0x2200}, {"part;", 0x2202}, {"exist;", 0x2203},
{"empty;", 0x2205}, {"nabla;", 0x2207}, {"isin;", 0x2208},
{"notin;", 0x2209}, {"ni;", 0x220B}, {"prod;", 0x220F},
{"sum;", 0x2211}, {"minus;", 0x2212}, {"lowast;", 0x2217},
{"radic;", 0x221A}, {"prop;", 0x221D}, {"infin;", 0x221E},
{"ang;", 0x2220}, {"and;", 0x2227}, {"or;", 0x2228},
{"cap;", 0x2229}, {"cup;", 0x222A}, {"int;", 0x222B},
{"there4;", 0x2234}, {"sim;", 0x223C}, {"cong;", 0x2245},
{"asymp;", 0x2248}, {"ne;", 0x2260}, {"equiv;", 0x2261},
{"le;", 0x2264}, {"ge;", 0x2265}, {"sub;", 0x2282},
{"sup;", 0x2283}, {"sube;", 0x2286}, {"supe;", 0x2287},
{"oplus;", 0x2295}, {"otimes;", 0x2297}, {"perp;", 0x22A5},
{"sdot;", 0x22C5}, {"lceil;", 0x2308}, {"rceil;", 0x2309},
{"lfloor;", 0x230A}, {"rfloor;", 0x230B}, {"lang;", 0x2329},
{"rang;", 0x232A}, {"loz;", 0x25CA}, {"spades;", 0x2660},
{"clubs;", 0x2663}, {"hearts;", 0x2665}, {"diams;", 0x2666},
{NULL, 0}
};
for (int ni = 0; named[ni].name; ni++) {
if (xr_match(r, named[ni].name)) {
ENT_GROW();
*out_pos += utf8_encode(named[ni].cp, *out + *out_pos);
return 1;
}
}
(*out)[(*out_pos)++] = '&';
return 1;
#undef ENT_GROW
}
static void xr_skip_comment(XmlReader *r)
{
while (!xr_eof(r)) {
if (xr_match(r, "-->")) return;
r->pos++;
}
}
static void xr_skip_pi(XmlReader *r)
{
while (!xr_eof(r)) {
if (xr_match(r, "?>")) return;
r->pos++;
}
}
static void xr_skip_doctype(XmlReader *r)
{
int depth = 1;
while (!xr_eof(r) && depth > 0) {
char c = xr_consume(r);
if (c == '<') depth++;
else if (c == '>') depth--;
}
}
static DtobValue *xr_parse_cdata(XmlReader *r)
{
size_t start = r->pos;
while (!xr_eof(r)) {
if (xr_starts_with(r, "]]>")) {
size_t end = r->pos;
r->pos += 3;
return dtob_raw((const uint8_t *)r->buf + start, end - start);
}
r->pos++;
}
return dtob_raw((const uint8_t *)r->buf + start, r->len - start);
}
static DtobValue *xr_parse_text(XmlReader *r)
{
size_t cap = 64;
size_t pos = 0;
char *buf = malloc(cap);
while (!xr_eof(r) && xr_peek(r) != '<') {
if (xr_peek(r) == '&') {
xr_consume(r);
xr_decode_entity(r, &buf, &pos, &cap);
} else {
if (pos + 1 >= cap) {
cap *= 2;
buf = realloc(buf, cap);
}
buf[pos++] = xr_consume(r);
}
}
DtobValue *v = json_string(buf, pos);
free(buf);
return v;
}
static int is_name_char(char c)
{
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.' || c == ':';
}
static char *xr_parse_name(XmlReader *r, size_t *out_len)
{
size_t start = r->pos;
while (!xr_eof(r) && is_name_char(xr_peek(r)))
r->pos++;
size_t len = r->pos - start;
if (len == 0) return NULL;
char *name = malloc(len + 1);
memcpy(name, r->buf + start, len);
name[len] = '\0';
if (out_len) *out_len = len;
return name;
}
static char *xr_parse_attr_value(XmlReader *r, size_t *out_len)
{
char quote = xr_consume(r);
if (quote != '"' && quote != '\'') return NULL;
size_t cap = 64;
size_t pos = 0;
char *buf = malloc(cap);
while (!xr_eof(r) && xr_peek(r) != quote) {
if (xr_peek(r) == '&') {
xr_consume(r);
xr_decode_entity(r, &buf, &pos, &cap);
} else {
if (pos + 1 >= cap) {
cap *= 2;
buf = realloc(buf, cap);
}
buf[pos++] = xr_consume(r);
}
}
if (xr_peek(r) == quote) xr_consume(r);
*out_len = pos;
return buf;
}
static char *xr_parse_unquoted_attr_value(XmlReader *r, size_t *out_len)
{
size_t start = r->pos;
while (!xr_eof(r)) {
char c = xr_peek(r);
if (c == ' ' || c == '\t' || c == '\n' || c == '\r' ||
c == '>' || c == '/') break;
r->pos++;
}
size_t len = r->pos - start;
if (len == 0) return NULL;
char *buf = malloc(len + 1);
memcpy(buf, r->buf + start, len);
buf[len] = '\0';
*out_len = len;
return buf;
}
static DtobValue *xr_parse_raw_content(XmlReader *r, const char *tag)
{
size_t tag_len = strlen(tag);
size_t start = r->pos;
while (!xr_eof(r)) {
if (r->buf[r->pos] == '<' && r->pos + 2 + tag_len < r->len &&
r->buf[r->pos + 1] == '/' &&
strncasecmp(r->buf + r->pos + 2, tag, tag_len) == 0) {
char after = r->buf[r->pos + 2 + tag_len];
if (after == '>' || after == ' ' || after == '\t' ||
after == '\n' || after == '\r') {
size_t end = r->pos;
r->pos += 2 + tag_len;
while (!xr_eof(r) && xr_peek(r) != '>') r->pos++;
if (xr_peek(r) == '>') xr_consume(r);
return json_string(r->buf + start, end - start);
}
}
r->pos++;
}
return json_string(r->buf + start, r->len - start);
}
static DtobValue *xr_parse_escapable_raw_content(XmlReader *r, const char *tag)
{
size_t tag_len = strlen(tag);
size_t cap = 64;
size_t pos = 0;
char *buf = malloc(cap);
while (!xr_eof(r)) {
if (r->buf[r->pos] == '<' && r->pos + 2 + tag_len < r->len &&
r->buf[r->pos + 1] == '/' &&
strncasecmp(r->buf + r->pos + 2, tag, tag_len) == 0) {
char after = r->buf[r->pos + 2 + tag_len];
if (after == '>' || after == ' ' || after == '\t' ||
after == '\n' || after == '\r') {
r->pos += 2 + tag_len;
while (!xr_eof(r) && xr_peek(r) != '>') r->pos++;
if (xr_peek(r) == '>') xr_consume(r);
break;
}
}
if (xr_peek(r) == '&') {
xr_consume(r);
xr_decode_entity(r, &buf, &pos, &cap);
} else {
if (pos + 1 >= cap) {
cap *= 2;
buf = realloc(buf, cap);
}
buf[pos++] = xr_consume(r);
}
}
DtobValue *v = json_string(buf, pos);
free(buf);
return v;
}
static const char *xr_peek_tag_name(XmlReader *r)
{
static char peek_buf[64];
if (xr_peek(r) != '<') return NULL;
size_t p = r->pos + 1;
if (p < r->len && r->buf[p] == '/') return NULL;
size_t start = p;
while (p < r->len && is_name_char(r->buf[p])) p++;
size_t len = p - start;
if (len == 0 || len >= sizeof(peek_buf)) return NULL;
memcpy(peek_buf, r->buf + start, len);
peek_buf[len] = '\0';
return peek_buf;
}
static DtobValue *xr_parse_element(XmlReader *r);
static DtobValue *xr_parse_node(XmlReader *r)
{
if (r->html) {
size_t saved = r->pos;
xr_skip_ws(r);
if (xr_eof(r)) return NULL;
if (xr_peek(r) != '<') r->pos = saved;
} else {
xr_skip_ws(r);
if (xr_eof(r)) return NULL;
}
if (xr_peek(r) == '<') {
if (xr_starts_with(r, "<!--")) {
r->pos += 4;
xr_skip_comment(r);
return xr_parse_node(r);
}
if (xr_starts_with(r, "<?")) {
r->pos += 2;
xr_skip_pi(r);
return xr_parse_node(r);
}
if (!r->html && xr_starts_with(r, "<![CDATA[")) {
r->pos += 9;
return xr_parse_cdata(r);
}
if (xr_starts_with(r, "<!")) {
r->pos += 2;
xr_skip_doctype(r);
return xr_parse_node(r);
}
if (xr_starts_with(r, "</")) {
return NULL;
}
return xr_parse_element(r);
}
return xr_parse_text(r);
}
static void xr_consume_close_tag(XmlReader *r, const char *tag)
{
size_t saved = r->pos;
r->pos += 2;
size_t name_start = r->pos;
while (!xr_eof(r) && is_name_char(xr_peek(r))) r->pos++;
size_t name_len = r->pos - name_start;
if (r->html) {
if (name_len == strlen(tag) &&
strncasecmp(r->buf + name_start, tag, name_len) == 0) {
while (!xr_eof(r) && xr_peek(r) != '>') r->pos++;
if (xr_peek(r) == '>') xr_consume(r);
} else {
r->pos = saved;
}
} else {
while (!xr_eof(r) && xr_peek(r) != '>') r->pos++;
if (xr_peek(r) == '>') xr_consume(r);
}
}
static DtobValue *xr_parse_element(XmlReader *r)
{
xr_consume(r);
size_t tag_len;
char *tag = xr_parse_name(r, &tag_len);
if (!tag) {
fprintf(stderr, "xml: expected tag name\n");
return NULL;
}
if (r->html) html_normalize_tag(tag, tag_len);
DtobValue *attrs = dtob_kvset();
xr_skip_ws(r);
while (!xr_eof(r) && xr_peek(r) != '>' && xr_peek(r) != '/') {
size_t aname_len;
char *aname = xr_parse_name(r, &aname_len);
if (!aname) break;
if (r->html) html_normalize_tag(aname, aname_len);
xr_skip_ws(r);
if (xr_peek(r) == '=') {
xr_consume(r);
xr_skip_ws(r);
size_t vlen;
char *val;
char next = xr_peek(r);
if (r->html && next != '"' && next != '\'') {
val = xr_parse_unquoted_attr_value(r, &vlen);
} else {
val = xr_parse_attr_value(r, &vlen);
}
if (val) {
const char *key = aname;
if (!r->html) {
char *colon = strchr(aname, ':');
if (colon) key = colon + 1;
}
dtob_kvset_put(attrs, key, json_string(val, vlen));
free(val);
}
} else {
dtob_kvset_put(attrs, aname, json_true());
}
free(aname);
xr_skip_ws(r);
}
const char *tag_clean = tag;
if (!r->html) {
char *colon = strchr(tag, ':');
if (colon) tag_clean = colon + 1;
}
int self_closing = 0;
if (xr_match(r, "/>")) {
self_closing = 1;
} else if (xr_peek(r) == '>') {
xr_consume(r);
}
DtobValue *elem = dtob_kvset();
dtob_kvset_put(elem, "t", json_string(tag_clean, strlen(tag_clean)));
dtob_kvset_put(elem, "a", attrs);
DtobValue *children = dtob_array();
if (!self_closing && r->html && html_is_void(tag_clean)) {
} else if (!self_closing && r->html && html_is_raw_text(tag_clean)) {
DtobValue *raw = xr_parse_raw_content(r, tag_clean);
if (raw) dtob_array_push(children, raw);
} else if (!self_closing && r->html && html_is_escapable_raw_text(tag_clean)) {
DtobValue *raw = xr_parse_escapable_raw_content(r, tag_clean);
if (raw) dtob_array_push(children, raw);
} else if (!self_closing) {
while (!xr_eof(r)) {
if (xr_starts_with(r, "</")) {
xr_consume_close_tag(r, tag_clean);
break;
}
if (r->html) {
const char *upcoming = xr_peek_tag_name(r);
if (upcoming && html_implicitly_closes(tag_clean, upcoming))
break;
}
DtobValue *child = xr_parse_node(r);
if (!child) break;
if (r->html && child->type == JSON_STRING) {
int all_ws = 1;
for (size_t i = 0; i < child->data_len; i++) {
char c = (char)child->data[i];
if (c != ' ' && c != '\t' && c != '\n' && c != '\r') {
all_ws = 0;
break;
}
}
if (all_ws) {
if (child->data_len > 0) {
child->data[0] = ' ';
child->data_len = 1;
} else {
dtob_free(child);
continue;
}
}
}
dtob_array_push(children, child);
}
}
if (!self_closing && r->html && xr_starts_with(r, "</"))
xr_consume_close_tag(r, tag_clean);
dtob_kvset_put(elem, "c", children);
free(tag);
return elem;
}
static DtobValue *parse_markup(const char *xml, int html)
{
XmlReader r = { xml, 0, strlen(xml), html };
while (!xr_eof(&r)) {
xr_skip_ws(&r);
if (xr_eof(&r)) break;
if (xr_starts_with(&r, "<!--")) {
r.pos += 4;
xr_skip_comment(&r);
} else if (xr_starts_with(&r, "<?")) {
r.pos += 2;
xr_skip_pi(&r);
} else if (xr_starts_with(&r, "<!")) {
r.pos += 2;
xr_skip_doctype(&r);
} else {
break;
}
}
if (xr_eof(&r) || xr_peek(&r) != '<') {
fprintf(stderr, "xml: no root element found\n");
return NULL;
}
return xr_parse_element(&r);
}
DtobValue *dtob_from_xml(const char *xml)
{
return parse_markup(xml, 0);
}
DtobValue *dtob_from_html(const char *html)
{
return parse_markup(html, 1);
}