fig 1.0.0

Parse, edit, and convert config files while preserving comments. Supports JSON, YAML, TOML, and more.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
//! XML reader: parses XML into the shared fig AST.
//!
//! Data model (config-oriented, reader-only):
//!   * An element becomes a `mapping`; a text-only element with no attributes
//!     becomes a bare `string`; an empty element becomes `null`.
//!   * Attributes are folded into the element's mapping under `@`-prefixed keys,
//!     and mixed/sibling text under `#text`. This is collision-proof: `@` and `#`
//!     are illegal as the first character of an XML name, so these synthetic keys
//!     can never clash with a real element/attribute name.
//!   * Repeated child elements of the same name collapse into a `sequence`
//!     (first-appearance order). Interleaving of differently-named repeats is not
//!     preserved — that is the deferred mixed-content side-table.
//!   * The document's single root element yields a one-entry root mapping
//!     `{ rootName: ... }`.
//!   * Values stay raw strings — no type inference. Predefined and numeric
//!     entities are decoded; CDATA is literal text. Whitespace-only text between
//!     element siblings is dropped; text inside a text-only element is verbatim.
//!
//! Entry order within an element mapping: attributes (source order), then child
//! elements (first-appearance order), then `#text` (if any).

const Parser = @This();

const std = @import("std");
const build_options = @import("build_options");
const AST = @import("../ast/ast.zig");
const Document = @import("../document.zig");
const Type = @import("xml.zig").Type;
const Span = @import("../util/span.zig");
const Tokenizer = @import("tokenizer.zig");

const Id = AST.Node.Id;

allocator: std.mem.Allocator,
version: Type = .XML_1_0,
source: []const u8 = "",
tokens: []const Tokenizer.Token = &.{},
pos: usize = 0,
nodes: std.ArrayList(AST.Node) = .empty,
spans: std.ArrayList(Span) = .empty,
owned_strings: std.ArrayList([]const u8) = .empty,

pub const ParseError = error{
    MissingRootElement,
    MultipleRootElements,
    ContentOutsideRoot,
    MismatchedTag,
    DuplicateAttribute,
    UnsupportedEntity,
    UnexpectedToken,
} || Tokenizer.TokenizeError;

/// One parsed child element: its name (a borrowed source slice) and the AST node
/// id of its value.
const Child = struct { name: []const u8, value: Id };

/// One run of character data; `literal` marks CDATA (no entity decoding, never
/// dropped as insignificant whitespace).
const TextRun = struct { raw: []const u8, literal: bool };

pub fn parse(allocator: std.mem.Allocator, input: []const u8, format: Type) ParseError!Document {
    var self: Parser = .{ .allocator = allocator, .version = format, .source = input };
    errdefer {
        for (self.owned_strings.items) |s| allocator.free(s);
        self.owned_strings.deinit(allocator);
        self.nodes.deinit(allocator);
        self.spans.deinit(allocator);
    }

    var tokenizer: Tokenizer = .{ .allocator = allocator, .str = input };
    const tokens = try tokenizer.tokenize();
    defer allocator.free(tokens);
    self.tokens = tokens;

    // Prolog: only insignificant whitespace may precede the root element.
    try self.skipWhitespaceText();
    if (self.curKind() != .lt) return error.MissingRootElement;
    const root_el = try self.parseElement();

    // Epilog: only insignificant whitespace may follow it.
    try self.skipWhitespaceText();
    switch (self.curKind()) {
        .eof => {},
        .lt, .lt_slash => return error.MultipleRootElements,
        else => return error.UnexpectedToken,
    }

    // Wrap the root element in the one-entry root mapping `{ name: value }`.
    const whole = Span.init(0, input.len);
    const key_id = try self.stringNode(try self.dupe(root_el.name), whole);
    const kv = try self.addNode(.{ .keyvalue = .{ .key = key_id, .value = root_el.value } }, whole);
    const root_id = try self.buildMapping(&.{kv}, whole);

    const nodes = try self.nodes.toOwnedSlice(allocator);
    const spans = try self.spans.toOwnedSlice(allocator);
    const owned = try self.owned_strings.toOwnedSlice(allocator);
    return .{
        .source = input,
        .ast = .{ .allocator = allocator, .owned_strings = owned, .root = root_id, .nodes = nodes },
        .node_spans = spans,
    };
}

/// Parse one element (positioned at its opening `lt`) through its end tag,
/// returning its name and the AST node id representing its value.
fn parseElement(self: *Parser) ParseError!Child {
    const sp = self.cur().span; // start-tag span; reused for this element's nodes
    self.advance(); // lt
    if (self.curKind() != .name) return error.UnexpectedToken;
    const el_name = self.curText();
    self.advance();

    // Attributes → `@name` keyvalue entries.
    var attrs: std.ArrayList(Id) = .empty;
    defer attrs.deinit(self.allocator);
    var attr_names: std.ArrayList([]const u8) = .empty;
    defer attr_names.deinit(self.allocator);
    while (self.curKind() == .name) {
        const aname = self.curText();
        self.advance();
        for (attr_names.items) |n| if (std.mem.eql(u8, n, aname)) return error.DuplicateAttribute;
        try attr_names.append(self.allocator, aname);

        if (self.curKind() != .eq) return error.UnexpectedToken;
        self.advance();
        if (self.curKind() != .attr_value) return error.UnexpectedToken;
        const raw = self.curText();
        self.advance();

        const key_id = try self.prefixedStringNode('@', aname, sp);
        const val_id = try self.decodedStringNode(raw, sp);
        try attrs.append(self.allocator, try self.addNode(.{ .keyvalue = .{ .key = key_id, .value = val_id } }, sp));
    }

    // Empty-element tag `<.../>`: no content.
    if (self.curKind() == .slash_gt) {
        self.advance();
        return .{ .name = el_name, .value = try self.containerOrEmpty(attrs.items, sp) };
    }
    if (self.curKind() != .gt) return error.UnexpectedToken;
    self.advance();

    // Content until the end tag.
    var children: std.ArrayList(Child) = .empty;
    defer children.deinit(self.allocator);
    var runs: std.ArrayList(TextRun) = .empty;
    defer runs.deinit(self.allocator);
    content: while (true) {
        switch (self.curKind()) {
            .char_data => {
                try runs.append(self.allocator, .{ .raw = self.curText(), .literal = false });
                self.advance();
            },
            .cdata => {
                try runs.append(self.allocator, .{ .raw = self.curText(), .literal = true });
                self.advance();
            },
            .lt => try children.append(self.allocator, try self.parseElement()),
            .lt_slash => break :content,
            .eof => return error.UnclosedTag,
            else => return error.UnexpectedToken,
        }
    }

    // End tag: `</name>` whose name must match.
    self.advance(); // lt_slash
    if (self.curKind() != .name) return error.UnexpectedToken;
    if (!std.mem.eql(u8, self.curText(), el_name)) return error.MismatchedTag;
    self.advance();
    if (self.curKind() != .gt) return error.UnexpectedToken;
    self.advance();

    return .{ .name = el_name, .value = try self.assemble(attrs.items, children.items, runs.items, sp) };
}

/// Build an element's value from its attributes, child elements, and text runs.
fn assemble(self: *Parser, attrs: []const Id, children: []const Child, runs: []const TextRun, sp: Span) ParseError!Id {
    const has_attrs = attrs.len > 0;
    const has_elems = children.len > 0;

    // Concatenate significant text. In element/attribute context, whitespace-only
    // char_data is dropped; CDATA and non-whitespace char_data are kept. In a
    // pure text-only element, all text is kept verbatim.
    var textbuf: std.ArrayList(u8) = .empty;
    defer textbuf.deinit(self.allocator);
    for (runs) |r| {
        if (r.literal) {
            try textbuf.appendSlice(self.allocator, r.raw);
        } else if ((!has_attrs and !has_elems) or !isWhitespaceOnly(r.raw)) {
            try self.decodeInto(&textbuf, r.raw);
        }
    }

    if (!has_attrs and !has_elems) {
        if (textbuf.items.len == 0) return self.addNode(.null_, sp);
        return self.stringNode(try self.dupe(textbuf.items), sp);
    }

    var entries: std.ArrayList(Id) = .empty;
    defer entries.deinit(self.allocator);
    try entries.appendSlice(self.allocator, attrs);
    try self.buildChildEntries(children, &entries, sp);
    if (textbuf.items.len > 0) {
        const key_id = try self.prefixedStringNode('#', "text", sp);
        const val_id = try self.stringNode(try self.dupe(textbuf.items), sp);
        try entries.append(self.allocator, try self.addNode(.{ .keyvalue = .{ .key = key_id, .value = val_id } }, sp));
    }
    return self.buildMapping(entries.items, sp);
}

/// Value for an element with no content: a mapping of its attributes, or `null`
/// when it has none.
fn containerOrEmpty(self: *Parser, attrs: []const Id, sp: Span) ParseError!Id {
    if (attrs.len == 0) return self.addNode(.null_, sp);
    return self.buildMapping(attrs, sp);
}

/// Group children by name into mapping entries: a name seen once becomes a plain
/// `name: value` entry; a repeated name becomes `name: [values…]`.
fn buildChildEntries(self: *Parser, children: []const Child, entries: *std.ArrayList(Id), sp: Span) ParseError!void {
    for (children, 0..) |child, i| {
        // Skip names already emitted at an earlier occurrence.
        var seen = false;
        for (children[0..i]) |prev| {
            if (std.mem.eql(u8, prev.name, child.name)) {
                seen = true;
                break;
            }
        }
        if (seen) continue;

        var vals: std.ArrayList(Id) = .empty;
        defer vals.deinit(self.allocator);
        for (children[i..]) |c| {
            if (std.mem.eql(u8, c.name, child.name)) try vals.append(self.allocator, c.value);
        }
        const value_id = if (vals.items.len == 1)
            vals.items[0]
        else
            try self.buildSequence(vals.items, sp);

        const key_id = try self.stringNode(try self.dupe(child.name), sp);
        try entries.append(self.allocator, try self.addNode(.{ .keyvalue = .{ .key = key_id, .value = value_id } }, sp));
    }
}

// ── entity decoding ──────────────────────────────────────────────────────────

/// Decode XML entity references in `raw` (predefined `&amp; &lt; &gt; &quot;
/// &apos;` and numeric `&#dd; / &#xhh;`) into `buf`. Any other named reference
/// is rejected — DTD-defined entities are out of scope.
fn decodeInto(self: *Parser, buf: *std.ArrayList(u8), raw: []const u8) ParseError!void {
    var i: usize = 0;
    while (i < raw.len) {
        if (raw[i] != '&') {
            try buf.append(self.allocator, raw[i]);
            i += 1;
            continue;
        }
        const semi = std.mem.indexOfScalarPos(u8, raw, i + 1, ';') orelse return error.UnsupportedEntity;
        const ent = raw[i + 1 .. semi];
        if (ent.len == 0) return error.UnsupportedEntity;
        if (ent[0] == '#') {
            const cp = parseCharRef(ent) orelse return error.UnsupportedEntity;
            var utf8: [4]u8 = undefined;
            const n = std.unicode.utf8Encode(cp, &utf8) catch return error.UnsupportedEntity;
            try buf.appendSlice(self.allocator, utf8[0..n]);
        } else {
            const repl: u8 = if (std.mem.eql(u8, ent, "amp"))
                '&'
            else if (std.mem.eql(u8, ent, "lt"))
                '<'
            else if (std.mem.eql(u8, ent, "gt"))
                '>'
            else if (std.mem.eql(u8, ent, "quot"))
                '"'
            else if (std.mem.eql(u8, ent, "apos"))
                '\''
            else
                return error.UnsupportedEntity;
            try buf.append(self.allocator, repl);
        }
        i = semi + 1;
    }
}

/// Parse a numeric character reference body (`#1234` or `#x1F600`) to a codepoint.
fn parseCharRef(ent: []const u8) ?u21 {
    if (ent.len < 2) return null;
    const hex = ent[1] == 'x' or ent[1] == 'X';
    const digits = if (hex) ent[2..] else ent[1..];
    if (digits.len == 0) return null;
    return std.fmt.parseInt(u21, digits, if (hex) 16 else 10) catch null;
}

// ── node construction ────────────────────────────────────────────────────────

fn addNode(self: *Parser, kind: AST.Node.Kind, span: Span) ParseError!Id {
    const id: Id = @intCast(self.nodes.items.len);
    try self.nodes.append(self.allocator, .{ .id = id, .kind = kind, .next_sibling = null });
    try self.spans.append(self.allocator, span);
    return id;
}

fn stringNode(self: *Parser, owned: []const u8, span: Span) ParseError!Id {
    return self.addNode(.{ .string = owned }, span);
}

/// A string node whose value is `prefix ++ name`, owned by the AST.
fn prefixedStringNode(self: *Parser, prefix: u8, name: []const u8, span: Span) ParseError!Id {
    const buf = try self.allocator.alloc(u8, name.len + 1);
    buf[0] = prefix;
    @memcpy(buf[1..], name);
    return self.stringNode(try self.intern(buf), span);
}

/// A string node whose value is `raw` with entities decoded, owned by the AST.
fn decodedStringNode(self: *Parser, raw: []const u8, span: Span) ParseError!Id {
    var buf: std.ArrayList(u8) = .empty;
    defer buf.deinit(self.allocator);
    try self.decodeInto(&buf, raw);
    return self.stringNode(try self.dupe(buf.items), span);
}

fn buildSequence(self: *Parser, items: []const Id, span: Span) ParseError!Id {
    self.link(items);
    return self.addNode(.{ .sequence = if (items.len == 0) null else items[0] }, span);
}

fn buildMapping(self: *Parser, entries: []const Id, span: Span) ParseError!Id {
    self.link(entries);
    return self.addNode(.{ .mapping = if (entries.len == 0) null else entries[0] }, span);
}

/// Chain `ids` as siblings; terminate the last.
fn link(self: *Parser, ids: []const Id) void {
    if (ids.len == 0) return;
    for (ids[0 .. ids.len - 1], ids[1..]) |cur_id, next_id| {
        self.nodes.items[cur_id].next_sibling = next_id;
    }
    self.nodes.items[ids[ids.len - 1]].next_sibling = null;
}

/// Register an already-allocated buffer with the AST's owned strings.
fn intern(self: *Parser, owned: []u8) ParseError![]const u8 {
    errdefer self.allocator.free(owned);
    try self.owned_strings.append(self.allocator, owned);
    return owned;
}

/// Copy `bytes` into AST-owned storage.
fn dupe(self: *Parser, bytes: []const u8) ParseError![]const u8 {
    return self.intern(try self.allocator.dupe(u8, bytes));
}

// ── cursor + helpers ─────────────────────────────────────────────────────────

fn cur(self: *const Parser) Tokenizer.Token {
    return self.tokens[self.pos];
}

fn curKind(self: *const Parser) Tokenizer.Kind {
    return self.tokens[self.pos].kind;
}

fn curText(self: *const Parser) []const u8 {
    return self.tokens[self.pos].source(self.source);
}

fn advance(self: *Parser) void {
    self.pos += 1;
}

/// Consume insignificant (whitespace-only) char_data; a non-whitespace run
/// outside the root element is an error.
fn skipWhitespaceText(self: *Parser) ParseError!void {
    while (self.curKind() == .char_data) {
        if (!isWhitespaceOnly(self.curText())) return error.ContentOutsideRoot;
        self.advance();
    }
}

fn isWhitespaceOnly(s: []const u8) bool {
    for (s) |c| {
        if (c != ' ' and c != '\t' and c != '\r' and c != '\n') return false;
    }
    return true;
}

// ── tests ────────────────────────────────────────────────────────────────────

const testing = std.testing;

/// Parse `src` and assert its JSON serialization equals `expected`.
fn expectJson(src: []const u8, expected: []const u8) !void {
    // These reader tests verify the parsed shape by serializing to JSON; skip
    // them when JSON is gated out of the build (the parser itself is unaffected).
    if (comptime !build_options.lang_json) return error.SkipZigTest;
    var doc = try parse(testing.allocator, src, .XML_1_0);
    defer doc.deinit(testing.allocator);
    var buf: [1024]u8 = undefined;
    var w = std.Io.Writer.fixed(&buf);
    try doc.ast.serialize(&w, .json);
    try testing.expectEqualStrings(expected, w.buffered());
}

test "text-only element" {
    try expectJson("<a>hi</a>",
        \\{
        \\  "a": "hi"
        \\}
        \\
    );
}

test "empty element is null" {
    try expectJson("<a/>",
        \\{
        \\  "a": null
        \\}
        \\
    );
    try expectJson("<a></a>",
        \\{
        \\  "a": null
        \\}
        \\
    );
}

test "nested elements" {
    try expectJson("<r><a>1</a><b>2</b></r>",
        \\{
        \\  "r": {
        \\    "a": "1",
        \\    "b": "2"
        \\  }
        \\}
        \\
    );
}

test "attribute folds to @key" {
    try expectJson(
        \\<a x="1"/>
    ,
        \\{
        \\  "a": {
        \\    "@x": "1"
        \\  }
        \\}
        \\
    );
}

test "repeated elements collapse to a sequence" {
    try expectJson("<r><i>a</i><i>b</i></r>",
        \\{
        \\  "r": {
        \\    "i": [
        \\      "a",
        \\      "b"
        \\    ]
        \\  }
        \\}
        \\
    );
}

test "attributes plus text use #text" {
    try expectJson(
        \\<a x="1">hi</a>
    ,
        \\{
        \\  "a": {
        \\    "@x": "1",
        \\    "#text": "hi"
        \\  }
        \\}
        \\
    );
}

test "attribute and child of same name do not collide" {
    try expectJson(
        \\<x id="1"><id>2</id></x>
    ,
        \\{
        \\  "x": {
        \\    "@id": "1",
        \\    "id": "2"
        \\  }
        \\}
        \\
    );
}

test "predefined and numeric entities decode" {
    try expectJson("<a>x &amp; y &#65; &#x42;</a>",
        \\{
        \\  "a": "x & y A B"
        \\}
        \\
    );
}

test "cdata is literal text" {
    try expectJson("<a><![CDATA[<b>&z]]></a>",
        \\{
        \\  "a": "<b>&z"
        \\}
        \\
    );
}

test "whitespace between element siblings is dropped" {
    try expectJson("<r>\n  <a>1</a>\n  <b>2</b>\n</r>",
        \\{
        \\  "r": {
        \\    "a": "1",
        \\    "b": "2"
        \\  }
        \\}
        \\
    );
}

test "leading/trailing prolog whitespace allowed" {
    try expectJson("\n  <a/>\n",
        \\{
        \\  "a": null
        \\}
        \\
    );
}

test "errors" {
    try testing.expectError(error.MismatchedTag, parse(testing.allocator, "<a></b>", .XML_1_0));
    try testing.expectError(error.MultipleRootElements, parse(testing.allocator, "<a/><b/>", .XML_1_0));
    try testing.expectError(error.DuplicateAttribute, parse(testing.allocator, "<a x=\"1\" x=\"2\"/>", .XML_1_0));
    try testing.expectError(error.UnsupportedEntity, parse(testing.allocator, "<a>&bogus;</a>", .XML_1_0));
    try testing.expectError(error.ContentOutsideRoot, parse(testing.allocator, "junk<a/>", .XML_1_0));
    try testing.expectError(error.MissingRootElement, parse(testing.allocator, "   ", .XML_1_0));
    try testing.expectError(error.UnclosedTag, parse(testing.allocator, "<a><b>", .XML_1_0));
}