fig 1.0.0 - Docs.rs

//! The parser turns a JSON-formatted []const u8 into an AST.
//! It uses the Tokenizer to tokenize the string, and then converts
//! the token slice into an AST incrementally.
//!
//! This parser temporarily allocates and frees memory for the tokenizer
//! and for the in-progress containers, including three ArrayLists
//! for `node`s, `Span`s, and `OpenContainer`s.
//!
//! Decoded string escape allocations are transferred into the returned AST's
//! `owned_strings` slice and must be freed with `ast.deinit();`

const Parser = @This();

const std = @import("std");
const builtin = @import("builtin");
const testing = std.testing;
const log = std.log.scoped(.parser);
const Unicode = @import("../util/util.zig").Unicode;
const AST = @import("../ast/ast.zig");
const Document = @import("../document.zig");
const Tokenizer = @import("tokenizer.zig").Tokenizer;
const Token = @import("../token.zig").Token(Tokenizer.Kind);
const Type = @import("json.zig").Type;
const Span = @import("../util/span.zig");

const ContainerKind = enum { array, object };
/// Either an array or object in the process of being parsed.
const OpenContainer = struct {
    id: AST.Node.Id,
    kind: ContainerKind,
    first_child: ?AST.Node.Id = null,
    last_child: ?AST.Node.Id = null,
    pending_key: ?AST.Node.Id = null,
};

// State
state: State = .ExpectValue,
nodes: std.ArrayList(AST.Node) = .empty,
node_spans: std.ArrayList(Span) = .empty,
container_stack: std.ArrayList(OpenContainer) = .empty,
owned_strings: std.ArrayList([]const u8) = .empty,

// Comment layer (JSON5/JSONC only — strict JSON never tokenizes a comment).
// `node_comments` grows in lockstep with `nodes`. `pending_leading` buffers
// own-line comments until the next node claims them (in `addNode`). Comment text
// borrows `input` (comments carry no escapes). Materialized only when
// `comments_seen`.
node_comments: std.ArrayList(AST.NodeComments) = .empty,
pending_leading: std.ArrayList(AST.Comment) = .empty,
/// The most recently completed value node — the candidate a same-line trailing
/// comment binds to. Reset to null by a newline or a comma (both close the
/// trailing window), so a post-comma/own-line comment becomes leading instead.
last_value_id: ?AST.Node.Id = null,
comments_seen: bool = false,

root: ?AST.Node.Id = null,

// Initial fields
allocator: std.mem.Allocator,
/// Which JSON dialect is being parsed. Gates the JSON5-only grammar
/// (unquoted keys, trailing commas, `Infinity`/`NaN`, single-quoted strings).
format: Type = .JSON,

const ParseError = error{ UnclosedObject, UnclosedArray, UnclosedString, InvalidBool, InvalidNumber, UnexpectedToken, InvalidUnicodeEscape };
const ParserError = ParseError || std.mem.Allocator.Error;

const State = enum {
    ExpectValue,

    ExpectArrayValueOrEnd,
    ExpectArrayCommaOrEnd,

    ExpectObjectKeyOrEnd,
    ExpectObjectKey,
    ExpectObjectColon,
    ExpectObjectValue,
    ExpectObjectCommaOrEnd,

    ExpectEndOfFile,
};

/// Expects "true" or "false", translates to boolean
pub fn getBool(slice: []const u8) ParseError!bool {
    if (std.mem.eql(u8, slice, "true")) return true;
    if (std.mem.eql(u8, slice, "false")) return false;
    return error.InvalidBool;
}

/// Removes double quotes. If the string contains escape codes,
/// decodes and stores the allocated string in the AST's `owned_strings`.
pub fn getString(self: *Parser, slice: []const u8) ParserError![]const u8 {
    const json5 = self.format == .JSON5;
    // JSON5 strings may also be single-quoted; the closing quote must match.
    const quote: u8 = if (slice.len >= 1) slice[0] else 0;
    const valid_quote = quote == '"' or (json5 and quote == '\'');
    if (slice.len < 2 or !valid_quote or slice[slice.len - 1] != quote) {
        return ParseError.UnclosedString;
    }
    const inner = slice[1 .. slice.len - 1];

    // Fast path: no escapes, can safely point into source.
    if (std.mem.indexOfScalar(u8, inner, '\\') == null) return inner;

    // String contains escapes, so we need to allocate a new decoded string.
    var decoded: std.ArrayList(u8) = .empty;
    errdefer decoded.deinit(self.allocator);

    var i: usize = 0;
    while (i < inner.len) {
        const c = inner[i];
        if (c != '\\') {
            try decoded.append(self.allocator, c);
            i += 1;
            continue;
        }

        i += 1;
        if (i >= inner.len) return ParseError.UnclosedString;

        switch (inner[i]) {
            '"' => try decoded.append(self.allocator, '"'), // double quote
            '\\' => try decoded.append(self.allocator, '\\'), // backslash
            '/' => try decoded.append(self.allocator, '/'), // slash
            'b' => try decoded.append(self.allocator, 0x08), // backspace
            'f' => try decoded.append(self.allocator, 0x0c), // formfeed
            'n' => try decoded.append(self.allocator, '\n'), // newline
            'r' => try decoded.append(self.allocator, '\r'), // return
            't' => try decoded.append(self.allocator, '\t'), // tab
            // JSON5-only escapes.
            '\'' => if (json5) try decoded.append(self.allocator, '\'') else return ParseError.UnexpectedToken,
            'v' => if (json5) try decoded.append(self.allocator, 0x0b) else return ParseError.UnexpectedToken,
            '0' => if (json5) try decoded.append(self.allocator, 0x00) else return ParseError.UnexpectedToken,
            'x' => { // \xHH hex escape (one code point U+00HH)
                if (!json5) return ParseError.UnexpectedToken;
                if (i + 2 >= inner.len) return ParseError.UnclosedString;
                const byte = std.fmt.parseInt(u8, inner[i + 1 .. i + 3], 16) catch return ParseError.InvalidUnicodeEscape;
                var xbuf: [4]u8 = undefined;
                const xwritten = std.unicode.utf8Encode(byte, &xbuf) catch return ParseError.InvalidUnicodeEscape;
                try decoded.appendSlice(self.allocator, xbuf[0..xwritten]);
                i += 2;
            },
            // Line continuations: a backslash before a line terminator emits
            // nothing (the source line wraps). CRLF counts as one terminator.
            '\n' => {
                if (!json5) return ParseError.UnexpectedToken;
            },
            '\r' => {
                if (!json5) return ParseError.UnexpectedToken;
                if (i + 1 < inner.len and inner[i + 1] == '\n') i += 1;
            },
            'u' => { // unicode
                // JSON \u escapes encode one UTF-16 code unit in 4 hex chars.
                if (i + 4 >= inner.len) return ParseError.UnclosedString;
                const bytes = inner[i + 1 .. i + 5];
                const first_unit = std.fmt.parseInt(u16, bytes, 16) catch return ParseError.InvalidUnicodeEscape;
                var codepoint: u21 = first_unit;
                i += 4;

                // If the escape contains an unpaired surrogate, preserve the
                // raw source representation rather than failing. JSONTestSuite
                // treats these as implementation-defined `i_` cases, and the
                // AST cannot losslessly normalize them into UTF-8.
                if (Unicode.isHighSurrogate(codepoint)) {
                    if (i + 6 >= inner.len) {
                        decoded.deinit(self.allocator);
                        return inner;
                    }
                    if (inner[i + 1] != '\\' or inner[i + 2] != 'u') {
                        decoded.deinit(self.allocator);
                        return inner;
                    }
                    const nextBytes = inner[i + 3 .. i + 7];
                    const low_unit = std.fmt.parseInt(u16, nextBytes, 16) catch return ParseError.InvalidUnicodeEscape;
                    if (!Unicode.isLowSurrogate(low_unit)) {
                        decoded.deinit(self.allocator);
                        return inner;
                    }
                    codepoint = 0x10000 + ((@as(u21, first_unit) - 0xD800) << 10) + (@as(u21, low_unit) - 0xDC00);
                    i += 6;
                } else if (Unicode.isLowSurrogate(codepoint)) {
                    decoded.deinit(self.allocator);
                    return inner;
                }

                var buf: [4]u8 = undefined;
                const written = std.unicode.utf8Encode(codepoint, &buf) catch return ParseError.InvalidUnicodeEscape;
                try decoded.appendSlice(self.allocator, buf[0..written]);
            },
            // JSON5 NonEscapeCharacter: any other escaped char is itself
            // (`\q` -> `q`). Strict JSON rejects unknown escapes.
            else => if (json5) try decoded.append(self.allocator, inner[i]) else return ParseError.UnexpectedToken,
        }
        i += 1;
    }
    const owned = try decoded.toOwnedSlice(self.allocator);
    errdefer self.allocator.free(owned);

    try self.owned_strings.append(self.allocator, owned);
    return owned;
}

/// Returns lossless struct representation of a number
pub fn getNumber(slice: []const u8) ParseError!AST.Node.Kind.Number {
    // JSON5 hexadecimal integers (`0xC8`, optionally signed) carry no dot and
    // no exponent; their `e`/`E` digits are part of the radix, not a float
    // exponent, so classify them before the dot/exponent heuristic.
    const body = if (slice.len > 0 and (slice[0] == '+' or slice[0] == '-')) slice[1..] else slice;
    if (body.len >= 2 and body[0] == '0' and (body[1] == 'x' or body[1] == 'X'))
        return .{ .raw = slice, .kind = .integer };

    var numDots: usize = 0;
    for (slice) |char| {
        if (char == '.') numDots += 1;
    }
    return .{ .raw = slice, .kind = switch (numDots) {
        0 => if (std.mem.indexOfAny(u8, slice, "eE") == null) .integer else .float,
        1 => .float,
        else => return error.InvalidNumber,
    } };
}

/// Main entry function
pub fn parseAbstract(allocator: std.mem.Allocator, input: []const u8, format: Type) !AST {
    const parsed = try parse(allocator, input, format);
    allocator.free(parsed.node_spans);
    return parsed.ast;
}

pub fn parse(allocator: std.mem.Allocator, input: []const u8, format: Type) !Document {
    var parser: Parser = .{ .allocator = allocator };
    defer parser.deinit();
    return parser.parse_once(input, format);
}

fn parse_once(self: *Parser, input: []const u8, kind: Type) !Document {
    self.format = kind;
    var tokenizer: Tokenizer = .{
        .allocator = self.allocator,
        .str = input,
        .kind = kind,
    };

    const tokens = try tokenizer.tokenize();
    defer self.allocator.free(tokens);

    // Each Document.Node has an id, a kind, and a next_sibling ID.
    // We produce them from the tokens.

    for (tokens, 0..) |token, i| {
        switch (token.kind) {
            // A newline closes the previous value's trailing-comment window: a
            // comment on the next line leads the next node instead.
            .whitespace => {
                if (std.mem.indexOfScalar(u8, input[token.span.start..token.span.end], '\n') != null)
                    self.last_value_id = null;
                continue;
            },
            .comment => {
                try self.handleComment(input, tokens, i);
                continue;
            },
            else => {},
        }

        switch (self.state) {
            .ExpectValue => {
                switch (token.kind) {
                    .open_brace => {
                        const id = try self.addNode(.{ .mapping = null }, token.span);
                        try self.openContainer(.object, id);
                        self.state = .ExpectObjectKeyOrEnd;
                    },
                    .open_bracket => {
                        const id = try self.addNode(.{ .sequence = null }, token.span);
                        try self.openContainer(.array, id);
                        self.state = .ExpectArrayValueOrEnd;
                    },
                    .null_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .true_, .false_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .string => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .number, .identifier => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },

            .ExpectArrayValueOrEnd => {
                switch (token.kind) {
                    .open_bracket => {
                        const id = try self.addNode(.{ .sequence = null }, token.span);
                        try self.openContainer(.array, id);
                        self.state = .ExpectArrayValueOrEnd;
                    },
                    .open_brace => {
                        const id = try self.addNode(.{ .mapping = null }, token.span);
                        try self.openContainer(.object, id);
                        self.state = .ExpectObjectKeyOrEnd;
                    },
                    .null_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .true_, .false_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .string => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .number, .identifier => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .close_bracket => {
                        const id = try self.closeContainer(token.span.end);
                        try self.finishValue(id);
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },
            .ExpectArrayCommaOrEnd => {
                switch (token.kind) {
                    .close_bracket => {
                        const id = try self.closeContainer(token.span.end);
                        try self.finishValue(id);
                    },
                    .comma => {
                        // JSON5 permits a trailing comma: route to the state
                        // that also accepts `]`. Strict JSON must then see a
                        // value, so `[1,]` stays an error.
                        self.state = if (self.format == .JSON5) .ExpectArrayValueOrEnd else .ExpectValue;
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },

            .ExpectObjectKeyOrEnd => {
                switch (token.kind) {
                    .string, .identifier, .true_, .false_, .null_ => {
                        try self.beginKey(input, token);
                    },
                    .close_brace => {
                        const id = try self.closeContainer(token.span.end);
                        try self.finishValue(id);
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },
            .ExpectObjectKey => {
                switch (token.kind) {
                    .string, .identifier, .true_, .false_, .null_ => {
                        try self.beginKey(input, token);
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },
            .ExpectObjectColon => {
                switch (token.kind) {
                    .colon => {
                        self.state = .ExpectObjectValue;
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },
            .ExpectObjectValue => {
                switch (token.kind) {
                    .open_brace => {
                        const id = try self.addNode(.{ .mapping = null }, token.span);
                        try self.openContainer(.object, id);
                        self.state = .ExpectObjectKeyOrEnd;
                    },
                    .open_bracket => {
                        const id = try self.addNode(.{ .sequence = null }, token.span);
                        try self.openContainer(.array, id);
                        self.state = .ExpectArrayValueOrEnd;
                    },
                    .null_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .true_, .false_ => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .string => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    .number, .identifier => {
                        const id = try self.addTokenNode(input, token);
                        try self.finishValue(id);
                    },
                    else => return ParseError.UnexpectedToken,
                }
            },
            .ExpectObjectCommaOrEnd => {
                switch (token.kind) {
                    .close_brace => {
                        const id = try self.closeContainer(token.span.end);
                        try self.finishValue(id);
                    },
                    // JSON5 permits a trailing comma before `}`.
                    .comma => self.state = if (self.format == .JSON5) .ExpectObjectKeyOrEnd else .ExpectObjectKey,
                    else => return ParseError.UnexpectedToken,
                }
            },

            .ExpectEndOfFile => {
                switch (token.kind) {
                    .end_of_file => continue,
                    else => return ParseError.UnexpectedToken,
                }
            },
        }
    }

    // while loop completed.
    // Ready to return a Document!
    const root = self.root orelse return ParseError.UnexpectedToken;

    const nodes = try self.nodes.toOwnedSlice(self.allocator);
    errdefer self.allocator.free(nodes);
    self.nodes = .empty;

    const node_spans = try self.node_spans.toOwnedSlice(self.allocator);
    errdefer self.allocator.free(node_spans);
    self.node_spans = .empty;

    const owned_strings = try self.owned_strings.toOwnedSlice(self.allocator);
    self.owned_strings = .empty;

    var ast: AST = .{
        .allocator = self.allocator,
        .owned_strings = owned_strings,
        .root = root,
        .nodes = nodes,
    };
    // Materialized last (no fallible step follows): hand the owned `leading`
    // slices to the AST. Only when comments were actually attached.
    if (self.comments_seen) {
        ast.node_comments = try self.node_comments.toOwnedSlice(self.allocator);
        self.node_comments = .empty;
    }

    return .{
        .source = input,
        .ast = ast,
        .node_spans = node_spans,
    };
}

pub fn deinit(self: *Parser) void {
    self.container_stack.deinit(self.allocator);
    self.nodes.deinit(self.allocator);
    self.node_spans.deinit(self.allocator);
    for (self.owned_strings.items) |string| {
        self.allocator.free(string);
    }
    self.owned_strings.deinit(self.allocator);
    // After a successful parse these `leading` slices moved to the AST and the
    // list is empty; on an error path they are freed here. Text borrows `input`.
    for (self.node_comments.items) |nc| self.allocator.free(nc.leading);
    self.node_comments.deinit(self.allocator);
    self.pending_leading.deinit(self.allocator);
}

// ===============
// PARSING HELPERS
// ===============

/// Add an incomplete node to self.nodes. Called as soon as `[` or `{` is found.
fn addNode(self: *Parser, kind: AST.Node.Kind, span: Span) !AST.Node.Id {
    const id: AST.Node.Id = @intCast(self.nodes.items.len);
    try self.nodes.append(self.allocator, .{
        .id = id,
        .kind = kind,
        .next_sibling = null, // Update if there is a next sibling
    });
    try self.node_spans.append(self.allocator, span);
    try self.node_comments.append(self.allocator, .{});
    // Buffered leading comments bind to the first node opened for the next
    // child — a key (object entry), a value (array element / root), or a
    // container. The keyvalue *pair* node (minted in `finishValue`) sees an
    // already-drained buffer, so this is a no-op there.
    try self.claimLeading(id);
    return id;
}

fn addTokenNode(self: *Parser, input: []const u8, token: Token) !AST.Node.Id {
    return self.addNode(try self.tokenKind(input, token), token.span);
}

fn tokenKind(self: *Parser, input: []const u8, token: Token) ParserError!AST.Node.Kind {
    const raw = token.source(input);
    return switch (token.kind) {
        .null_ => .null_,
        .true_, .false_ => .{ .boolean = try getBool(raw) },
        .string => .{ .string = try self.getString(raw) },
        .number => specialNumber(raw) orelse .{ .number = try getNumber(raw) },
        // A bare identifier is only a value when it spells `Infinity`/`NaN`.
        .identifier => specialNumber(raw) orelse return ParseError.UnexpectedToken,
        else => ParseError.UnexpectedToken,
    };
}

/// `Infinity`/`NaN` (optionally signed) lift to an extended `number_special`
/// node — no JSON number can hold a non-finite value. Returns null otherwise.
fn specialNumber(raw: []const u8) ?AST.Node.Kind {
    const body = if (raw.len > 0 and (raw[0] == '+' or raw[0] == '-')) raw[1..] else raw;
    if (std.mem.eql(u8, body, "Infinity") or std.mem.eql(u8, body, "NaN"))
        return .{ .extended = .{ .kind = .number_special, .text = raw } };
    return null;
}

/// Record the pending object key from a `.string` (quoted), `.identifier`
/// (unquoted), or bare keyword (`true`/`false`/`null`) token, then expect `:`.
fn beginKey(self: *Parser, input: []const u8, token: Token) ParserError!void {
    // Only quoted strings are legal keys in strict JSON; the rest are JSON5.
    if (token.kind != .string and self.format != .JSON5) return ParseError.UnexpectedToken;
    const key_id = try self.addKeyNode(input, token);
    const parent = &self.container_stack.items[self.container_stack.items.len - 1];
    parent.pending_key = key_id;
    self.state = .ExpectObjectColon;
}

/// Build a string-valued key node. A quoted string is decoded; an identifier or
/// keyword key is its verbatim source text.
fn addKeyNode(self: *Parser, input: []const u8, token: Token) ParserError!AST.Node.Id {
    const kind: AST.Node.Kind = switch (token.kind) {
        .string => .{ .string = try self.getString(token.source(input)) },
        .identifier, .true_, .false_, .null_ => .{ .string = token.source(input) },
        else => return ParseError.UnexpectedToken,
    };
    return self.addNode(kind, token.span);
}

/// Attaches a completed child to the current open container.
fn attachChild(self: *Parser, parent: *OpenContainer, child_id: AST.Node.Id) void {
    if (parent.first_child != null) {
        self.nodes.items[parent.last_child.?].next_sibling = child_id;
    } else {
        parent.first_child = child_id;
        switch (parent.kind) {
            .array => self.nodes.items[parent.id].kind = .{ .sequence = child_id },
            .object => self.nodes.items[parent.id].kind = .{ .mapping = child_id },
        }
    }
    parent.last_child = child_id;
}

fn finishValue(self: *Parser, value_id: AST.Node.Id) !void {
    // This value is now the trailing-comment candidate (the value node is the
    // trailing anchor for both array elements and object entries).
    self.last_value_id = value_id;
    // If there is no parent, the parsing is complete
    if (self.container_stack.items.len == 0) {
        self.root = value_id;
        self.state = .ExpectEndOfFile;
        return;
    }

    const parent = &self.container_stack.items[self.container_stack.items.len - 1];

    switch (parent.kind) {
        .array => {
            self.attachChild(parent, value_id);
            self.state = .ExpectArrayCommaOrEnd;
        },
        .object => {
            const key_id = parent.pending_key orelse return ParseError.UnexpectedToken;
            parent.pending_key = null;

            const key_span = self.node_spans.items[key_id];
            const value_span = self.node_spans.items[value_id];
            const pair_id = try self.addNode(.{ .keyvalue = .{
                .key = key_id,
                .value = value_id,
            } }, .{
                .start = key_span.start,
                .end = value_span.end,
            });

            self.attachChild(parent, pair_id);
            self.state = .ExpectObjectCommaOrEnd;
        },
    }
}

// ── comments ────────────────────────────────────────────────────────────────

/// Classify and attach the comment at `tokens[i]`. It trails the most recently
/// completed value when that value's trailing window is still open
/// (`last_value_id` set — no newline since) AND the comment is the last thing on
/// its line (`endsLine`). Otherwise it buffers as leading for the next node. The
/// `endsLine` test is what disambiguates `1, // trail` (trailing) from
/// `1, /*c*/ b` (leading of `b`).
fn handleComment(self: *Parser, input: []const u8, tokens: []const Token, i: usize) !void {
    const c = parseComment(tokens[i].source(input));
    const comment_start = tokens[i].span.start;
    if (self.last_value_id != null and endsLine(input, tokens, i)) {
        const id = self.last_value_id.?;
        self.last_value_id = null; // one trailing per value
        // A comment on the CLOSING line of a multi-line container (`]` / `}` then
        // `// c`) belongs at the bottom of the body, not on the value's line — so
        // it joins the container's `dangling` run rather than its `trailing`
        // (which is reserved for the opening line). An inline container, or a
        // scalar, keeps the same-line `trailing`.
        if (self.multilineContainer(input, id, comment_start)) {
            try self.appendDangling(id, c);
        } else {
            self.setTrailing(id, c);
        }
    } else if (endsLine(input, tokens, i) and self.container_stack.items.len > 0 and
        afterOpenDelimiter(input, tokens, i))
    {
        // `[ // note` / `{ // note` — the comment rides the line the container
        // opened on, so it trails the container value (mirrors YAML's `key: #`).
        self.setTrailing(self.container_stack.items[self.container_stack.items.len - 1].id, c);
    } else {
        try self.pending_leading.append(self.allocator, c);
    }
}

/// Whether the comment at `tokens[i]` is the last content on its source line —
/// i.e. the next significant token is a newline, a comma, or a closing
/// delimiter / EOF. A value/key (or another comment) appearing first on the same
/// line means this comment instead leads that following content.
fn endsLine(input: []const u8, tokens: []const Token, i: usize) bool {
    var j = i + 1;
    while (j < tokens.len) : (j += 1) {
        switch (tokens[j].kind) {
            .whitespace => {
                if (std.mem.indexOfScalar(u8, input[tokens[j].span.start..tokens[j].span.end], '\n') != null)
                    return true;
            },
            .comma, .close_brace, .close_bracket, .end_of_file => return true,
            else => return false,
        }
    }
    return true;
}

/// Whether the most recent significant token before `tokens[i]`, on the same
/// line, is a container-opening `[`/`{`. Identifies a comment that rides the
/// line a container opened on (`[ // c`), which trails the container value.
fn afterOpenDelimiter(input: []const u8, tokens: []const Token, i: usize) bool {
    var j = i;
    while (j > 0) {
        j -= 1;
        switch (tokens[j].kind) {
            .whitespace => {
                if (std.mem.indexOfScalar(u8, input[tokens[j].span.start..tokens[j].span.end], '\n') != null)
                    return false; // crossed a newline → not the open line
            },
            .open_bracket, .open_brace => return true,
            else => return false,
        }
    }
    return false;
}

/// Strip the markers from a comment token's raw bytes (which borrow `input`) and
/// classify line vs block.
fn parseComment(raw: []const u8) AST.Comment {
    if (raw.len >= 2 and raw[1] == '*') {
        // `/* … */` — the tokenizer guarantees the closing `*/`.
        return .{ .text = std.mem.trim(u8, raw[2 .. raw.len - 2], " \t\r\n"), .style = .block };
    }
    // `// …` to end of line.
    return .{ .text = std.mem.trim(u8, raw[2..], " \t\r"), .style = .line };
}

/// Hand the buffered leading comments to node `id`, transferring ownership of
/// the slice. No-op when nothing is buffered.
fn claimLeading(self: *Parser, id: AST.Node.Id) !void {
    if (self.pending_leading.items.len == 0) return;
    const owned = try self.pending_leading.toOwnedSlice(self.allocator);
    self.pending_leading = .empty;
    self.node_comments.items[id].leading = owned;
    self.comments_seen = true;
}

fn setTrailing(self: *Parser, id: AST.Node.Id, c: AST.Comment) void {
    self.node_comments.items[id].trailing = c;
    self.comments_seen = true;
}

/// Whether `id` is a container whose opening delimiter is on an earlier line than
/// `comment_start` — i.e. a multi-line `[ … ]`/`{ … }` whose close is on the
/// comment's line. (An inline, single-line container returns false.)
fn multilineContainer(self: *Parser, input: []const u8, id: AST.Node.Id, comment_start: usize) bool {
    switch (self.nodes.items[id].kind) {
        .sequence, .mapping => {},
        else => return false,
    }
    const open = self.node_spans.items[id].start;
    if (comment_start <= open) return false;
    return std.mem.indexOfScalar(u8, input[open..comment_start], '\n') != null;
}

/// Append one comment to `id`'s existing `dangling` run (reallocating). Used for
/// a closing-line comment that follows the orphans already claimed at the close.
fn appendDangling(self: *Parser, id: AST.Node.Id, c: AST.Comment) !void {
    const old = self.node_comments.items[id].dangling;
    const grown = try self.allocator.alloc(AST.Comment, old.len + 1);
    @memcpy(grown[0..old.len], old);
    grown[old.len] = c;
    self.allocator.free(old);
    self.node_comments.items[id].dangling = grown;
    self.comments_seen = true;
}

/// Hand buffered orphan comments (no node followed them, e.g. before a closing
/// delimiter or at EOF) to container `id` as its `dangling` run.
fn claimDangling(self: *Parser, id: AST.Node.Id) !void {
    if (self.pending_leading.items.len == 0) return;
    const owned = try self.pending_leading.toOwnedSlice(self.allocator);
    self.pending_leading = .empty;
    self.node_comments.items[id].dangling = owned;
    self.comments_seen = true;
}

/// Pushes stack metadata for a container node that already exists in self.nodes
fn openContainer(self: *Parser, kind: ContainerKind, node_id: AST.Node.Id) !void {
    try self.container_stack.append(self.allocator, .{
        .id = node_id,
        .kind = kind,
    });
}

/// Pops the current container, patches its span end, and returns the node ID.
/// Orphan comments buffered before the close delimiter become the container's
/// `dangling` run.
fn closeContainer(self: *Parser, span_end: usize) !AST.Node.Id {
    if (self.container_stack.items.len == 0) return ParseError.UnexpectedToken;
    const container = self.container_stack.pop().?;
    self.node_spans.items[container.id].end = span_end;
    try self.claimDangling(container.id);
    return container.id;
}

// =======
// Testing
// =======

fn testParser(input: []const u8, expected: AST) !void {
    var ast = try Parser.parseAbstract(testing.allocator, input, .JSON);
    defer ast.deinit();
    try testing.expect(expected.eql(ast));
}

fn testParserError(input: []const u8, expected_error: anyerror) !void {
    if (Parser.parseAbstract(testing.allocator, input, .JSON)) |ast| {
        var parsed = ast;
        defer parsed.deinit();
        try testing.expect(false);
    } else |err| {
        try testing.expectEqual(expected_error, err);
    }
}

test "simple JSON document" {
    try testParser(
        \\[{"hello":"world"}]
    , .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
        .{ .id = 0, .kind = .{ .sequence = 1 }, .next_sibling = null },
        .{
            .id = 1,
            .kind = .{ .mapping = 4 },
            .next_sibling = null,
        },
        .{
            .id = 2,
            .kind = .{ .string = "hello" },
            .next_sibling = null,
        },
        .{
            .id = 3,
            .kind = .{ .string = "world" },
            .next_sibling = null,
        },
        .{
            .id = 4,
            .kind = .{ .keyvalue = .{ .key = 2, .value = 3 } },
            .next_sibling = null,
        },
    } });
}

test "decodes JSON string escapes" {
    var ast = try Parser.parseAbstract(testing.allocator, "\"quote: \\\" slash: \\\\ newline: \\n tab: \\t backspace: \\b formfeed: \\f slash: \\/\"", .JSON);
    defer ast.deinit();

    const value = switch (ast.nodes[ast.root].kind) {
        .string => |string| string,
        else => return error.TestUnexpectedResult,
    };

    try testing.expectEqualSlices(u8, "quote: \" slash: \\ newline: \n tab: \t backspace: \x08 formfeed: \x0c slash: /", value);
}

test "decodes JSON unicode escapes" {
    var ast = try Parser.parseAbstract(testing.allocator, "\"A: \\u0041 latin: \\u00E9 clef: \\uD834\\uDD1E\"", .JSON);
    defer ast.deinit();

    const value = switch (ast.nodes[ast.root].kind) {
        .string => |string| string,
        else => return error.TestUnexpectedResult,
    };

    try testing.expectEqualSlices(u8, "A: A latin: é clef: 𝄞", value);
}

test "decodes escaped object keys" {
    var ast = try Parser.parseAbstract(testing.allocator, "{\"he\\u006clo\":1}", .JSON);
    defer ast.deinit();

    const value = try ast.getValByPath(&.{.{ .key = "hello" }});
    const number = switch (value.kind) {
        .number => |number| number,
        else => return error.TestUnexpectedResult,
    };

    try testing.expectEqualSlices(u8, "1", number.raw);
}

test "preserves unpaired unicode surrogate escapes as raw strings" {
    try testParser(
        "\"\\uD800\"",
        .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
            .{ .id = 0, .kind = .{ .string = "\\uD800" }, .next_sibling = null },
        } },
    );
    try testParser(
        "\"\\uDC00\"",
        .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
            .{ .id = 0, .kind = .{ .string = "\\uDC00" }, .next_sibling = null },
        } },
    );
    try testParser(
        "\"\\uD800x\"",
        .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
            .{ .id = 0, .kind = .{ .string = "\\uD800x" }, .next_sibling = null },
        } },
    );
    try testParser(
        "\"\\uD800\\u0041\"",
        .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
            .{ .id = 0, .kind = .{ .string = "\\uD800\\u0041" }, .next_sibling = null },
        } },
    );
}

test "UTF-8 BOM before document is ignored" {
    try testParser(
        "\xEF\xBB\xBF{}",
        .{ .allocator = testing.allocator, .root = 0, .nodes = &[_]AST.Node{
            .{ .id = 0, .kind = .{ .mapping = null }, .next_sibling = null },
        } },
    );
}

test "object trailing comma is rejected" {
    try testParserError("{\"a\":1,}", error.UnexpectedToken);
}

// ── JSON5 ────────────────────────────────────────────────────────────────────

fn parseJson5(input: []const u8) !AST {
    return Parser.parseAbstract(testing.allocator, input, .JSON5);
}

test "json5: trailing commas accepted (and still rejected in strict JSON)" {
    var arr = try parseJson5("[1,2,]");
    defer arr.deinit();
    try testing.expectEqual(@as(usize, 2), countItems(arr, arr.root));

    var obj = try parseJson5("{a:1,}");
    defer obj.deinit();
    try testing.expectEqual(@as(usize, 1), countItems(obj, obj.root));

    // Strict JSON keeps rejecting both.
    try testParserError("[1,2,]", error.UnexpectedToken);
    try testParserError("{\"a\":1,}", error.UnexpectedToken);
}

test "json5: leading comma is still rejected" {
    try testJson5Error("[,1]", error.UnexpectedToken);
    try testJson5Error("[,]", error.UnexpectedToken);
}

test "json5: unquoted and keyword object keys" {
    var ast = try parseJson5("{ hello: 1, $_$9: 2, while: 3, null: 4 }");
    defer ast.deinit();
    inline for (.{ "hello", "$_$9", "while", "null" }) |k| {
        const v = try ast.getValByPath(&.{.{ .key = k }});
        try testing.expect(v.kind == .number);
    }
}

test "json5: single-quoted strings, escapes, and line continuation" {
    var a = try parseJson5("'I can\\'t'");
    defer a.deinit();
    try testing.expectEqualSlices(u8, "I can't", a.nodes[a.root].kind.string);

    var b = try parseJson5("'line 1 \\\nline 2'");
    defer b.deinit();
    try testing.expectEqualSlices(u8, "line 1 line 2", b.nodes[b.root].kind.string);
}

test "json5: Infinity and NaN become extended number_special" {
    inline for (.{ "Infinity", "-Infinity", "+Infinity", "NaN" }) |lit| {
        var ast = try parseJson5(lit);
        defer ast.deinit();
        const k = ast.nodes[ast.root].kind;
        try testing.expect(k == .extended and k.extended.kind == .number_special);
        try testing.expectEqualSlices(u8, lit, k.extended.text);
    }
}

test "json5: hexadecimal, leading/trailing point, and signed numbers" {
    const cases = .{
        .{ "0xC8", AST.Node.Kind.Number{ .raw = "0xC8", .kind = .integer } },
        .{ "0xc8e4", AST.Node.Kind.Number{ .raw = "0xc8e4", .kind = .integer } },
        .{ "+15", AST.Node.Kind.Number{ .raw = "+15", .kind = .integer } },
        .{ ".5", AST.Node.Kind.Number{ .raw = ".5", .kind = .float } },
        .{ "5.", AST.Node.Kind.Number{ .raw = "5.", .kind = .float } },
    };
    inline for (cases) |c| {
        var ast = try parseJson5(c[0]);
        defer ast.deinit();
        const n = ast.nodes[ast.root].kind.number;
        try testing.expectEqual(c[1].kind, n.kind);
        try testing.expectEqualSlices(u8, c[1].raw, n.raw);
    }
}

test "json5: octal and lone-decimal forms are rejected" {
    try testJson5Error("010", error.LeadingZero);
    try testJson5Error("0x", error.UnexpectedToken);
    try testJson5Error(".", error.UnexpectedToken);
    try testJson5Error("+098", error.LeadingZero);
}

fn testJson5Error(input: []const u8, expected_error: anyerror) !void {
    if (Parser.parseAbstract(testing.allocator, input, .JSON5)) |ast| {
        var parsed = ast;
        defer parsed.deinit();
        try testing.expect(false);
    } else |err| {
        try testing.expectEqual(expected_error, err);
    }
}

test "json5: captures leading, trailing, line and block comments" {
    var ast = try parseJson5(
        \\{
        \\  // leading on a
        \\  a: 1, // trailing on a
        \\  /* block before b */
        \\  b: [
        \\    2 /* trailing block on 2 */,
        \\    3 // trailing on 3
        \\  ]
        \\}
    );
    defer ast.deinit();

    const root = ast.nodes[ast.root];
    const kv_a = ast.nodes[root.kind.mapping.?].kind.keyvalue;
    // Leading binds to the key, trailing to the value.
    try testing.expectEqualStrings("leading on a", ast.comments(kv_a.key).leading[0].text);
    try testing.expectEqualStrings("trailing on a", ast.comments(kv_a.value).trailing.?.text);

    const kv_b = ast.nodes[ast.nodes[root.kind.mapping.?].next_sibling.?].kind.keyvalue;
    try testing.expectEqualStrings("block before b", ast.comments(kv_b.key).leading[0].text);
    try testing.expect(ast.comments(kv_b.key).leading[0].style == .block);

    const seq = ast.nodes[kv_b.value];
    const e2 = ast.nodes[seq.kind.sequence.?];
    const e3 = ast.nodes[e2.next_sibling.?];
    try testing.expectEqualStrings("trailing block on 2", ast.comments(e2.id).trailing.?.text);
    try testing.expect(ast.comments(e2.id).trailing.?.style == .block);
    try testing.expectEqualStrings("trailing on 3", ast.comments(e3.id).trailing.?.text);
}

test "json5: post-comma comment leads the next entry, not trails the previous" {
    // The classic ambiguity: `1, /*c*/ b` — the comma closes 1's trailing window,
    // so `c` must lead `b`.
    var ast = try parseJson5("{ a: 1, /* c */ b: 2 }");
    defer ast.deinit();
    const root = ast.nodes[ast.root];
    const kv_a = ast.nodes[root.kind.mapping.?].kind.keyvalue;
    const kv_b = ast.nodes[ast.nodes[root.kind.mapping.?].next_sibling.?].kind.keyvalue;
    try testing.expect(ast.comments(kv_a.value).trailing == null);
    try testing.expectEqualStrings("c", ast.comments(kv_b.key).leading[0].text);
}

test "json5: comment-free document carries no comment table" {
    var ast = try parseJson5("{ a: 1, b: [2, 3] }");
    defer ast.deinit();
    try testing.expectEqual(@as(usize, 0), ast.node_comments.len);
}

fn countItems(ast: AST, container: AST.Node.Id) usize {
    var n: usize = 0;
    var cur = switch (ast.nodes[container].kind) {
        .sequence, .mapping => |first| first,
        else => return 0,
    };
    while (cur) |id| : (cur = ast.nodes[id].next_sibling) n += 1;
    return n;
}