//! The parser turns a TOML-formatted []const u8 into an AST.
//!
//! Phase 1: root-level `key = value` statements with scalar values (strings,
//! integers, floats, booleans, datetimes). Dotted keys and `[table]` headers
//! (Phase 2) and arrays / inline tables (Phase 3) are not assembled yet — a
//! statement that needs them returns error.NotImplemented.
//!
//! Scalars keep their source text verbatim (numbers/datetimes store `raw`;
//! decoded strings live in `owned_strings`). Number/datetime *normalization*
//! (e.g. 0xFF → 255 for JSON) is a print/convert concern, not done here.
const Parser = @This();
const std = @import("std");
const testing = std.testing;
const AST = @import("../ast/ast.zig");
const Document = @import("../document.zig");
const Type = @import("toml.zig").Type;
const Span = @import("../util/span.zig");
const Tokenizer = @import("tokenizer.zig");
const Token = Tokenizer.Token;
allocator: std.mem.Allocator,
version: Type = .TOML_1_0,
source: []const u8 = "",
tokens: []const Token = &.{},
pos: usize = 0,
nodes: std.ArrayList(AST.Node) = .empty,
spans: std.ArrayList(Span) = .empty,
owned_strings: std.ArrayList([]const u8) = .empty,
// Comment layer. Comments are captured in `skipInline`/`skipBlank` (the trivia
// skippers): a comment on the same line as a just-parsed value (`last_value_id`
// set) trails it; one on its own line buffers as `pending_leading`, claimed onto
// the next entry's key in `appendKeyValue`. A newline resets the trailing
// window. Text borrows `source`. `pending_leading` is reserved to `tokens.len`
// once so capture cannot fail. Materialized only when `comments_seen`.
node_comments: std.ArrayList(AST.NodeComments) = .empty,
pending_leading: std.ArrayList(AST.Comment) = .empty,
last_value_id: ?AST.Node.Id = null,
comments_seen: bool = false,
/// The mapping that bare/dotted `key = value` lines attach to. Starts at root,
/// repointed by each `[table]` header.
current_table: AST.Node.Id = 0,
/// Per-mapping provenance, used to enforce TOML's table/key conflict rules.
/// Absent ⇒ a value node or the root.
table_meta: std.AutoHashMapUnmanaged(AST.Node.Id, TableMeta) = .empty,
/// How a table (mapping) node came to exist — determines whether a later header
/// or dotted key may target/extend it.
const TableMeta = struct {
/// Defined by its own `[header]` (or `[[array]]` element). Cannot be
/// redefined by a header, nor extended by dotted keys from another line.
explicit: bool = false,
/// Created as the value of a dotted-key segment (`a.b = 1` makes `a`).
dotted: bool = false,
/// Created only as an intermediate on a header path (`[a.b.c]` makes
/// `a`, `a.b`). May still be promoted to `explicit` by its own header.
implicit: bool = false,
/// A sequence node created by `[[array.of.tables]]`. Only such sequences
/// may be navigated into (last element) or extended; a static `= [...]`
/// array has no meta and is therefore closed.
aot: bool = false,
/// A mapping from an inline `{ ... }` table — fully defined and closed; no
/// header or dotted key may extend it.
inline_table: bool = false,
};
const KeySeg = struct { str: []const u8, span: Span };
pub const ParseError = error{
NotImplemented,
UnexpectedToken,
UnclosedString,
BadEscape,
InvalidUnicode,
InvalidNumber,
InvalidDatetime,
InvalidKey,
DuplicateKey,
TrailingContent,
InvalidUtf8,
};
pub const ParserError = ParseError || Tokenizer.TokenizeError || std.mem.Allocator.Error;
pub fn parse(allocator: std.mem.Allocator, input: []const u8, format: Type) ParserError!Document {
var parser: Parser = .{ .allocator = allocator };
return parser.parseOnce(input, format) catch |err| {
parser.nodes.deinit(allocator);
parser.spans.deinit(allocator);
for (parser.owned_strings.items) |s| allocator.free(s);
parser.owned_strings.deinit(allocator);
return err;
};
}
pub fn parseAbstract(allocator: std.mem.Allocator, input: []const u8, format: Type) ParserError!AST {
const doc = try parse(allocator, input, format);
allocator.free(doc.node_spans);
return doc.ast;
}
fn parseOnce(self: *Parser, input: []const u8, format: Type) ParserError!Document {
self.version = format;
self.source = input;
// A TOML file must be valid UTF-8 (catches all bad-utf8 fixtures at once).
if (!std.unicode.utf8ValidateSlice(input)) return error.InvalidUtf8;
var tokenizer: Tokenizer = .{ .allocator = self.allocator, .str = input, .version = format };
self.tokens = try tokenizer.tokenize();
defer self.allocator.free(self.tokens);
defer self.table_meta.deinit(self.allocator);
// Reserve so trivia-skipping can buffer leading comments without failing.
try self.pending_leading.ensureTotalCapacity(self.allocator, self.tokens.len);
defer self.pending_leading.deinit(self.allocator);
// On success the table is moved into the AST and this list emptied; on any
// error path it (and any owned comment slices) are freed here.
defer {
for (self.node_comments.items) |nc| {
self.allocator.free(nc.leading);
self.allocator.free(nc.dangling);
}
self.node_comments.deinit(self.allocator);
}
// Root mapping is node 0.
const root_id = try self.addNode(.{ .mapping = null }, Span.init(0, input.len));
self.current_table = root_id;
self.skipBlank();
while (!self.atEnd()) {
switch (self.peek().kind) {
.key => try self.parseKeyValue(),
.open_bracket => try self.parseTableHeader(),
.double_open_bracket => try self.parseArrayTable(),
else => return error.UnexpectedToken,
}
try self.requireLineEnd();
self.skipBlank();
}
// End-of-file orphan comments dangle off the table they sit in (the last one
// `current_table` points at). Mid-file orphans are instead claimed as the
// next key's / header's leading.
try self.claimDangling(self.current_table);
const nodes = try self.nodes.toOwnedSlice(self.allocator);
errdefer self.allocator.free(nodes);
const spans = try self.spans.toOwnedSlice(self.allocator);
errdefer self.allocator.free(spans);
const owned = try self.owned_strings.toOwnedSlice(self.allocator);
var ast: AST = .{
.allocator = self.allocator,
.root = root_id,
.nodes = nodes,
.owned_strings = owned,
};
if (self.comments_seen) {
ast.node_comments = try self.node_comments.toOwnedSlice(self.allocator);
self.node_comments = .empty;
}
return .{
.source = input,
.ast = ast,
.node_spans = spans,
};
}
fn addNode(self: *Parser, kind: AST.Node.Kind, span: Span) ParserError!AST.Node.Id {
const id: AST.Node.Id = @intCast(self.nodes.items.len);
try self.nodes.append(self.allocator, .{ .id = id, .kind = kind });
try self.spans.append(self.allocator, span);
try self.node_comments.append(self.allocator, .{});
return id;
}
// ── Token cursor ────────────────────────────────────────────────────────────
fn peek(self: *Parser) Token {
return self.tokens[self.pos];
}
fn advance(self: *Parser) Token {
const t = self.tokens[self.pos];
if (self.pos + 1 < self.tokens.len) self.pos += 1;
return t;
}
fn atEnd(self: *Parser) bool {
return self.peek().kind == .end_of_file;
}
/// Skip whitespace and comments (but not newlines). A comment here is on the
/// current line, so it can trail a just-parsed value.
fn skipInline(self: *Parser) void {
while (true) switch (self.peek().kind) {
.whitespace => self.pos += 1,
.comment => {
self.captureComment(self.peek());
self.pos += 1;
},
else => return,
};
}
/// Skip whitespace, comments, and blank lines (newlines). A newline closes the
/// trailing-comment window, so comments past it lead the next entry.
fn skipBlank(self: *Parser) void {
while (true) switch (self.peek().kind) {
.whitespace => self.pos += 1,
.comment => {
self.captureComment(self.peek());
self.pos += 1;
},
.newline => {
self.last_value_id = null;
self.pos += 1;
},
else => return,
};
}
// ── Comments ─────────────────────────────────────────────────────────────────
/// Classify a comment token: trailing the most recent value when its window is
/// open (`last_value_id` set, no newline since), else buffered as leading.
fn captureComment(self: *Parser, tok: Token) void {
const c: AST.Comment = .{ .text = commentText(self.tokenText(tok)), .style = .line };
if (self.last_value_id) |id| {
self.node_comments.items[id].trailing = c;
self.comments_seen = true;
self.last_value_id = null; // one trailing per value
} else {
self.pending_leading.appendAssumeCapacity(c); // capacity reserved in parseOnce
}
}
/// Strip the leading `#` and surrounding spaces from a comment token's bytes
/// (which borrow `source`).
fn commentText(raw: []const u8) []const u8 {
const body = if (raw.len > 0 and raw[0] == '#') raw[1..] else raw;
return std.mem.trim(u8, body, " \t\r");
}
/// Hand the buffered leading comments to node `id` as an owned slice, then clear
/// the buffer (retaining its reserved capacity). No-op when nothing is buffered.
fn claimLeading(self: *Parser, id: AST.Node.Id) ParserError!void {
if (self.pending_leading.items.len == 0) return;
const owned = try self.allocator.dupe(AST.Comment, self.pending_leading.items);
self.pending_leading.clearRetainingCapacity();
self.node_comments.items[id].leading = owned;
self.comments_seen = true;
}
/// Hand buffered orphan comments (no key/header followed them — at end of file)
/// to table `id` as its `dangling` run. Mid-file orphans instead lead the next
/// key/header, so this only fires at EOF.
fn claimDangling(self: *Parser, id: AST.Node.Id) ParserError!void {
if (self.pending_leading.items.len == 0) return;
const owned = try self.allocator.dupe(AST.Comment, self.pending_leading.items);
self.pending_leading.clearRetainingCapacity();
self.node_comments.items[id].dangling = owned;
self.comments_seen = true;
}
/// After a statement, only trivia then a newline or EOF may follow.
fn requireLineEnd(self: *Parser) ParserError!void {
self.skipInline();
switch (self.peek().kind) {
.newline, .end_of_file => {},
else => return error.TrailingContent,
}
}
fn tokenText(self: *Parser, tok: Token) []const u8 {
return self.source[tok.span.start..tok.span.end];
}
// ── Statements ──────────────────────────────────────────────────────────────
/// Parse `[table.path]` and repoint `current_table`. The path is resolved from
/// the root; intermediates may pass through any existing table, the final must
/// be new (→ explicit) or an implicit path-table (→ promoted to explicit).
fn parseTableHeader(self: *Parser) ParserError!void {
_ = self.advance(); // '['
self.skipInline();
var segs: std.ArrayList(KeySeg) = .empty;
defer segs.deinit(self.allocator);
try self.parseKeyPath(&segs);
self.skipInline();
if (self.peek().kind != .close_bracket) return error.UnexpectedToken;
_ = self.advance();
if (segs.items.len == 0) return error.InvalidKey;
const cur = try self.navigateHeaderPath(0, segs.items[0 .. segs.items.len - 1]);
const final = segs.items[segs.items.len - 1];
if (self.lookupChild(cur, final.str)) |child| {
if (self.nodes.items[child].kind != .mapping) return error.DuplicateKey;
const m = self.table_meta.get(child) orelse TableMeta{};
if (m.explicit or m.dotted or m.inline_table) return error.DuplicateKey;
try self.table_meta.put(self.allocator, child, .{ .explicit = true });
self.current_table = child;
} else {
self.current_table = try self.createTable(cur, final, .{ .explicit = true });
}
}
/// Parse `[[array.of.tables]]`: navigate the path from root (intermediates like
/// a header), then create-or-extend the final array-of-tables, appending a fresh
/// element table that becomes `current_table`.
fn parseArrayTable(self: *Parser) ParserError!void {
_ = self.advance(); // '[['
self.skipInline();
var segs: std.ArrayList(KeySeg) = .empty;
defer segs.deinit(self.allocator);
try self.parseKeyPath(&segs);
self.skipInline();
if (self.peek().kind != .double_close_bracket) return error.UnexpectedToken;
_ = self.advance();
if (segs.items.len == 0) return error.InvalidKey;
const cur = try self.navigateHeaderPath(0, segs.items[0 .. segs.items.len - 1]);
const final = segs.items[segs.items.len - 1];
if (self.lookupChild(cur, final.str)) |child| {
const m = self.table_meta.get(child) orelse TableMeta{};
if (self.nodes.items[child].kind != .sequence or !m.aot) return error.DuplicateKey;
self.current_table = try self.appendArrayElement(child);
} else {
const seq_id = try self.addNode(.{ .sequence = null }, final.span);
try self.appendKeyValue(cur, final, seq_id);
try self.table_meta.put(self.allocator, seq_id, .{ .aot = true });
self.current_table = try self.appendArrayElement(seq_id);
}
}
/// Walk a header/array-of-tables path of intermediate segments from `start`,
/// creating missing tables (implicit) and descending into existing ones. An
/// array-of-tables intermediate descends into its last element.
fn navigateHeaderPath(self: *Parser, start: AST.Node.Id, intermediates: []const KeySeg) ParserError!AST.Node.Id {
var cur = start;
for (intermediates) |seg| {
if (self.lookupChild(cur, seg.str)) |child| {
cur = try self.descend(child);
} else {
cur = try self.createTable(cur, seg, .{ .implicit = true });
}
}
return cur;
}
/// Resolve an existing path node to the mapping to continue from: a plain table
/// directly, an array-of-tables to its last element. A non-table value, a
/// static array, or a closed inline table is an error.
fn descend(self: *Parser, child: AST.Node.Id) ParserError!AST.Node.Id {
return switch (self.nodes.items[child].kind) {
.mapping => blk: {
const m = self.table_meta.get(child) orelse TableMeta{};
if (m.inline_table) return error.DuplicateKey;
break :blk child;
},
.sequence => blk: {
const m = self.table_meta.get(child) orelse TableMeta{};
if (!m.aot) return error.DuplicateKey;
break :blk try self.lastElement(child);
},
else => error.DuplicateKey,
};
}
fn lastElement(self: *Parser, seq_id: AST.Node.Id) ParserError!AST.Node.Id {
var last = self.nodes.items[seq_id].kind.sequence orelse return error.DuplicateKey;
while (self.nodes.items[last].next_sibling) |n| last = n;
return last;
}
fn appendArrayElement(self: *Parser, seq_id: AST.Node.Id) ParserError!AST.Node.Id {
const elem = try self.addNode(.{ .mapping = null }, self.spans.items[seq_id]);
if (self.nodes.items[seq_id].kind.sequence) |first| {
var last = first;
while (self.nodes.items[last].next_sibling) |n| last = n;
self.nodes.items[last].next_sibling = elem;
} else {
self.nodes.items[seq_id].kind = .{ .sequence = elem };
}
return elem;
}
/// Parse a `key = value` line (the key possibly dotted), attaching to
/// `current_table`. Dotted intermediates create/extend dotted tables but may
/// not descend into an explicitly-defined table (TOML forbids using dotted keys
/// to append to a `[table]`) nor through a non-table value.
fn parseKeyValue(self: *Parser) ParserError!void {
var segs: std.ArrayList(KeySeg) = .empty;
defer segs.deinit(self.allocator);
try self.parseKeyPath(&segs);
self.skipInline();
if (self.peek().kind != .equals) return error.UnexpectedToken;
_ = self.advance(); // '='
self.skipInline();
const cur = try self.navigateDottedPath(self.current_table, segs.items[0 .. segs.items.len - 1]);
const final = segs.items[segs.items.len - 1];
if (self.lookupChild(cur, final.str) != null) return error.DuplicateKey;
const value_id = try self.parseValue();
// The value is now the trailing-comment candidate for the rest of this line
// (a `# comment` after it, captured by the upcoming `requireLineEnd`).
self.last_value_id = value_id;
try self.appendKeyValue(cur, final, value_id);
}
/// Walk dotted-key intermediates from `start`, creating missing tables (dotted)
/// and descending into dotted/implicit ones. Descending into an explicitly
/// defined table, a closed inline table, or a non-table value is an error —
/// dotted keys may not append to a `[table]`.
fn navigateDottedPath(self: *Parser, start: AST.Node.Id, intermediates: []const KeySeg) ParserError!AST.Node.Id {
var cur = start;
for (intermediates) |seg| {
if (self.lookupChild(cur, seg.str)) |child| {
if (self.nodes.items[child].kind != .mapping) return error.DuplicateKey;
const m = self.table_meta.get(child) orelse TableMeta{};
if (m.explicit or m.inline_table) return error.DuplicateKey;
cur = child;
} else {
cur = try self.createTable(cur, seg, .{ .dotted = true });
}
}
return cur;
}
/// Parse a dotted key path (`a.b.c`); cursor must be at the first `.key`. Stops
/// when the next significant token is not a dot.
fn parseKeyPath(self: *Parser, segs: *std.ArrayList(KeySeg)) ParserError!void {
while (true) {
const tok = self.peek();
if (tok.kind != .key) return error.UnexpectedToken;
_ = self.advance();
try segs.append(self.allocator, .{ .str = try self.decodeKey(tok), .span = tok.span });
self.skipInline();
if (self.peek().kind != .dot) return;
_ = self.advance();
self.skipInline();
}
}
/// Value node id of a child of `map_id` keyed `key`, or null.
fn lookupChild(self: *Parser, map_id: AST.Node.Id, key: []const u8) ?AST.Node.Id {
var cur = self.nodes.items[map_id].kind.mapping;
while (cur) |id| : (cur = self.nodes.items[id].next_sibling) {
const kv = self.nodes.items[id].kind.keyvalue;
if (std.mem.eql(u8, self.nodes.items[kv.key].kind.string, key)) return kv.value;
}
return null;
}
/// Append a `key = value` entry to mapping `map_id`.
fn appendKeyValue(self: *Parser, map_id: AST.Node.Id, key: KeySeg, value_id: AST.Node.Id) ParserError!void {
const key_id = try self.addNode(.{ .string = key.str }, key.span);
// A leading comment block above this line (or above a `[header]`, since
// headers also route through here) binds to the key node.
try self.claimLeading(key_id);
const value_end = self.spans.items[value_id].end;
const kv_id = try self.addNode(
.{ .keyvalue = .{ .key = key_id, .value = value_id } },
Span.init(key.span.start, value_end),
);
if (self.nodes.items[map_id].kind.mapping) |first| {
var last = first;
while (self.nodes.items[last].next_sibling) |n| last = n;
self.nodes.items[last].next_sibling = kv_id;
} else {
self.nodes.items[map_id].kind = .{ .mapping = kv_id };
}
}
/// Create an empty child table under `parent` keyed `key`, record its origin,
/// and return the new mapping node id.
fn createTable(self: *Parser, parent: AST.Node.Id, key: KeySeg, meta: TableMeta) ParserError!AST.Node.Id {
const map_id = try self.addNode(.{ .mapping = null }, key.span);
try self.appendKeyValue(parent, key, map_id);
try self.table_meta.put(self.allocator, map_id, meta);
return map_id;
}
fn parseValue(self: *Parser) ParserError!AST.Node.Id {
const tok = self.peek();
switch (tok.kind) {
.string => {
_ = self.advance();
const decoded = try self.decodeString(self.tokenText(tok));
return self.addNode(.{ .string = decoded }, tok.span);
},
.number => {
_ = self.advance();
const raw = self.tokenText(tok);
const kind = try classifyNumber(raw);
// Store the value in canonical, format-independent form (decimal,
// no underscores) so TOML→JSON/YAML conversion is direct. The
// original source text is still recoverable via node_spans, so a
// future round-trip editor loses nothing.
const canon = switch (kind) {
.integer => try self.canonicalInt(raw),
.float => try self.canonicalFloat(raw),
};
return self.addNode(.{ .number = .{ .raw = canon, .kind = kind } }, tok.span);
},
.datetime => {
_ = self.advance();
const raw = self.tokenText(tok);
const shape = try self.classifyDatetime(raw);
return self.addNode(.{ .extended = .{ .text = raw, .kind = shape } }, tok.span);
},
.boolean => {
_ = self.advance();
return self.addNode(.{ .boolean = self.tokenText(tok)[0] == 't' }, tok.span);
},
.open_bracket => return self.parseArray(),
.open_brace => return self.parseInlineTable(),
else => return error.UnexpectedToken,
}
}
/// Parse a `[ value, value, ... ]` array. Arrays may span lines and carry a
/// trailing comma; elements are heterogeneous.
fn parseArray(self: *Parser) ParserError!AST.Node.Id {
const start = self.peek().span.start;
_ = self.advance(); // '['
const seq_id = try self.addNode(.{ .sequence = null }, Span.init(start, start + 1));
var last: ?AST.Node.Id = null;
while (true) {
self.skipBlankFlow();
if (self.peek().kind == .close_bracket) break;
const elem = try self.parseValue();
if (last) |l| {
self.nodes.items[l].next_sibling = elem;
} else {
self.nodes.items[seq_id].kind = .{ .sequence = elem };
}
last = elem;
self.skipBlankFlow();
switch (self.peek().kind) {
.comma => _ = self.advance(),
.close_bracket => break,
else => return error.UnexpectedToken,
}
}
const end = self.peek().span.end;
_ = self.advance(); // ']'
self.spans.items[seq_id] = Span.init(start, end);
return seq_id;
}
/// Parse a `{ key = value, ... }` inline table. TOML 1.0 inline tables are
/// single-line (no newlines inside) and have no trailing comma.
fn parseInlineTable(self: *Parser) ParserError!AST.Node.Id {
// TOML 1.1 permits newlines and a trailing comma inside inline tables; 1.0
// requires everything on one line with no trailing comma.
const allow_nl = self.version == .TOML_1_1;
const start = self.peek().span.start;
_ = self.advance(); // '{'
const map_id = try self.addNode(.{ .mapping = null }, Span.init(start, start + 1));
try self.table_meta.put(self.allocator, map_id, .{ .inline_table = true });
self.skipFlowWs(allow_nl);
if (self.peek().kind != .close_brace) {
while (true) {
try self.parseInlineEntry(map_id);
self.skipFlowWs(allow_nl);
switch (self.peek().kind) {
.comma => {
_ = self.advance();
self.skipFlowWs(allow_nl);
if (self.peek().kind == .close_brace) {
if (!allow_nl) return error.UnexpectedToken; // trailing comma (1.0)
break;
}
},
.close_brace => break,
else => return error.UnexpectedToken,
}
}
}
const end = self.peek().span.end;
if (self.peek().kind != .close_brace) return error.UnexpectedToken;
_ = self.advance(); // '}'
self.spans.items[map_id] = Span.init(start, end);
return map_id;
}
/// One `key = value` (key possibly dotted) inside an inline table.
fn parseInlineEntry(self: *Parser, table_id: AST.Node.Id) ParserError!void {
const allow_nl = self.version == .TOML_1_1;
var segs: std.ArrayList(KeySeg) = .empty;
defer segs.deinit(self.allocator);
// Inline-table keys are lexed in value mode, so a bare key arrives as a
// .number/.boolean/.datetime token; reinterpret it by position.
while (true) {
const tok = self.peek();
const key = try self.decodeInlineKey(tok);
_ = self.advance();
try segs.append(self.allocator, .{ .str = key, .span = tok.span });
self.skipFlowWs(allow_nl);
if (self.peek().kind != .dot) break;
_ = self.advance();
self.skipFlowWs(allow_nl);
}
if (self.peek().kind != .equals) return error.UnexpectedToken;
_ = self.advance();
self.skipFlowWs(allow_nl);
const cur = try self.navigateDottedPath(table_id, segs.items[0 .. segs.items.len - 1]);
const final = segs.items[segs.items.len - 1];
if (self.lookupChild(cur, final.str) != null) return error.DuplicateKey;
const value_id = try self.parseValue();
try self.appendKeyValue(cur, final, value_id);
}
fn decodeInlineKey(self: *Parser, tok: Token) ParserError![]const u8 {
return switch (tok.kind) {
.string => self.decodeString(self.tokenText(tok)), // quoted key
.key => self.tokenText(tok),
.number, .boolean, .datetime => blk: {
const text = self.tokenText(tok);
if (text.len == 0) return error.InvalidKey;
for (text) |c| if (!Tokenizer.isBareKeyChar(c)) return error.InvalidKey;
break :blk text;
},
else => error.UnexpectedToken,
};
}
/// Skip whitespace, comments, and newlines — for inside a multi-line array.
fn skipBlankFlow(self: *Parser) void {
self.skipBlank();
}
/// Skip inline-table separators: whitespace + comments, plus newlines when the
/// version permits them inside inline tables (1.1).
fn skipFlowWs(self: *Parser, allow_nl: bool) void {
if (allow_nl) self.skipBlank() else self.skipInline();
}
// ── Key / string decoding ───────────────────────────────────────────────────
fn decodeKey(self: *Parser, tok: Token) ParserError![]const u8 {
const raw = self.tokenText(tok);
if (raw.len == 0) return error.InvalidKey;
return switch (raw[0]) {
'"', '\'' => self.decodeString(raw),
else => raw, // bare key
};
}
/// Decode any of the four TOML string forms to its byte value. Returns a slice
/// of `source` when no transformation is needed, else an owned (freed via the
/// AST) allocation.
fn decodeString(self: *Parser, raw: []const u8) ParserError![]const u8 {
if (raw.len < 2) return error.UnclosedString;
const q = raw[0];
const triple = raw.len >= 6 and raw[1] == q and raw[2] == q;
if (q == '\'') {
// Literal: no escapes.
if (triple) {
var inner = raw[3 .. raw.len - 3];
inner = trimLeadingNewline(inner);
return inner;
}
return raw[1 .. raw.len - 1];
}
// Basic (q == '"').
if (triple) {
const inner = trimLeadingNewline(raw[3 .. raw.len - 3]);
return self.decodeBasic(inner, true);
}
const inner = raw[1 .. raw.len - 1];
if (std.mem.indexOfScalar(u8, inner, '\\') == null) return inner;
return self.decodeBasic(inner, false);
}
/// A multi-line string with a newline immediately after the opening delimiter
/// drops that first newline.
fn trimLeadingNewline(inner: []const u8) []const u8 {
if (inner.len >= 1 and inner[0] == '\n') return inner[1..];
if (inner.len >= 2 and inner[0] == '\r' and inner[1] == '\n') return inner[2..];
return inner;
}
fn decodeBasic(self: *Parser, inner: []const u8, multiline: bool) ParserError![]const u8 {
var out: std.ArrayList(u8) = .empty;
errdefer out.deinit(self.allocator);
var i: usize = 0;
while (i < inner.len) {
const c = inner[i];
if (c != '\\') {
try out.append(self.allocator, c);
i += 1;
continue;
}
if (i + 1 >= inner.len) return error.BadEscape;
const n = inner[i + 1];
switch (n) {
'b' => try out.append(self.allocator, 0x08),
't' => try out.append(self.allocator, '\t'),
'n' => try out.append(self.allocator, '\n'),
'f' => try out.append(self.allocator, 0x0c),
'r' => try out.append(self.allocator, '\r'),
'"' => try out.append(self.allocator, '"'),
'\\' => try out.append(self.allocator, '\\'),
'u' => i = try self.appendUnicode(&out, inner, i + 2, 4) - 2,
'U' => i = try self.appendUnicode(&out, inner, i + 2, 8) - 2,
// TOML 1.1: \e is ESC (U+001B); \xHH is shorthand for \u00HH.
'e' => {
if (self.version == .TOML_1_0) return error.BadEscape;
try out.append(self.allocator, 0x1b);
},
'x' => {
if (self.version == .TOML_1_0) return error.BadEscape;
i = try self.appendUnicode(&out, inner, i + 2, 2) - 2;
},
' ', '\t', '\n', '\r' => {
if (!multiline) return error.BadEscape;
// Line-ending backslash: `\` + optional whitespace + newline
// trims all following whitespace up to the next content.
var j = i + 1;
while (j < inner.len and (inner[j] == ' ' or inner[j] == '\t')) j += 1;
if (j >= inner.len or (inner[j] != '\n' and inner[j] != '\r')) return error.BadEscape;
while (j < inner.len and (inner[j] == ' ' or inner[j] == '\t' or inner[j] == '\n' or inner[j] == '\r')) j += 1;
i = j;
continue;
},
else => return error.BadEscape,
}
i += 2;
}
const slice = try out.toOwnedSlice(self.allocator);
errdefer self.allocator.free(slice);
try self.owned_strings.append(self.allocator, slice);
return slice;
}
/// Decode `n` hex digits at `inner[at..]` into a UTF-8 codepoint appended to
/// `out`; returns the index just past the digits.
fn appendUnicode(self: *Parser, out: *std.ArrayList(u8), inner: []const u8, at: usize, n: usize) ParserError!usize {
if (at + n > inner.len) return error.BadEscape;
const cp = std.fmt.parseInt(u21, inner[at .. at + n], 16) catch return error.InvalidUnicode;
if (cp > 0x10FFFF or (cp >= 0xD800 and cp <= 0xDFFF)) return error.InvalidUnicode;
var buf: [4]u8 = undefined;
const len = std.unicode.utf8Encode(cp, &buf) catch return error.InvalidUnicode;
try out.appendSlice(self.allocator, buf[0..len]);
return at + n;
}
// ── Number validation / classification ──────────────────────────────────────
const NumberKind = @FieldType(AST.Node.Kind.Number, "kind");
fn classifyNumber(raw: []const u8) ParserError!NumberKind {
if (raw.len == 0) return error.InvalidNumber;
// Special floats.
if (eqAny(raw, &.{ "inf", "+inf", "-inf", "nan", "+nan", "-nan" })) return .float;
// Radix-prefixed integers (no sign permitted).
if (raw.len >= 2 and raw[0] == '0') switch (raw[1]) {
'x' => return if (validUnderscored(raw[2..], isHex)) .integer else error.InvalidNumber,
'o' => return if (validUnderscored(raw[2..], isOctal)) .integer else error.InvalidNumber,
'b' => return if (validUnderscored(raw[2..], isBinary)) .integer else error.InvalidNumber,
else => {},
};
var body = raw;
if (body[0] == '+' or body[0] == '-') body = body[1..];
if (body.len == 0) return error.InvalidNumber;
// Split mantissa / exponent.
var mantissa = body;
var exponent: ?[]const u8 = null;
if (std.mem.indexOfAny(u8, body, "eE")) |e| {
mantissa = body[0..e];
exponent = body[e + 1 ..];
}
// Mantissa: int part, optional `.fraction`.
var int_part = mantissa;
var frac_part: ?[]const u8 = null;
if (std.mem.indexOfScalar(u8, mantissa, '.')) |d| {
int_part = mantissa[0..d];
frac_part = mantissa[d + 1 ..];
}
if (!validDecimalInt(int_part)) return error.InvalidNumber;
var is_float = false;
if (frac_part) |f| {
if (!validUnderscored(f, isDecimal)) return error.InvalidNumber;
is_float = true;
}
if (exponent) |e| {
var exp = e;
if (exp.len > 0 and (exp[0] == '+' or exp[0] == '-')) exp = exp[1..];
if (!validUnderscored(exp, isDecimal)) return error.InvalidNumber;
is_float = true;
}
return if (is_float) .float else .integer;
}
/// A decimal integer literal: `0`, or a non-zero-leading run of digits. Used for
/// the standalone integer and for a float's integer part (leading zeros banned
/// in both: `01` and `03.14` are invalid).
fn validDecimalInt(s: []const u8) bool {
if (!validUnderscored(s, isDecimal)) return false;
if (s.len > 1 and s[0] == '0') return false; // no leading zeros
return true;
}
/// Non-empty, all chars satisfy `pred` or are `_`, and every `_` sits between
/// two `pred` digits (no leading/trailing/doubled underscore).
fn validUnderscored(s: []const u8, comptime pred: fn (u8) bool) bool {
if (s.len == 0) return false;
if (s[0] == '_' or s[s.len - 1] == '_') return false;
var prev_us = false;
for (s) |c| {
if (c == '_') {
if (prev_us) return false;
prev_us = true;
} else if (pred(c)) {
prev_us = false;
} else return false;
}
return true;
}
fn isDecimal(c: u8) bool {
return c >= '0' and c <= '9';
}
fn isHex(c: u8) bool {
return isDecimal(c) or (c >= 'a' and c <= 'f') or (c >= 'A' and c <= 'F');
}
fn isOctal(c: u8) bool {
return c >= '0' and c <= '7';
}
fn isBinary(c: u8) bool {
return c == '0' or c == '1';
}
fn eqAny(s: []const u8, options: []const []const u8) bool {
for (options) |o| if (std.mem.eql(u8, s, o)) return true;
return false;
}
/// Canonicalize an integer literal (any radix, underscores, sign) to a decimal
/// string. Returns `raw` unchanged when already canonical, else an owned copy.
fn canonicalInt(self: *Parser, raw: []const u8) ParserError![]const u8 {
var buf: [80]u8 = undefined;
var n: usize = 0;
for (raw) |c| {
if (c == '_') continue;
if (n >= buf.len) return error.InvalidNumber;
buf[n] = c;
n += 1;
}
const v = std.fmt.parseInt(i64, buf[0..n], 0) catch return error.InvalidNumber;
var out: [24]u8 = undefined;
const s = std.fmt.bufPrint(&out, "{d}", .{v}) catch return error.InvalidNumber;
if (std.mem.eql(u8, s, raw)) return raw;
return self.intern(s);
}
/// Canonicalize a float literal: special values to `inf`/`-inf`/`nan`, and strip
/// digit-group underscores (the remaining decimal/exponent form is valid JSON).
fn canonicalFloat(self: *Parser, raw: []const u8) ParserError![]const u8 {
if (eqAny(raw, &.{ "inf", "+inf" })) return "inf";
if (std.mem.eql(u8, raw, "-inf")) return "-inf";
if (eqAny(raw, &.{ "nan", "+nan", "-nan" })) return "nan";
if (std.mem.indexOfScalar(u8, raw, '_') == null) return raw;
var buf: [80]u8 = undefined;
var n: usize = 0;
for (raw) |c| {
if (c == '_') continue;
if (n >= buf.len) return error.InvalidNumber;
buf[n] = c;
n += 1;
}
return self.intern(buf[0..n]);
}
/// Copy `s` into an AST-owned allocation.
fn intern(self: *Parser, s: []const u8) ParserError![]const u8 {
const owned = try self.allocator.dupe(u8, s);
errdefer self.allocator.free(owned);
try self.owned_strings.append(self.allocator, owned);
return owned;
}
// ── Datetime validation / classification ────────────────────────────────────
// The datetime subset of ExtKind; `classifyDatetime` only ever returns these
// four. (TOML never produces the enum/char-literal ExtKinds — those are ZON.)
const Shape = AST.Node.Kind.Extended.ExtKind;
fn classifyDatetime(self: *Parser, raw: []const u8) ParserError!Shape {
// Time-only: HH:MM...
if (raw.len >= 3 and raw[2] == ':') {
try self.validateTime(raw);
return .local_time;
}
// Date present: YYYY-MM-DD.
if (raw.len < 10) return error.InvalidDatetime;
try validateDate(raw[0..10]);
if (raw.len == 10) return .local_date;
// Separator then time (+ optional offset).
const sep = raw[10];
if (sep != 'T' and sep != 't' and sep != ' ') return error.InvalidDatetime;
const rest = raw[11..];
// Offset: trailing Z/z, or ±HH:MM at the end.
var time_str = rest;
var has_offset = false;
if (rest.len > 0 and (rest[rest.len - 1] == 'Z' or rest[rest.len - 1] == 'z')) {
time_str = rest[0 .. rest.len - 1];
has_offset = true;
} else if (rest.len >= 6 and (rest[rest.len - 6] == '+' or rest[rest.len - 6] == '-') and rest[rest.len - 3] == ':') {
try validateOffset(rest[rest.len - 6 ..]);
time_str = rest[0 .. rest.len - 6];
has_offset = true;
}
try self.validateTime(time_str);
return if (has_offset) .offset_datetime else .local_datetime;
}
fn twoDigit(s: []const u8, at: usize) u8 {
return (s[at] - '0') * 10 + (s[at + 1] - '0');
}
fn validateDate(s: []const u8) ParserError!void {
if (s.len != 10 or s[4] != '-' or s[7] != '-') return error.InvalidDatetime;
const year = @as(u16, twoDigit(s, 0)) * 100 + twoDigit(s, 2);
const month = twoDigit(s, 5);
const day = twoDigit(s, 8);
if (month < 1 or month > 12) return error.InvalidDatetime;
if (day < 1 or day > daysInMonth(year, month)) return error.InvalidDatetime;
}
fn daysInMonth(year: u16, month: u8) u8 {
return switch (month) {
1, 3, 5, 7, 8, 10, 12 => 31,
4, 6, 9, 11 => 30,
2 => if (isLeapYear(year)) @as(u8, 29) else 28,
else => 0,
};
}
fn isLeapYear(year: u16) bool {
return (year % 4 == 0 and year % 100 != 0) or year % 400 == 0;
}
/// HH:MM[:SS[.fraction]]; seconds 00-60 (leap second allowed). Seconds are
/// required in TOML 1.0 but optional in 1.1.
fn validateTime(self: *Parser, s: []const u8) ParserError!void {
if (s.len < 5 or s[2] != ':') return error.InvalidDatetime;
const hour = twoDigit(s, 0);
const minute = twoDigit(s, 3);
if (hour > 23 or minute > 59) return error.InvalidDatetime;
if (s.len == 5) {
if (self.version == .TOML_1_0) return error.InvalidDatetime; // seconds required
return;
}
if (s[5] != ':') return error.InvalidDatetime;
if (s.len < 8) return error.InvalidDatetime;
if (twoDigit(s, 6) > 60) return error.InvalidDatetime;
if (s.len == 8) return;
if (s[8] != '.' or s.len < 10) return error.InvalidDatetime;
for (s[9..]) |c| if (!isDecimal(c)) return error.InvalidDatetime;
}
/// ±HH:MM offset; hour 00-23, minute 00-59.
fn validateOffset(s: []const u8) ParserError!void {
if (s.len != 6 or s[3] != ':') return error.InvalidDatetime;
if (twoDigit(s, 1) > 23 or twoDigit(s, 4) > 59) return error.InvalidDatetime;
}
// ── Tests ───────────────────────────────────────────────────────────────────
test "parses empty / comment-only documents" {
const inputs = [_][]const u8{ "", "# c\n", "\n\n \n" };
for (inputs) |input| {
var doc = try parse(testing.allocator, input, .TOML_1_0);
defer doc.deinit(testing.allocator);
try testing.expect(doc.ast.nodes[doc.ast.root].kind == .mapping);
}
}
test "parses scalar key/value pairs" {
var doc = try parse(testing.allocator,
\\name = "Tom"
\\count = 42
\\pi = 3.14
\\hex = 0xDEAD_beef
\\flag = true
\\when = 1979-05-27T07:32:00Z
\\
, .TOML_1_0);
defer doc.deinit(testing.allocator);
const ast = &doc.ast;
const name = try ast.getValByPath(&.{.{ .key = "name" }});
try testing.expectEqualStrings("Tom", name.kind.string);
const count = try ast.getValByPath(&.{.{ .key = "count" }});
try testing.expect(count.kind.number.kind == .integer);
const pi = try ast.getValByPath(&.{.{ .key = "pi" }});
try testing.expect(pi.kind.number.kind == .float);
const when = try ast.getValByPath(&.{.{ .key = "when" }});
try testing.expect(when.kind.extended.kind == .offset_datetime);
}
test "datetime shapes" {
const cases = [_]struct { src: []const u8, shape: Shape }{
.{ .src = "1979-05-27T07:32:00Z", .shape = .offset_datetime },
.{ .src = "1979-05-27T07:32:00", .shape = .local_datetime },
.{ .src = "1979-05-27 07:32:00", .shape = .local_datetime },
.{ .src = "1979-05-27", .shape = .local_date },
.{ .src = "07:32:00", .shape = .local_time },
.{ .src = "00:32:00.999999", .shape = .local_time },
};
for (cases) |c| {
const src = try std.fmt.allocPrint(testing.allocator, "d = {s}\n", .{c.src});
defer testing.allocator.free(src);
var doc = try parse(testing.allocator, src, .TOML_1_0);
defer doc.deinit(testing.allocator);
const d = try doc.ast.getValByPath(&.{.{ .key = "d" }});
try testing.expectEqual(c.shape, d.kind.extended.kind);
}
}
test "rejects bad scalars" {
const bad = [_][]const u8{
"x = 01\n", // leading zero
"x = 1__2\n", // doubled underscore
"x = 0x\n", // empty hex
"x = 1979-13-01\n", // month over
"x = 2021-02-29\n", // not a leap year
"x = 00:00:61\n", // second over
"x = \"a\nb\"\n", // newline in single-line string
"x = \"\\q\"\n", // bad escape
"a = 1 b = 2\n", // trailing content
};
for (bad) |input| {
if (parse(testing.allocator, input, .TOML_1_0)) |doc| {
var d = doc;
d.deinit(testing.allocator);
std.debug.print("expected rejection: {s}\n", .{input});
return error.ExpectedParseFailure;
} else |_| {}
}
}
test "rejects duplicate root keys" {
try testing.expectError(error.DuplicateKey, parse(testing.allocator, "a = 1\na = 2\n", .TOML_1_0));
}
test "tables and dotted keys build nested mappings" {
var doc = try parse(testing.allocator,
\\[server.tcp]
\\port = 80
\\opts.timeout = 30
\\
\\[server]
\\name = "main"
\\
, .TOML_1_0);
defer doc.deinit(testing.allocator);
const ast = &doc.ast;
const port = try ast.getValByPath(&.{ .{ .key = "server" }, .{ .key = "tcp" }, .{ .key = "port" } });
try testing.expect(port.kind.number.kind == .integer);
const timeout = try ast.getValByPath(&.{ .{ .key = "server" }, .{ .key = "tcp" }, .{ .key = "opts" }, .{ .key = "timeout" } });
try testing.expectEqualStrings("30", timeout.kind.number.raw);
const name = try ast.getValByPath(&.{ .{ .key = "server" }, .{ .key = "name" } });
try testing.expectEqualStrings("main", name.kind.string);
}
test "implicit table promoted to explicit is allowed" {
var doc = try parse(testing.allocator, "[a.b.c]\nx = 1\n\n[a]\ny = 2\n", .TOML_1_0);
defer doc.deinit(testing.allocator);
}
test "arrays, inline tables, and arrays-of-tables" {
var doc = try parse(testing.allocator,
\\nums = [1, 2, 3]
\\nested = [[1, 2], ["a", "b"]]
\\point = { x = 1, y = 2 }
\\
\\[[fruit]]
\\name = "apple"
\\
\\[[fruit]]
\\name = "pear"
\\
, .TOML_1_0);
defer doc.deinit(testing.allocator);
const ast = &doc.ast;
const n1 = try ast.getValByPath(&.{ .{ .key = "nums" }, .{ .index = 1 } });
try testing.expectEqualStrings("2", n1.kind.number.raw);
const inner = try ast.getValByPath(&.{ .{ .key = "nested" }, .{ .index = 1 }, .{ .index = 0 } });
try testing.expectEqualStrings("a", inner.kind.string);
const y = try ast.getValByPath(&.{ .{ .key = "point" }, .{ .key = "y" } });
try testing.expectEqualStrings("2", y.kind.number.raw);
const pear = try ast.getValByPath(&.{ .{ .key = "fruit" }, .{ .index = 1 }, .{ .key = "name" } });
try testing.expectEqualStrings("pear", pear.kind.string);
}
test "inline-table dotted keys" {
var doc = try parse(testing.allocator, "a = { b.c = 1, b.d = 2 }\n", .TOML_1_0);
defer doc.deinit(testing.allocator);
const c = try doc.ast.getValByPath(&.{ .{ .key = "a" }, .{ .key = "b" }, .{ .key = "c" } });
try testing.expectEqualStrings("1", c.kind.number.raw);
}
test "rejects inline-table and array errors" {
const bad = [_][]const u8{
"a = { b = 1, }\n", // trailing comma (1.0)
"a = { b = 1\n c = 2 }\n", // newline inside inline table (1.0)
"a = {b=1}\n[a.c]\nx=1\n", // extend a closed inline table
"a = [1, 2\n", // unclosed array
"a = { b = 1, b = 2 }\n", // duplicate inline key
};
for (bad) |input| {
if (parse(testing.allocator, input, .TOML_1_0)) |doc| {
var d = doc;
d.deinit(testing.allocator);
std.debug.print("expected rejection: {s}\n", .{input});
return error.ExpectedParseFailure;
} else |_| {}
}
}
test "TOML 1.1 features: optional seconds, \\e/\\x escapes, inline-table newlines" {
var doc = try parse(testing.allocator,
\\t = 13:37
\\esc = "\e\x41"
\\tbl = {
\\ a = 1,
\\ b = 2,
\\}
\\
, .TOML_1_1);
defer doc.deinit(testing.allocator);
const ast = &doc.ast;
const t = try ast.getValByPath(&.{.{ .key = "t" }});
try testing.expect(t.kind.extended.kind == .local_time);
const esc = try ast.getValByPath(&.{.{ .key = "esc" }});
try testing.expectEqualStrings("\x1bA", esc.kind.string);
const b = try ast.getValByPath(&.{ .{ .key = "tbl" }, .{ .key = "b" } });
try testing.expectEqualStrings("2", b.kind.number.raw);
}
test "1.1-only constructs are rejected under 1.0" {
const only_1_1 = [_][]const u8{
"t = 13:37\n", // optional seconds
"s = \"\\e\"\n", // \e escape
"s = \"\\x41\"\n", // \x escape
"t = { a = 1,\n b = 2 }\n", // newline in inline table
"t = { a = 1, }\n", // trailing comma
};
for (only_1_1) |input| {
// Valid under 1.1 …
var ok = try parse(testing.allocator, input, .TOML_1_1);
ok.deinit(testing.allocator);
// … but rejected under 1.0.
if (parse(testing.allocator, input, .TOML_1_0)) |doc| {
var d = doc;
d.deinit(testing.allocator);
std.debug.print("expected 1.0 rejection: {s}\n", .{input});
return error.ExpectedParseFailure;
} else |_| {}
}
}
test "rejects table/key conflicts" {
const bad = [_][]const u8{
"[a]\nb = 1\n\n[a.b]\nc = 2\n", // value used as table
"[a]\n\n[a]\n", // duplicate table
"a.b = 1\na.b.c = 2\n", // dotted key through a value
"[a.b.c]\nz = 9\n\n[a]\nb.c.t = 1\n", // dotted-append to explicit table
};
for (bad) |input| {
if (parse(testing.allocator, input, .TOML_1_0)) |doc| {
var d = doc;
d.deinit(testing.allocator);
std.debug.print("expected rejection: {s}\n", .{input});
return error.ExpectedParseFailure;
} else |_| {}
}
}