fig 1.0.0 - Docs.rs

//! JSON Tokenizer. Turns a []const u8 in JSON format to a slice of Tokens.
//!
//! Allocates (then frees) memory for an expanding ArrayList of tokens.

pub const Tokenizer = @This();

const std = @import("std");
const builtin = @import("builtin");
const log = std.log.scoped(.tokenizer);
const testing = std.testing;
const JsonFormat = @import("json.zig").Type;
const Span = @import("../util/span.zig");
pub const Token = @import("../token.zig").Token(Kind);

pub const Kind = enum {
    // Structural
    /// {
    open_brace,
    /// }
    close_brace,
    /// [
    open_bracket,
    /// ]
    close_bracket,

    colon,
    comma,
    end_of_file,

    // Literals
    true_,
    false_,
    null_,

    // variable-length
    string,
    number,
    comment,
    whitespace,
    /// JSON5 only: an unquoted ECMAScript IdentifierName. Used as an object key,
    /// or as a value when it spells `Infinity` / `NaN`.
    identifier,

    /// Find length of token kind. Returns null for variable-length tokens.
    pub fn len(self: Kind) ?usize {
        return switch (self) {
            .end_of_file => 0,
            .open_brace, .close_brace, .open_bracket, .close_bracket, .colon, .comma => 1,
            .true_, .null_ => 4,
            .false_ => 5,
            else => null,
        };
    }
};

const TokenizeError = error{ UnexpectedToken, MissingToken, OutOfMemory, UnexpectedSlash, MissingCloseBrace, MissingOpenQuote, MissingColon, MissingCloseBracket, LeadingZero, UnclosedString, UnexpectedEndOfInput, UnclosedComment };

// State
tokens: std.ArrayList(Token) = .empty,
index: usize = 0,

// Initial fields
allocator: std.mem.Allocator,
str: []const u8 = "",
kind: JsonFormat = JsonFormat.JSONC,

pub fn tokenize(self: *Tokenizer) ![]const Token {
    errdefer self.tokens.deinit(self.allocator);
    try self.tokens.ensureTotalCapacity(self.allocator, self.str.len + 1);

    if (std.mem.startsWith(u8, self.str, "\xEF\xBB\xBF")) {
        self.index = 3;
    }

    const json5 = self.kind == JsonFormat.JSON5;
    while (self.char()) |c| {
        // JSON5 widens the alphabet: unquoted identifier keys (and the bare
        // `Infinity`/`NaN` values), single-quoted strings, a leading `+` or `.`
        // on numbers, and the extra ES whitespace (vertical tab, form feed).
        if (json5) {
            if (isIdentStart(c)) {
                try self.addToken(try self.identifierOrKeyword());
                continue;
            }
            switch (c) {
                '\'' => {
                    try self.addToken(try self.string('\''));
                    continue;
                },
                '+', '.' => {
                    try self.addToken(try self.number());
                    continue;
                },
                0x0b, 0x0c => {
                    try self.addToken(try self.getWhitespace());
                    continue;
                },
                else => {},
            }
        }
        try self.addToken(switch (c) {
            '{' => .init(.open_brace, .init(self.index, self.index + 1)),
            '}' => .init(.close_brace, .init(self.index, self.index + 1)),
            '[' => .init(.open_bracket, .init(self.index, self.index + 1)),
            ']' => .init(.close_bracket, .init(self.index, self.index + 1)),
            ':' => .init(.colon, .init(self.index, self.index + 1)),
            ',' => .init(.comma, .init(self.index, self.index + 1)),
            't' => try self.findLiteral(.true_),
            'f' => try self.findLiteral(.false_),
            'n' => try self.findLiteral(.null_),
            '"' => try self.string('"'),
            '/' => try self.comment(),
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-' => try self.number(),
            ' ', '\t', '\n', '\r' => try self.getWhitespace(),
            else => return TokenizeError.UnexpectedToken,
        });
    }

    try self.addToken(.fixed(.end_of_file, self.index));
    return try self.tokens.toOwnedSlice(self.allocator);
}

fn findLiteral(self: *const Tokenizer, kind: Token.Kind) TokenizeError!Token {
    switch (kind) {
        .null_ => {
            if (self.matches("null")) return .fixed(.null_, self.index);
        },
        .true_ => {
            if (self.matches("true")) return .fixed(.true_, self.index);
        },
        .false_ => {
            if (self.matches("false")) return .fixed(.false_, self.index);
        },
        else => return error.UnexpectedToken,
    }
    return TokenizeError.UnexpectedToken;
}

/// Collects all whitespace and returns it as a token.
/// Can return null. `addToken` checks for null.
fn getWhitespace(self: *Tokenizer) TokenizeError!Token {
    const start = self.index;
    while (self.char()) |c| {
        if (!std.ascii.isWhitespace(c)) break;
        self.index += 1;
    }
    const end = self.index;
    if (start == end) unreachable;
    return .init(.whitespace, .init(start, end));
}

// =====================
// CONVENIENCE FUNCTIONS
// =====================

/// Convenience function for accessing current character
fn char(self: *const Tokenizer) ?u8 {
    if (self.index >= self.str.len) return null;
    return self.str[self.index];
}

/// Convenience function for adding a token to the tokens array
fn addToken(self: *Tokenizer, token: Token) TokenizeError!void {
    try self.tokens.append(self.allocator, token);
    self.index = token.span.end;
}

/// Checks if the index is on a given sequence of characters.
fn matches(self: *const Tokenizer, str: []const u8) bool {
    if (str.len > self.str.len - self.index) return false;
    var local_index = self.index;
    for (str) |c| {
        if (self.str[local_index] != c) return false;
        local_index += 1;
    }
    return true;
}

fn isDigit(c: u8) bool {
    return c >= '0' and c <= '9';
}

/// ASCII subset of an ECMAScript IdentifierStart (`$`, `_`, letters). Unicode and
/// `\u` escapes in identifiers are out of scope (the suite's `todo/` cases).
fn isIdentStart(c: u8) bool {
    return (c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == '_' or c == '$';
}

fn isIdentPart(c: u8) bool {
    return isIdentStart(c) or isDigit(c);
}

/// Scan a run of identifier characters, classifying the three reserved value
/// keywords (`true`/`false`/`null`) into their dedicated tokens and everything
/// else (`Infinity`, `NaN`, and bare object keys) into an `identifier` token.
fn identifierOrKeyword(self: *Tokenizer) TokenizeError!Token {
    const start = self.index;
    while (self.char()) |c| {
        if (!isIdentPart(c)) break;
        self.index += 1;
    }
    const word = self.str[start..self.index];
    const kind: Token.Kind = if (std.mem.eql(u8, word, "true"))
        .true_
    else if (std.mem.eql(u8, word, "false"))
        .false_
    else if (std.mem.eql(u8, word, "null"))
        .null_
    else
        .identifier;
    return .init(kind, .init(start, self.index));
}

fn isHexDigit(c: u8) bool {
    return isDigit(c) or (c >= 'a' and c <= 'f') or (c >= 'A' and c <= 'F');
}

// ========================
// TERMINAL TOKEN FUNCTIONS
// ========================

/// Collects all the bytes of a string and returns a JsonToken.string
/// Never returns null, but can be an empty string.
/// Respects escaped values.
fn string(self: *Tokenizer, delimiter: u8) TokenizeError!Token {
    const json5 = self.kind == JsonFormat.JSON5;
    const start = self.index;
    self.index += 1; // skip opening quote

    while (self.char()) |c| {
        if (c == delimiter) { // Closing quote
            self.index += 1;
            return .init(.string, .init(start, self.index));
        }
        switch (c) {
            '\\' => { // Escape a character
                self.index += 1; // skip backslash
                const escaped = self.char() orelse return TokenizeError.UnclosedString;
                if (json5) {
                    // JSON5 accepts any escape: the JSON set, plus `\x`, `\v`,
                    // `\0`, identity escapes (`\'`), and line continuations
                    // (`\` + LF/CR/CRLF). Validation/decoding is deferred to the
                    // parser; here we only need to consume the escaped unit so a
                    // `\'` or `\<newline>` does not terminate the string.
                    self.index += 1;
                    if (escaped == '\r' and self.char() == '\n') self.index += 1;
                    continue;
                }
                switch (escaped) {
                    // Valid escapes: quote, backslash, whitespace
                    '"', '\\', '/', 'b', 'f', 'n', 'r', 't' => {
                        self.index += 1;
                    },
                    // Unicode escape. \u<four hex characters>
                    'u' => {
                        self.index += 1;
                        for (0..4) |_| {
                            const hex = self.char() orelse return TokenizeError.UnclosedString;
                            if (!isHexDigit(hex)) return TokenizeError.UnexpectedToken;
                            self.index += 1;
                        }
                    },
                    else => return TokenizeError.UnexpectedToken,
                }
            },
            0x00...0x1f => return TokenizeError.UnexpectedToken,
            else => self.index += 1,
        }
    }
    return error.UnclosedString;
}

/// Collects various kinds of numbers.
/// Negative, decimal, exponent
/// Checks for leading zero as well.
fn number(self: *Tokenizer) TokenizeError!Token {
    if (self.kind == JsonFormat.JSON5) return self.numberJson5();
    const start = self.index;

    // Check for negativity
    if (self.char() == '-') {
        self.index += 1;
    }

    switch (self.char() orelse return TokenizeError.UnexpectedEndOfInput) {
        // Either zero, or illegal leading zero
        '0' => {
            self.index += 1;
            if (self.char()) |c| if (isDigit(c)) return TokenizeError.LeadingZero;
        },
        '1'...'9' => {
            self.index += 1;
            while (self.char()) |c| {
                if (!isDigit(c)) break;
                self.index += 1;
            }
        },
        else => return TokenizeError.UnexpectedToken,
    }

    // Check for decimal
    if (self.char() == '.') {
        self.index += 1;
        const first_fraction = self.char() orelse return TokenizeError.UnexpectedEndOfInput;
        if (!isDigit(first_fraction)) return TokenizeError.UnexpectedToken;
        while (self.char()) |c| {
            if (!isDigit(c)) break;
            self.index += 1;
        }
    }

    // Check for exponent
    if (self.char()) |c| {
        if (c == 'e' or c == 'E') {
            self.index += 1;
            if (self.char()) |sign| {
                if (sign == '+' or sign == '-') self.index += 1;
            }
            const first_exponent = self.char() orelse return TokenizeError.UnexpectedEndOfInput;
            if (!isDigit(first_exponent)) return TokenizeError.UnexpectedToken;

            while (self.char()) |digit| {
                if (!isDigit(digit)) break;
                self.index += 1;
            }
        }
    }

    return .init(.number, .init(start, self.index));
}

/// JSON5 number: optional `+`/`-` sign, then one of
///   * `Infinity` / `NaN`              (non-finite; the parser lifts these to an
///                                      extended `number_special` node)
///   * `0x`-prefixed hexadecimal integer
///   * a decimal with an optional leading or trailing `.` and optional exponent
/// Leading zeros on a decimal integer stay illegal (no octal), matching JSON.
fn numberJson5(self: *Tokenizer) TokenizeError!Token {
    const start = self.index;

    if (self.char() == '+' or self.char() == '-') self.index += 1;

    if (self.matches("Infinity")) {
        self.index += "Infinity".len;
        return .init(.number, .init(start, self.index));
    }
    if (self.matches("NaN")) {
        self.index += "NaN".len;
        return .init(.number, .init(start, self.index));
    }

    // Hexadecimal integer: `0x`/`0X` then at least one hex digit.
    if (self.char() == '0' and (self.peek(1) == 'x' or self.peek(1) == 'X')) {
        self.index += 2;
        const hex_start = self.index;
        while (self.char()) |c| {
            if (!isHexDigit(c)) break;
            self.index += 1;
        }
        if (self.index == hex_start) return TokenizeError.UnexpectedToken; // bare `0x`
        return .init(.number, .init(start, self.index));
    }

    var digits_seen = false;

    // Integer part. A leading zero may not be followed by another digit (octal).
    if (self.char() == '0') {
        self.index += 1;
        digits_seen = true;
        if (self.char()) |c| if (isDigit(c)) return TokenizeError.LeadingZero;
    } else {
        while (self.char()) |c| {
            if (!isDigit(c)) break;
            self.index += 1;
            digits_seen = true;
        }
    }

    // Fractional part. Leading (`.5`) and trailing (`5.`) points are both legal.
    if (self.char() == '.') {
        self.index += 1;
        while (self.char()) |c| {
            if (!isDigit(c)) break;
            self.index += 1;
            digits_seen = true;
        }
    }

    // A lone `.` (or sign with no digits at all) is not a number.
    if (!digits_seen) return TokenizeError.UnexpectedToken;

    // Exponent.
    if (self.char()) |c| {
        if (c == 'e' or c == 'E') {
            self.index += 1;
            if (self.char()) |sign| {
                if (sign == '+' or sign == '-') self.index += 1;
            }
            const exp_start = self.index;
            while (self.char()) |d| {
                if (!isDigit(d)) break;
                self.index += 1;
            }
            if (self.index == exp_start) return TokenizeError.UnexpectedEndOfInput;
        }
    }

    return .init(.number, .init(start, self.index));
}

/// Look ahead `n` bytes from the current index without consuming.
fn peek(self: *const Tokenizer, n: usize) ?u8 {
    const i = self.index + n;
    if (i >= self.str.len) return null;
    return self.str[i];
}

/// Collects all bytes until arriving at a newline
/// Never returns null, but can be empty
fn comment(self: *Tokenizer) TokenizeError!Token {
    // Comments are not supported in the canonical JSON format
    if (self.kind == JsonFormat.JSON) return error.UnexpectedSlash;
    // Make sure there isn't just a random single slash
    if (self.index + 1 >= self.str.len) return TokenizeError.UnexpectedSlash;

    const start = self.index;
    const second = self.str[self.index + 1];

    return switch (second) {
        '/' => { // Single line comment
            self.index += 2;
            while (self.char()) |c| {
                // A line comment ends at any line terminator. JSON5/ES allow a
                // bare CR (and CRLF) to close it, not just LF.
                if (c == '\n' or c == '\r') break;
                self.index += 1;
            }
            return .init(.comment, .init(start, self.index));
        },
        '*' => { // Multi-line comment
            self.index += 2;
            while (self.index + 1 < self.str.len) {
                if (self.str[self.index] == '*' and self.str[self.index + 1] == '/') {
                    self.index += 2;
                    return .init(.comment, .init(start, self.index));
                }
                self.index += 1;
            }
            return TokenizeError.UnclosedComment;
        },
        else => return TokenizeError.UnexpectedSlash,
    };
}

// =======
// Testing
// =======

// Run tests standalone with
// `zig build test -Dtest-filter=tokenizer --summary all`

fn tok(kind: Token.Kind, start: usize, end: usize) Token {
    return Token.init(kind, .init(start, end));
}

fn testTokenizer(input: []const u8, expected: []const Token) !void {
    var tokenizer: Tokenizer = .{ .allocator = testing.allocator, .str = input };
    const tokens = try tokenizer.tokenize();
    defer testing.allocator.free(tokens);
    //errdefer log.err("expected: {any}", .{expected});
    //errdefer log.err("actual: {any}", .{tokens});
    try testing.expectEqualSlices(Token, expected, tokens);
}

fn testTokenizerError(input: []const u8, format: JsonFormat, expected_error: anyerror) !void {
    var tokenizer: Tokenizer = .{
        .allocator = testing.allocator,
        .str = input,
        .kind = format,
    };

    if (tokenizer.tokenize()) |tokens| {
        defer testing.allocator.free(tokens);
        try testing.expect(false);
    } else |err| {
        try testing.expectEqual(expected_error, err);
    }
}

test "array no whitespace" {
    try testTokenizer(
        \\["hello","there"]
    , &.{
        tok(.open_bracket, 0, 1),
        tok(.string, 1, 8),
        tok(.comma, 8, 9),
        tok(.string, 9, 16),
        tok(.close_bracket, 16, 17),
        tok(.end_of_file, 17, 17),
    });
}

test "whitespace" {
    try testTokenizer(" [ \"hello\" ,  \"there\" ] ", &.{
        tok(.whitespace, 0, 1),
        tok(.open_bracket, 1, 2),
        tok(.whitespace, 2, 3),
        tok(.string, 3, 10),
        tok(.whitespace, 10, 11),
        tok(.comma, 11, 12),
        tok(.whitespace, 12, 14),
        tok(.string, 14, 21),
        tok(.whitespace, 21, 22),
        tok(.close_bracket, 22, 23),
        tok(.whitespace, 23, 24),
        tok(.end_of_file, 24, 24),
    });
    try testTokenizer(" { \"hello\" :  \"there\" } ", &.{
        tok(.whitespace, 0, 1),
        tok(.open_brace, 1, 2),
        tok(.whitespace, 2, 3),
        tok(.string, 3, 10),
        tok(.whitespace, 10, 11),
        tok(.colon, 11, 12),
        tok(.whitespace, 12, 14),
        tok(.string, 14, 21),
        tok(.whitespace, 21, 22),
        tok(.close_brace, 22, 23),
        tok(.whitespace, 23, 24),
        tok(.end_of_file, 24, 24),
    });
}

test "object with array" {
    try testTokenizer(
        \\{"array": ["hello" ,  "there"]}
    , &.{
        tok(.open_brace, 0, 1),
        tok(.string, 1, 8),
        tok(.colon, 8, 9),
        tok(.whitespace, 9, 10),
        tok(.open_bracket, 10, 11),
        tok(.string, 11, 18),
        tok(.whitespace, 18, 19),
        tok(.comma, 19, 20),
        tok(.whitespace, 20, 22),
        tok(.string, 22, 29),
        tok(.close_bracket, 29, 30),
        tok(.close_brace, 30, 31),
        tok(.end_of_file, 31, 31),
    });
}

test "primitives" {
    try testTokenizer(
        \\[true, false, null, "string", 40334]
    , &.{
        tok(.open_bracket, 0, 1),
        tok(.true_, 1, 5),
        tok(.comma, 5, 6),
        tok(.whitespace, 6, 7),
        tok(.false_, 7, 12),
        tok(.comma, 12, 13),
        tok(.whitespace, 13, 14),
        tok(.null_, 14, 18),
        tok(.comma, 18, 19),
        tok(.whitespace, 19, 20),
        tok(.string, 20, 28),
        tok(.comma, 28, 29),
        tok(.whitespace, 29, 30),
        tok(.number, 30, 35),
        tok(.close_bracket, 35, 36),
        tok(.end_of_file, 36, 36),
    });
}

test "numbers" {
    try testTokenizer(
        \\[1,-1,0,0.2,12e+3,1e10,0e1,-0.2,1e+10]
    , &.{
        tok(.open_bracket, 0, 1),
        tok(.number, 1, 2),
        tok(.comma, 2, 3),
        tok(.number, 3, 5),
        tok(.comma, 5, 6),
        tok(.number, 6, 7),
        tok(.comma, 7, 8),
        tok(.number, 8, 11),
        tok(.comma, 11, 12),
        tok(.number, 12, 17),
        tok(.comma, 17, 18),
        tok(.number, 18, 22),
        tok(.comma, 22, 23),
        tok(.number, 23, 26),
        tok(.comma, 26, 27),
        tok(.number, 27, 31),
        tok(.comma, 31, 32),
        tok(.number, 32, 37),
        tok(.close_bracket, 37, 38),
        tok(.end_of_file, 38, 38),
    });
}

test "empty object/array" {
    try testTokenizer(
        \\[]
    , &.{
        tok(.open_bracket, 0, 1),
        tok(.close_bracket, 1, 2),
        tok(.end_of_file, 2, 2),
    });
    try testTokenizer(
        \\{}
    , &.{
        tok(.open_brace, 0, 1),
        tok(.close_brace, 1, 2),
        tok(.end_of_file, 2, 2),
    });
}

test "truncated literals are rejected" {
    try testTokenizerError("tru", .JSONC, error.UnexpectedToken);
    try testTokenizerError("fals", .JSONC, error.UnexpectedToken);
    try testTokenizerError("n", .JSONC, error.UnexpectedToken);
}

test "strings reject invalid JSON escapes and control bytes" {
    try testTokenizerError("\"\\x\"", .JSONC, error.UnexpectedToken);
    try testTokenizerError("\"line\nbreak\"", .JSONC, error.UnexpectedToken);
    try testTokenizerError("\"\\u12", .JSONC, error.UnclosedString);
    try testTokenizerError("\"\\u12g4\"", .JSONC, error.UnexpectedToken);
}

test "strict JSON numbers reject invalid forms" {
    try testTokenizerError("-", .JSONC, error.UnexpectedEndOfInput);
    try testTokenizerError("-.2", .JSONC, error.UnexpectedToken);
    try testTokenizerError("1.", .JSONC, error.UnexpectedEndOfInput);
    try testTokenizerError("1e+", .JSONC, error.UnexpectedEndOfInput);
    try testTokenizerError("-012", .JSONC, error.LeadingZero);
    try testTokenizerError("012", .JSONC, error.LeadingZero);
}

test "JSONC comments" {
    try testTokenizer("[// hi\n1]", &.{
        tok(.open_bracket, 0, 1),
        tok(.comment, 1, 6),
        tok(.whitespace, 6, 7),
        tok(.number, 7, 8),
        tok(.close_bracket, 8, 9),
        tok(.end_of_file, 9, 9),
    });

    try testTokenizerError("/", .JSONC, error.UnexpectedSlash);
    //multiline test
    try testTokenizer("/* hi */", &.{ tok(.comment, 0, 8), tok(.end_of_file, 8, 8) });
    // multiline inline test
    try testTokenizer("{\"hello\":/* hi */\"world\"}",
        &.{
            tok(.open_brace, 0, 1),
            tok(.string, 1, 8),
            tok(.colon, 8, 9),
            tok(.comment, 9, 17),
            tok(.string, 17, 24),
            tok(.close_brace, 24, 25),
            tok(.end_of_file, 25, 25)});
    try testTokenizer( // multiline comment test
        \\{
        \\  "hello": "world"
        \\/*
        \\hello, this is  a
        \\multiline comment
        \\*/
        \\}
        ,
        &.{
            tok(.open_brace, 0, 1),
            tok(.whitespace, 1, 4),
            tok(.string, 4, 11),
            tok(.colon, 11, 12),
            tok(.whitespace, 12, 13),
            tok(.string, 13, 20),
            tok(.whitespace, 20, 21),
            tok(.comment, 21, 62),
            tok(.whitespace, 62, 63),
            tok(.close_brace, 63, 64),
            tok(.end_of_file, 64, 64)});
    try testTokenizerError("// hi", .JSON, error.UnexpectedSlash);
}