rustik-highlight 0.1.0

//! Fast line-oriented JSON tokenization.
//!
//! The TextMate JSON grammar is expensive for large blobs because it drives a
//! recursive regex stack. This module keeps JSON highlighting bounded and
//! allocation-light by scanning one line at a time.

use crate::{ScopeId, ScopeSpan};

/// Scope id for JSON property names.
pub const PROPERTY_NAME: ScopeId = ScopeId::new(0);
/// Scope id for JSON string delimiters.
pub const STRING_PUNCTUATION: ScopeId = ScopeId::new(1);
/// Scope id for JSON string content.
pub const STRING: ScopeId = ScopeId::new(2);
/// Scope id for JSON numbers.
pub const NUMBER: ScopeId = ScopeId::new(3);
/// Scope id for JSON constants such as `true`, `false`, and `null`.
pub const CONSTANT: ScopeId = ScopeId::new(4);

/// Scope names emitted by [`tokenize_line_into`].
pub const SCOPES: &[&str] = &[
    "support.type.property-name.json",
    "punctuation.definition.string.json",
    "string.quoted.double.json",
    "constant.numeric.json",
    "constant.language.json",
];

/// Tokenizes one JSON line into caller-owned storage.
pub fn tokenize_line_into(line: &str, spans: &mut Vec<ScopeSpan>) {
    spans.clear();
    let line = line.trim_end_matches(['\r', '\n']);
    let bytes = line.as_bytes();
    let mut pos = 0;

    while pos < bytes.len() {
        match bytes[pos] {
            b'"' => {
                let end = string_end(bytes, pos + 1);
                let content_scope = if string_is_key(bytes, end) {
                    PROPERTY_NAME
                } else {
                    STRING
                };
                push(spans, pos, pos + 1, STRING_PUNCTUATION);
                push(spans, pos + 1, end, content_scope);

                if end < bytes.len() {
                    push(spans, end, end + 1, STRING_PUNCTUATION);
                    pos = end + 1;
                } else {
                    pos = end;
                }
            }
            b'-' | b'0'..=b'9' => {
                let end = number_end(bytes, pos);
                push(spans, pos, end, NUMBER);
                pos = end;
            }
            b't' if bytes[pos..].starts_with(b"true") => {
                push(spans, pos, pos + 4, CONSTANT);
                pos += 4;
            }
            b'f' if bytes[pos..].starts_with(b"false") => {
                push(spans, pos, pos + 5, CONSTANT);
                pos += 5;
            }
            b'n' if bytes[pos..].starts_with(b"null") => {
                push(spans, pos, pos + 4, CONSTANT);
                pos += 4;
            }
            _ => pos += 1,
        }
    }
}

/// Adds a non-empty JSON scope span.
fn push(spans: &mut Vec<ScopeSpan>, start: usize, end: usize, scope: ScopeId) {
    if start < end {
        spans.push(ScopeSpan { start, end, scope });
    }
}

/// Finds the closing quote for a JSON string, honoring backslash escapes.
fn string_end(bytes: &[u8], mut pos: usize) -> usize {
    let mut escaped = false;
    while pos < bytes.len() {
        match (bytes[pos], escaped) {
            (_, true) => escaped = false,
            (b'\\', false) => escaped = true,
            (b'"', false) => return pos,
            _ => {}
        }
        pos += 1;
    }
    pos
}

/// Returns whether the string ending at `end` is followed by a JSON object colon.
fn string_is_key(bytes: &[u8], end: usize) -> bool {
    let mut pos = end.saturating_add(1);
    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
        pos += 1;
    }
    pos < bytes.len() && bytes[pos] == b':'
}

/// Returns the byte position after a JSON number-like token.
fn number_end(bytes: &[u8], mut pos: usize) -> usize {
    if bytes.get(pos) == Some(&b'-') {
        pos += 1;
    }
    while pos < bytes.len() && bytes[pos].is_ascii_digit() {
        pos += 1;
    }
    if bytes.get(pos) == Some(&b'.') {
        pos += 1;
        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
            pos += 1;
        }
    }
    if matches!(bytes.get(pos), Some(b'e' | b'E')) {
        pos += 1;
        if matches!(bytes.get(pos), Some(b'+' | b'-')) {
            pos += 1;
        }
        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
            pos += 1;
        }
    }
    pos
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn tokenizes_keys_strings_numbers_and_constants() {
        let line = r#"  "a": "b\"c", "n": 12.5e-1, "ok": true"#;
        let mut spans = Vec::new();

        tokenize_line_into(line, &mut spans);

        assert!(
            spans
                .iter()
                .any(|span| span.scope == PROPERTY_NAME && &line[span.start..span.end] == "a")
        );
        assert!(
            spans
                .iter()
                .any(|span| span.scope == STRING && &line[span.start..span.end] == r#"b\"c"#)
        );
        assert!(
            spans
                .iter()
                .any(|span| span.scope == NUMBER && &line[span.start..span.end] == "12.5e-1")
        );
        assert!(
            spans
                .iter()
                .any(|span| span.scope == CONSTANT && &line[span.start..span.end] == "true")
        );
    }

    #[test]
    fn keeps_unterminated_strings_on_the_same_line() {
        let line = r#"  "a": "unterminated"#;
        let mut spans = Vec::new();

        tokenize_line_into(line, &mut spans);

        assert!(
            spans
                .iter()
                .any(|span| span.scope == STRING && &line[span.start..span.end] == "unterminated")
        );
    }
}