Skip to main content

rustik_highlight/
json.rs

1//! Fast line-oriented JSON tokenization.
2//!
3//! The TextMate JSON grammar is expensive for large blobs because it drives a
4//! recursive regex stack. This module keeps JSON highlighting bounded and
5//! allocation-light by scanning one line at a time.
6
7use crate::{ScopeId, ScopeSpan};
8
9/// Scope id for JSON property names.
10pub const PROPERTY_NAME: ScopeId = ScopeId::new(0);
11/// Scope id for JSON string delimiters.
12pub const STRING_PUNCTUATION: ScopeId = ScopeId::new(1);
13/// Scope id for JSON string content.
14pub const STRING: ScopeId = ScopeId::new(2);
15/// Scope id for JSON numbers.
16pub const NUMBER: ScopeId = ScopeId::new(3);
17/// Scope id for JSON constants such as `true`, `false`, and `null`.
18pub const CONSTANT: ScopeId = ScopeId::new(4);
19
20/// Scope names emitted by [`tokenize_line_into`].
21pub const SCOPES: &[&str] = &[
22    "support.type.property-name.json",
23    "punctuation.definition.string.json",
24    "string.quoted.double.json",
25    "constant.numeric.json",
26    "constant.language.json",
27];
28
29/// Tokenizes one JSON line into caller-owned storage.
30pub fn tokenize_line_into(line: &str, spans: &mut Vec<ScopeSpan>) {
31    spans.clear();
32    let line = line.trim_end_matches(['\r', '\n']);
33    let bytes = line.as_bytes();
34    let mut pos = 0;
35
36    while pos < bytes.len() {
37        match bytes[pos] {
38            b'"' => {
39                let end = string_end(bytes, pos + 1);
40                let content_scope = if string_is_key(bytes, end) {
41                    PROPERTY_NAME
42                } else {
43                    STRING
44                };
45                push(spans, pos, pos + 1, STRING_PUNCTUATION);
46                push(spans, pos + 1, end, content_scope);
47
48                if end < bytes.len() {
49                    push(spans, end, end + 1, STRING_PUNCTUATION);
50                    pos = end + 1;
51                } else {
52                    pos = end;
53                }
54            }
55            b'-' | b'0'..=b'9' => {
56                let end = number_end(bytes, pos);
57                push(spans, pos, end, NUMBER);
58                pos = end;
59            }
60            b't' if bytes[pos..].starts_with(b"true") => {
61                push(spans, pos, pos + 4, CONSTANT);
62                pos += 4;
63            }
64            b'f' if bytes[pos..].starts_with(b"false") => {
65                push(spans, pos, pos + 5, CONSTANT);
66                pos += 5;
67            }
68            b'n' if bytes[pos..].starts_with(b"null") => {
69                push(spans, pos, pos + 4, CONSTANT);
70                pos += 4;
71            }
72            _ => pos += 1,
73        }
74    }
75}
76
77/// Adds a non-empty JSON scope span.
78fn push(spans: &mut Vec<ScopeSpan>, start: usize, end: usize, scope: ScopeId) {
79    if start < end {
80        spans.push(ScopeSpan { start, end, scope });
81    }
82}
83
84/// Finds the closing quote for a JSON string, honoring backslash escapes.
85fn string_end(bytes: &[u8], mut pos: usize) -> usize {
86    let mut escaped = false;
87    while pos < bytes.len() {
88        match (bytes[pos], escaped) {
89            (_, true) => escaped = false,
90            (b'\\', false) => escaped = true,
91            (b'"', false) => return pos,
92            _ => {}
93        }
94        pos += 1;
95    }
96    pos
97}
98
99/// Returns whether the string ending at `end` is followed by a JSON object colon.
100fn string_is_key(bytes: &[u8], end: usize) -> bool {
101    let mut pos = end.saturating_add(1);
102    while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
103        pos += 1;
104    }
105    pos < bytes.len() && bytes[pos] == b':'
106}
107
108/// Returns the byte position after a JSON number-like token.
109fn number_end(bytes: &[u8], mut pos: usize) -> usize {
110    if bytes.get(pos) == Some(&b'-') {
111        pos += 1;
112    }
113    while pos < bytes.len() && bytes[pos].is_ascii_digit() {
114        pos += 1;
115    }
116    if bytes.get(pos) == Some(&b'.') {
117        pos += 1;
118        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
119            pos += 1;
120        }
121    }
122    if matches!(bytes.get(pos), Some(b'e' | b'E')) {
123        pos += 1;
124        if matches!(bytes.get(pos), Some(b'+' | b'-')) {
125            pos += 1;
126        }
127        while pos < bytes.len() && bytes[pos].is_ascii_digit() {
128            pos += 1;
129        }
130    }
131    pos
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137
138    #[test]
139    fn tokenizes_keys_strings_numbers_and_constants() {
140        let line = r#"  "a": "b\"c", "n": 12.5e-1, "ok": true"#;
141        let mut spans = Vec::new();
142
143        tokenize_line_into(line, &mut spans);
144
145        assert!(
146            spans
147                .iter()
148                .any(|span| span.scope == PROPERTY_NAME && &line[span.start..span.end] == "a")
149        );
150        assert!(
151            spans
152                .iter()
153                .any(|span| span.scope == STRING && &line[span.start..span.end] == r#"b\"c"#)
154        );
155        assert!(
156            spans
157                .iter()
158                .any(|span| span.scope == NUMBER && &line[span.start..span.end] == "12.5e-1")
159        );
160        assert!(
161            spans
162                .iter()
163                .any(|span| span.scope == CONSTANT && &line[span.start..span.end] == "true")
164        );
165    }
166
167    #[test]
168    fn keeps_unterminated_strings_on_the_same_line() {
169        let line = r#"  "a": "unterminated"#;
170        let mut spans = Vec::new();
171
172        tokenize_line_into(line, &mut spans);
173
174        assert!(
175            spans
176                .iter()
177                .any(|span| span.scope == STRING && &line[span.start..span.end] == "unterminated")
178        );
179    }
180}