Skip to main content

glum_lib/
highlight.rs

1//! Lightweight token-based syntax highlighting for code blocks.
2//!
3//! Intentionally simple: per-language keyword sets, plus generic rules for
4//! line comments, block comments, strings, and numbers. Good enough to give
5//! code readable structure without dragging in a full regex grammar engine.
6//!
7//! Supported languages (by fenced info string): `rust`, `python`/`py`,
8//! `javascript`/`js`/`ts`/`typescript`, `go`, `bash`/`sh`/`shell`,
9//! `json`, `yaml`/`yml`, `toml`, `html`/`xml`, `c`/`cpp`/`c++`/`h`, `java`.
10//! Unknown languages fall back to plain code-style spans.
11//!
12//! The highlighter operates per-line (so it can be wired into paged display).
13//! Multi-line block comments and strings are not tracked across lines; this
14//! is a deliberate tradeoff for simplicity and avoids pathological inputs
15//! that would make a stateful scanner slow.
16
17use ratatui::style::Style;
18use ratatui::text::Span;
19
20use crate::theme::Theme;
21
22/// Produce styled spans for a single line of code in the given language.
23pub fn highlight_line(line: &str, lang: &str, theme: Theme) -> Vec<Span<'static>> {
24    let grammar = Grammar::for_lang(lang);
25    if grammar.is_none() {
26        return vec![Span::styled(line.to_string(), theme.code_style())];
27    }
28    let grammar = grammar.unwrap();
29    scan(line, grammar, theme)
30}
31
32#[derive(Debug, Clone, Copy)]
33struct Grammar {
34    keywords: &'static [&'static str],
35    types: &'static [&'static str],
36    /// Line-comment prefixes (tried in order).
37    line_comments: &'static [&'static str],
38    /// String delimiters; both ends are the same character.
39    strings: &'static [char],
40    /// Whether `#` starts a line comment (shell/python/yaml/toml style).
41    /// This is covered by `line_comments` but flagged explicitly for fn-call heuristic.
42    case_sensitive: bool,
43    /// Permit a function-call highlight: `identifier(` → identifier colored as fn.
44    fn_call_highlight: bool,
45}
46
47impl Grammar {
48    fn for_lang(raw: &str) -> Option<&'static Self> {
49        let lang = raw.trim().to_ascii_lowercase();
50        let key = lang
51            .split(|c: char| c == ',' || c.is_whitespace())
52            .next()
53            .unwrap_or("");
54        match key {
55            "rust" | "rs" => Some(&RUST),
56            "python" | "py" => Some(&PYTHON),
57            "js" | "javascript" | "jsx" | "ts" | "typescript" | "tsx" => Some(&JS),
58            "go" => Some(&GO),
59            "bash" | "sh" | "shell" | "zsh" => Some(&BASH),
60            "json" => Some(&JSON),
61            "yaml" | "yml" => Some(&YAML),
62            "toml" | "ini" => Some(&TOML),
63            "html" | "xml" | "svg" => Some(&HTML),
64            "c" | "h" => Some(&C),
65            "cpp" | "c++" | "hpp" | "cxx" | "hxx" => Some(&CPP),
66            "java" => Some(&JAVA),
67            _ => None,
68        }
69    }
70}
71
72static RUST: Grammar = Grammar {
73    keywords: &[
74        "as", "async", "await", "break", "const", "continue", "crate", "dyn", "else", "enum",
75        "extern", "false", "fn", "for", "if", "impl", "in", "let", "loop", "match", "mod", "move",
76        "mut", "pub", "ref", "return", "self", "Self", "static", "struct", "super", "trait",
77        "true", "type", "union", "unsafe", "use", "where", "while", "yield",
78    ],
79    types: &[
80        "bool", "char", "f32", "f64", "i8", "i16", "i32", "i64", "i128", "isize", "u8", "u16",
81        "u32", "u64", "u128", "usize", "str", "String", "Vec", "Option", "Result", "Box", "Rc",
82        "Arc", "HashMap", "BTreeMap",
83    ],
84    line_comments: &["//"],
85    strings: &['"'],
86    case_sensitive: true,
87    fn_call_highlight: true,
88};
89
90static PYTHON: Grammar = Grammar {
91    keywords: &[
92        "False", "None", "True", "and", "as", "assert", "async", "await", "break", "class",
93        "continue", "def", "del", "elif", "else", "except", "finally", "for", "from", "global",
94        "if", "import", "in", "is", "lambda", "nonlocal", "not", "or", "pass", "raise", "return",
95        "try", "while", "with", "yield", "match", "case",
96    ],
97    types: &[
98        "int", "float", "str", "bool", "list", "dict", "tuple", "set", "bytes",
99    ],
100    line_comments: &["#"],
101    strings: &['"', '\''],
102    case_sensitive: true,
103    fn_call_highlight: true,
104};
105
106static JS: Grammar = Grammar {
107    keywords: &[
108        "async",
109        "await",
110        "break",
111        "case",
112        "catch",
113        "class",
114        "const",
115        "continue",
116        "debugger",
117        "default",
118        "delete",
119        "do",
120        "else",
121        "enum",
122        "export",
123        "extends",
124        "false",
125        "finally",
126        "for",
127        "function",
128        "if",
129        "import",
130        "in",
131        "instanceof",
132        "interface",
133        "let",
134        "new",
135        "null",
136        "of",
137        "return",
138        "static",
139        "super",
140        "switch",
141        "this",
142        "throw",
143        "true",
144        "try",
145        "type",
146        "typeof",
147        "undefined",
148        "var",
149        "void",
150        "while",
151        "with",
152        "yield",
153    ],
154    types: &[
155        "boolean", "number", "string", "object", "symbol", "bigint", "any", "unknown", "never",
156        "void",
157    ],
158    line_comments: &["//"],
159    strings: &['"', '\'', '`'],
160    case_sensitive: true,
161    fn_call_highlight: true,
162};
163
164static GO: Grammar = Grammar {
165    keywords: &[
166        "break",
167        "case",
168        "chan",
169        "const",
170        "continue",
171        "default",
172        "defer",
173        "else",
174        "fallthrough",
175        "for",
176        "func",
177        "go",
178        "goto",
179        "if",
180        "import",
181        "interface",
182        "map",
183        "package",
184        "range",
185        "return",
186        "select",
187        "struct",
188        "switch",
189        "type",
190        "var",
191        "true",
192        "false",
193        "nil",
194    ],
195    types: &[
196        "bool",
197        "byte",
198        "rune",
199        "string",
200        "error",
201        "int",
202        "int8",
203        "int16",
204        "int32",
205        "int64",
206        "uint",
207        "uint8",
208        "uint16",
209        "uint32",
210        "uint64",
211        "uintptr",
212        "float32",
213        "float64",
214        "complex64",
215        "complex128",
216        "any",
217    ],
218    line_comments: &["//"],
219    strings: &['"', '`'],
220    case_sensitive: true,
221    fn_call_highlight: true,
222};
223
224static BASH: Grammar = Grammar {
225    keywords: &[
226        "if", "then", "else", "elif", "fi", "case", "esac", "for", "select", "while", "until",
227        "do", "done", "function", "in", "time", "return", "break", "continue", "export", "local",
228        "readonly", "source", "alias", "unset", "trap",
229    ],
230    types: &[],
231    line_comments: &["#"],
232    strings: &['"', '\''],
233    case_sensitive: true,
234    fn_call_highlight: false,
235};
236
237static JSON: Grammar = Grammar {
238    keywords: &["true", "false", "null"],
239    types: &[],
240    line_comments: &[],
241    strings: &['"'],
242    case_sensitive: true,
243    fn_call_highlight: false,
244};
245
246static YAML: Grammar = Grammar {
247    keywords: &["true", "false", "null", "yes", "no", "on", "off"],
248    types: &[],
249    line_comments: &["#"],
250    strings: &['"', '\''],
251    case_sensitive: false,
252    fn_call_highlight: false,
253};
254
255static TOML: Grammar = Grammar {
256    keywords: &["true", "false"],
257    types: &[],
258    line_comments: &["#"],
259    strings: &['"', '\''],
260    case_sensitive: true,
261    fn_call_highlight: false,
262};
263
264static HTML: Grammar = Grammar {
265    keywords: &[],
266    types: &[],
267    line_comments: &[],
268    strings: &['"', '\''],
269    case_sensitive: false,
270    fn_call_highlight: false,
271};
272
273static C: Grammar = Grammar {
274    keywords: &[
275        "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else",
276        "enum", "extern", "float", "for", "goto", "if", "inline", "int", "long", "register",
277        "restrict", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef",
278        "union", "unsigned", "void", "volatile", "while",
279    ],
280    types: &[
281        "int8_t",
282        "int16_t",
283        "int32_t",
284        "int64_t",
285        "uint8_t",
286        "uint16_t",
287        "uint32_t",
288        "uint64_t",
289        "size_t",
290        "ssize_t",
291        "ptrdiff_t",
292        "intptr_t",
293        "uintptr_t",
294        "bool",
295        "FILE",
296    ],
297    line_comments: &["//"],
298    strings: &['"', '\''],
299    case_sensitive: true,
300    fn_call_highlight: true,
301};
302
303static CPP: Grammar = Grammar {
304    keywords: &[
305        "alignas",
306        "alignof",
307        "and",
308        "auto",
309        "bool",
310        "break",
311        "case",
312        "catch",
313        "char",
314        "class",
315        "const",
316        "constexpr",
317        "continue",
318        "decltype",
319        "default",
320        "delete",
321        "do",
322        "double",
323        "else",
324        "enum",
325        "explicit",
326        "export",
327        "extern",
328        "false",
329        "final",
330        "float",
331        "for",
332        "friend",
333        "goto",
334        "if",
335        "inline",
336        "int",
337        "long",
338        "mutable",
339        "namespace",
340        "new",
341        "noexcept",
342        "not",
343        "nullptr",
344        "operator",
345        "override",
346        "private",
347        "protected",
348        "public",
349        "register",
350        "return",
351        "short",
352        "signed",
353        "sizeof",
354        "static",
355        "struct",
356        "switch",
357        "template",
358        "this",
359        "throw",
360        "true",
361        "try",
362        "typedef",
363        "typeid",
364        "typename",
365        "union",
366        "unsigned",
367        "using",
368        "virtual",
369        "void",
370        "volatile",
371        "while",
372    ],
373    types: &[
374        "int8_t",
375        "int16_t",
376        "int32_t",
377        "int64_t",
378        "uint8_t",
379        "uint16_t",
380        "uint32_t",
381        "uint64_t",
382        "size_t",
383        "string",
384        "vector",
385        "map",
386        "unordered_map",
387        "set",
388    ],
389    line_comments: &["//"],
390    strings: &['"', '\''],
391    case_sensitive: true,
392    fn_call_highlight: true,
393};
394
395static JAVA: Grammar = Grammar {
396    keywords: &[
397        "abstract",
398        "assert",
399        "boolean",
400        "break",
401        "byte",
402        "case",
403        "catch",
404        "char",
405        "class",
406        "const",
407        "continue",
408        "default",
409        "do",
410        "double",
411        "else",
412        "enum",
413        "extends",
414        "final",
415        "finally",
416        "float",
417        "for",
418        "goto",
419        "if",
420        "implements",
421        "import",
422        "instanceof",
423        "int",
424        "interface",
425        "long",
426        "native",
427        "new",
428        "null",
429        "package",
430        "private",
431        "protected",
432        "public",
433        "return",
434        "short",
435        "static",
436        "strictfp",
437        "super",
438        "switch",
439        "synchronized",
440        "this",
441        "throw",
442        "throws",
443        "transient",
444        "true",
445        "false",
446        "try",
447        "void",
448        "volatile",
449        "while",
450        "var",
451    ],
452    types: &[
453        "String", "Integer", "Long", "Double", "Float", "Boolean", "List", "Map", "Set", "Object",
454    ],
455    line_comments: &["//"],
456    strings: &['"', '\''],
457    case_sensitive: true,
458    fn_call_highlight: true,
459};
460
461/// Peek the next character at byte index `i`, or None at end-of-string.
462fn char_at(line: &str, i: usize) -> Option<char> {
463    line[i..].chars().next()
464}
465
466fn scan(line: &str, g: &'static Grammar, theme: Theme) -> Vec<Span<'static>> {
467    let mut spans: Vec<Span<'static>> = Vec::new();
468    let base = theme.code_style();
469    let mut i = 0usize;
470
471    // Fast path for comment-only lines.
472    for prefix in g.line_comments {
473        let trimmed = line.trim_start();
474        if trimmed.starts_with(prefix) {
475            let indent_len = line.len() - trimmed.len();
476            if indent_len > 0 {
477                spans.push(Span::styled(line[..indent_len].to_string(), base));
478            }
479            spans.push(Span::styled(
480                line[indent_len..].to_string(),
481                theme.comment_style(),
482            ));
483            return spans;
484        }
485    }
486
487    while i < line.len() {
488        // Detect start of comment mid-line.
489        let mut matched_comment = false;
490        for prefix in g.line_comments {
491            if line[i..].starts_with(prefix) {
492                spans.push(Span::styled(line[i..].to_string(), theme.comment_style()));
493                i = line.len();
494                matched_comment = true;
495                break;
496            }
497        }
498        if matched_comment {
499            break;
500        }
501
502        // `i` is always on a char boundary: we advance only by whole chars
503        // (`ch.len_utf8()`) or by boundary-preserving offsets from sub-scanners.
504        let Some(ch) = char_at(line, i) else { break };
505        let ch_len = ch.len_utf8();
506
507        // String literal.
508        if g.strings.contains(&ch) {
509            let (span, end) = read_string(line, i, ch, theme);
510            spans.push(span);
511            i = end;
512            continue;
513        }
514
515        // Number literal.
516        if ch.is_ascii_digit()
517            || (ch == '.' && char_at(line, i + 1).is_some_and(|c| c.is_ascii_digit()))
518        {
519            let (span, end) = read_number(line, i, theme);
520            spans.push(span);
521            i = end;
522            continue;
523        }
524
525        // Identifier / keyword / type / function-call.
526        if is_ident_start(ch) {
527            let start = i;
528            let mut j = i;
529            while let Some(c) = char_at(line, j) {
530                if !is_ident_continue(c) {
531                    break;
532                }
533                j += c.len_utf8();
534            }
535            let word = &line[start..j];
536            let style = classify_word(word, g, theme);
537            let final_style =
538                if g.fn_call_highlight && style == base && char_at(line, j) == Some('(') {
539                    theme.fn_style()
540                } else {
541                    style
542                };
543            spans.push(Span::styled(word.to_string(), final_style));
544            i = j;
545            continue;
546        }
547
548        // Otherwise, accumulate default-styled runs until the next interesting char.
549        let start = i;
550        let mut j = i;
551        while let Some(c) = char_at(line, j) {
552            if g.strings.contains(&c)
553                || is_ident_start(c)
554                || c.is_ascii_digit()
555                || g.line_comments.iter().any(|p| line[j..].starts_with(*p))
556            {
557                break;
558            }
559            j += c.len_utf8();
560        }
561        if j == start {
562            // No progress — force at least one char of progress to guarantee termination.
563            j = start + ch_len;
564        }
565        spans.push(Span::styled(line[start..j].to_string(), base));
566        i = j;
567    }
568
569    if spans.is_empty() {
570        spans.push(Span::styled(String::new(), base));
571    }
572    spans
573}
574
575fn read_string(line: &str, start: usize, delim: char, theme: Theme) -> (Span<'static>, usize) {
576    let mut i = start + delim.len_utf8();
577    let mut escape = false;
578    while let Some(c) = char_at(line, i) {
579        let cl = c.len_utf8();
580        if escape {
581            escape = false;
582            i += cl;
583            continue;
584        }
585        if c == '\\' {
586            escape = true;
587            i += cl;
588            continue;
589        }
590        if c == delim {
591            i += cl;
592            return (
593                Span::styled(line[start..i].to_string(), theme.string_style()),
594                i,
595            );
596        }
597        i += cl;
598    }
599    // Unterminated on this line — style through end.
600    (
601        Span::styled(line[start..].to_string(), theme.string_style()),
602        line.len(),
603    )
604}
605
606fn read_number(line: &str, start: usize, theme: Theme) -> (Span<'static>, usize) {
607    let mut i = start;
608    let mut saw_dot = false;
609    let mut saw_e = false;
610    // Hex, oct, bin prefixes.
611    if line[i..].starts_with("0x") || line[i..].starts_with("0X") {
612        i += 2;
613        while let Some(c) = char_at(line, i) {
614            if c.is_ascii_hexdigit() || c == '_' {
615                i += c.len_utf8();
616            } else {
617                break;
618            }
619        }
620    } else if line[i..].starts_with("0b") || line[i..].starts_with("0B") {
621        i += 2;
622        while let Some(c) = char_at(line, i) {
623            if matches!(c, '0' | '1' | '_') {
624                i += c.len_utf8();
625            } else {
626                break;
627            }
628        }
629    } else {
630        while let Some(c) = char_at(line, i) {
631            if c.is_ascii_digit() || c == '_' {
632                i += c.len_utf8();
633            } else if c == '.' && !saw_dot && !saw_e {
634                saw_dot = true;
635                i += 1;
636            } else if (c == 'e' || c == 'E') && !saw_e {
637                saw_e = true;
638                i += 1;
639                if matches!(char_at(line, i), Some('+' | '-')) {
640                    i += 1;
641                }
642            } else {
643                break;
644            }
645        }
646    }
647    // Optional numeric suffix (e.g. 10u32, 1.0f64).
648    while let Some(c) = char_at(line, i) {
649        if is_ident_continue(c) {
650            i += c.len_utf8();
651        } else {
652            break;
653        }
654    }
655    (
656        Span::styled(line[start..i].to_string(), theme.number_style()),
657        i,
658    )
659}
660
661fn is_ident_start(c: char) -> bool {
662    c == '_' || c.is_ascii_alphabetic()
663}
664
665fn is_ident_continue(c: char) -> bool {
666    c == '_' || c.is_ascii_alphanumeric()
667}
668
669fn classify_word(word: &str, g: &'static Grammar, theme: Theme) -> Style {
670    let cmp: Box<dyn Fn(&&&str) -> bool> = if g.case_sensitive {
671        Box::new(|k: &&&str| **k == word)
672    } else {
673        let lw = word.to_ascii_lowercase();
674        Box::new(move |k: &&&str| k.eq_ignore_ascii_case(&lw))
675    };
676    if g.keywords.iter().any(|k| cmp(&k)) {
677        return theme.keyword_style();
678    }
679    if g.types.iter().any(|k| cmp(&k)) {
680        return theme.type_style();
681    }
682    theme.code_style()
683}
684
685#[cfg(test)]
686mod tests {
687    use super::*;
688    use crate::theme::{Theme, ThemeName};
689
690    fn plain() -> Theme {
691        Theme::resolve(ThemeName::Plain)
692    }
693
694    #[test]
695    fn unknown_lang_passthrough() {
696        let spans = highlight_line("hello world", "klingon", plain());
697        assert_eq!(spans.len(), 1);
698        assert_eq!(spans[0].content, "hello world");
699    }
700
701    #[test]
702    fn rust_keyword_and_string() {
703        let spans = highlight_line(r#"let x = "hi";"#, "rust", plain());
704        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
705        assert_eq!(joined, r#"let x = "hi";"#);
706        assert!(spans.iter().any(|s| s.content.as_ref() == "let"));
707        assert!(spans.iter().any(|s| s.content.as_ref() == r#""hi""#));
708    }
709
710    #[test]
711    fn python_comment() {
712        let spans = highlight_line("x = 1  # comment", "python", plain());
713        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
714        assert_eq!(joined, "x = 1  # comment");
715    }
716
717    #[test]
718    fn hex_number() {
719        let spans = highlight_line("let n = 0xFF;", "rust", plain());
720        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
721        assert_eq!(joined, "let n = 0xFF;");
722        assert!(spans.iter().any(|s| s.content.as_ref() == "0xFF"));
723    }
724
725    #[test]
726    fn unterminated_string_does_not_panic() {
727        let spans = highlight_line(r#"let s = "oops"#, "rust", plain());
728        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
729        assert_eq!(joined, r#"let s = "oops"#);
730    }
731
732    #[test]
733    fn fn_call_highlighted() {
734        let spans = highlight_line("println!(foo())", "rust", plain());
735        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
736        assert_eq!(joined, "println!(foo())");
737    }
738
739    #[test]
740    fn handles_multibyte_chars_without_panicking() {
741        // Ellipsis and em-dash inside code should not crash the scanner.
742        let line = "if let Err(e) = auth::validate_token(token, &state.conf\u{2026}";
743        let spans = highlight_line(line, "rust", plain());
744        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
745        assert_eq!(joined, line);
746    }
747
748    #[test]
749    fn handles_cjk_and_emoji_in_comments() {
750        let line = "let x = 1; // 日本語 🎉 comment";
751        let spans = highlight_line(line, "rust", plain());
752        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
753        assert_eq!(joined, line);
754    }
755
756    #[test]
757    fn handles_multibyte_in_string_literal() {
758        let line = r#"let s = "héllo — world";"#;
759        let spans = highlight_line(line, "rust", plain());
760        let joined: String = spans.iter().map(|s| s.content.as_ref()).collect();
761        assert_eq!(joined, line);
762    }
763}