fresh/primitives/
highlight_engine.rs

1//! Unified highlighting engine
2//!
3//! This module provides a unified abstraction over different highlighting backends:
4//! - TextMate grammars via syntect (default for highlighting)
5//! - Tree-sitter (available via explicit preference, also used for non-highlighting features)
6//!
7//! # Backend Selection
8//! By default, syntect/TextMate is used for syntax highlighting because it provides
9//! broader language coverage. Tree-sitter language detection is still performed
10//! to support non-highlighting features like auto-indentation and semantic highlighting.
11//!
12//! # Non-Highlighting Features
13//! Even when using TextMate for highlighting, tree-sitter `Language` is detected
14//! and available via `.language()` for:
15//! - Auto-indentation (via IndentCalculator)
16//! - Semantic highlighting (variable scope tracking)
17//! - Other syntax-aware features
18
19use crate::model::buffer::Buffer;
20use crate::primitives::grammar::GrammarRegistry;
21use crate::primitives::highlighter::{
22    highlight_color, HighlightCategory, HighlightSpan, Highlighter, Language,
23};
24use crate::view::theme::Theme;
25use std::ops::Range;
26use std::path::Path;
27use std::sync::Arc;
28use syntect::parsing::SyntaxSet;
29
30/// Map TextMate scope to highlight category
31fn scope_to_category(scope: &str) -> Option<HighlightCategory> {
32    let scope_lower = scope.to_lowercase();
33
34    // Comments - highest priority
35    if scope_lower.starts_with("comment") {
36        return Some(HighlightCategory::Comment);
37    }
38
39    // Strings
40    if scope_lower.starts_with("string") {
41        return Some(HighlightCategory::String);
42    }
43
44    // Markdown/markup scopes - handle before generic keyword/punctuation checks
45    // See: https://macromates.com/manual/en/language_grammars (TextMate scope naming)
46    // Headings: markup.heading and entity.name.section (used by syntect's markdown grammar)
47    if scope_lower.starts_with("markup.heading") || scope_lower.starts_with("entity.name.section") {
48        return Some(HighlightCategory::Keyword); // Headers styled like keywords (bold, prominent)
49    }
50    // Bold: markup.bold
51    if scope_lower.starts_with("markup.bold") {
52        return Some(HighlightCategory::Constant); // Bold styled like constants (bright)
53    }
54    // Italic: markup.italic
55    if scope_lower.starts_with("markup.italic") {
56        return Some(HighlightCategory::Variable); // Italic styled like variables
57    }
58    // Inline code and code blocks: markup.raw, markup.inline.raw
59    if scope_lower.starts_with("markup.raw") || scope_lower.starts_with("markup.inline.raw") {
60        return Some(HighlightCategory::String); // Code styled like strings
61    }
62    // Links: markup.underline.link
63    if scope_lower.starts_with("markup.underline.link") {
64        return Some(HighlightCategory::Function); // Links styled like functions (distinct color)
65    }
66    // Generic underline (often links)
67    if scope_lower.starts_with("markup.underline") {
68        return Some(HighlightCategory::Function);
69    }
70    // Block quotes: markup.quote
71    if scope_lower.starts_with("markup.quote") {
72        return Some(HighlightCategory::Comment); // Quotes styled like comments (subdued)
73    }
74    // Lists: markup.list
75    if scope_lower.starts_with("markup.list") {
76        return Some(HighlightCategory::Operator); // List markers styled like operators
77    }
78    // Strikethrough: markup.strikethrough
79    if scope_lower.starts_with("markup.strikethrough") {
80        return Some(HighlightCategory::Comment); // Strikethrough styled subdued
81    }
82
83    // Keywords
84    if scope_lower.starts_with("keyword.control")
85        || scope_lower.starts_with("keyword.other")
86        || scope_lower.starts_with("keyword.declaration")
87        || scope_lower.starts_with("keyword")
88    {
89        // keyword.operator should map to Operator, not Keyword
90        if !scope_lower.starts_with("keyword.operator") {
91            return Some(HighlightCategory::Keyword);
92        }
93    }
94
95    // Operators (including keyword.operator)
96    if scope_lower.starts_with("keyword.operator") || scope_lower.starts_with("punctuation") {
97        return Some(HighlightCategory::Operator);
98    }
99
100    // Functions
101    if scope_lower.starts_with("entity.name.function")
102        || scope_lower.starts_with("support.function")
103        || scope_lower.starts_with("meta.function-call")
104        || scope_lower.starts_with("variable.function")
105    {
106        return Some(HighlightCategory::Function);
107    }
108
109    // Types
110    if scope_lower.starts_with("entity.name.type")
111        || scope_lower.starts_with("entity.name.class")
112        || scope_lower.starts_with("entity.name.struct")
113        || scope_lower.starts_with("entity.name.enum")
114        || scope_lower.starts_with("entity.name.interface")
115        || scope_lower.starts_with("entity.name.trait")
116        || scope_lower.starts_with("support.type")
117        || scope_lower.starts_with("support.class")
118        || scope_lower.starts_with("storage.type")
119    {
120        return Some(HighlightCategory::Type);
121    }
122
123    // Storage modifiers (pub, static, const as keywords)
124    if scope_lower.starts_with("storage.modifier") {
125        return Some(HighlightCategory::Keyword);
126    }
127
128    // Constants and numbers
129    if scope_lower.starts_with("constant.numeric")
130        || scope_lower.starts_with("constant.language.boolean")
131    {
132        return Some(HighlightCategory::Number);
133    }
134    if scope_lower.starts_with("constant") {
135        return Some(HighlightCategory::Constant);
136    }
137
138    // Variables
139    if scope_lower.starts_with("variable.parameter")
140        || scope_lower.starts_with("variable.other")
141        || scope_lower.starts_with("variable.language")
142    {
143        return Some(HighlightCategory::Variable);
144    }
145
146    // Properties / object keys
147    if scope_lower.starts_with("entity.name.tag")
148        || scope_lower.starts_with("support.other.property")
149        || scope_lower.starts_with("meta.object-literal.key")
150        || scope_lower.starts_with("variable.other.property")
151        || scope_lower.starts_with("variable.other.object.property")
152    {
153        return Some(HighlightCategory::Property);
154    }
155
156    // Attributes (decorators, annotations)
157    if scope_lower.starts_with("entity.other.attribute")
158        || scope_lower.starts_with("meta.attribute")
159        || scope_lower.starts_with("entity.name.decorator")
160    {
161        return Some(HighlightCategory::Attribute);
162    }
163
164    // Generic variable fallback
165    if scope_lower.starts_with("variable") {
166        return Some(HighlightCategory::Variable);
167    }
168
169    None
170}
171
172/// Preference for which highlighting backend to use
173#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
174pub enum HighlighterPreference {
175    /// Use TextMate/syntect for highlighting (default)
176    /// Tree-sitter language is still detected for other features (indentation, semantic highlighting)
177    #[default]
178    Auto,
179    /// Force tree-sitter for highlighting (useful for testing/comparison)
180    TreeSitter,
181    /// Explicitly use TextMate grammar (same as Auto)
182    TextMate,
183}
184
185/// Unified highlighting engine supporting multiple backends
186#[derive(Default)]
187pub enum HighlightEngine {
188    /// Tree-sitter based highlighting (built-in languages)
189    TreeSitter(Box<Highlighter>),
190    /// TextMate grammar based highlighting
191    TextMate(Box<TextMateEngine>),
192    /// No highlighting available
193    #[default]
194    None,
195}
196
197/// TextMate highlighting engine wrapper
198///
199/// This struct handles the lifetime complexities of syntect by storing
200/// the syntax set and using indices rather than references.
201pub struct TextMateEngine {
202    syntax_set: Arc<SyntaxSet>,
203    syntax_index: usize,
204    cache: Option<TextMateCache>,
205    last_buffer_len: usize,
206    /// Tree-sitter language for non-highlighting features (indentation, semantic highlighting)
207    /// Even when using syntect for highlighting, we track the language for other features
208    ts_language: Option<Language>,
209}
210
211#[derive(Debug, Clone)]
212struct TextMateCache {
213    range: Range<usize>,
214    spans: Vec<CachedSpan>,
215}
216
217#[derive(Debug, Clone)]
218struct CachedSpan {
219    range: Range<usize>,
220    category: crate::primitives::highlighter::HighlightCategory,
221}
222
223/// Maximum bytes to parse in a single operation
224const MAX_PARSE_BYTES: usize = 1024 * 1024;
225
226impl TextMateEngine {
227    /// Create a new TextMate engine for the given syntax
228    pub fn new(syntax_set: Arc<SyntaxSet>, syntax_index: usize) -> Self {
229        Self {
230            syntax_set,
231            syntax_index,
232            cache: None,
233            last_buffer_len: 0,
234            ts_language: None,
235        }
236    }
237
238    /// Create a new TextMate engine with a tree-sitter language for non-highlighting features
239    pub fn with_language(
240        syntax_set: Arc<SyntaxSet>,
241        syntax_index: usize,
242        ts_language: Option<Language>,
243    ) -> Self {
244        Self {
245            syntax_set,
246            syntax_index,
247            cache: None,
248            last_buffer_len: 0,
249            ts_language,
250        }
251    }
252
253    /// Get the tree-sitter language (for indentation, semantic highlighting, etc.)
254    pub fn language(&self) -> Option<&Language> {
255        self.ts_language.as_ref()
256    }
257
258    /// Highlight the visible viewport range
259    ///
260    /// `context_bytes` controls how far before/after the viewport to parse for accurate
261    /// highlighting of multi-line constructs (strings, comments, nested blocks).
262    pub fn highlight_viewport(
263        &mut self,
264        buffer: &Buffer,
265        viewport_start: usize,
266        viewport_end: usize,
267        theme: &Theme,
268        context_bytes: usize,
269    ) -> Vec<HighlightSpan> {
270        use syntect::parsing::{ParseState, ScopeStack};
271
272        // Check cache validity
273        if let Some(cache) = &self.cache {
274            if cache.range.start <= viewport_start
275                && cache.range.end >= viewport_end
276                && self.last_buffer_len == buffer.len()
277            {
278                return cache
279                    .spans
280                    .iter()
281                    .filter(|span| {
282                        span.range.start < viewport_end && span.range.end > viewport_start
283                    })
284                    .map(|span| HighlightSpan {
285                        range: span.range.clone(),
286                        color: highlight_color(span.category, theme),
287                    })
288                    .collect();
289            }
290        }
291
292        // Cache miss - parse viewport region
293        let parse_start = viewport_start.saturating_sub(context_bytes);
294        let parse_end = (viewport_end + context_bytes).min(buffer.len());
295
296        if parse_end <= parse_start || parse_end - parse_start > MAX_PARSE_BYTES {
297            return Vec::new();
298        }
299
300        let syntax = &self.syntax_set.syntaxes()[self.syntax_index];
301        let mut state = ParseState::new(syntax);
302        let mut spans = Vec::new();
303
304        // Get content
305        let content = buffer.slice_bytes(parse_start..parse_end);
306        let content_str = match std::str::from_utf8(&content) {
307            Ok(s) => s,
308            Err(_) => return Vec::new(),
309        };
310
311        // Parse line by line - manually track line boundaries to handle CRLF correctly
312        // str::lines() strips both \n and \r\n, losing the distinction
313        let content_bytes = content_str.as_bytes();
314        let mut pos = 0;
315        let mut current_offset = parse_start;
316        let mut current_scopes = ScopeStack::new();
317
318        while pos < content_bytes.len() {
319            let line_start = pos;
320            let mut line_end = pos;
321
322            // Scan for line ending (find \n or \r\n or end of content)
323            while line_end < content_bytes.len() {
324                if content_bytes[line_end] == b'\n' {
325                    line_end += 1;
326                    break;
327                } else if content_bytes[line_end] == b'\r' {
328                    if line_end + 1 < content_bytes.len() && content_bytes[line_end + 1] == b'\n' {
329                        line_end += 2; // CRLF
330                    } else {
331                        line_end += 1; // CR only
332                    }
333                    break;
334                }
335                line_end += 1;
336            }
337
338            // Get the line content and actual byte length
339            let line_bytes = &content_bytes[line_start..line_end];
340            let actual_line_byte_len = line_bytes.len();
341
342            // Create line string for syntect - strip CR if present, ensure single \n
343            let line_str = match std::str::from_utf8(line_bytes) {
344                Ok(s) => s,
345                Err(_) => {
346                    pos = line_end;
347                    current_offset += actual_line_byte_len;
348                    continue;
349                }
350            };
351
352            // Remove trailing \r\n or \n, then add single \n for syntect
353            let line_content = line_str.trim_end_matches(&['\r', '\n'][..]);
354            let line_for_syntect = if line_end < content_bytes.len() || line_str.ends_with('\n') {
355                format!("{}\n", line_content)
356            } else {
357                line_content.to_string()
358            };
359
360            let ops = match state.parse_line(&line_for_syntect, &self.syntax_set) {
361                Ok(ops) => ops,
362                Err(_) => {
363                    pos = line_end;
364                    current_offset += actual_line_byte_len;
365                    continue;
366                }
367            };
368
369            // Convert operations to spans
370            // Note: syntect offsets are relative to line_for_syntect, but we need
371            // to map them to the actual buffer positions
372            let mut syntect_offset = 0;
373            let line_content_len = line_content.len();
374
375            for (op_offset, op) in ops {
376                // Handle any text before this operation (but only within content, not newline)
377                let clamped_op_offset = op_offset.min(line_content_len);
378                if clamped_op_offset > syntect_offset {
379                    if let Some(category) = Self::scope_stack_to_category(&current_scopes) {
380                        let byte_start = current_offset + syntect_offset;
381                        let byte_end = current_offset + clamped_op_offset;
382                        if byte_start < byte_end {
383                            spans.push(CachedSpan {
384                                range: byte_start..byte_end,
385                                category,
386                            });
387                        }
388                    }
389                }
390                syntect_offset = clamped_op_offset;
391
392                let _ = current_scopes.apply(&op);
393            }
394
395            // Handle remaining text on line (content only, not line ending)
396            if syntect_offset < line_content_len {
397                if let Some(category) = Self::scope_stack_to_category(&current_scopes) {
398                    let byte_start = current_offset + syntect_offset;
399                    let byte_end = current_offset + line_content_len;
400                    if byte_start < byte_end {
401                        spans.push(CachedSpan {
402                            range: byte_start..byte_end,
403                            category,
404                        });
405                    }
406                }
407            }
408
409            // Advance by actual byte length (including real line terminator)
410            pos = line_end;
411            current_offset += actual_line_byte_len;
412        }
413
414        // Merge adjacent spans
415        Self::merge_adjacent_spans(&mut spans);
416
417        // Update cache
418        self.cache = Some(TextMateCache {
419            range: parse_start..parse_end,
420            spans: spans.clone(),
421        });
422        self.last_buffer_len = buffer.len();
423
424        // Filter and resolve colors
425        spans
426            .into_iter()
427            .filter(|span| span.range.start < viewport_end && span.range.end > viewport_start)
428            .map(|span| HighlightSpan {
429                range: span.range,
430                color: highlight_color(span.category, theme),
431            })
432            .collect()
433    }
434
435    /// Map scope stack to highlight category
436    fn scope_stack_to_category(scopes: &syntect::parsing::ScopeStack) -> Option<HighlightCategory> {
437        for scope in scopes.as_slice().iter().rev() {
438            let scope_str = scope.build_string();
439            if let Some(cat) = scope_to_category(&scope_str) {
440                return Some(cat);
441            }
442        }
443        None
444    }
445
446    /// Merge adjacent spans with same category
447    fn merge_adjacent_spans(spans: &mut Vec<CachedSpan>) {
448        if spans.len() < 2 {
449            return;
450        }
451
452        let mut write_idx = 0;
453        for read_idx in 1..spans.len() {
454            if spans[write_idx].category == spans[read_idx].category
455                && spans[write_idx].range.end == spans[read_idx].range.start
456            {
457                spans[write_idx].range.end = spans[read_idx].range.end;
458            } else {
459                write_idx += 1;
460                if write_idx != read_idx {
461                    spans[write_idx] = spans[read_idx].clone();
462                }
463            }
464        }
465        spans.truncate(write_idx + 1);
466    }
467
468    /// Invalidate cache for edited range
469    pub fn invalidate_range(&mut self, edit_range: Range<usize>) {
470        if let Some(cache) = &self.cache {
471            if edit_range.start < cache.range.end && edit_range.end > cache.range.start {
472                self.cache = None;
473            }
474        }
475    }
476
477    /// Invalidate all cache
478    pub fn invalidate_all(&mut self) {
479        self.cache = None;
480    }
481
482    /// Get syntax name
483    pub fn syntax_name(&self) -> &str {
484        &self.syntax_set.syntaxes()[self.syntax_index].name
485    }
486}
487
488impl HighlightEngine {
489    /// Create a highlighting engine for a file
490    ///
491    /// Always uses syntect/TextMate for highlighting, but detects tree-sitter
492    /// language for other features (indentation, semantic highlighting).
493    pub fn for_file(path: &Path, registry: &GrammarRegistry) -> Self {
494        Self::for_file_with_preference(path, registry, HighlighterPreference::Auto)
495    }
496
497    /// Create a highlighting engine for a file, using language configuration for detection.
498    ///
499    /// This method checks the provided languages configuration for filename and extension
500    /// matches before falling back to built-in detection. This allows users to configure
501    /// custom filename patterns (like PKGBUILD for bash) that will be respected for
502    /// syntax highlighting.
503    pub fn for_file_with_languages(
504        path: &Path,
505        registry: &GrammarRegistry,
506        languages: &std::collections::HashMap<String, crate::config::LanguageConfig>,
507    ) -> Self {
508        Self::for_file_with_languages_and_preference(
509            path,
510            registry,
511            languages,
512            HighlighterPreference::Auto,
513        )
514    }
515
516    /// Create a highlighting engine with explicit preference and language configuration.
517    pub fn for_file_with_languages_and_preference(
518        path: &Path,
519        registry: &GrammarRegistry,
520        languages: &std::collections::HashMap<String, crate::config::LanguageConfig>,
521        preference: HighlighterPreference,
522    ) -> Self {
523        match preference {
524            // Auto now defaults to TextMate for highlighting (syntect has broader coverage)
525            // but still detects tree-sitter language for indentation/semantic features
526            HighlighterPreference::Auto | HighlighterPreference::TextMate => {
527                Self::textmate_for_file_with_languages(path, registry, languages)
528            }
529            HighlighterPreference::TreeSitter => {
530                if let Some(lang) = Language::from_path(path) {
531                    if let Ok(highlighter) = Highlighter::new(lang) {
532                        return Self::TreeSitter(Box::new(highlighter));
533                    }
534                }
535                Self::None
536            }
537        }
538    }
539
540    /// Create a highlighting engine with explicit preference
541    pub fn for_file_with_preference(
542        path: &Path,
543        registry: &GrammarRegistry,
544        preference: HighlighterPreference,
545    ) -> Self {
546        match preference {
547            // Auto now defaults to TextMate for highlighting (syntect has broader coverage)
548            // but still detects tree-sitter language for indentation/semantic features
549            HighlighterPreference::Auto | HighlighterPreference::TextMate => {
550                Self::textmate_for_file(path, registry)
551            }
552            HighlighterPreference::TreeSitter => {
553                if let Some(lang) = Language::from_path(path) {
554                    if let Ok(highlighter) = Highlighter::new(lang) {
555                        return Self::TreeSitter(Box::new(highlighter));
556                    }
557                }
558                Self::None
559            }
560        }
561    }
562
563    /// Create a TextMate engine for a file, falling back to tree-sitter if no TextMate grammar
564    fn textmate_for_file(path: &Path, registry: &GrammarRegistry) -> Self {
565        let syntax_set = registry.syntax_set_arc();
566
567        // Detect tree-sitter language for non-highlighting features
568        let ts_language = Language::from_path(path);
569
570        // Find syntax by file extension
571        if let Some(syntax) = registry.find_syntax_for_file(path) {
572            // Find the index of this syntax in the set
573            if let Some(index) = syntax_set
574                .syntaxes()
575                .iter()
576                .position(|s| s.name == syntax.name)
577            {
578                return Self::TextMate(Box::new(TextMateEngine::with_language(
579                    syntax_set,
580                    index,
581                    ts_language,
582                )));
583            }
584        }
585
586        // No TextMate grammar found - fall back to tree-sitter if available
587        // This handles languages like TypeScript that syntect doesn't include by default
588        if let Some(lang) = ts_language {
589            if let Ok(highlighter) = Highlighter::new(lang) {
590                tracing::debug!(
591                    "No TextMate grammar for {:?}, falling back to tree-sitter",
592                    path.extension()
593                );
594                return Self::TreeSitter(Box::new(highlighter));
595            }
596        }
597
598        Self::None
599    }
600
601    /// Create a TextMate engine for a file with language configuration support
602    fn textmate_for_file_with_languages(
603        path: &Path,
604        registry: &GrammarRegistry,
605        languages: &std::collections::HashMap<String, crate::config::LanguageConfig>,
606    ) -> Self {
607        let syntax_set = registry.syntax_set_arc();
608
609        // Detect tree-sitter language for non-highlighting features
610        let ts_language = Language::from_path(path);
611
612        // Find syntax by file extension, checking languages config first
613        if let Some(syntax) = registry.find_syntax_for_file_with_languages(path, languages) {
614            // Find the index of this syntax in the set
615            if let Some(index) = syntax_set
616                .syntaxes()
617                .iter()
618                .position(|s| s.name == syntax.name)
619            {
620                return Self::TextMate(Box::new(TextMateEngine::with_language(
621                    syntax_set,
622                    index,
623                    ts_language,
624                )));
625            }
626        }
627
628        // No TextMate grammar found - fall back to tree-sitter if available
629        // This handles languages like TypeScript that syntect doesn't include by default
630        if let Some(lang) = ts_language {
631            if let Ok(highlighter) = Highlighter::new(lang) {
632                tracing::debug!(
633                    "No TextMate grammar for {:?}, falling back to tree-sitter",
634                    path.extension()
635                );
636                return Self::TreeSitter(Box::new(highlighter));
637            }
638        }
639
640        Self::None
641    }
642
643    /// Highlight the visible viewport
644    ///
645    /// `context_bytes` controls how far before/after the viewport to parse for accurate
646    /// highlighting of multi-line constructs (strings, comments, nested blocks).
647    pub fn highlight_viewport(
648        &mut self,
649        buffer: &Buffer,
650        viewport_start: usize,
651        viewport_end: usize,
652        theme: &Theme,
653        context_bytes: usize,
654    ) -> Vec<HighlightSpan> {
655        match self {
656            Self::TreeSitter(h) => {
657                h.highlight_viewport(buffer, viewport_start, viewport_end, theme, context_bytes)
658            }
659            Self::TextMate(h) => {
660                h.highlight_viewport(buffer, viewport_start, viewport_end, theme, context_bytes)
661            }
662            Self::None => Vec::new(),
663        }
664    }
665
666    /// Invalidate cache for an edited range
667    pub fn invalidate_range(&mut self, edit_range: Range<usize>) {
668        match self {
669            Self::TreeSitter(h) => h.invalidate_range(edit_range),
670            Self::TextMate(h) => h.invalidate_range(edit_range),
671            Self::None => {}
672        }
673    }
674
675    /// Invalidate entire cache
676    pub fn invalidate_all(&mut self) {
677        match self {
678            Self::TreeSitter(h) => h.invalidate_all(),
679            Self::TextMate(h) => h.invalidate_all(),
680            Self::None => {}
681        }
682    }
683
684    /// Check if this engine has highlighting available
685    pub fn has_highlighting(&self) -> bool {
686        !matches!(self, Self::None)
687    }
688
689    /// Get a description of the active backend
690    pub fn backend_name(&self) -> &str {
691        match self {
692            Self::TreeSitter(_) => "tree-sitter",
693            Self::TextMate(_) => "textmate",
694            Self::None => "none",
695        }
696    }
697
698    /// Get the language/syntax name if available
699    pub fn syntax_name(&self) -> Option<&str> {
700        match self {
701            Self::TreeSitter(_) => None, // Tree-sitter doesn't expose name easily
702            Self::TextMate(h) => Some(h.syntax_name()),
703            Self::None => None,
704        }
705    }
706
707    /// Get the tree-sitter Language for non-highlighting features
708    /// Returns the language even when using TextMate for highlighting
709    pub fn language(&self) -> Option<&Language> {
710        match self {
711            Self::TreeSitter(h) => Some(h.language()),
712            Self::TextMate(h) => h.language(),
713            Self::None => None,
714        }
715    }
716}
717
718/// Highlight a code string using syntect (for markdown code blocks, hover popups, etc.)
719/// Returns spans with byte ranges relative to the input string.
720///
721/// This uses TextMate grammars via syntect which provides broader language coverage
722/// than tree-sitter (~150+ languages vs ~17).
723pub fn highlight_string(
724    code: &str,
725    lang_hint: &str,
726    registry: &GrammarRegistry,
727    theme: &Theme,
728) -> Vec<HighlightSpan> {
729    use syntect::parsing::{ParseState, ScopeStack};
730
731    // Find syntax by language token (handles aliases like "py" -> Python)
732    let syntax = match registry.syntax_set().find_syntax_by_token(lang_hint) {
733        Some(s) => s,
734        None => return Vec::new(),
735    };
736
737    let syntax_set = registry.syntax_set();
738    let mut state = ParseState::new(syntax);
739    let mut spans = Vec::new();
740    let mut current_scopes = ScopeStack::new();
741    let mut current_offset = 0;
742
743    // Parse line by line
744    for line in code.split_inclusive('\n') {
745        let line_start = current_offset;
746        let line_len = line.len();
747
748        // Remove trailing newline for syntect, then add it back
749        let line_content = line.trim_end_matches(&['\r', '\n'][..]);
750        let line_for_syntect = if line.ends_with('\n') {
751            format!("{}\n", line_content)
752        } else {
753            line_content.to_string()
754        };
755
756        let ops = match state.parse_line(&line_for_syntect, syntax_set) {
757            Ok(ops) => ops,
758            Err(_) => {
759                current_offset += line_len;
760                continue;
761            }
762        };
763
764        let mut syntect_offset = 0;
765        let line_content_len = line_content.len();
766
767        for (op_offset, op) in ops {
768            let clamped_op_offset = op_offset.min(line_content_len);
769            if clamped_op_offset > syntect_offset {
770                if let Some(category) = scope_stack_to_category(&current_scopes) {
771                    let byte_start = line_start + syntect_offset;
772                    let byte_end = line_start + clamped_op_offset;
773                    if byte_start < byte_end {
774                        spans.push(HighlightSpan {
775                            range: byte_start..byte_end,
776                            color: highlight_color(category, theme),
777                        });
778                    }
779                }
780            }
781            syntect_offset = clamped_op_offset;
782            let _ = current_scopes.apply(&op);
783        }
784
785        // Handle remaining text on line
786        if syntect_offset < line_content_len {
787            if let Some(category) = scope_stack_to_category(&current_scopes) {
788                let byte_start = line_start + syntect_offset;
789                let byte_end = line_start + line_content_len;
790                if byte_start < byte_end {
791                    spans.push(HighlightSpan {
792                        range: byte_start..byte_end,
793                        color: highlight_color(category, theme),
794                    });
795                }
796            }
797        }
798
799        current_offset += line_len;
800    }
801
802    // Merge adjacent spans with same color
803    merge_adjacent_highlight_spans(&mut spans);
804
805    spans
806}
807
808/// Map scope stack to highlight category (for highlight_string)
809fn scope_stack_to_category(scopes: &syntect::parsing::ScopeStack) -> Option<HighlightCategory> {
810    for scope in scopes.as_slice().iter().rev() {
811        let scope_str = scope.build_string();
812        if let Some(cat) = scope_to_category(&scope_str) {
813            return Some(cat);
814        }
815    }
816    None
817}
818
819/// Merge adjacent spans with same color
820fn merge_adjacent_highlight_spans(spans: &mut Vec<HighlightSpan>) {
821    if spans.len() < 2 {
822        return;
823    }
824
825    let mut write_idx = 0;
826    for read_idx in 1..spans.len() {
827        if spans[write_idx].color == spans[read_idx].color
828            && spans[write_idx].range.end == spans[read_idx].range.start
829        {
830            spans[write_idx].range.end = spans[read_idx].range.end;
831        } else {
832            write_idx += 1;
833            if write_idx != read_idx {
834                spans[write_idx] = spans[read_idx].clone();
835            }
836        }
837    }
838    spans.truncate(write_idx + 1);
839}
840
841#[cfg(test)]
842mod tests {
843    use super::*;
844    use crate::view::theme;
845
846    #[test]
847    fn test_highlighter_preference_default() {
848        let pref = HighlighterPreference::default();
849        assert_eq!(pref, HighlighterPreference::Auto);
850    }
851
852    #[test]
853    fn test_highlight_engine_default() {
854        let engine = HighlightEngine::default();
855        assert!(!engine.has_highlighting());
856        assert_eq!(engine.backend_name(), "none");
857    }
858
859    #[test]
860    fn test_textmate_backend_selection() {
861        let registry =
862            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
863
864        // Languages with TextMate grammars use TextMate for highlighting
865        let engine = HighlightEngine::for_file(Path::new("test.rs"), &registry);
866        assert_eq!(engine.backend_name(), "textmate");
867        // Tree-sitter language should still be detected for other features
868        assert!(engine.language().is_some());
869
870        let engine = HighlightEngine::for_file(Path::new("test.py"), &registry);
871        assert_eq!(engine.backend_name(), "textmate");
872        assert!(engine.language().is_some());
873
874        let engine = HighlightEngine::for_file(Path::new("test.js"), &registry);
875        assert_eq!(engine.backend_name(), "textmate");
876        assert!(engine.language().is_some());
877
878        // TypeScript falls back to tree-sitter (syntect doesn't include TS by default)
879        let engine = HighlightEngine::for_file(Path::new("test.ts"), &registry);
880        assert_eq!(engine.backend_name(), "tree-sitter");
881        assert!(engine.language().is_some());
882
883        let engine = HighlightEngine::for_file(Path::new("test.tsx"), &registry);
884        assert_eq!(engine.backend_name(), "tree-sitter");
885        assert!(engine.language().is_some());
886    }
887
888    #[test]
889    fn test_tree_sitter_explicit_preference() {
890        let registry =
891            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
892
893        // Force tree-sitter for highlighting
894        let engine = HighlightEngine::for_file_with_preference(
895            Path::new("test.rs"),
896            &registry,
897            HighlighterPreference::TreeSitter,
898        );
899        assert_eq!(engine.backend_name(), "tree-sitter");
900    }
901
902    #[test]
903    fn test_unknown_extension() {
904        let registry =
905            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
906
907        // Unknown extension
908        let engine = HighlightEngine::for_file(Path::new("test.unknown_xyz_123"), &registry);
909        // Might be none or might find something via syntect
910        // Just verify it doesn't panic
911        let _ = engine.backend_name();
912    }
913
914    #[test]
915    fn test_highlight_viewport_empty_buffer_no_panic() {
916        // Regression test: calling highlight_viewport with an empty buffer
917        // and non-zero viewport range previously caused subtraction overflow panic.
918        //
919        // The bug occurred when:
920        // - buffer is empty (len = 0)
921        // - viewport_start > context_bytes (so parse_start > 0 after saturating_sub)
922        // - parse_end = min(viewport_end + context_bytes, buffer.len()) = 0
923        // - parse_end - parse_start would underflow (0 - positive = overflow)
924        let registry =
925            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
926
927        let mut engine = HighlightEngine::for_file(Path::new("test.rs"), &registry);
928
929        // Create empty buffer
930        let buffer = Buffer::from_str("", 0);
931        let theme = Theme::load_builtin(theme::THEME_LIGHT).unwrap();
932
933        // Test the specific case that triggered the overflow:
934        // viewport_start=100, context_bytes=10 => parse_start=90, parse_end=0
935        // 0 - 90 = overflow!
936        if let HighlightEngine::TextMate(ref mut tm) = engine {
937            // Small context_bytes so parse_start remains > 0
938            let spans = tm.highlight_viewport(&buffer, 100, 200, &theme, 10);
939            assert!(spans.is_empty());
940        }
941    }
942
943    /// Test that TextMateEngine produces correct byte offsets for CRLF content.
944    /// This is a regression test for a bug where using str::lines() caused 1-byte
945    /// offset drift per line because it strips line terminators.
946    #[test]
947    fn test_textmate_engine_crlf_byte_offsets() {
948        let registry =
949            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
950
951        let mut engine = HighlightEngine::for_file(Path::new("test.java"), &registry);
952
953        // Create CRLF content with keywords on each line
954        // Each "public" keyword should be highlighted at byte positions:
955        // Line 1: "public" at bytes 0-5
956        // Line 2: "public" at bytes 8-13 (after "public\r\n" = 8 bytes)
957        // Line 3: "public" at bytes 16-21 (after two "public\r\n" = 16 bytes)
958        let content = b"public\r\npublic\r\npublic\r\n";
959        let buffer = Buffer::from_bytes(content.to_vec());
960        let theme = Theme::load_builtin(theme::THEME_LIGHT).unwrap();
961
962        if let HighlightEngine::TextMate(ref mut tm) = engine {
963            // Highlight the entire content
964            let spans = tm.highlight_viewport(&buffer, 0, content.len(), &theme, 0);
965
966            // Find spans that cover keyword positions
967            // The keyword "public" should have spans at these byte ranges:
968            // Line 1: 0..6
969            // Line 2: 8..14 (NOT 7..13 which would be the buggy offset)
970            // Line 3: 16..22 (NOT 14..20 which would be the buggy offset)
971
972            eprintln!(
973                "Spans: {:?}",
974                spans.iter().map(|s| &s.range).collect::<Vec<_>>()
975            );
976
977            // Check that we have spans covering the correct positions
978            let has_span_at = |start: usize, end: usize| -> bool {
979                spans
980                    .iter()
981                    .any(|s| s.range.start <= start && s.range.end >= end)
982            };
983
984            // Line 1: "public" at bytes 0-6
985            assert!(
986                has_span_at(0, 6),
987                "Should have span covering bytes 0-6 (line 1 'public'). Spans: {:?}",
988                spans.iter().map(|s| &s.range).collect::<Vec<_>>()
989            );
990
991            // Line 2: "public" at bytes 8-14 (after "public\r\n")
992            // If buggy, would be at 7-13
993            assert!(
994                has_span_at(8, 14),
995                "Should have span covering bytes 8-14 (line 2 'public'). \
996                 If this fails, CRLF offset drift is occurring. Spans: {:?}",
997                spans.iter().map(|s| &s.range).collect::<Vec<_>>()
998            );
999
1000            // Line 3: "public" at bytes 16-22 (after two "public\r\n")
1001            // If buggy, would be at 14-20
1002            assert!(
1003                has_span_at(16, 22),
1004                "Should have span covering bytes 16-22 (line 3 'public'). \
1005                 If this fails, CRLF offset drift is occurring. Spans: {:?}",
1006                spans.iter().map(|s| &s.range).collect::<Vec<_>>()
1007            );
1008        } else {
1009            panic!("Expected TextMate engine for .java file");
1010        }
1011    }
1012
1013    #[test]
1014    fn test_git_rebase_todo_highlighting() {
1015        let registry =
1016            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
1017
1018        // git-rebase-todo files should use the Git Rebase Todo grammar
1019        let engine = HighlightEngine::for_file(Path::new("git-rebase-todo"), &registry);
1020        assert_eq!(engine.backend_name(), "textmate");
1021        assert!(engine.has_highlighting());
1022    }
1023
1024    #[test]
1025    fn test_git_commit_message_highlighting() {
1026        let registry =
1027            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
1028
1029        // COMMIT_EDITMSG should use the Git Commit Message grammar
1030        let engine = HighlightEngine::for_file(Path::new("COMMIT_EDITMSG"), &registry);
1031        assert_eq!(engine.backend_name(), "textmate");
1032        assert!(engine.has_highlighting());
1033
1034        // MERGE_MSG should also work
1035        let engine = HighlightEngine::for_file(Path::new("MERGE_MSG"), &registry);
1036        assert_eq!(engine.backend_name(), "textmate");
1037        assert!(engine.has_highlighting());
1038    }
1039
1040    #[test]
1041    fn test_gitignore_highlighting() {
1042        let registry =
1043            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
1044
1045        // .gitignore should use the Gitignore grammar
1046        let engine = HighlightEngine::for_file(Path::new(".gitignore"), &registry);
1047        assert_eq!(engine.backend_name(), "textmate");
1048        assert!(engine.has_highlighting());
1049
1050        // .dockerignore should also work
1051        let engine = HighlightEngine::for_file(Path::new(".dockerignore"), &registry);
1052        assert_eq!(engine.backend_name(), "textmate");
1053        assert!(engine.has_highlighting());
1054    }
1055
1056    #[test]
1057    fn test_gitconfig_highlighting() {
1058        let registry =
1059            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
1060
1061        // .gitconfig should use the Git Config grammar
1062        let engine = HighlightEngine::for_file(Path::new(".gitconfig"), &registry);
1063        assert_eq!(engine.backend_name(), "textmate");
1064        assert!(engine.has_highlighting());
1065
1066        // .gitmodules should also work
1067        let engine = HighlightEngine::for_file(Path::new(".gitmodules"), &registry);
1068        assert_eq!(engine.backend_name(), "textmate");
1069        assert!(engine.has_highlighting());
1070    }
1071
1072    #[test]
1073    fn test_gitattributes_highlighting() {
1074        let registry =
1075            GrammarRegistry::load(&crate::primitives::grammar::LocalGrammarLoader::new());
1076
1077        // .gitattributes should use the Git Attributes grammar
1078        let engine = HighlightEngine::for_file(Path::new(".gitattributes"), &registry);
1079        assert_eq!(engine.backend_name(), "textmate");
1080        assert!(engine.has_highlighting());
1081    }
1082}