Skip to main content

highlight_spans/
lib.rs

1use std::collections::HashMap;
2
3use thiserror::Error;
4use tree_sitter::StreamingIterator;
5use tree_sitter_highlight::{HighlightConfiguration, HighlightEvent, Highlighter as TsHighlighter};
6
7unsafe extern "C" {
8    /// Returns the SQL Tree-sitter language handle from the vendored parser.
9    fn tree_sitter_sql() -> *const ();
10}
11
12const MARKDOWN_LANGUAGE: tree_sitter_language::LanguageFn = tree_sitter_md::LANGUAGE;
13const MARKDOWN_INLINE_LANGUAGE: tree_sitter_language::LanguageFn = tree_sitter_md::INLINE_LANGUAGE;
14const MARKDOWN_HIGHLIGHTS_QUERY: &str = tree_sitter_md::HIGHLIGHT_QUERY_BLOCK;
15const MARKDOWN_INJECTIONS_QUERY: &str = tree_sitter_md::INJECTION_QUERY_BLOCK;
16const MARKDOWN_INLINE_HIGHLIGHTS_QUERY: &str = tree_sitter_md::HIGHLIGHT_QUERY_INLINE;
17const MARKDOWN_INLINE_INJECTIONS_QUERY: &str = tree_sitter_md::INJECTION_QUERY_INLINE;
18const XML_LANGUAGE: tree_sitter_language::LanguageFn = tree_sitter_xml::LANGUAGE_XML;
19const XML_HIGHLIGHTS_QUERY: &str = tree_sitter_xml::XML_HIGHLIGHT_QUERY;
20const XML_IMPLEMENTATION_INJECTIONS_QUERY: &str = r#"
21(
22  element
23    (STag (Name) @_start_tag)
24    (content (CDSect (CData) @injection.content))
25    (ETag (Name) @_end_tag)
26  (#eq? @_start_tag "Implementation")
27  (#eq? @_end_tag "Implementation")
28  (#set! injection.language "objectscript")
29)
30(
31  element
32    (STag (Name) @_start_tag)
33    (content (CharData) @injection.content)
34    (ETag (Name) @_end_tag)
35  (#eq? @_start_tag "Implementation")
36  (#eq? @_end_tag "Implementation")
37  (#set! injection.language "objectscript")
38)
39"#;
40
41const SQL_LANGUAGE: tree_sitter_language::LanguageFn =
42    unsafe { tree_sitter_language::LanguageFn::from_raw(tree_sitter_sql) };
43const SQL_HIGHLIGHTS_QUERY: &str = include_str!("../vendor/tree-sitter-sql/queries/highlights.scm");
44
45#[derive(Debug, Clone, Copy, Eq, PartialEq)]
46pub enum Grammar {
47    ObjectScript,
48    Sql,
49    Python,
50    Markdown,
51    Mdx,
52    Xml,
53}
54
55const SUPPORTED_GRAMMARS: [&str; 6] = ["objectscript", "sql", "python", "markdown", "mdx", "xml"];
56
57impl Grammar {
58    /// Parses a grammar name or alias into a [`Grammar`] value.
59    ///
60    /// The input is normalized to lowercase alphanumeric characters, so values
61    /// such as `"ObjectScript"`, `"objectscript-playground"`, and `"os"` are accepted.
62    #[must_use]
63    pub fn from_name(input: &str) -> Option<Self> {
64        let normalized = normalize_language_name(input);
65        grammar_from_normalized_name(&normalized)
66    }
67
68    /// Returns the canonical lowercase name for this grammar.
69    #[must_use]
70    pub fn canonical_name(self) -> &'static str {
71        match self {
72            Self::ObjectScript => "objectscript",
73            Self::Sql => "sql",
74            Self::Python => "python",
75            Self::Markdown => "markdown",
76            Self::Mdx => "mdx",
77            Self::Xml => "xml",
78        }
79    }
80
81    /// Returns the canonical grammar names accepted by the CLI-facing APIs.
82    #[must_use]
83    pub fn supported_names() -> &'static [&'static str] {
84        &SUPPORTED_GRAMMARS
85    }
86}
87
88#[derive(Debug, Clone, Eq, PartialEq)]
89pub struct Attr {
90    pub id: usize,
91    pub capture_name: String,
92}
93
94impl Attr {
95    /// Returns the theme lookup key for this capture (for example `"@keyword"`).
96    #[must_use]
97    pub fn theme_key(&self) -> String {
98        format!("@{}", self.capture_name)
99    }
100}
101
102#[derive(Debug, Clone, Copy, Eq, PartialEq)]
103pub struct Span {
104    pub attr_id: usize,
105    pub start_byte: usize,
106    pub end_byte: usize,
107}
108
109#[derive(Debug, Clone, Eq, PartialEq)]
110pub struct HighlightResult {
111    pub attrs: Vec<Attr>,
112    pub spans: Vec<Span>,
113}
114
115#[derive(Debug, Error)]
116pub enum HighlightError {
117    #[error("failed to build highlight configuration: {0}")]
118    Query(#[from] tree_sitter::QueryError),
119    #[error("highlighting failed: {0}")]
120    Highlight(#[from] tree_sitter_highlight::Error),
121    #[error("failed to configure parser language: {0}")]
122    Language(#[from] tree_sitter::LanguageError),
123    #[error("failed to parse source for injection analysis")]
124    Parse,
125}
126
127pub struct SpanHighlighter {
128    highlighter: TsHighlighter,
129    attrs: Vec<Attr>,
130    objectscript: HighlightConfiguration,
131    sql: HighlightConfiguration,
132    python: HighlightConfiguration,
133    markdown: HighlightConfiguration,
134    markdown_inline: HighlightConfiguration,
135    xml: HighlightConfiguration,
136    objectscript_injection_query: tree_sitter::Query,
137    objectscript_injection_content_capture: Option<u32>,
138    objectscript_injection_language_capture: Option<u32>,
139    xml_injection_query: tree_sitter::Query,
140    xml_injection_content_capture: Option<u32>,
141    xml_injection_language_capture: Option<u32>,
142}
143
144#[derive(Debug, Clone, Copy, Eq, PartialEq)]
145struct InjectionRegion {
146    grammar: Grammar,
147    start_byte: usize,
148    end_byte: usize,
149}
150
151impl SpanHighlighter {
152    /// Creates a highlighter configured for all supported grammars and injections.
153    ///
154    /// This preloads Tree-sitter highlight configurations for ObjectScript, SQL,
155    /// Python, and Markdown variants, and builds a unified capture table.
156    ///
157    /// # Errors
158    ///
159    /// Returns an error if any grammar query cannot be compiled or if parser
160    /// language configuration fails.
161    pub fn new() -> Result<Self, HighlightError> {
162        let objectscript_language: tree_sitter::Language =
163            tree_sitter_objectscript_playground::LANGUAGE_OBJECTSCRIPT.into();
164        let mut objectscript = new_config(
165            objectscript_language.clone(),
166            "objectscript",
167            tree_sitter_objectscript_playground::HIGHLIGHTS_QUERY,
168            tree_sitter_objectscript_playground::INJECTIONS_QUERY,
169        )?;
170        let mut sql = new_config(SQL_LANGUAGE.into(), "sql", SQL_HIGHLIGHTS_QUERY, "")?;
171        let mut python = new_config(
172            tree_sitter_python::LANGUAGE.into(),
173            "python",
174            tree_sitter_python::HIGHLIGHTS_QUERY,
175            "",
176        )?;
177        let mut markdown = new_config(
178            MARKDOWN_LANGUAGE.into(),
179            "markdown",
180            MARKDOWN_HIGHLIGHTS_QUERY,
181            MARKDOWN_INJECTIONS_QUERY,
182        )?;
183        let mut markdown_inline = new_config(
184            MARKDOWN_INLINE_LANGUAGE.into(),
185            "markdown_inline",
186            MARKDOWN_INLINE_HIGHLIGHTS_QUERY,
187            MARKDOWN_INLINE_INJECTIONS_QUERY,
188        )?;
189        let xml_language: tree_sitter::Language = XML_LANGUAGE.into();
190        let mut xml = new_config(xml_language.clone(), "xml", XML_HIGHLIGHTS_QUERY, "")?;
191        let objectscript_injection_query = tree_sitter::Query::new(
192            &objectscript_language,
193            tree_sitter_objectscript_playground::INJECTIONS_QUERY,
194        )?;
195        let (objectscript_injection_content_capture, objectscript_injection_language_capture) =
196            injection_capture_indices(&objectscript_injection_query);
197        let xml_injection_query =
198            tree_sitter::Query::new(&xml_language, XML_IMPLEMENTATION_INJECTIONS_QUERY)?;
199        let (xml_injection_content_capture, xml_injection_language_capture) =
200            injection_capture_indices(&xml_injection_query);
201
202        let mut recognized = Vec::<String>::new();
203        let mut capture_index_by_name = HashMap::<String, usize>::new();
204        for config in [
205            &objectscript,
206            &sql,
207            &python,
208            &markdown,
209            &markdown_inline,
210            &xml,
211        ] {
212            for name in config.names() {
213                if capture_index_by_name.contains_key(*name) {
214                    continue;
215                }
216                let id = recognized.len();
217                let owned = (*name).to_string();
218                capture_index_by_name.insert(owned.clone(), id);
219                recognized.push(owned);
220            }
221        }
222        let recognized_refs = recognized.iter().map(String::as_str).collect::<Vec<_>>();
223        objectscript.configure(&recognized_refs);
224        sql.configure(&recognized_refs);
225        python.configure(&recognized_refs);
226        markdown.configure(&recognized_refs);
227        markdown_inline.configure(&recognized_refs);
228        xml.configure(&recognized_refs);
229        let attrs = recognized
230            .into_iter()
231            .enumerate()
232            .map(|(id, capture_name)| Attr { id, capture_name })
233            .collect::<Vec<_>>();
234
235        Ok(Self {
236            highlighter: TsHighlighter::new(),
237            attrs,
238            objectscript,
239            sql,
240            python,
241            markdown,
242            markdown_inline,
243            xml,
244            objectscript_injection_query,
245            objectscript_injection_content_capture,
246            objectscript_injection_language_capture,
247            xml_injection_query,
248            xml_injection_content_capture,
249            xml_injection_language_capture,
250        })
251    }
252
253    /// Highlights a source buffer and returns capture attributes plus byte spans.
254    ///
255    /// When `flavor` is [`Grammar::ObjectScript`], language injections are resolved
256    /// and applied to injected regions (for example embedded SQL blocks). When
257    /// `flavor` is [`Grammar::Xml`], ObjectScript injections are applied to
258    /// recognized XML embedded-code regions (for example `<Implementation>` bodies).
259    ///
260    /// # Errors
261    ///
262    /// Returns an error if Tree-sitter highlighting fails or if injection parsing
263    /// cannot be completed.
264    pub fn highlight(
265        &mut self,
266        source: &[u8],
267        flavor: Grammar,
268    ) -> Result<HighlightResult, HighlightError> {
269        let mut result = self.highlight_base(source, flavor)?;
270        if flavor == Grammar::ObjectScript {
271            self.apply_objectscript_injections(source, &mut result)?;
272        } else if flavor == Grammar::Xml {
273            self.apply_xml_injections(source, &mut result)?;
274        }
275        Ok(result)
276    }
277
278    /// Runs the base Tree-sitter highlight pass for a single grammar.
279    ///
280    /// Unlike [`Self::highlight`], this does not apply post-processing for
281    /// host-language injection regions.
282    ///
283    /// # Errors
284    ///
285    /// Returns an error if Tree-sitter fails to emit highlight events.
286    fn highlight_base(
287        &mut self,
288        source: &[u8],
289        flavor: Grammar,
290    ) -> Result<HighlightResult, HighlightError> {
291        let config = match flavor {
292            Grammar::ObjectScript => &self.objectscript,
293            Grammar::Sql => &self.sql,
294            Grammar::Python => &self.python,
295            Grammar::Markdown => &self.markdown,
296            // InterSystems MDX is OLAP query syntax; use SQL highlighting as a temporary fallback.
297            Grammar::Mdx => &self.sql,
298            Grammar::Xml => &self.xml,
299        };
300
301        let attrs = self.attrs.clone();
302
303        let injections = InjectionConfigs {
304            objectscript: &self.objectscript,
305            sql: &self.sql,
306            python: &self.python,
307            markdown: &self.markdown,
308            markdown_inline: &self.markdown_inline,
309            xml: &self.xml,
310        };
311
312        let events = self
313            .highlighter
314            .highlight(config, source, None, move |language_name| {
315                injections.resolve(language_name)
316            })?;
317        let mut spans = Vec::new();
318        let mut active_stack = Vec::new();
319
320        for event in events {
321            match event? {
322                HighlightEvent::HighlightStart(highlight) => active_stack.push(highlight.0),
323                HighlightEvent::HighlightEnd => {
324                    active_stack.pop();
325                }
326                HighlightEvent::Source { start, end } => {
327                    if let Some(&attr_id) = active_stack.last() {
328                        push_merged(
329                            &mut spans,
330                            Span {
331                                attr_id,
332                                start_byte: start,
333                                end_byte: end,
334                            },
335                        );
336                    }
337                }
338            }
339        }
340
341        Ok(HighlightResult { attrs, spans })
342    }
343
344    /// Highlights line-oriented input by joining lines with `\n`.
345    ///
346    /// # Errors
347    ///
348    /// Returns the same errors as [`Self::highlight`].
349    pub fn highlight_lines<S: AsRef<str>>(
350        &mut self,
351        lines: &[S],
352        flavor: Grammar,
353    ) -> Result<HighlightResult, HighlightError> {
354        let source = lines
355            .iter()
356            .map(AsRef::as_ref)
357            .collect::<Vec<_>>()
358            .join("\n");
359        self.highlight(source.as_bytes(), flavor)
360    }
361
362    /// Replaces ObjectScript injection regions in `base` with injected highlights.
363    ///
364    /// This method removes spans from injected byte ranges and merges spans produced
365    /// by the injected language highlighter.
366    ///
367    /// # Errors
368    ///
369    /// Returns an error if injection discovery or nested highlighting fails.
370    fn apply_objectscript_injections(
371        &mut self,
372        source: &[u8],
373        base: &mut HighlightResult,
374    ) -> Result<(), HighlightError> {
375        let injections = self.find_objectscript_injections(source)?;
376        self.apply_injections(source, base, injections)
377    }
378
379    /// Replaces XML injection regions in `base` with injected highlights.
380    ///
381    /// This currently targets XML regions where ObjectScript appears in
382    /// `<Implementation>` bodies.
383    fn apply_xml_injections(
384        &mut self,
385        source: &[u8],
386        base: &mut HighlightResult,
387    ) -> Result<(), HighlightError> {
388        let injections = self.find_xml_injections(source)?;
389        self.apply_injections(source, base, injections)
390    }
391
392    /// Applies already-discovered injection regions by replacing base spans.
393    fn apply_injections(
394        &mut self,
395        source: &[u8],
396        base: &mut HighlightResult,
397        injections: Vec<InjectionRegion>,
398    ) -> Result<(), HighlightError> {
399        if injections.is_empty() {
400            return Ok(());
401        }
402
403        let mut attrs = base.attrs.clone();
404        let mut attr_ids_by_name = attrs
405            .iter()
406            .map(|attr| (attr.capture_name.clone(), attr.id))
407            .collect::<HashMap<_, _>>();
408        let mut injected_spans = Vec::new();
409
410        for injection in &injections {
411            let nested_source = &source[injection.start_byte..injection.end_byte];
412            let nested = self.highlight_base(nested_source, injection.grammar)?;
413            let remap = remap_attr_ids(&nested.attrs, &mut attrs, &mut attr_ids_by_name);
414            for span in nested.spans {
415                let Some(&mapped_attr_id) = remap.get(span.attr_id) else {
416                    continue;
417                };
418                injected_spans.push(Span {
419                    attr_id: mapped_attr_id,
420                    start_byte: span.start_byte + injection.start_byte,
421                    end_byte: span.end_byte + injection.start_byte,
422                });
423            }
424        }
425
426        let mut spans = exclude_ranges(
427            &base.spans,
428            &injections
429                .iter()
430                .map(|inj| (inj.start_byte, inj.end_byte))
431                .collect::<Vec<_>>(),
432        );
433        spans.extend(injected_spans);
434
435        base.attrs = attrs;
436        base.spans = normalize_spans(spans);
437        Ok(())
438    }
439
440    /// Finds non-overlapping ObjectScript injection regions in the source buffer.
441    ///
442    /// # Errors
443    ///
444    /// Returns an error if parsing or query execution for injection analysis fails.
445    fn find_objectscript_injections(
446        &self,
447        source: &[u8],
448    ) -> Result<Vec<InjectionRegion>, HighlightError> {
449        let objectscript_language: tree_sitter::Language =
450            tree_sitter_objectscript_playground::LANGUAGE_OBJECTSCRIPT.into();
451        self.find_injections(
452            source,
453            &objectscript_language,
454            &self.objectscript_injection_query,
455            self.objectscript_injection_content_capture,
456            self.objectscript_injection_language_capture,
457        )
458    }
459
460    /// Finds non-overlapping XML injection regions in the source buffer.
461    ///
462    /// # Errors
463    ///
464    /// Returns an error if parsing or query execution for injection analysis fails.
465    fn find_xml_injections(&self, source: &[u8]) -> Result<Vec<InjectionRegion>, HighlightError> {
466        let xml_language: tree_sitter::Language = XML_LANGUAGE.into();
467        self.find_injections(
468            source,
469            &xml_language,
470            &self.xml_injection_query,
471            self.xml_injection_content_capture,
472            self.xml_injection_language_capture,
473        )
474    }
475
476    /// Finds and normalizes non-overlapping injection regions for a host grammar.
477    ///
478    /// # Errors
479    ///
480    /// Returns an error if parsing or query execution for injection analysis fails.
481    fn find_injections(
482        &self,
483        source: &[u8],
484        language: &tree_sitter::Language,
485        query: &tree_sitter::Query,
486        content_capture: Option<u32>,
487        language_capture: Option<u32>,
488    ) -> Result<Vec<InjectionRegion>, HighlightError> {
489        let mut parser = tree_sitter::Parser::new();
490        parser.set_language(language)?;
491        let tree = parser.parse(source, None).ok_or(HighlightError::Parse)?;
492        let mut cursor = tree_sitter::QueryCursor::new();
493
494        let mut injections = Vec::new();
495        let mut matches = cursor.matches(query, tree.root_node(), source);
496        while let Some(mat) = matches.next() {
497            let Some(injection) = self.injection_region_for_match(
498                query,
499                content_capture,
500                language_capture,
501                source,
502                &mat,
503            ) else {
504                continue;
505            };
506            injections.push(injection);
507        }
508
509        if injections.is_empty() {
510            return Ok(injections);
511        }
512
513        injections.sort_by(|a, b| {
514            a.start_byte
515                .cmp(&b.start_byte)
516                .then(b.end_byte.cmp(&a.end_byte))
517                .then((a.grammar as u8).cmp(&(b.grammar as u8)))
518        });
519        injections.dedup_by(|a, b| {
520            a.grammar == b.grammar && a.start_byte == b.start_byte && a.end_byte == b.end_byte
521        });
522
523        let mut non_overlapping = Vec::with_capacity(injections.len());
524        let mut last_end = 0usize;
525        for injection in injections {
526            if injection.start_byte < last_end {
527                continue;
528            }
529            last_end = injection.end_byte;
530            non_overlapping.push(injection);
531        }
532        Ok(non_overlapping)
533    }
534
535    /// Converts a query match to an [`InjectionRegion`] when captures are complete.
536    ///
537    /// Returns `None` when language or content captures are missing, unknown, or empty.
538    fn injection_region_for_match<'a>(
539        &self,
540        query: &tree_sitter::Query,
541        content_capture: Option<u32>,
542        language_capture: Option<u32>,
543        source: &'a [u8],
544        mat: &tree_sitter::QueryMatch<'a, 'a>,
545    ) -> Option<InjectionRegion> {
546        let mut language_name = None;
547        let mut content_node = None;
548
549        for capture in mat.captures {
550            let index = Some(capture.index);
551            if index == language_capture {
552                language_name = capture.node.utf8_text(source).ok();
553            } else if index == content_capture {
554                content_node = Some(capture.node);
555            }
556        }
557
558        for prop in query.property_settings(mat.pattern_index) {
559            match prop.key.as_ref() {
560                "injection.language" => {
561                    if language_name.is_none() {
562                        language_name = prop.value.as_ref().map(std::convert::AsRef::as_ref);
563                    }
564                }
565                "injection.self" | "injection.parent" => {
566                    if language_name.is_none() {
567                        language_name = Some("objectscript");
568                    }
569                }
570                _ => {}
571            }
572        }
573
574        let grammar = language_name.and_then(Grammar::from_name)?;
575        let content_node = content_node?;
576        let start_byte = content_node.start_byte();
577        let end_byte = content_node.end_byte();
578        if start_byte >= end_byte {
579            return None;
580        }
581
582        Some(InjectionRegion {
583            grammar,
584            start_byte,
585            end_byte,
586        })
587    }
588}
589
590struct InjectionConfigs<'a> {
591    objectscript: &'a HighlightConfiguration,
592    sql: &'a HighlightConfiguration,
593    python: &'a HighlightConfiguration,
594    markdown: &'a HighlightConfiguration,
595    markdown_inline: &'a HighlightConfiguration,
596    xml: &'a HighlightConfiguration,
597}
598
599impl<'a> InjectionConfigs<'a> {
600    /// Resolves an injected language name to a highlight configuration.
601    ///
602    /// Unknown language names return `None` so Tree-sitter skips injection highlighting.
603    fn resolve(&self, language_name: &str) -> Option<&'a HighlightConfiguration> {
604        let normalized = normalize_language_name(language_name);
605        if normalized == "markdowninline" {
606            return Some(self.markdown_inline);
607        }
608
609        let grammar = grammar_from_normalized_name(&normalized)?;
610        match grammar {
611            Grammar::ObjectScript => Some(self.objectscript),
612            Grammar::Sql => Some(self.sql),
613            Grammar::Python => Some(self.python),
614            Grammar::Markdown => Some(self.markdown),
615            Grammar::Mdx => Some(self.sql),
616            Grammar::Xml => Some(self.xml),
617        }
618    }
619}
620
621/// Normalizes a language name by retaining only ASCII alphanumerics and
622/// lowercasing the result.
623fn normalize_language_name(input: &str) -> String {
624    input
625        .chars()
626        .filter(char::is_ascii_alphanumeric)
627        .map(|ch| ch.to_ascii_lowercase())
628        .collect()
629}
630
631/// Maps a normalized language name to a supported [`Grammar`].
632fn grammar_from_normalized_name(normalized: &str) -> Option<Grammar> {
633    match normalized {
634        "objectscript" | "os" | "playground" | "objectscriptplayground" => {
635            Some(Grammar::ObjectScript)
636        }
637        "sql" | "tsql" | "plsql" | "mysql" | "postgres" | "postgresql" => Some(Grammar::Sql),
638        "python" | "py" => Some(Grammar::Python),
639        "markdown" | "md" | "gfm" => Some(Grammar::Markdown),
640        "mdx" => Some(Grammar::Mdx),
641        "xml" => Some(Grammar::Xml),
642        _ => None,
643    }
644}
645
646/// Locates `injection.content` and `injection.language` captures in a query.
647fn injection_capture_indices(query: &tree_sitter::Query) -> (Option<u32>, Option<u32>) {
648    let mut content_capture = None;
649    let mut language_capture = None;
650    for (idx, name) in query.capture_names().iter().enumerate() {
651        let idx = Some(idx as u32);
652        match *name {
653            "injection.content" => content_capture = idx,
654            "injection.language" => language_capture = idx,
655            _ => {}
656        }
657    }
658    (content_capture, language_capture)
659}
660
661/// Builds and configures a Tree-sitter highlight configuration.
662///
663/// # Errors
664///
665/// Returns an error when the highlight or injection query is invalid for the
666/// provided language.
667fn new_config(
668    language: tree_sitter::Language,
669    language_name: &str,
670    highlights: &str,
671    injections: &str,
672) -> Result<HighlightConfiguration, tree_sitter::QueryError> {
673    let mut config =
674        HighlightConfiguration::new(language, language_name, highlights, injections, "")?;
675    let recognized = config
676        .names()
677        .iter()
678        .map(|name| (*name).to_string())
679        .collect::<Vec<_>>();
680    let recognized_refs = recognized.iter().map(String::as_str).collect::<Vec<_>>();
681    config.configure(&recognized_refs);
682    Ok(config)
683}
684
685/// Pushes a span into `spans`, merging with the previous span when adjacent and
686/// sharing the same attribute id.
687fn push_merged(spans: &mut Vec<Span>, next: Span) {
688    if next.start_byte >= next.end_byte {
689        return;
690    }
691
692    if let Some(last) = spans.last_mut() {
693        if last.attr_id == next.attr_id && last.end_byte == next.start_byte {
694            last.end_byte = next.end_byte;
695            return;
696        }
697    }
698
699    spans.push(next);
700}
701
702/// Remaps incoming attribute ids to ids in the destination attribute table.
703///
704/// Existing destination ids are reused by capture name; new capture names are appended.
705fn remap_attr_ids(
706    incoming: &[Attr],
707    attrs: &mut Vec<Attr>,
708    attr_ids_by_name: &mut HashMap<String, usize>,
709) -> Vec<usize> {
710    let mut remap = vec![0usize; incoming.len()];
711    for attr in incoming {
712        let mapped_attr_id = if let Some(&mapped_attr_id) = attr_ids_by_name.get(&attr.capture_name)
713        {
714            mapped_attr_id
715        } else {
716            let mapped_attr_id = attrs.len();
717            attrs.push(Attr {
718                id: mapped_attr_id,
719                capture_name: attr.capture_name.clone(),
720            });
721            attr_ids_by_name.insert(attr.capture_name.clone(), mapped_attr_id);
722            mapped_attr_id
723        };
724        if let Some(slot) = remap.get_mut(attr.id) {
725            *slot = mapped_attr_id;
726        }
727    }
728    remap
729}
730
731/// Removes byte `ranges` from `spans`, splitting spans as needed.
732fn exclude_ranges(spans: &[Span], ranges: &[(usize, usize)]) -> Vec<Span> {
733    if ranges.is_empty() {
734        return spans.to_vec();
735    }
736
737    let mut out: Vec<Span> = Vec::with_capacity(spans.len());
738    let mut range_idx = 0usize;
739    for span in spans {
740        while range_idx < ranges.len() && ranges[range_idx].1 <= span.start_byte {
741            range_idx += 1;
742        }
743
744        let mut cursor = span.start_byte;
745        let mut idx = range_idx;
746        while idx < ranges.len() {
747            let (range_start, range_end) = ranges[idx];
748            if range_start >= span.end_byte {
749                break;
750            }
751
752            if range_end <= cursor {
753                idx += 1;
754                continue;
755            }
756
757            if cursor < range_start {
758                push_merged(
759                    &mut out,
760                    Span {
761                        attr_id: span.attr_id,
762                        start_byte: cursor,
763                        end_byte: range_start.min(span.end_byte),
764                    },
765                );
766            }
767
768            if range_end >= span.end_byte {
769                cursor = span.end_byte;
770                break;
771            }
772
773            cursor = range_end;
774            idx += 1;
775        }
776
777        if cursor < span.end_byte {
778            push_merged(
779                &mut out,
780                Span {
781                    attr_id: span.attr_id,
782                    start_byte: cursor,
783                    end_byte: span.end_byte,
784                },
785            );
786        }
787    }
788    out
789}
790
791/// Sorts spans and enforces a non-overlapping, merge-friendly representation.
792fn normalize_spans(mut spans: Vec<Span>) -> Vec<Span> {
793    spans.sort_by(|a, b| {
794        a.start_byte
795            .cmp(&b.start_byte)
796            .then(a.end_byte.cmp(&b.end_byte))
797            .then(a.attr_id.cmp(&b.attr_id))
798    });
799
800    let mut out: Vec<Span> = Vec::with_capacity(spans.len());
801    for mut span in spans {
802        if let Some(last) = out.last() {
803            if span.start_byte < last.end_byte {
804                if span.end_byte <= last.end_byte {
805                    continue;
806                }
807                span.start_byte = last.end_byte;
808            }
809        }
810        push_merged(&mut out, span);
811    }
812    out
813}
814
815#[cfg(test)]
816mod tests {
817    use super::{Grammar, HighlightResult, SpanHighlighter};
818
819    /// Returns whether `expected_text` appears under `capture_name` in `result`.
820    fn has_capture_for_text(
821        result: &HighlightResult,
822        source: &[u8],
823        capture_name: &str,
824        expected_text: &[u8],
825    ) -> bool {
826        let attr_id = match result
827            .attrs
828            .iter()
829            .find(|attr| attr.capture_name == capture_name)
830            .map(|attr| attr.id)
831        {
832            Some(id) => id,
833            None => return false,
834        };
835
836        result.spans.iter().any(|span| {
837            span.attr_id == attr_id && &source[span.start_byte..span.end_byte] == expected_text
838        })
839    }
840
841    #[test]
842    /// Verifies ObjectScript numeric literals are tagged as `number`.
843    fn highlights_numeric_literal_as_number() {
844        let source = br#"
845Class Demo.Highlight
846{
847  ClassMethod Main()
848  {
849    set x = 42
850  }
851}
852"#;
853        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
854        let result = highlighter
855            .highlight(source, Grammar::ObjectScript)
856            .expect("failed to highlight");
857
858        assert!(
859            has_capture_for_text(&result, source, "number", b"42"),
860            "expected highlighted span for numeric literal"
861        );
862    }
863
864    #[test]
865    /// Verifies canonical and alias grammar names resolve correctly.
866    fn parses_supported_grammar_aliases() {
867        assert_eq!(
868            Grammar::from_name("objectscript"),
869            Some(Grammar::ObjectScript)
870        );
871        assert_eq!(Grammar::from_name("SQL"), Some(Grammar::Sql));
872        assert_eq!(Grammar::from_name("py"), Some(Grammar::Python));
873        assert_eq!(Grammar::from_name("md"), Some(Grammar::Markdown));
874        assert_eq!(Grammar::from_name("mdx"), Some(Grammar::Mdx));
875        assert_eq!(Grammar::from_name("xml"), Some(Grammar::Xml));
876        assert!(Grammar::from_name("unknown").is_none());
877    }
878
879    #[test]
880    /// Verifies SQL keywords are captured as `keyword`.
881    fn highlights_sql_keyword() {
882        let source = b"SELECT 42 FROM Demo";
883        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
884        let result = highlighter
885            .highlight(source, Grammar::Sql)
886            .expect("failed to highlight SQL");
887
888        assert!(
889            has_capture_for_text(&result, source, "keyword", b"SELECT"),
890            "expected SELECT to be highlighted as keyword"
891        );
892    }
893
894    #[test]
895    /// Verifies `%SQLQuery` bodies are highlighted via SQL injection handling.
896    fn objectscript_sqlquery_body_is_highlighted_as_sql() {
897        let source = br#"
898Class Test
899{
900  Query ListEmployees() As %SQLQuery
901  {
902SELECT ID,Name FROM Employee
903  }
904}
905"#;
906        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
907        let result = highlighter
908            .highlight(source, Grammar::ObjectScript)
909            .expect("failed to highlight ObjectScript with SQL injection");
910
911        assert!(
912            has_capture_for_text(&result, source, "keyword", b"SELECT"),
913            "expected SQL SELECT in %SQLQuery body to be highlighted as keyword"
914        );
915    }
916
917    #[test]
918    /// Verifies Python numeric literals are highlighted as `number`.
919    fn highlights_python_number() {
920        let source = b"def f(x):\n    return x + 1\n";
921        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
922        let result = highlighter
923            .highlight(source, Grammar::Python)
924            .expect("failed to highlight Python");
925
926        assert!(
927            has_capture_for_text(&result, source, "number", b"1"),
928            "expected numeric literal to be highlighted in Python"
929        );
930    }
931
932    #[test]
933    /// Verifies Markdown heading text is captured as `text.title`.
934    fn highlights_markdown_heading() {
935        let source = b"# Heading\n";
936        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
937        let result = highlighter
938            .highlight(source, Grammar::Markdown)
939            .expect("failed to highlight Markdown");
940
941        assert!(
942            has_capture_for_text(&result, source, "text.title", b"Heading"),
943            "expected heading text to be highlighted in Markdown"
944        );
945    }
946
947    #[test]
948    /// Verifies MDX currently falls back to SQL keyword highlighting.
949    fn mdx_falls_back_to_sql_keyword_highlighting() {
950        let source = b"SELECT 1 FROM Cube";
951        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
952        let result = highlighter
953            .highlight(source, Grammar::Mdx)
954            .expect("failed to highlight MDX fallback");
955
956        assert!(
957            has_capture_for_text(&result, source, "keyword", b"SELECT"),
958            "expected MDX fallback to highlight SQL keywords"
959        );
960    }
961
962    #[test]
963    /// Verifies ObjectScript inside XML `<Implementation>` CDATA is injected.
964    fn xml_implementation_cdata_is_highlighted_as_objectscript() {
965        let source = br#"
966<Export>
967  <Class name="Demo.Sample">
968    <Method name="Run">
969      <Implementation><![CDATA[
970 set x = 42
971]]></Implementation>
972    </Method>
973  </Class>
974</Export>
975"#;
976        let mut highlighter = SpanHighlighter::new().expect("failed to build highlighter");
977        let result = highlighter
978            .highlight(source, Grammar::Xml)
979            .expect("failed to highlight XML with ObjectScript injection");
980
981        assert!(
982            has_capture_for_text(&result, source, "number", b"42"),
983            "expected injected ObjectScript numeric literal to be highlighted"
984        );
985    }
986}