Skip to main content

arborium_highlight/
lib.rs

1//! Unified syntax highlighting for arborium.
2//!
3//! This crate provides the core highlighting engine that works with both:
4//! - **Statically linked Rust grammars**: For CLI tools and servers
5//! - **Dynamically loaded WASM plugins**: For browser contexts
6//!
7//! # Why Async in a Highlighting Library?
8//!
9//! You might wonder why a syntax highlighting library has async code. The answer
10//! is **browser support**.
11//!
12//! - **Parsing is synchronous**: Tree-sitter parsing cannot be async—it's a
13//!   fundamentally synchronous operation that walks the syntax tree.
14//!
15//! - **Getting a grammar can be async**: In browser contexts, grammar plugins
16//!   are loaded from a CDN via JavaScript's dynamic `import()`. This is
17//!   inherently async since it involves network requests and WASM instantiation.
18//!
19//! In native Rust, grammars are statically linked, so the provider returns
20//! immediately. But the trait is async to support both use cases with the same
21//! code.
22//!
23//! # Architecture
24//!
25//! The highlighting system is built around two key traits:
26//!
27//! - [`Grammar`]: What a grammar can do — parse text and return spans
28//! - [`GrammarProvider`]: How grammars are obtained — this is where sync vs async differs
29//!
30//! ## The Sync-in-Async-Clothing Pattern
31//!
32//! The core highlighting logic (including injection handling) is written **once**
33//! as async code in `HighlighterCore`. Two wrappers provide the sync and async APIs:
34//!
35//! - [`SyncHighlighter`]: Polls the async future **once** and panics if it yields.
36//!   This is safe for native Rust where providers return immediately.
37//!
38//! - [`AsyncHighlighter`]: Actually awaits provider calls. Use this for browser/WASM
39//!   contexts where grammar loading involves network requests.
40//!
41//! This design ensures both environments share the exact same injection-handling
42//! logic, avoiding subtle bugs from duplicated code.
43//!
44//! ## When to Use Which
45//!
46//! | Context | Highlighter | Provider Example |
47//! |---------|-------------|------------------|
48//! | Native Rust | [`SyncHighlighter`] | `StaticProvider` (grammars compiled in) |
49//! | Browser WASM | [`AsyncHighlighter`] | `JsGrammarProvider` (loads from CDN) |
50//!
51//! # Quick Start
52//!
53//! ```rust,ignore
54//! use arborium_highlight::{SyncHighlighter, Grammar, GrammarProvider, ParseResult, Span};
55//! use arborium_highlight::{HighlightConfig, HtmlFormat};
56//!
57//! // Define your grammar (implements Grammar trait)
58//! struct MyGrammar { /* ... */ }
59//! impl Grammar for MyGrammar {
60//!     fn parse(&mut self, text: &str) -> ParseResult {
61//!         // Parse and return spans + injections
62//!         ParseResult::default()
63//!     }
64//! }
65//!
66//! // Define your provider (implements GrammarProvider trait)
67//! struct MyProvider { /* ... */ }
68//! impl GrammarProvider for MyProvider {
69//!     type Grammar = MyGrammar;
70//!     async fn get(&mut self, language: &str) -> Option<&mut Self::Grammar> {
71//!         // Return grammar for language
72//!         None
73//!     }
74//! }
75//!
76//! // Use with default configuration (custom elements: <a-k>, <a-f>, etc.)
77//! let mut highlighter = SyncHighlighter::new(MyProvider { /* ... */ });
78//! let html = highlighter.highlight("rust", "fn main() {}");
79//! // Output: <a-k>fn</a-k> <a-f>main</a-f>() {}
80//!
81//! // Or use class-based output for compatibility with existing CSS
82//! let config = HighlightConfig {
83//!     html_format: HtmlFormat::ClassNames,
84//!     ..Default::default()
85//! };
86//! let mut highlighter = SyncHighlighter::with_config(MyProvider { /* ... */ }, config);
87//! let html = highlighter.highlight("rust", "fn main() {}");
88//! // Output: <span class="keyword">fn</span> <span class="function">main</span>() {}
89//! ```
90//!
91//! # HTML Output Formats
92//!
93//! Arborium supports multiple HTML output formats via [`HtmlFormat`]:
94//!
95//! - **`CustomElements`** (default): Compact custom elements like `<a-k>`, `<a-f>`, etc.
96//! - **`CustomElementsWithPrefix(prefix)`**: Custom elements with your prefix, e.g., `<code-k>`
97//! - **`ClassNames`**: Traditional `<span class="keyword">` for compatibility
98//! - **`ClassNamesWithPrefix(prefix)`**: Namespaced classes like `<span class="arb-keyword">`
99//!
100//! See [`HtmlFormat`] for examples and use cases.
101
102mod render;
103mod types;
104
105#[cfg(feature = "tree-sitter")]
106pub mod tree_sitter;
107
108pub use render::{
109    AnsiOptions, ThemedSpan, html_escape, spans_to_ansi, spans_to_ansi_with_options, spans_to_html,
110    spans_to_themed, write_spans_as_ansi, write_spans_as_html,
111};
112pub use types::{HighlightError, Injection, ParseResult, Span};
113
114#[cfg(feature = "tree-sitter")]
115pub use tree_sitter::{CompiledGrammar, GrammarConfig, GrammarError, ParseContext};
116
117// Backward compatibility aliases
118#[cfg(feature = "tree-sitter")]
119#[doc(hidden)]
120pub use tree_sitter::{TreeSitterGrammarConfig, TreeSitterGrammarError};
121
122use std::future::Future;
123use std::task::{Context, Poll, RawWaker, RawWakerVTable, Waker};
124
125/// A grammar that can parse text and produce highlight spans.
126///
127/// This is implemented by:
128/// - Tree-sitter based parsers (for Rust)
129/// - WASM plugin wrappers (for browser)
130/// - Mock implementations (for testing)
131///
132/// # Implementation Notes
133///
134/// Parsing is always synchronous. The async part of highlighting is *getting* the grammar,
135/// not using it. This is because tree-sitter parsing is inherently synchronous.
136pub trait Grammar {
137    /// Parse text and return spans + injection points.
138    ///
139    /// This is always synchronous - the async part is *getting* the grammar,
140    /// not using it.
141    fn parse(&mut self, text: &str) -> ParseResult;
142}
143
144/// Provides grammars for languages.
145///
146/// This trait abstracts over how grammars are obtained:
147///
148/// - **Static (Rust)**: Grammars are statically linked. `get()` returns
149///   immediately without awaiting.
150///
151/// - **Dynamic (WASM)**: Grammars are loaded as WASM plugins. `get()` may
152///   need to fetch and instantiate a plugin, which is async.
153///
154/// # Implementation Notes
155///
156/// For sync contexts (Rust CLI tools, servers), implement `get()` to return
157/// immediately. The `SyncHighlighter` wrapper will panic if `get()` yields.
158///
159/// For async contexts (WASM/browser), `get()` can await plugin loading.
160/// Use `AsyncHighlighter` wrapper.
161pub trait GrammarProvider {
162    /// The grammar type this provider returns.
163    type Grammar: Grammar;
164
165    /// Get a grammar for a language.
166    ///
167    /// Returns `None` if the language is not supported.
168    ///
169    /// # Sync vs Async
170    ///
171    /// This is an async method, but for sync providers (static Rust grammars),
172    /// it should return `Ready` immediately without yielding. The caller
173    /// (SyncHighlighter) will poll once and panic if it gets `Pending`.
174    ///
175    /// # Send Bound
176    ///
177    /// On native targets, the future must be `Send` for compatibility with
178    /// async runtimes. On WASM, `Send` is not required (single-threaded).
179    #[cfg(not(target_arch = "wasm32"))]
180    fn get(&mut self, language: &str) -> impl Future<Output = Option<&mut Self::Grammar>> + Send;
181
182    /// Get a grammar for a language (WASM version without Send bound).
183    #[cfg(target_arch = "wasm32")]
184    fn get(&mut self, language: &str) -> impl Future<Output = Option<&mut Self::Grammar>>;
185}
186
187/// HTML output format for syntax highlighting.
188#[derive(Debug, Clone, PartialEq, Eq)]
189pub enum HtmlFormat {
190    /// Custom elements with default prefix: `<a-k>`, `<a-f>`, etc. (default)
191    ///
192    /// This is the most compact format and leverages custom HTML elements.
193    ///
194    /// # Example
195    /// ```html
196    /// <a-k>fn</a-k> <a-f>main</a-f>()
197    /// ```
198    CustomElements,
199
200    /// Custom elements with custom prefix: `<prefix-k>`, `<prefix-f>`, etc.
201    ///
202    /// Useful for branding or avoiding conflicts with other custom elements.
203    ///
204    /// # Example
205    /// ```html
206    /// <!-- With prefix "code" -->
207    /// <code-k>fn</code-k> <code-f>main</code-f>()
208    /// ```
209    CustomElementsWithPrefix(String),
210
211    /// Traditional class-based spans: `<span class="keyword">`, etc.
212    ///
213    /// Compatible with existing tooling that expects class-based markup.
214    ///
215    /// # Example
216    /// ```html
217    /// <span class="keyword">fn</span> <span class="function">main</span>()
218    /// ```
219    ClassNames,
220
221    /// Class-based spans with custom prefix: `<span class="prefix-keyword">`, etc.
222    ///
223    /// Useful for namespacing CSS classes.
224    ///
225    /// # Example
226    /// ```html
227    /// <!-- With prefix "arb" -->
228    /// <span class="arb-keyword">fn</span> <span class="arb-function">main</span>()
229    /// ```
230    ClassNamesWithPrefix(String),
231}
232
233impl Default for HtmlFormat {
234    fn default() -> Self {
235        Self::CustomElements
236    }
237}
238
239/// Configuration for highlighting.
240#[derive(Debug, Clone)]
241pub struct HighlightConfig {
242    /// Maximum depth for processing language injections.
243    ///
244    /// - `0`: No injections (just primary language)
245    /// - `3`: Default, handles most cases
246    /// - Higher: For deeply nested content
247    pub max_injection_depth: u32,
248
249    /// HTML output format (custom elements vs class-based spans).
250    pub html_format: HtmlFormat,
251}
252
253impl Default for HighlightConfig {
254    fn default() -> Self {
255        Self {
256            max_injection_depth: 3,
257            html_format: HtmlFormat::default(),
258        }
259    }
260}
261
262/// Internal async implementation - handles all the hard work.
263///
264/// The core logic is written once as async, then wrapped by `SyncHighlighter`
265/// (which polls once and panics if it yields) and `AsyncHighlighter` (which
266/// actually awaits).
267struct HighlighterCore<P: GrammarProvider> {
268    provider: P,
269    config: HighlightConfig,
270}
271
272impl<P: GrammarProvider> HighlighterCore<P> {
273    fn new(provider: P) -> Self {
274        Self {
275            provider,
276            config: HighlightConfig::default(),
277        }
278    }
279
280    fn with_config(provider: P, config: HighlightConfig) -> Self {
281        Self { provider, config }
282    }
283
284    /// Highlight and return raw spans for the full document,
285    /// including any recursively processed injections.
286    async fn highlight_spans(
287        &mut self,
288        language: &str,
289        source: &str,
290    ) -> Result<Vec<Span>, HighlightError> {
291        // 1. Get the primary grammar
292        let grammar = self
293            .provider
294            .get(language)
295            .await
296            .ok_or_else(|| HighlightError::UnsupportedLanguage(language.into()))?;
297
298        // 2. Parse the primary language
299        let result = grammar.parse(source);
300
301        // 3. Collect all spans (including from injections)
302        let mut all_spans = result.spans;
303
304        // 4. Process injections recursively
305        if self.config.max_injection_depth > 0 {
306            self.process_injections(
307                source,
308                result.injections,
309                0,
310                self.config.max_injection_depth,
311                &mut all_spans,
312            )
313            .await;
314        }
315
316        Ok(all_spans)
317    }
318
319    /// The main highlight function - written once, used by both wrappers.
320    async fn highlight(&mut self, language: &str, source: &str) -> Result<String, HighlightError> {
321        let spans = self.highlight_spans(language, source).await?;
322        Ok(spans_to_html(source, spans, &self.config.html_format))
323    }
324
325    /// Process injections recursively.
326    async fn process_injections(
327        &mut self,
328        source: &str,
329        injections: Vec<Injection>,
330        base_offset: u32,
331        remaining_depth: u32,
332        all_spans: &mut Vec<Span>,
333    ) {
334        if remaining_depth == 0 {
335            return;
336        }
337
338        for injection in injections {
339            let start = injection.start as usize;
340            let end = injection.end as usize;
341
342            if end <= source.len() && start < end {
343                // Try to get grammar for injected language
344                if let Some(inj_grammar) = self.provider.get(&injection.language).await {
345                    let injected_text = &source[start..end];
346                    let result = inj_grammar.parse(injected_text);
347
348                    // Adjust offsets and add spans
349                    let adjusted_spans: Vec<Span> = result
350                        .spans
351                        .into_iter()
352                        .map(|mut s| {
353                            s.start += base_offset + injection.start;
354                            s.end += base_offset + injection.start;
355                            s
356                        })
357                        .collect();
358                    all_spans.extend(adjusted_spans);
359
360                    // Recurse into nested injections
361                    if !result.injections.is_empty() {
362                        // Box the recursive call to avoid infinite type size
363                        Box::pin(self.process_injections(
364                            injected_text,
365                            result.injections,
366                            base_offset + injection.start,
367                            remaining_depth - 1,
368                            all_spans,
369                        ))
370                        .await;
371                    }
372                }
373                // If grammar not available, skip this injection silently
374            }
375        }
376    }
377}
378
379/// Synchronous highlighter for Rust contexts.
380///
381/// Uses a sync provider where `get()` returns immediately.
382/// Panics if the provider ever yields (returns Pending).
383///
384/// # Example
385///
386/// ```rust,ignore
387/// use arborium_highlight::{SyncHighlighter, StaticProvider};
388///
389/// let mut highlighter = SyncHighlighter::new(StaticProvider::new());
390/// let html = highlighter.highlight("rust", "fn main() {}")?;
391/// ```
392pub struct SyncHighlighter<P: GrammarProvider> {
393    core: HighlighterCore<P>,
394}
395
396impl<P: GrammarProvider> SyncHighlighter<P> {
397    /// Create a new synchronous highlighter with default configuration.
398    pub fn new(provider: P) -> Self {
399        Self {
400            core: HighlighterCore::new(provider),
401        }
402    }
403
404    /// Create a new synchronous highlighter with custom configuration.
405    pub fn with_config(provider: P, config: HighlightConfig) -> Self {
406        Self {
407            core: HighlighterCore::with_config(provider, config),
408        }
409    }
410
411    /// Get a mutable reference to the underlying provider.
412    pub fn provider_mut(&mut self) -> &mut P {
413        &mut self.core.provider
414    }
415
416    /// Highlight source code synchronously and return HTML.
417    ///
418    /// # Panics
419    ///
420    /// Panics if the provider's `get()` method yields (returns Pending).
421    /// This indicates a bug - sync providers should never yield.
422    pub fn highlight(&mut self, language: &str, source: &str) -> Result<String, HighlightError> {
423        let future = self.core.highlight(language, source);
424
425        // Pin the future on the stack
426        let mut future = std::pin::pin!(future);
427
428        // Create a no-op waker (we're not actually async)
429        let waker = noop_waker();
430        let mut cx = Context::from_waker(&waker);
431
432        // Poll once - sync providers complete immediately
433        match future.as_mut().poll(&mut cx) {
434            Poll::Ready(result) => result,
435            Poll::Pending => {
436                panic!(
437                    "SyncHighlighter: provider yielded. Use AsyncHighlighter for async providers."
438                )
439            }
440        }
441    }
442
443    /// Highlight source code synchronously and return ANSI-colored text
444    /// using the provided theme.
445    ///
446    /// This uses the same span computation as HTML output but renders
447    /// with ANSI escape sequences.
448    pub fn highlight_to_ansi(
449        &mut self,
450        language: &str,
451        source: &str,
452        theme: &arborium_theme::Theme,
453    ) -> Result<String, HighlightError> {
454        self.highlight_to_ansi_with_options(language, source, theme, &AnsiOptions::default())
455    }
456
457    /// Highlight source code synchronously and return ANSI-colored text
458    /// using the provided theme and explicit ANSI rendering options.
459    pub fn highlight_to_ansi_with_options(
460        &mut self,
461        language: &str,
462        source: &str,
463        theme: &arborium_theme::Theme,
464        options: &AnsiOptions,
465    ) -> Result<String, HighlightError> {
466        let future = self.core.highlight_spans(language, source);
467
468        let mut future = std::pin::pin!(future);
469        let waker = noop_waker();
470        let mut cx = Context::from_waker(&waker);
471
472        match future.as_mut().poll(&mut cx) {
473            Poll::Ready(Ok(spans)) => Ok(spans_to_ansi_with_options(source, spans, theme, options)),
474            Poll::Ready(Err(e)) => Err(e),
475            Poll::Pending => {
476                panic!(
477                    "SyncHighlighter: provider yielded. Use AsyncHighlighter for async providers."
478                )
479            }
480        }
481    }
482}
483
484/// Asynchronous highlighter for WASM/browser contexts.
485///
486/// Uses an async provider where `get()` may need to load plugins.
487///
488/// # Example
489///
490/// ```rust,ignore
491/// use arborium_highlight::{AsyncHighlighter, WasmPluginProvider};
492///
493/// let mut highlighter = AsyncHighlighter::new(WasmPluginProvider::new());
494/// let html = highlighter.highlight("rust", "fn main() {}").await?;
495/// ```
496pub struct AsyncHighlighter<P: GrammarProvider> {
497    core: HighlighterCore<P>,
498}
499
500impl<P: GrammarProvider> AsyncHighlighter<P> {
501    /// Create a new asynchronous highlighter with default configuration.
502    pub fn new(provider: P) -> Self {
503        Self {
504            core: HighlighterCore::new(provider),
505        }
506    }
507
508    /// Create a new asynchronous highlighter with custom configuration.
509    pub fn with_config(provider: P, config: HighlightConfig) -> Self {
510        Self {
511            core: HighlighterCore::with_config(provider, config),
512        }
513    }
514
515    /// Get a mutable reference to the underlying provider.
516    pub fn provider_mut(&mut self) -> &mut P {
517        &mut self.core.provider
518    }
519
520    /// Highlight source code asynchronously.
521    pub async fn highlight(
522        &mut self,
523        language: &str,
524        source: &str,
525    ) -> Result<String, HighlightError> {
526        self.core.highlight(language, source).await
527    }
528}
529
530/// Create a no-op waker for sync polling.
531fn noop_waker() -> Waker {
532    const VTABLE: RawWakerVTable = RawWakerVTable::new(
533        |_| RAW_WAKER, // clone
534        |_| {},        // wake
535        |_| {},        // wake_by_ref
536        |_| {},        // drop
537    );
538    const RAW_WAKER: RawWaker = RawWaker::new(std::ptr::null(), &VTABLE);
539
540    unsafe { Waker::from_raw(RAW_WAKER) }
541}
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546    use std::collections::HashMap;
547
548    /// Mock provider for testing - sync, returns immediately
549    struct MockProvider {
550        grammars: HashMap<&'static str, MockGrammar>,
551    }
552
553    impl GrammarProvider for MockProvider {
554        type Grammar = MockGrammar;
555
556        #[cfg(not(target_arch = "wasm32"))]
557        async fn get(&mut self, language: &str) -> Option<&mut Self::Grammar> {
558            self.grammars.get_mut(language)
559        }
560
561        #[cfg(target_arch = "wasm32")]
562        async fn get(&mut self, language: &str) -> Option<&mut Self::Grammar> {
563            self.grammars.get_mut(language)
564        }
565    }
566
567    struct MockGrammar {
568        result: ParseResult,
569    }
570
571    impl Grammar for MockGrammar {
572        fn parse(&mut self, _text: &str) -> ParseResult {
573            self.result.clone()
574        }
575    }
576
577    #[test]
578    fn test_basic_highlighting() {
579        let provider = MockProvider {
580            grammars: [(
581                "test",
582                MockGrammar {
583                    result: ParseResult {
584                        spans: vec![Span {
585                            start: 0,
586                            end: 2,
587                            capture: "keyword".into(),
588                            pattern_index: 0,
589                        }],
590                        injections: vec![],
591                    },
592                },
593            )]
594            .into(),
595        };
596
597        let mut highlighter = SyncHighlighter::new(provider);
598        let html = highlighter.highlight("test", "fn").unwrap();
599        assert_eq!(html, "<a-k>fn</a-k>");
600    }
601
602    #[test]
603    fn test_injection() {
604        let provider = MockProvider {
605            grammars: [
606                (
607                    "outer",
608                    MockGrammar {
609                        result: ParseResult {
610                            spans: vec![],
611                            injections: vec![Injection {
612                                start: 0,
613                                end: 5,
614                                language: "inner".into(),
615                                include_children: false,
616                            }],
617                        },
618                    },
619                ),
620                (
621                    "inner",
622                    MockGrammar {
623                        result: ParseResult {
624                            spans: vec![Span {
625                                start: 0,
626                                end: 5,
627                                capture: "string".into(),
628                                pattern_index: 0,
629                            }],
630                            injections: vec![],
631                        },
632                    },
633                ),
634            ]
635            .into(),
636        };
637
638        let mut highlighter = SyncHighlighter::new(provider);
639        let html = highlighter.highlight("outer", "hello").unwrap();
640        assert_eq!(html, "<a-s>hello</a-s>");
641    }
642
643    #[test]
644    fn test_unsupported_language() {
645        let provider = MockProvider {
646            grammars: HashMap::new(),
647        };
648
649        let mut highlighter = SyncHighlighter::new(provider);
650        let result = highlighter.highlight("unknown", "code");
651        assert!(matches!(
652            result,
653            Err(HighlightError::UnsupportedLanguage(_))
654        ));
655    }
656
657    #[test]
658    fn test_reuse_with_shorter_text() {
659        // Regression test: reusing a highlighter with a shorter string
660        // after a longer string should not panic with slice bounds errors.
661        // This tests that we don't incorrectly use cached tree state.
662        let provider = MockProvider {
663            grammars: [(
664                "test",
665                MockGrammar {
666                    result: ParseResult {
667                        spans: vec![Span {
668                            start: 0,
669                            end: 2,
670                            capture: "keyword".into(),
671                            pattern_index: 0,
672                        }],
673                        injections: vec![],
674                    },
675                },
676            )]
677            .into(),
678        };
679
680        let mut highlighter = SyncHighlighter::new(provider);
681
682        // First: longer string
683        let _ = highlighter.highlight("test", "longer string here");
684
685        // Second: shorter string - should not panic
686        let _ = highlighter.highlight("test", "short");
687    }
688
689    #[test]
690    fn test_span_coalescing() {
691        let spans = vec![
692            Span {
693                start: 0,
694                end: 3,
695                capture: "keyword".into(),
696                pattern_index: 0,
697            },
698            Span {
699                start: 3,
700                end: 7,
701                capture: "keyword.function".into(),
702                pattern_index: 0,
703            },
704        ];
705        let html = spans_to_html("keyword", spans, &HtmlFormat::default());
706        assert_eq!(html, "<a-k>keyword</a-k>");
707    }
708}