Skip to main content

lex_analysis/
semantic_tokens.rs

1//! This is the semantic token collector, which editors use for syntax highlighting.
2//! It's worth going over the general approach.
3//!
4//! Semantic Tokens and Editor Highlighting Architecture
5//!
6//!     1. LSP emits semantic tokens using our format's native terminology (e.g., `Verbatim`
7//! Annotation, etc). The LSP declares a token legend at initialization and emits tokens as indices
8//! into that legend—it has no knowledge of editor-specific theming.
9//!     2. Editor plugins map our token types to the editor's theme primitives. This lets users
10//! leverage their existing theme choices while our core LSP code remains editor-agnostic.
11//!     
12//! Editor-Specific Mapping
13//!
14//!     VSCode — declarative mapping in `package.json`:
15//!         "semanticTokenScopes": [{
16//!         "language": "ourformat",
17//!         "scopes": {
18//!         "Verbatim": ["markup.inline.raw"],
19//!         "Heading": ["markup.heading"],
20//!         "Emphasis": ["markup.italic"]
21//!         }
22//!         }]
23//!     :: javascript
24//!
25//!     We map to TextMate scopes (`markup.*`) as they have broad theme support and are a natural
26//! fit for markup.
27//!
28//!     Neovim — imperative mapping in the plugin:
29//!         vim.api.nvim_set_hl(0, '@lsp.type.Verbatim', { link = '@markup.raw' })
30//!         `vim.api.nvim_set_hl(0, '@lsp.type.Heading', { link = '@markup.heading' })
31//!         vim.api.nvim_set_hl(0, '@lsp.type.Emphasis', { link = '@markup.italic' })
32//!     :: lua
33//!     We link to treesitter's `@markup.*` groups for equivalent theme coverage.
34//!     Benefits:
35//!         - LSP speaks our format's semantics—no impedance mismatch
36//!         - Users get syntax highlighting that respects their theme
37//!         - Mapping logic is isolated to editor plugins; adding a new editor doesn't touch the LSP
38//!
39//! The file editors/vscode/themes/lex-light.json has the recommended theming for Lex to be used in
40//! tests and so forth.
41use lex_core::lex::ast::inline_positions::{walk_text_content_positions, InlinePositionVisitor};
42use lex_core::lex::ast::{
43    Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Range, Session,
44    Table, TextContent, Verbatim,
45};
46use lex_core::lex::inlines::{ReferenceInline, ReferenceType};
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
49pub enum LexSemanticTokenKind {
50    DocumentTitle,
51    DocumentSubtitle,
52    SessionMarker,
53    SessionTitleText,
54    DefinitionSubject,
55    DefinitionContent,
56    ListMarker,
57    ListItemText,
58    AnnotationLabel,
59    AnnotationParameter,
60    AnnotationContent,
61    InlineStrong,
62    InlineEmphasis,
63    InlineCode,
64    InlineMath,
65    Reference,
66    ReferenceCitation,
67    ReferenceFootnote,
68    ReferenceAnnotation,
69    VerbatimSubject,
70    DataLabel,
71    DataParameter,
72    VerbatimContent,
73    InlineMarkerStrongStart,
74    InlineMarkerStrongEnd,
75    InlineMarkerEmphasisStart,
76    InlineMarkerEmphasisEnd,
77    InlineMarkerCodeStart,
78    InlineMarkerCodeEnd,
79    InlineMarkerMathStart,
80    InlineMarkerMathEnd,
81    InlineMarkerRefStart,
82    InlineMarkerRefEnd,
83}
84
85impl LexSemanticTokenKind {
86    /// Returns the semantic token type string for LSP.
87    ///
88    /// These token type names are mapped to standard TextMate scopes in editor configurations
89    /// to ensure compatibility with existing themes (Neovim, VSCode, etc.).
90    ///
91    /// Mapping rationale (based on Lex↔Markdown mapping from lex-babel):
92    /// - Session → Heading → maps to "markup.heading"
93    /// - Definition → Term: Desc → maps to "variable.other.definition"
94    /// - InlineStrong → bold → maps to "markup.bold"
95    /// - InlineEmphasis → *italic* → maps to "markup.italic"
96    /// - InlineCode → `code` → maps to "markup.inline.raw"
97    /// - InlineMath → $math$ → maps to "constant.numeric"
98    /// - Reference → \[citation\] → maps to "markup.underline.link"
99    /// - Verbatim → ```block``` → maps to "markup.raw.block"
100    /// - Annotation → <!-- comment --> → maps to "comment.block"
101    /// - ListMarker → - or 1. → maps to "punctuation.definition.list"
102    pub fn as_str(self) -> &'static str {
103        match self {
104            LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
105            LexSemanticTokenKind::DocumentSubtitle => "DocumentSubtitle",
106            LexSemanticTokenKind::SessionMarker => "SessionMarker",
107            LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
108            LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
109            LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
110            LexSemanticTokenKind::ListMarker => "ListMarker",
111            LexSemanticTokenKind::ListItemText => "ListItemText",
112            LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
113            LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
114            LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
115            LexSemanticTokenKind::InlineStrong => "InlineStrong",
116            LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
117            LexSemanticTokenKind::InlineCode => "InlineCode",
118            LexSemanticTokenKind::InlineMath => "InlineMath",
119            LexSemanticTokenKind::Reference => "Reference",
120            LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
121            LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
122            LexSemanticTokenKind::ReferenceAnnotation => "ReferenceAnnotation",
123            LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
124            LexSemanticTokenKind::DataLabel => "DataLabel",
125            LexSemanticTokenKind::DataParameter => "DataParameter",
126            LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
127            LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
128            LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
129            LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
130            LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
131            LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
132            LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
133            LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
134            LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
135            LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
136            LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
137        }
138    }
139}
140
141pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
142    LexSemanticTokenKind::DocumentTitle,
143    LexSemanticTokenKind::DocumentSubtitle,
144    LexSemanticTokenKind::SessionMarker,
145    LexSemanticTokenKind::SessionTitleText,
146    LexSemanticTokenKind::DefinitionSubject,
147    LexSemanticTokenKind::DefinitionContent,
148    LexSemanticTokenKind::ListMarker,
149    LexSemanticTokenKind::ListItemText,
150    LexSemanticTokenKind::AnnotationLabel,
151    LexSemanticTokenKind::AnnotationParameter,
152    LexSemanticTokenKind::AnnotationContent,
153    LexSemanticTokenKind::InlineStrong,
154    LexSemanticTokenKind::InlineEmphasis,
155    LexSemanticTokenKind::InlineCode,
156    LexSemanticTokenKind::InlineMath,
157    LexSemanticTokenKind::Reference,
158    LexSemanticTokenKind::ReferenceCitation,
159    LexSemanticTokenKind::ReferenceFootnote,
160    LexSemanticTokenKind::VerbatimSubject,
161    LexSemanticTokenKind::DataLabel,
162    LexSemanticTokenKind::DataParameter,
163    LexSemanticTokenKind::VerbatimContent,
164    LexSemanticTokenKind::InlineMarkerStrongStart,
165    LexSemanticTokenKind::InlineMarkerStrongEnd,
166    LexSemanticTokenKind::InlineMarkerEmphasisStart,
167    LexSemanticTokenKind::InlineMarkerEmphasisEnd,
168    LexSemanticTokenKind::InlineMarkerCodeStart,
169    LexSemanticTokenKind::InlineMarkerCodeEnd,
170    LexSemanticTokenKind::InlineMarkerMathStart,
171    LexSemanticTokenKind::InlineMarkerMathEnd,
172    LexSemanticTokenKind::InlineMarkerRefStart,
173    LexSemanticTokenKind::InlineMarkerRefEnd,
174    LexSemanticTokenKind::ReferenceAnnotation,
175];
176
177#[derive(Debug, Clone, PartialEq)]
178pub struct LexSemanticToken {
179    pub kind: LexSemanticTokenKind,
180    pub range: Range,
181}
182
183pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
184    let mut collector = TokenCollector::new();
185    collector.process_document(document);
186    collector.finish()
187}
188
189struct TokenCollector {
190    tokens: Vec<LexSemanticToken>,
191    in_annotation: bool,
192    in_definition: bool,
193}
194
195impl TokenCollector {
196    fn new() -> Self {
197        Self {
198            tokens: Vec::new(),
199            in_annotation: false,
200            in_definition: false,
201        }
202    }
203
204    fn finish(mut self) -> Vec<LexSemanticToken> {
205        self.tokens.sort_by(|a, b| {
206            let a_start = (
207                &a.range.start.line,
208                &a.range.start.column,
209                &a.range.end.line,
210                &a.range.end.column,
211            );
212            let b_start = (
213                &b.range.start.line,
214                &b.range.start.column,
215                &b.range.end.line,
216                &b.range.end.column,
217            );
218            a_start.cmp(&b_start)
219        });
220        self.tokens
221    }
222
223    fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
224        if range.span.start < range.span.end {
225            self.tokens.push(LexSemanticToken {
226                kind,
227                range: range.clone(),
228            });
229        }
230    }
231
232    fn process_document(&mut self, document: &Document) {
233        self.process_annotations(document.annotations());
234        if let Some(title) = &document.title {
235            if let Some(title_loc) = &title.content.location {
236                self.push_range(title_loc, LexSemanticTokenKind::DocumentTitle);
237            } else {
238                self.push_range(&title.location, LexSemanticTokenKind::DocumentTitle);
239            }
240            self.process_text_content(&title.content);
241            if let Some(subtitle) = &title.subtitle {
242                if let Some(sub_loc) = &subtitle.location {
243                    self.push_range(sub_loc, LexSemanticTokenKind::DocumentSubtitle);
244                }
245                self.process_text_content(subtitle);
246            }
247        }
248        self.process_session(&document.root, LexSemanticTokenKind::SessionTitleText);
249    }
250
251    fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
252        // Emit separate tokens for marker and title text
253        if let Some(marker) = &session.marker {
254            // Emit SessionMarker token for the sequence marker
255            self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
256        }
257
258        // Emit SessionTitleText token for the title text (without marker)
259        // Create a range for the title text by using the full title location
260        // and adjusting if there's a marker
261        if let Some(header) = session.header_location() {
262            if let Some(marker) = &session.marker {
263                // Calculate the title text range (after the marker)
264                let marker_text = marker.as_str();
265                let full_title = session.full_title();
266
267                // Find where the marker ends in the title
268                if let Some(pos) = full_title.find(marker_text) {
269                    let marker_end = pos + marker_text.len();
270                    // Skip whitespace after marker
271                    let title_start = full_title[marker_end..]
272                        .chars()
273                        .position(|c| !c.is_whitespace())
274                        .map(|p| marker_end + p)
275                        .unwrap_or(marker_end);
276
277                    if title_start < full_title.len() {
278                        // Create range for title text only
279                        use lex_core::lex::ast::Position;
280                        let title_text_range = Range::new(
281                            header.span.start + title_start..header.span.end,
282                            Position::new(header.start.line, header.start.column + title_start),
283                            header.end,
284                        );
285                        self.push_range(&title_text_range, title_kind);
286                    }
287                }
288            } else {
289                // No marker, the entire header is title text
290                self.push_range(header, title_kind);
291            }
292        }
293
294        self.process_text_content(&session.title);
295
296        self.process_annotations(session.annotations());
297        for child in session.children.iter() {
298            self.process_content_item(child);
299        }
300    }
301
302    fn process_content_item(&mut self, item: &ContentItem) {
303        match item {
304            ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
305            ContentItem::Session(session) => {
306                self.process_session(session, LexSemanticTokenKind::SessionTitleText)
307            }
308            ContentItem::List(list) => self.process_list(list),
309            ContentItem::ListItem(list_item) => self.process_list_item(list_item),
310            ContentItem::Definition(definition) => self.process_definition(definition),
311            ContentItem::Annotation(annotation) => self.process_annotation(annotation),
312            ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
313            ContentItem::Table(table) => self.process_table(table),
314            ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
315            ContentItem::VerbatimLine(_) => {}
316            ContentItem::BlankLineGroup(_) => {}
317        }
318    }
319
320    fn process_paragraph(&mut self, paragraph: &Paragraph) {
321        for line in &paragraph.lines {
322            if let ContentItem::TextLine(text_line) = line {
323                // Don't emit full-line tokens for DefinitionContent or AnnotationContent
324                // as they overlap with inline tokens. The context is already clear from
325                // the DefinitionSubject and AnnotationLabel tokens.
326                self.process_text_content(&text_line.content);
327            }
328        }
329        self.process_annotations(paragraph.annotations());
330    }
331
332    fn process_list(&mut self, list: &List) {
333        self.process_annotations(list.annotations());
334        for item in list.items.iter() {
335            if let ContentItem::ListItem(list_item) = item {
336                self.process_list_item(list_item);
337            }
338        }
339    }
340
341    fn process_list_item(&mut self, list_item: &ListItem) {
342        if let Some(marker_range) = &list_item.marker.location {
343            self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
344        }
345        for text in &list_item.text {
346            if let Some(location) = &text.location {
347                self.push_range(location, LexSemanticTokenKind::ListItemText);
348            }
349            self.process_text_content(text);
350        }
351        self.process_annotations(list_item.annotations());
352        for child in list_item.children.iter() {
353            self.process_content_item(child);
354        }
355    }
356
357    fn process_definition(&mut self, definition: &Definition) {
358        if let Some(header) = definition.header_location() {
359            self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
360        }
361        self.process_text_content(&definition.subject);
362        self.process_annotations(definition.annotations());
363        let was_in_definition = self.in_definition;
364        self.in_definition = true;
365        for child in definition.children.iter() {
366            self.process_content_item(child);
367        }
368        self.in_definition = was_in_definition;
369    }
370
371    fn process_verbatim(&mut self, verbatim: &Verbatim) {
372        for group in verbatim.group() {
373            self.process_text_content(group.subject);
374            if let Some(location) = &group.subject.location {
375                self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
376            }
377            for child in group.children {
378                if let ContentItem::VerbatimLine(line) = child {
379                    self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
380                }
381            }
382        }
383
384        self.push_range(
385            &verbatim.closing_data.label.location,
386            LexSemanticTokenKind::DataLabel,
387        );
388        for parameter in &verbatim.closing_data.parameters {
389            self.push_range(&parameter.location, LexSemanticTokenKind::DataParameter);
390        }
391
392        self.process_annotations(verbatim.annotations());
393    }
394
395    fn process_table(&mut self, table: &Table) {
396        self.process_text_content(&table.subject);
397        if let Some(location) = &table.subject.location {
398            self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
399        }
400
401        // Process cell content: inline text and block children
402        for row in table.all_rows() {
403            for cell in &row.cells {
404                self.process_text_content(&cell.content);
405                for child in cell.children.iter() {
406                    self.process_content_item(child);
407                }
408            }
409        }
410
411        // Table config annotations are in table.annotations — processed below
412        // by process_annotations()
413
414        self.process_annotations(table.annotations());
415    }
416
417    fn process_annotation(&mut self, annotation: &Annotation) {
418        self.push_range(
419            annotation.header_location(),
420            LexSemanticTokenKind::AnnotationLabel,
421        );
422        for parameter in &annotation.data.parameters {
423            self.push_range(
424                &parameter.location,
425                LexSemanticTokenKind::AnnotationParameter,
426            );
427        }
428        let was_in_annotation = self.in_annotation;
429        self.in_annotation = true;
430        for child in annotation.children.iter() {
431            self.process_content_item(child);
432        }
433        self.in_annotation = was_in_annotation;
434    }
435
436    fn process_annotations(&mut self, annotations: &[Annotation]) {
437        for annotation in annotations {
438            self.process_annotation(annotation);
439        }
440    }
441
442    fn process_text_content(&mut self, text: &TextContent) {
443        let mut emitter = InlineTokenEmitter {
444            tokens: &mut self.tokens,
445            in_annotation: self.in_annotation,
446            in_definition: self.in_definition,
447            in_formatted: 0,
448        };
449        walk_text_content_positions(text, &mut emitter);
450    }
451}
452
453/// Visitor that converts inline-tree positions into [`LexSemanticToken`]s.
454///
455/// Bridges the shared inline-position walker (which only tracks structural
456/// ranges) with the editor-facing semantic-token taxonomy. Marker tokens
457/// (`InlineMarker_*_start`/`_end`) are emitted on every container/literal
458/// boundary; content tokens take the appropriate kind for the surrounding
459/// element, including reference-type discrimination
460/// (`ReferenceCitation`/`ReferenceFootnote`/`ReferenceAnnotation`/
461/// `Reference`). Plain text inside a Strong/Emphasis container is suppressed
462/// (`in_formatted > 0`) because the container's content span already covers
463/// it.
464struct InlineTokenEmitter<'a> {
465    tokens: &'a mut Vec<LexSemanticToken>,
466    in_annotation: bool,
467    in_definition: bool,
468    in_formatted: usize,
469}
470
471impl<'a> InlineTokenEmitter<'a> {
472    fn push(&mut self, range: &Range, kind: LexSemanticTokenKind) {
473        if range.span.start < range.span.end {
474            self.tokens.push(LexSemanticToken {
475                kind,
476                range: range.clone(),
477            });
478        }
479    }
480}
481
482impl<'a> InlinePositionVisitor for InlineTokenEmitter<'a> {
483    fn visit_plain(&mut self, range: &Range, _text: &str) {
484        if self.in_formatted > 0 {
485            // Covered by the enclosing container's content span — see leave_strong/leave_emphasis.
486            return;
487        }
488        let kind = if self.in_annotation {
489            LexSemanticTokenKind::AnnotationContent
490        } else if self.in_definition {
491            LexSemanticTokenKind::DefinitionContent
492        } else {
493            return;
494        };
495        self.push(range, kind);
496    }
497
498    fn enter_strong(&mut self, open_marker: &Range) {
499        self.push(open_marker, LexSemanticTokenKind::InlineMarkerStrongStart);
500        self.in_formatted += 1;
501    }
502
503    fn leave_strong(&mut self, content: &Range, close_marker: &Range) {
504        self.in_formatted -= 1;
505        self.push(content, LexSemanticTokenKind::InlineStrong);
506        self.push(close_marker, LexSemanticTokenKind::InlineMarkerStrongEnd);
507    }
508
509    fn enter_emphasis(&mut self, open_marker: &Range) {
510        self.push(open_marker, LexSemanticTokenKind::InlineMarkerEmphasisStart);
511        self.in_formatted += 1;
512    }
513
514    fn leave_emphasis(&mut self, content: &Range, close_marker: &Range) {
515        self.in_formatted -= 1;
516        self.push(content, LexSemanticTokenKind::InlineEmphasis);
517        self.push(close_marker, LexSemanticTokenKind::InlineMarkerEmphasisEnd);
518    }
519
520    fn visit_code(
521        &mut self,
522        open_marker: &Range,
523        content: &Range,
524        close_marker: &Range,
525        _text: &str,
526    ) {
527        self.push(open_marker, LexSemanticTokenKind::InlineMarkerCodeStart);
528        self.push(content, LexSemanticTokenKind::InlineCode);
529        self.push(close_marker, LexSemanticTokenKind::InlineMarkerCodeEnd);
530    }
531
532    fn visit_math(
533        &mut self,
534        open_marker: &Range,
535        content: &Range,
536        close_marker: &Range,
537        _text: &str,
538    ) {
539        self.push(open_marker, LexSemanticTokenKind::InlineMarkerMathStart);
540        self.push(content, LexSemanticTokenKind::InlineMath);
541        self.push(close_marker, LexSemanticTokenKind::InlineMarkerMathEnd);
542    }
543
544    fn visit_reference(
545        &mut self,
546        open_marker: &Range,
547        content: &Range,
548        close_marker: &Range,
549        data: &ReferenceInline,
550    ) {
551        self.push(open_marker, LexSemanticTokenKind::InlineMarkerRefStart);
552        let ref_kind = match &data.reference_type {
553            ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
554            ReferenceType::FootnoteNumber { .. } => LexSemanticTokenKind::ReferenceFootnote,
555            ReferenceType::AnnotationReference { .. } => LexSemanticTokenKind::ReferenceAnnotation,
556            _ => LexSemanticTokenKind::Reference,
557        };
558        self.push(content, ref_kind);
559        self.push(close_marker, LexSemanticTokenKind::InlineMarkerRefEnd);
560    }
561}
562
563#[cfg(test)]
564mod tests {
565    use super::*;
566    use crate::test_support::{sample_document, sample_source};
567    use lex_core::lex::testing::lexplore::Lexplore;
568
569    fn snippets(
570        tokens: &[LexSemanticToken],
571        kind: LexSemanticTokenKind,
572        source: &str,
573    ) -> Vec<String> {
574        tokens
575            .iter()
576            .filter(|token| token.kind == kind)
577            .map(|token| source[token.range.span.clone()].to_string())
578            .collect()
579    }
580
581    #[test]
582    fn collects_structural_tokens() {
583        let document = sample_document();
584        let tokens = collect_semantic_tokens(&document);
585        let source = sample_source();
586
587        // Session titles are now split into SessionMarker and SessionTitleText
588        assert!(
589            snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
590                .iter()
591                .any(|snippet| snippet.trim() == "1.")
592        );
593        assert!(
594            snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
595                .iter()
596                .any(|snippet| snippet.trim() == "Intro")
597        );
598        // Cache is parsed as VerbatimSubject
599        assert!(
600            snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
601                .iter()
602                .any(|snippet| snippet.trim_end() == "Cache")
603        );
604        let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
605        assert_eq!(markers.len(), 4);
606        assert!(markers
607            .iter()
608            .all(|snippet| snippet.trim_start().starts_with('-')
609                || snippet.trim_start().chars().next().unwrap().is_numeric()));
610        let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
611        assert!(annotation_labels
612            .iter()
613            .any(|snippet| snippet.contains("test.note")));
614        let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
615        assert!(parameters
616            .iter()
617            .any(|snippet| snippet.contains("severity=info")));
618        let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
619        assert!(verbatim_subjects
620            .iter()
621            .any(|snippet| snippet.contains("CLI Example")));
622        assert!(snippets(&tokens, LexSemanticTokenKind::DataLabel, source)
623            .iter()
624            .any(|snippet| snippet.contains("shell")));
625    }
626
627    #[test]
628    fn collects_inline_tokens() {
629        let document = sample_document();
630        let tokens = collect_semantic_tokens(&document);
631        let source = sample_source();
632        assert!(
633            snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
634                .iter()
635                .any(|snippet| snippet.contains("Lex"))
636        );
637        assert!(
638            snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
639                .iter()
640                .any(|snippet| snippet.contains("format"))
641        );
642        assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
643            .iter()
644            .any(|snippet| snippet.contains("code")));
645        assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
646            .iter()
647            .any(|snippet| snippet.contains("math")));
648    }
649
650    #[test]
651    fn classifies_references() {
652        let document = sample_document();
653        let tokens = collect_semantic_tokens(&document);
654        let source = sample_source();
655        assert!(
656            snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
657                .iter()
658                .any(|snippet| snippet.contains("@spec2025"))
659        );
660        assert!(
661            snippets(&tokens, LexSemanticTokenKind::ReferenceAnnotation, source)
662                .iter()
663                .any(|snippet| snippet.contains("::source"))
664        );
665        assert!(
666            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
667                .iter()
668                .any(|snippet| snippet.contains("1"))
669        );
670        assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
671            .iter()
672            .any(|snippet| snippet.contains("Cache")));
673    }
674
675    #[test]
676    fn empty_document_has_no_tokens() {
677        let document = Lexplore::benchmark(0)
678            .parse()
679            .expect("failed to parse empty benchmark fixture");
680        let tokens = collect_semantic_tokens(&document);
681        assert!(tokens.is_empty());
682    }
683
684    #[test]
685    fn emits_annotation_content_for_inline_annotation() {
686        let document = sample_document();
687        let tokens = collect_semantic_tokens(&document);
688        let source = sample_source();
689
690        // The fixture starts with `:: test.note severity=info :: Document preface.`
691        // "Document preface." is inline annotation content — plain text inside annotation context.
692        let annotation_content = snippets(&tokens, LexSemanticTokenKind::AnnotationContent, source);
693        assert!(
694            annotation_content
695                .iter()
696                .any(|snippet| snippet.contains("Document preface")),
697            "AnnotationContent should be emitted for plain text inside annotations, got: {annotation_content:?}"
698        );
699    }
700
701    #[test]
702    fn annotation_content_excludes_formatted_text() {
703        // Inline formatting within annotation context should get its own token type,
704        // not AnnotationContent — only Plain nodes emit AnnotationContent.
705        let source = ":: test.note :: Some *bold* text.\n";
706        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
707        let tokens = collect_semantic_tokens(&document);
708
709        let annotation_content: Vec<_> = tokens
710            .iter()
711            .filter(|t| t.kind == LexSemanticTokenKind::AnnotationContent)
712            .map(|t| &source[t.range.span.clone()])
713            .collect();
714
715        // "Some " and " text." should be AnnotationContent, but "bold" should not
716        assert!(
717            annotation_content.iter().any(|s| s.contains("Some")),
718            "Plain text before formatting should be AnnotationContent"
719        );
720        assert!(
721            annotation_content.iter().any(|s| s.contains("text.")),
722            "Plain text after formatting should be AnnotationContent"
723        );
724        assert!(
725            !annotation_content.iter().any(|s| s.contains("bold")),
726            "Formatted text should NOT be AnnotationContent"
727        );
728
729        // "bold" should be InlineStrong
730        let strong: Vec<_> = tokens
731            .iter()
732            .filter(|t| t.kind == LexSemanticTokenKind::InlineStrong)
733            .map(|t| &source[t.range.span.clone()])
734            .collect();
735        assert!(strong.contains(&"bold"));
736    }
737
738    #[test]
739    fn table_cell_inline_formatting_gets_tokens() {
740        let source = "Stats:\n    | *Name* | `code` |\n    | _test_ | #42#   |\n";
741        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
742        let tokens = collect_semantic_tokens(&document);
743
744        let strong = snippets(&tokens, LexSemanticTokenKind::InlineStrong, source);
745        assert!(
746            strong.iter().any(|s| s.contains("Name")),
747            "Expected InlineStrong for *Name* in table cell, got: {strong:?}"
748        );
749
750        let code = snippets(&tokens, LexSemanticTokenKind::InlineCode, source);
751        assert!(
752            code.iter().any(|s| s.contains("code")),
753            "Expected InlineCode for `code` in table cell, got: {code:?}"
754        );
755
756        let emphasis = snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source);
757        assert!(
758            emphasis.iter().any(|s| s.contains("test")),
759            "Expected InlineEmphasis for _test_ in table cell, got: {emphasis:?}"
760        );
761
762        let math = snippets(&tokens, LexSemanticTokenKind::InlineMath, source);
763        assert!(
764            math.iter().any(|s| s.contains("42")),
765            "Expected InlineMath for #42# in table cell, got: {math:?}"
766        );
767    }
768}