lex_analysis/
semantic_tokens.rs

1//! This is the semantic token collector, which editors use for syntax highlighting.
2//! It's worth going over the general approach.
3//!
4//! Semantic Tokens and Editor Highlighting Architecture
5//!
6//!     1. LSP emits semantic tokens using our format's native terminology (e.g., `Verbatim`
7//! Annotation, etc). The LSP declares a token legend at initialization and emits tokens as indices
8//! into that legend—it has no knowledge of editor-specific theming.
9//!     2. Editor plugins map our token types to the editor's theme primitives. This lets users
10//! leverage their existing theme choices while our core LSP code remains editor-agnostic.
11//!     
12//! Editor-Specific Mapping
13//!
14//!     VSCode — declarative mapping in `package.json`:
15//!         "semanticTokenScopes": [{
16//!         "language": "ourformat",
17//!         "scopes": {
18//!         "Verbatim": ["markup.inline.raw"],
19//!         "Heading": ["markup.heading"],
20//!         "Emphasis": ["markup.italic"]
21//!         }
22//!         }]
23//!     :: javascript
24//!
25//!     We map to TextMate scopes (`markup.*`) as they have broad theme support and are a natural
26//! fit for markup.
27//!
28//!     Neovim — imperative mapping in the plugin:
29//!         vim.api.nvim_set_hl(0, '@lsp.type.Verbatim', { link = '@markup.raw' })
30//!         `vim.api.nvim_set_hl(0, '@lsp.type.Heading', { link = '@markup.heading' })
31//!         vim.api.nvim_set_hl(0, '@lsp.type.Emphasis', { link = '@markup.italic' })
32//!     :: lua
33//!     We link to treesitter's `@markup.*` groups for equivalent theme coverage.
34//!     Benefits:
35//!         - LSP speaks our format's semantics—no impedance mismatch
36//!         - Users get syntax highlighting that respects their theme
37//!         - Mapping logic is isolated to editor plugins; adding a new editor doesn't touch the LSP
38//!
39//! The file editors/vscode/themes/lex-light.json has the reocommended theming for Lex to be used in
40//! tests and so forth.
41use lex_core::lex::ast::{
42    Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Position, Range,
43    Session, Table, TextContent, Verbatim,
44};
45use lex_core::lex::inlines::{InlineNode, ReferenceType};
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
48pub enum LexSemanticTokenKind {
49    DocumentTitle,
50    DocumentSubtitle,
51    SessionMarker,
52    SessionTitleText,
53    DefinitionSubject,
54    DefinitionContent,
55    ListMarker,
56    ListItemText,
57    AnnotationLabel,
58    AnnotationParameter,
59    AnnotationContent,
60    InlineStrong,
61    InlineEmphasis,
62    InlineCode,
63    InlineMath,
64    Reference,
65    ReferenceCitation,
66    ReferenceFootnote,
67    VerbatimSubject,
68    DataLabel,
69    DataParameter,
70    VerbatimContent,
71    InlineMarkerStrongStart,
72    InlineMarkerStrongEnd,
73    InlineMarkerEmphasisStart,
74    InlineMarkerEmphasisEnd,
75    InlineMarkerCodeStart,
76    InlineMarkerCodeEnd,
77    InlineMarkerMathStart,
78    InlineMarkerMathEnd,
79    InlineMarkerRefStart,
80    InlineMarkerRefEnd,
81}
82
83impl LexSemanticTokenKind {
84    /// Returns the semantic token type string for LSP.
85    ///
86    /// These token type names are mapped to standard TextMate scopes in editor configurations
87    /// to ensure compatibility with existing themes (Neovim, VSCode, etc.).
88    ///
89    /// Mapping rationale (based on Lex↔Markdown mapping from lex-babel):
90    /// - Session → Heading → maps to "markup.heading"
91    /// - Definition → Term: Desc → maps to "variable.other.definition"
92    /// - InlineStrong → bold → maps to "markup.bold"
93    /// - InlineEmphasis → *italic* → maps to "markup.italic"
94    /// - InlineCode → `code` → maps to "markup.inline.raw"
95    /// - InlineMath → $math$ → maps to "constant.numeric"
96    /// - Reference → \[citation\] → maps to "markup.underline.link"
97    /// - Verbatim → ```block``` → maps to "markup.raw.block"
98    /// - Annotation → <!-- comment --> → maps to "comment.block"
99    /// - ListMarker → - or 1. → maps to "punctuation.definition.list"
100    pub fn as_str(self) -> &'static str {
101        match self {
102            LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
103            LexSemanticTokenKind::DocumentSubtitle => "DocumentSubtitle",
104            LexSemanticTokenKind::SessionMarker => "SessionMarker",
105            LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
106            LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
107            LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
108            LexSemanticTokenKind::ListMarker => "ListMarker",
109            LexSemanticTokenKind::ListItemText => "ListItemText",
110            LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
111            LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
112            LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
113            LexSemanticTokenKind::InlineStrong => "InlineStrong",
114            LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
115            LexSemanticTokenKind::InlineCode => "InlineCode",
116            LexSemanticTokenKind::InlineMath => "InlineMath",
117            LexSemanticTokenKind::Reference => "Reference",
118            LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
119            LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
120            LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
121            LexSemanticTokenKind::DataLabel => "DataLabel",
122            LexSemanticTokenKind::DataParameter => "DataParameter",
123            LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
124            LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
125            LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
126            LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
127            LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
128            LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
129            LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
130            LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
131            LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
132            LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
133            LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
134        }
135    }
136}
137
138pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
139    LexSemanticTokenKind::DocumentTitle,
140    LexSemanticTokenKind::DocumentSubtitle,
141    LexSemanticTokenKind::SessionMarker,
142    LexSemanticTokenKind::SessionTitleText,
143    LexSemanticTokenKind::DefinitionSubject,
144    LexSemanticTokenKind::DefinitionContent,
145    LexSemanticTokenKind::ListMarker,
146    LexSemanticTokenKind::ListItemText,
147    LexSemanticTokenKind::AnnotationLabel,
148    LexSemanticTokenKind::AnnotationParameter,
149    LexSemanticTokenKind::AnnotationContent,
150    LexSemanticTokenKind::InlineStrong,
151    LexSemanticTokenKind::InlineEmphasis,
152    LexSemanticTokenKind::InlineCode,
153    LexSemanticTokenKind::InlineMath,
154    LexSemanticTokenKind::Reference,
155    LexSemanticTokenKind::ReferenceCitation,
156    LexSemanticTokenKind::ReferenceFootnote,
157    LexSemanticTokenKind::VerbatimSubject,
158    LexSemanticTokenKind::DataLabel,
159    LexSemanticTokenKind::DataParameter,
160    LexSemanticTokenKind::VerbatimContent,
161    LexSemanticTokenKind::InlineMarkerStrongStart,
162    LexSemanticTokenKind::InlineMarkerStrongEnd,
163    LexSemanticTokenKind::InlineMarkerEmphasisStart,
164    LexSemanticTokenKind::InlineMarkerEmphasisEnd,
165    LexSemanticTokenKind::InlineMarkerCodeStart,
166    LexSemanticTokenKind::InlineMarkerCodeEnd,
167    LexSemanticTokenKind::InlineMarkerMathStart,
168    LexSemanticTokenKind::InlineMarkerMathEnd,
169    LexSemanticTokenKind::InlineMarkerRefStart,
170    LexSemanticTokenKind::InlineMarkerRefEnd,
171];
172
173#[derive(Debug, Clone, PartialEq)]
174pub struct LexSemanticToken {
175    pub kind: LexSemanticTokenKind,
176    pub range: Range,
177}
178
179pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
180    let mut collector = TokenCollector::new();
181    collector.process_document(document);
182    collector.finish()
183}
184
185struct TokenCollector {
186    tokens: Vec<LexSemanticToken>,
187    in_annotation: bool,
188    in_definition: bool,
189}
190
191impl TokenCollector {
192    fn new() -> Self {
193        Self {
194            tokens: Vec::new(),
195            in_annotation: false,
196            in_definition: false,
197        }
198    }
199
200    fn finish(mut self) -> Vec<LexSemanticToken> {
201        self.tokens.sort_by(|a, b| {
202            let a_start = (
203                &a.range.start.line,
204                &a.range.start.column,
205                &a.range.end.line,
206                &a.range.end.column,
207            );
208            let b_start = (
209                &b.range.start.line,
210                &b.range.start.column,
211                &b.range.end.line,
212                &b.range.end.column,
213            );
214            a_start.cmp(&b_start)
215        });
216        self.tokens
217    }
218
219    fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
220        if range.span.start < range.span.end {
221            self.tokens.push(LexSemanticToken {
222                kind,
223                range: range.clone(),
224            });
225        }
226    }
227
228    fn process_document(&mut self, document: &Document) {
229        self.process_annotations(document.annotations());
230        if let Some(title) = &document.title {
231            if let Some(title_loc) = &title.content.location {
232                self.push_range(title_loc, LexSemanticTokenKind::DocumentTitle);
233            } else {
234                self.push_range(&title.location, LexSemanticTokenKind::DocumentTitle);
235            }
236            self.process_text_content(&title.content);
237            if let Some(subtitle) = &title.subtitle {
238                if let Some(sub_loc) = &subtitle.location {
239                    self.push_range(sub_loc, LexSemanticTokenKind::DocumentSubtitle);
240                }
241                self.process_text_content(subtitle);
242            }
243        }
244        self.process_session(&document.root, LexSemanticTokenKind::SessionTitleText);
245    }
246
247    fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
248        // Emit separate tokens for marker and title text
249        if let Some(marker) = &session.marker {
250            // Emit SessionMarker token for the sequence marker
251            self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
252        }
253
254        // Emit SessionTitleText token for the title text (without marker)
255        // Create a range for the title text by using the full title location
256        // and adjusting if there's a marker
257        if let Some(header) = session.header_location() {
258            if let Some(marker) = &session.marker {
259                // Calculate the title text range (after the marker)
260                let marker_text = marker.as_str();
261                let full_title = session.full_title();
262
263                // Find where the marker ends in the title
264                if let Some(pos) = full_title.find(marker_text) {
265                    let marker_end = pos + marker_text.len();
266                    // Skip whitespace after marker
267                    let title_start = full_title[marker_end..]
268                        .chars()
269                        .position(|c| !c.is_whitespace())
270                        .map(|p| marker_end + p)
271                        .unwrap_or(marker_end);
272
273                    if title_start < full_title.len() {
274                        // Create range for title text only
275                        use lex_core::lex::ast::Position;
276                        let title_text_range = Range::new(
277                            header.span.start + title_start..header.span.end,
278                            Position::new(header.start.line, header.start.column + title_start),
279                            header.end,
280                        );
281                        self.push_range(&title_text_range, title_kind);
282                    }
283                }
284            } else {
285                // No marker, the entire header is title text
286                self.push_range(header, title_kind);
287            }
288        }
289
290        self.process_text_content(&session.title);
291
292        self.process_annotations(session.annotations());
293        for child in session.children.iter() {
294            self.process_content_item(child);
295        }
296    }
297
298    fn process_content_item(&mut self, item: &ContentItem) {
299        match item {
300            ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
301            ContentItem::Session(session) => {
302                self.process_session(session, LexSemanticTokenKind::SessionTitleText)
303            }
304            ContentItem::List(list) => self.process_list(list),
305            ContentItem::ListItem(list_item) => self.process_list_item(list_item),
306            ContentItem::Definition(definition) => self.process_definition(definition),
307            ContentItem::Annotation(annotation) => self.process_annotation(annotation),
308            ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
309            ContentItem::Table(table) => self.process_table(table),
310            ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
311            ContentItem::VerbatimLine(_) => {}
312            ContentItem::BlankLineGroup(_) => {}
313        }
314    }
315
316    fn process_paragraph(&mut self, paragraph: &Paragraph) {
317        for line in &paragraph.lines {
318            if let ContentItem::TextLine(text_line) = line {
319                // Don't emit full-line tokens for DefinitionContent or AnnotationContent
320                // as they overlap with inline tokens. The context is already clear from
321                // the DefinitionSubject and AnnotationLabel tokens.
322                self.process_text_content(&text_line.content);
323            }
324        }
325        self.process_annotations(paragraph.annotations());
326    }
327
328    fn process_list(&mut self, list: &List) {
329        self.process_annotations(list.annotations());
330        for item in list.items.iter() {
331            if let ContentItem::ListItem(list_item) = item {
332                self.process_list_item(list_item);
333            }
334        }
335    }
336
337    fn process_list_item(&mut self, list_item: &ListItem) {
338        if let Some(marker_range) = &list_item.marker.location {
339            self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
340        }
341        for text in &list_item.text {
342            if let Some(location) = &text.location {
343                self.push_range(location, LexSemanticTokenKind::ListItemText);
344            }
345            self.process_text_content(text);
346        }
347        self.process_annotations(list_item.annotations());
348        for child in list_item.children.iter() {
349            self.process_content_item(child);
350        }
351    }
352
353    fn process_definition(&mut self, definition: &Definition) {
354        if let Some(header) = definition.header_location() {
355            self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
356        }
357        self.process_text_content(&definition.subject);
358        self.process_annotations(definition.annotations());
359        let was_in_definition = self.in_definition;
360        self.in_definition = true;
361        for child in definition.children.iter() {
362            self.process_content_item(child);
363        }
364        self.in_definition = was_in_definition;
365    }
366
367    fn process_verbatim(&mut self, verbatim: &Verbatim) {
368        for group in verbatim.group() {
369            self.process_text_content(group.subject);
370            if let Some(location) = &group.subject.location {
371                self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
372            }
373            for child in group.children {
374                if let ContentItem::VerbatimLine(line) = child {
375                    self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
376                }
377            }
378        }
379
380        self.push_range(
381            &verbatim.closing_data.label.location,
382            LexSemanticTokenKind::DataLabel,
383        );
384        for parameter in &verbatim.closing_data.parameters {
385            self.push_range(&parameter.location, LexSemanticTokenKind::DataParameter);
386        }
387
388        self.process_annotations(verbatim.annotations());
389    }
390
391    fn process_table(&mut self, table: &Table) {
392        self.process_text_content(&table.subject);
393        if let Some(location) = &table.subject.location {
394            self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
395        }
396
397        // Process cell content: inline text and block children
398        for row in table.all_rows() {
399            for cell in &row.cells {
400                self.process_text_content(&cell.content);
401                for child in cell.children.iter() {
402                    self.process_content_item(child);
403                }
404            }
405        }
406
407        // Table config annotations are in table.annotations — processed below
408        // by process_annotations()
409
410        self.process_annotations(table.annotations());
411    }
412
413    fn process_annotation(&mut self, annotation: &Annotation) {
414        self.push_range(
415            annotation.header_location(),
416            LexSemanticTokenKind::AnnotationLabel,
417        );
418        for parameter in &annotation.data.parameters {
419            self.push_range(
420                &parameter.location,
421                LexSemanticTokenKind::AnnotationParameter,
422            );
423        }
424        let was_in_annotation = self.in_annotation;
425        self.in_annotation = true;
426        for child in annotation.children.iter() {
427            self.process_content_item(child);
428        }
429        self.in_annotation = was_in_annotation;
430    }
431
432    fn process_annotations(&mut self, annotations: &[Annotation]) {
433        for annotation in annotations {
434            self.process_annotation(annotation);
435        }
436    }
437
438    fn process_text_content(&mut self, text: &TextContent) {
439        let Some(base_range) = text.location.as_ref() else {
440            return;
441        };
442        let raw = text.as_string();
443        if raw.is_empty() {
444            return;
445        }
446        let nodes = text.inline_items();
447        let mut walker = InlineWalker {
448            raw,
449            base_range,
450            cursor: 0,
451            tokens: &mut self.tokens,
452            in_annotation: self.in_annotation,
453            in_definition: self.in_definition,
454            in_formatted: false,
455        };
456        walker.walk_nodes(&nodes);
457    }
458}
459
460/// Walks the InlineNode tree and raw text in parallel to produce positioned semantic tokens.
461///
462/// The inline parser consumes escape sequences and delimiters, so InlineNode text doesn't
463/// directly correspond to byte offsets in the raw source. This walker maintains a cursor
464/// into the raw text and advances it according to the same rules the inline parser uses,
465/// producing correctly positioned Range values for each token.
466struct InlineWalker<'a> {
467    raw: &'a str,
468    base_range: &'a Range,
469    cursor: usize,
470    tokens: &'a mut Vec<LexSemanticToken>,
471    in_annotation: bool,
472    in_definition: bool,
473    /// True when inside a formatting container (Strong/Emphasis). Plain text inside
474    /// containers is covered by the container's content span, so context-dependent
475    /// tokens (AnnotationContent, DefinitionContent) are suppressed.
476    in_formatted: bool,
477}
478
479impl<'a> InlineWalker<'a> {
480    fn walk_nodes(&mut self, nodes: &[InlineNode]) {
481        for node in nodes {
482            self.walk_node(node);
483        }
484    }
485
486    fn walk_node(&mut self, node: &InlineNode) {
487        match node {
488            InlineNode::Plain { text, .. } => self.walk_plain(text),
489            InlineNode::Strong { content, .. } => self.walk_container(
490                content,
491                '*',
492                LexSemanticTokenKind::InlineStrong,
493                LexSemanticTokenKind::InlineMarkerStrongStart,
494                LexSemanticTokenKind::InlineMarkerStrongEnd,
495            ),
496            InlineNode::Emphasis { content, .. } => self.walk_container(
497                content,
498                '_',
499                LexSemanticTokenKind::InlineEmphasis,
500                LexSemanticTokenKind::InlineMarkerEmphasisStart,
501                LexSemanticTokenKind::InlineMarkerEmphasisEnd,
502            ),
503            InlineNode::Code { text, .. } => self.walk_literal(
504                text,
505                '`',
506                LexSemanticTokenKind::InlineCode,
507                LexSemanticTokenKind::InlineMarkerCodeStart,
508                LexSemanticTokenKind::InlineMarkerCodeEnd,
509            ),
510            InlineNode::Math { text, .. } => self.walk_literal(
511                text,
512                '#',
513                LexSemanticTokenKind::InlineMath,
514                LexSemanticTokenKind::InlineMarkerMathStart,
515                LexSemanticTokenKind::InlineMarkerMathEnd,
516            ),
517            InlineNode::Reference { data, .. } => self.walk_reference(data),
518        }
519    }
520
521    /// Walk a Plain text node, advancing cursor through escape sequences in raw text.
522    /// Emits AnnotationContent or DefinitionContent when inside those contexts.
523    fn walk_plain(&mut self, text: &str) {
524        let start = self.cursor;
525        self.advance_unescaped(text);
526        let end = self.cursor;
527
528        if start < end {
529            let kind = if self.in_formatted {
530                None // Covered by the container's content span
531            } else if self.in_annotation {
532                Some(LexSemanticTokenKind::AnnotationContent)
533            } else if self.in_definition {
534                Some(LexSemanticTokenKind::DefinitionContent)
535            } else {
536                None
537            };
538            if let Some(kind) = kind {
539                self.push(self.make_range(start, end), kind);
540            }
541        }
542    }
543
544    /// Walk a container node (Strong/Emphasis) which has an opening marker, children, and closing marker.
545    fn walk_container(
546        &mut self,
547        content: &[InlineNode],
548        marker: char,
549        content_kind: LexSemanticTokenKind,
550        start_marker_kind: LexSemanticTokenKind,
551        end_marker_kind: LexSemanticTokenKind,
552    ) {
553        let marker_len = marker.len_utf8();
554
555        // Opening marker
556        let marker_start = self.cursor;
557        self.cursor += marker_len;
558        self.push(
559            self.make_range(marker_start, self.cursor),
560            start_marker_kind,
561        );
562
563        // Recurse into children — record span boundaries for the content token
564        let content_start = self.cursor;
565        let was_in_formatted = self.in_formatted;
566        self.in_formatted = true;
567        self.walk_nodes(content);
568        self.in_formatted = was_in_formatted;
569        let content_end = self.cursor;
570
571        // Emit a single content span covering all children
572        if content_start < content_end {
573            self.push(self.make_range(content_start, content_end), content_kind);
574        }
575
576        // Closing marker
577        let close_start = self.cursor;
578        self.cursor += marker_len;
579        self.push(self.make_range(close_start, self.cursor), end_marker_kind);
580    }
581
582    /// Walk a literal node (Code/Math) — no escape processing inside.
583    fn walk_literal(
584        &mut self,
585        text: &str,
586        marker: char,
587        content_kind: LexSemanticTokenKind,
588        start_marker_kind: LexSemanticTokenKind,
589        end_marker_kind: LexSemanticTokenKind,
590    ) {
591        let marker_len = marker.len_utf8();
592
593        // Opening marker
594        let marker_start = self.cursor;
595        self.cursor += marker_len;
596        self.push(
597            self.make_range(marker_start, self.cursor),
598            start_marker_kind,
599        );
600
601        // Literal content (verbatim, no escape processing)
602        let content_start = self.cursor;
603        self.cursor += text.len();
604        if content_start < self.cursor {
605            self.push(self.make_range(content_start, self.cursor), content_kind);
606        }
607
608        // Closing marker
609        let close_start = self.cursor;
610        self.cursor += marker_len;
611        self.push(self.make_range(close_start, self.cursor), end_marker_kind);
612    }
613
614    /// Walk a Reference node — literal content wrapped in `[` `]`.
615    fn walk_reference(&mut self, data: &lex_core::lex::inlines::ReferenceInline) {
616        let ref_kind = match &data.reference_type {
617            ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
618            ReferenceType::FootnoteNumber { .. } | ReferenceType::FootnoteLabeled { .. } => {
619                LexSemanticTokenKind::ReferenceFootnote
620            }
621            _ => LexSemanticTokenKind::Reference,
622        };
623
624        // Opening bracket
625        let open_start = self.cursor;
626        self.cursor += 1;
627        self.push(
628            self.make_range(open_start, self.cursor),
629            LexSemanticTokenKind::InlineMarkerRefStart,
630        );
631
632        // Reference content (literal — matches raw verbatim)
633        let content_start = self.cursor;
634        self.cursor += data.raw.len();
635        if content_start < self.cursor {
636            self.push(self.make_range(content_start, self.cursor), ref_kind);
637        }
638
639        // Closing bracket
640        let close_start = self.cursor;
641        self.cursor += 1;
642        self.push(
643            self.make_range(close_start, self.cursor),
644            LexSemanticTokenKind::InlineMarkerRefEnd,
645        );
646    }
647
648    /// Advance the raw-text cursor to match unescaped `text` from an InlineNode::Plain.
649    ///
650    /// The inline parser applies escape rules: `\*` → `*`, `\\` → `\`, but `\n` stays `\n`.
651    /// This function mirrors that logic to track how many raw bytes correspond to each
652    /// unescaped character.
653    fn advance_unescaped(&mut self, text: &str) {
654        for expected in text.chars() {
655            if self.cursor >= self.raw.len() {
656                break;
657            }
658            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
659            if raw_ch == '\\' {
660                if self.cursor + 1 >= self.raw.len() {
661                    // Trailing backslash: treat as literal to mirror parser behavior and
662                    // avoid out-of-bounds slicing on `self.raw[self.cursor + 1..]`.
663                    self.cursor += 1;
664                } else {
665                    let next_ch = self.raw[self.cursor + 1..].chars().next();
666                    match next_ch {
667                        Some(nc) if !nc.is_alphanumeric() => {
668                            // Escaped: raw `\X` maps to unescaped `X`
669                            self.cursor += 1 + nc.len_utf8();
670                        }
671                        _ => {
672                            // Literal backslash: raw `\` stays as `\` in the node
673                            self.cursor += 1;
674                        }
675                    }
676                }
677            } else {
678                self.cursor += raw_ch.len_utf8();
679            }
680            let _ = expected; // cursor already advanced
681        }
682    }
683
684    fn make_range(&self, start: usize, end: usize) -> Range {
685        let start_pos = self.position_at(start);
686        let end_pos = self.position_at(end);
687        Range::new(
688            (self.base_range.span.start + start)..(self.base_range.span.start + end),
689            start_pos,
690            end_pos,
691        )
692    }
693
694    fn position_at(&self, offset: usize) -> Position {
695        let mut line = self.base_range.start.line;
696        let mut column = self.base_range.start.column;
697        for ch in self.raw[..offset].chars() {
698            if ch == '\n' {
699                line += 1;
700                column = 0;
701            } else {
702                column += ch.len_utf8();
703            }
704        }
705        Position::new(line, column)
706    }
707
708    fn push(&mut self, range: Range, kind: LexSemanticTokenKind) {
709        if range.span.start < range.span.end {
710            self.tokens.push(LexSemanticToken { kind, range });
711        }
712    }
713}
714
715#[cfg(test)]
716mod tests {
717    use super::*;
718    use crate::test_support::{sample_document, sample_source};
719    use lex_core::lex::testing::lexplore::Lexplore;
720
721    fn snippets(
722        tokens: &[LexSemanticToken],
723        kind: LexSemanticTokenKind,
724        source: &str,
725    ) -> Vec<String> {
726        tokens
727            .iter()
728            .filter(|token| token.kind == kind)
729            .map(|token| source[token.range.span.clone()].to_string())
730            .collect()
731    }
732
733    #[test]
734    fn collects_structural_tokens() {
735        let document = sample_document();
736        let tokens = collect_semantic_tokens(&document);
737        let source = sample_source();
738
739        // Session titles are now split into SessionMarker and SessionTitleText
740        assert!(
741            snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
742                .iter()
743                .any(|snippet| snippet.trim() == "1.")
744        );
745        assert!(
746            snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
747                .iter()
748                .any(|snippet| snippet.trim() == "Intro")
749        );
750        // Cache is parsed as VerbatimSubject
751        assert!(
752            snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
753                .iter()
754                .any(|snippet| snippet.trim_end() == "Cache")
755        );
756        let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
757        assert_eq!(markers.len(), 4);
758        assert!(markers
759            .iter()
760            .all(|snippet| snippet.trim_start().starts_with('-')
761                || snippet.trim_start().chars().next().unwrap().is_numeric()));
762        let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
763        assert!(annotation_labels
764            .iter()
765            .any(|snippet| snippet.contains("doc.note")));
766        let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
767        assert!(parameters
768            .iter()
769            .any(|snippet| snippet.contains("severity=info")));
770        let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
771        assert!(verbatim_subjects
772            .iter()
773            .any(|snippet| snippet.contains("CLI Example")));
774        assert!(snippets(&tokens, LexSemanticTokenKind::DataLabel, source)
775            .iter()
776            .any(|snippet| snippet.contains("shell")));
777    }
778
779    #[test]
780    fn collects_inline_tokens() {
781        let document = sample_document();
782        let tokens = collect_semantic_tokens(&document);
783        let source = sample_source();
784        assert!(
785            snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
786                .iter()
787                .any(|snippet| snippet.contains("Lex"))
788        );
789        assert!(
790            snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
791                .iter()
792                .any(|snippet| snippet.contains("format"))
793        );
794        assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
795            .iter()
796            .any(|snippet| snippet.contains("code")));
797        assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
798            .iter()
799            .any(|snippet| snippet.contains("math")));
800    }
801
802    #[test]
803    fn classifies_references() {
804        let document = sample_document();
805        let tokens = collect_semantic_tokens(&document);
806        let source = sample_source();
807        assert!(
808            snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
809                .iter()
810                .any(|snippet| snippet.contains("@spec2025"))
811        );
812        assert!(
813            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
814                .iter()
815                .any(|snippet| snippet.contains("^source"))
816        );
817        assert!(
818            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
819                .iter()
820                .any(|snippet| snippet.contains("1"))
821        );
822        assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
823            .iter()
824            .any(|snippet| snippet.contains("Cache")));
825    }
826
827    #[test]
828    fn empty_document_has_no_tokens() {
829        let document = Lexplore::benchmark(0)
830            .parse()
831            .expect("failed to parse empty benchmark fixture");
832        let tokens = collect_semantic_tokens(&document);
833        assert!(tokens.is_empty());
834    }
835
836    #[test]
837    fn emits_annotation_content_for_inline_annotation() {
838        let document = sample_document();
839        let tokens = collect_semantic_tokens(&document);
840        let source = sample_source();
841
842        // The fixture starts with `:: doc.note severity=info :: Document preface.`
843        // "Document preface." is inline annotation content — plain text inside annotation context.
844        let annotation_content = snippets(&tokens, LexSemanticTokenKind::AnnotationContent, source);
845        assert!(
846            annotation_content
847                .iter()
848                .any(|snippet| snippet.contains("Document preface")),
849            "AnnotationContent should be emitted for plain text inside annotations, got: {annotation_content:?}"
850        );
851    }
852
853    #[test]
854    fn annotation_content_excludes_formatted_text() {
855        // Inline formatting within annotation context should get its own token type,
856        // not AnnotationContent — only Plain nodes emit AnnotationContent.
857        let source = ":: note :: Some *bold* text.\n";
858        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
859        let tokens = collect_semantic_tokens(&document);
860
861        let annotation_content: Vec<_> = tokens
862            .iter()
863            .filter(|t| t.kind == LexSemanticTokenKind::AnnotationContent)
864            .map(|t| &source[t.range.span.clone()])
865            .collect();
866
867        // "Some " and " text." should be AnnotationContent, but "bold" should not
868        assert!(
869            annotation_content.iter().any(|s| s.contains("Some")),
870            "Plain text before formatting should be AnnotationContent"
871        );
872        assert!(
873            annotation_content.iter().any(|s| s.contains("text.")),
874            "Plain text after formatting should be AnnotationContent"
875        );
876        assert!(
877            !annotation_content.iter().any(|s| s.contains("bold")),
878            "Formatted text should NOT be AnnotationContent"
879        );
880
881        // "bold" should be InlineStrong
882        let strong: Vec<_> = tokens
883            .iter()
884            .filter(|t| t.kind == LexSemanticTokenKind::InlineStrong)
885            .map(|t| &source[t.range.span.clone()])
886            .collect();
887        assert!(strong.contains(&"bold"));
888    }
889
890    #[test]
891    fn table_cell_inline_formatting_gets_tokens() {
892        let source = "Stats:\n    | *Name* | `code` |\n    | _test_ | #42#   |\n:: table ::\n";
893        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
894        let tokens = collect_semantic_tokens(&document);
895
896        let strong = snippets(&tokens, LexSemanticTokenKind::InlineStrong, source);
897        assert!(
898            strong.iter().any(|s| s.contains("Name")),
899            "Expected InlineStrong for *Name* in table cell, got: {strong:?}"
900        );
901
902        let code = snippets(&tokens, LexSemanticTokenKind::InlineCode, source);
903        assert!(
904            code.iter().any(|s| s.contains("code")),
905            "Expected InlineCode for `code` in table cell, got: {code:?}"
906        );
907
908        let emphasis = snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source);
909        assert!(
910            emphasis.iter().any(|s| s.contains("test")),
911            "Expected InlineEmphasis for _test_ in table cell, got: {emphasis:?}"
912        );
913
914        let math = snippets(&tokens, LexSemanticTokenKind::InlineMath, source);
915        assert!(
916            math.iter().any(|s| s.contains("42")),
917            "Expected InlineMath for #42# in table cell, got: {math:?}"
918        );
919    }
920}
lex_analysis/semantic_tokens.rs

lex_analysis/
semantic_tokens.rs