lex_analysis/
semantic_tokens.rs

1//! This is the semantic token collector, which editors use for syntax highlighting.
2//! It's worth going over the general approach.
3//!
4//! Semantic Tokens and Editor Highlighting Architecture
5//!
6//!     1. LSP emits semantic tokens using our format's native terminology (e.g., `Verbatim`
7//! Annotation, etc). The LSP declares a token legend at initialization and emits tokens as indices
8//! into that legend—it has no knowledge of editor-specific theming.
9//!     2. Editor plugins map our token types to the editor's theme primitives. This lets users
10//! leverage their existing theme choices while our core LSP code remains editor-agnostic.
11//!     
12//! Editor-Specific Mapping
13//!
14//!     VSCode — declarative mapping in `package.json`:
15//!         "semanticTokenScopes": [{
16//!         "language": "ourformat",
17//!         "scopes": {
18//!         "Verbatim": ["markup.inline.raw"],
19//!         "Heading": ["markup.heading"],
20//!         "Emphasis": ["markup.italic"]
21//!         }
22//!         }]
23//!     :: javascript
24//!
25//!     We map to TextMate scopes (`markup.*`) as they have broad theme support and are a natural
26//! fit for markup.
27//!
28//!     Neovim — imperative mapping in the plugin:
29//!         vim.api.nvim_set_hl(0, '@lsp.type.Verbatim', { link = '@markup.raw' })
30//!         `vim.api.nvim_set_hl(0, '@lsp.type.Heading', { link = '@markup.heading' })
31//!         vim.api.nvim_set_hl(0, '@lsp.type.Emphasis', { link = '@markup.italic' })
32//!     :: lua
33//!     We link to treesitter's `@markup.*` groups for equivalent theme coverage.
34//!     Benefits:
35//!         - LSP speaks our format's semantics—no impedance mismatch
36//!         - Users get syntax highlighting that respects their theme
37//!         - Mapping logic is isolated to editor plugins; adding a new editor doesn't touch the LSP
38//!
39//! The file editors/vscode/themes/lex-light.json has the reocommended theming for Lex to be used in
40//! tests and so forth.
41use lex_core::lex::ast::{
42    Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Position, Range,
43    Session, Table, TextContent, Verbatim,
44};
45use lex_core::lex::inlines::{InlineNode, ReferenceType};
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
48pub enum LexSemanticTokenKind {
49    DocumentTitle,
50    DocumentSubtitle,
51    SessionMarker,
52    SessionTitleText,
53    DefinitionSubject,
54    DefinitionContent,
55    ListMarker,
56    ListItemText,
57    AnnotationLabel,
58    AnnotationParameter,
59    AnnotationContent,
60    InlineStrong,
61    InlineEmphasis,
62    InlineCode,
63    InlineMath,
64    Reference,
65    ReferenceCitation,
66    ReferenceFootnote,
67    ReferenceAnnotation,
68    VerbatimSubject,
69    DataLabel,
70    DataParameter,
71    VerbatimContent,
72    InlineMarkerStrongStart,
73    InlineMarkerStrongEnd,
74    InlineMarkerEmphasisStart,
75    InlineMarkerEmphasisEnd,
76    InlineMarkerCodeStart,
77    InlineMarkerCodeEnd,
78    InlineMarkerMathStart,
79    InlineMarkerMathEnd,
80    InlineMarkerRefStart,
81    InlineMarkerRefEnd,
82}
83
84impl LexSemanticTokenKind {
85    /// Returns the semantic token type string for LSP.
86    ///
87    /// These token type names are mapped to standard TextMate scopes in editor configurations
88    /// to ensure compatibility with existing themes (Neovim, VSCode, etc.).
89    ///
90    /// Mapping rationale (based on Lex↔Markdown mapping from lex-babel):
91    /// - Session → Heading → maps to "markup.heading"
92    /// - Definition → Term: Desc → maps to "variable.other.definition"
93    /// - InlineStrong → bold → maps to "markup.bold"
94    /// - InlineEmphasis → *italic* → maps to "markup.italic"
95    /// - InlineCode → `code` → maps to "markup.inline.raw"
96    /// - InlineMath → $math$ → maps to "constant.numeric"
97    /// - Reference → \[citation\] → maps to "markup.underline.link"
98    /// - Verbatim → ```block``` → maps to "markup.raw.block"
99    /// - Annotation → <!-- comment --> → maps to "comment.block"
100    /// - ListMarker → - or 1. → maps to "punctuation.definition.list"
101    pub fn as_str(self) -> &'static str {
102        match self {
103            LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
104            LexSemanticTokenKind::DocumentSubtitle => "DocumentSubtitle",
105            LexSemanticTokenKind::SessionMarker => "SessionMarker",
106            LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
107            LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
108            LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
109            LexSemanticTokenKind::ListMarker => "ListMarker",
110            LexSemanticTokenKind::ListItemText => "ListItemText",
111            LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
112            LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
113            LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
114            LexSemanticTokenKind::InlineStrong => "InlineStrong",
115            LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
116            LexSemanticTokenKind::InlineCode => "InlineCode",
117            LexSemanticTokenKind::InlineMath => "InlineMath",
118            LexSemanticTokenKind::Reference => "Reference",
119            LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
120            LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
121            LexSemanticTokenKind::ReferenceAnnotation => "ReferenceAnnotation",
122            LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
123            LexSemanticTokenKind::DataLabel => "DataLabel",
124            LexSemanticTokenKind::DataParameter => "DataParameter",
125            LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
126            LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
127            LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
128            LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
129            LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
130            LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
131            LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
132            LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
133            LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
134            LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
135            LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
136        }
137    }
138}
139
140pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
141    LexSemanticTokenKind::DocumentTitle,
142    LexSemanticTokenKind::DocumentSubtitle,
143    LexSemanticTokenKind::SessionMarker,
144    LexSemanticTokenKind::SessionTitleText,
145    LexSemanticTokenKind::DefinitionSubject,
146    LexSemanticTokenKind::DefinitionContent,
147    LexSemanticTokenKind::ListMarker,
148    LexSemanticTokenKind::ListItemText,
149    LexSemanticTokenKind::AnnotationLabel,
150    LexSemanticTokenKind::AnnotationParameter,
151    LexSemanticTokenKind::AnnotationContent,
152    LexSemanticTokenKind::InlineStrong,
153    LexSemanticTokenKind::InlineEmphasis,
154    LexSemanticTokenKind::InlineCode,
155    LexSemanticTokenKind::InlineMath,
156    LexSemanticTokenKind::Reference,
157    LexSemanticTokenKind::ReferenceCitation,
158    LexSemanticTokenKind::ReferenceFootnote,
159    LexSemanticTokenKind::VerbatimSubject,
160    LexSemanticTokenKind::DataLabel,
161    LexSemanticTokenKind::DataParameter,
162    LexSemanticTokenKind::VerbatimContent,
163    LexSemanticTokenKind::InlineMarkerStrongStart,
164    LexSemanticTokenKind::InlineMarkerStrongEnd,
165    LexSemanticTokenKind::InlineMarkerEmphasisStart,
166    LexSemanticTokenKind::InlineMarkerEmphasisEnd,
167    LexSemanticTokenKind::InlineMarkerCodeStart,
168    LexSemanticTokenKind::InlineMarkerCodeEnd,
169    LexSemanticTokenKind::InlineMarkerMathStart,
170    LexSemanticTokenKind::InlineMarkerMathEnd,
171    LexSemanticTokenKind::InlineMarkerRefStart,
172    LexSemanticTokenKind::InlineMarkerRefEnd,
173    LexSemanticTokenKind::ReferenceAnnotation,
174];
175
176#[derive(Debug, Clone, PartialEq)]
177pub struct LexSemanticToken {
178    pub kind: LexSemanticTokenKind,
179    pub range: Range,
180}
181
182pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
183    let mut collector = TokenCollector::new();
184    collector.process_document(document);
185    collector.finish()
186}
187
188struct TokenCollector {
189    tokens: Vec<LexSemanticToken>,
190    in_annotation: bool,
191    in_definition: bool,
192}
193
194impl TokenCollector {
195    fn new() -> Self {
196        Self {
197            tokens: Vec::new(),
198            in_annotation: false,
199            in_definition: false,
200        }
201    }
202
203    fn finish(mut self) -> Vec<LexSemanticToken> {
204        self.tokens.sort_by(|a, b| {
205            let a_start = (
206                &a.range.start.line,
207                &a.range.start.column,
208                &a.range.end.line,
209                &a.range.end.column,
210            );
211            let b_start = (
212                &b.range.start.line,
213                &b.range.start.column,
214                &b.range.end.line,
215                &b.range.end.column,
216            );
217            a_start.cmp(&b_start)
218        });
219        self.tokens
220    }
221
222    fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
223        if range.span.start < range.span.end {
224            self.tokens.push(LexSemanticToken {
225                kind,
226                range: range.clone(),
227            });
228        }
229    }
230
231    fn process_document(&mut self, document: &Document) {
232        self.process_annotations(document.annotations());
233        if let Some(title) = &document.title {
234            if let Some(title_loc) = &title.content.location {
235                self.push_range(title_loc, LexSemanticTokenKind::DocumentTitle);
236            } else {
237                self.push_range(&title.location, LexSemanticTokenKind::DocumentTitle);
238            }
239            self.process_text_content(&title.content);
240            if let Some(subtitle) = &title.subtitle {
241                if let Some(sub_loc) = &subtitle.location {
242                    self.push_range(sub_loc, LexSemanticTokenKind::DocumentSubtitle);
243                }
244                self.process_text_content(subtitle);
245            }
246        }
247        self.process_session(&document.root, LexSemanticTokenKind::SessionTitleText);
248    }
249
250    fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
251        // Emit separate tokens for marker and title text
252        if let Some(marker) = &session.marker {
253            // Emit SessionMarker token for the sequence marker
254            self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
255        }
256
257        // Emit SessionTitleText token for the title text (without marker)
258        // Create a range for the title text by using the full title location
259        // and adjusting if there's a marker
260        if let Some(header) = session.header_location() {
261            if let Some(marker) = &session.marker {
262                // Calculate the title text range (after the marker)
263                let marker_text = marker.as_str();
264                let full_title = session.full_title();
265
266                // Find where the marker ends in the title
267                if let Some(pos) = full_title.find(marker_text) {
268                    let marker_end = pos + marker_text.len();
269                    // Skip whitespace after marker
270                    let title_start = full_title[marker_end..]
271                        .chars()
272                        .position(|c| !c.is_whitespace())
273                        .map(|p| marker_end + p)
274                        .unwrap_or(marker_end);
275
276                    if title_start < full_title.len() {
277                        // Create range for title text only
278                        use lex_core::lex::ast::Position;
279                        let title_text_range = Range::new(
280                            header.span.start + title_start..header.span.end,
281                            Position::new(header.start.line, header.start.column + title_start),
282                            header.end,
283                        );
284                        self.push_range(&title_text_range, title_kind);
285                    }
286                }
287            } else {
288                // No marker, the entire header is title text
289                self.push_range(header, title_kind);
290            }
291        }
292
293        self.process_text_content(&session.title);
294
295        self.process_annotations(session.annotations());
296        for child in session.children.iter() {
297            self.process_content_item(child);
298        }
299    }
300
301    fn process_content_item(&mut self, item: &ContentItem) {
302        match item {
303            ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
304            ContentItem::Session(session) => {
305                self.process_session(session, LexSemanticTokenKind::SessionTitleText)
306            }
307            ContentItem::List(list) => self.process_list(list),
308            ContentItem::ListItem(list_item) => self.process_list_item(list_item),
309            ContentItem::Definition(definition) => self.process_definition(definition),
310            ContentItem::Annotation(annotation) => self.process_annotation(annotation),
311            ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
312            ContentItem::Table(table) => self.process_table(table),
313            ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
314            ContentItem::VerbatimLine(_) => {}
315            ContentItem::BlankLineGroup(_) => {}
316        }
317    }
318
319    fn process_paragraph(&mut self, paragraph: &Paragraph) {
320        for line in &paragraph.lines {
321            if let ContentItem::TextLine(text_line) = line {
322                // Don't emit full-line tokens for DefinitionContent or AnnotationContent
323                // as they overlap with inline tokens. The context is already clear from
324                // the DefinitionSubject and AnnotationLabel tokens.
325                self.process_text_content(&text_line.content);
326            }
327        }
328        self.process_annotations(paragraph.annotations());
329    }
330
331    fn process_list(&mut self, list: &List) {
332        self.process_annotations(list.annotations());
333        for item in list.items.iter() {
334            if let ContentItem::ListItem(list_item) = item {
335                self.process_list_item(list_item);
336            }
337        }
338    }
339
340    fn process_list_item(&mut self, list_item: &ListItem) {
341        if let Some(marker_range) = &list_item.marker.location {
342            self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
343        }
344        for text in &list_item.text {
345            if let Some(location) = &text.location {
346                self.push_range(location, LexSemanticTokenKind::ListItemText);
347            }
348            self.process_text_content(text);
349        }
350        self.process_annotations(list_item.annotations());
351        for child in list_item.children.iter() {
352            self.process_content_item(child);
353        }
354    }
355
356    fn process_definition(&mut self, definition: &Definition) {
357        if let Some(header) = definition.header_location() {
358            self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
359        }
360        self.process_text_content(&definition.subject);
361        self.process_annotations(definition.annotations());
362        let was_in_definition = self.in_definition;
363        self.in_definition = true;
364        for child in definition.children.iter() {
365            self.process_content_item(child);
366        }
367        self.in_definition = was_in_definition;
368    }
369
370    fn process_verbatim(&mut self, verbatim: &Verbatim) {
371        for group in verbatim.group() {
372            self.process_text_content(group.subject);
373            if let Some(location) = &group.subject.location {
374                self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
375            }
376            for child in group.children {
377                if let ContentItem::VerbatimLine(line) = child {
378                    self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
379                }
380            }
381        }
382
383        self.push_range(
384            &verbatim.closing_data.label.location,
385            LexSemanticTokenKind::DataLabel,
386        );
387        for parameter in &verbatim.closing_data.parameters {
388            self.push_range(&parameter.location, LexSemanticTokenKind::DataParameter);
389        }
390
391        self.process_annotations(verbatim.annotations());
392    }
393
394    fn process_table(&mut self, table: &Table) {
395        self.process_text_content(&table.subject);
396        if let Some(location) = &table.subject.location {
397            self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
398        }
399
400        // Process cell content: inline text and block children
401        for row in table.all_rows() {
402            for cell in &row.cells {
403                self.process_text_content(&cell.content);
404                for child in cell.children.iter() {
405                    self.process_content_item(child);
406                }
407            }
408        }
409
410        // Table config annotations are in table.annotations — processed below
411        // by process_annotations()
412
413        self.process_annotations(table.annotations());
414    }
415
416    fn process_annotation(&mut self, annotation: &Annotation) {
417        self.push_range(
418            annotation.header_location(),
419            LexSemanticTokenKind::AnnotationLabel,
420        );
421        for parameter in &annotation.data.parameters {
422            self.push_range(
423                &parameter.location,
424                LexSemanticTokenKind::AnnotationParameter,
425            );
426        }
427        let was_in_annotation = self.in_annotation;
428        self.in_annotation = true;
429        for child in annotation.children.iter() {
430            self.process_content_item(child);
431        }
432        self.in_annotation = was_in_annotation;
433    }
434
435    fn process_annotations(&mut self, annotations: &[Annotation]) {
436        for annotation in annotations {
437            self.process_annotation(annotation);
438        }
439    }
440
441    fn process_text_content(&mut self, text: &TextContent) {
442        let Some(base_range) = text.location.as_ref() else {
443            return;
444        };
445        let raw = text.as_string();
446        if raw.is_empty() {
447            return;
448        }
449        let nodes = text.inline_items();
450        let mut walker = InlineWalker {
451            raw,
452            base_range,
453            cursor: 0,
454            tokens: &mut self.tokens,
455            in_annotation: self.in_annotation,
456            in_definition: self.in_definition,
457            in_formatted: false,
458        };
459        walker.walk_nodes(&nodes);
460    }
461}
462
463/// Walks the InlineNode tree and raw text in parallel to produce positioned semantic tokens.
464///
465/// The inline parser consumes escape sequences and delimiters, so InlineNode text doesn't
466/// directly correspond to byte offsets in the raw source. This walker maintains a cursor
467/// into the raw text and advances it according to the same rules the inline parser uses,
468/// producing correctly positioned Range values for each token.
469struct InlineWalker<'a> {
470    raw: &'a str,
471    base_range: &'a Range,
472    cursor: usize,
473    tokens: &'a mut Vec<LexSemanticToken>,
474    in_annotation: bool,
475    in_definition: bool,
476    /// True when inside a formatting container (Strong/Emphasis). Plain text inside
477    /// containers is covered by the container's content span, so context-dependent
478    /// tokens (AnnotationContent, DefinitionContent) are suppressed.
479    in_formatted: bool,
480}
481
482impl<'a> InlineWalker<'a> {
483    fn walk_nodes(&mut self, nodes: &[InlineNode]) {
484        for node in nodes {
485            self.walk_node(node);
486        }
487    }
488
489    fn walk_node(&mut self, node: &InlineNode) {
490        match node {
491            InlineNode::Plain { text, .. } => self.walk_plain(text),
492            InlineNode::Strong { content, .. } => self.walk_container(
493                content,
494                '*',
495                LexSemanticTokenKind::InlineStrong,
496                LexSemanticTokenKind::InlineMarkerStrongStart,
497                LexSemanticTokenKind::InlineMarkerStrongEnd,
498            ),
499            InlineNode::Emphasis { content, .. } => self.walk_container(
500                content,
501                '_',
502                LexSemanticTokenKind::InlineEmphasis,
503                LexSemanticTokenKind::InlineMarkerEmphasisStart,
504                LexSemanticTokenKind::InlineMarkerEmphasisEnd,
505            ),
506            InlineNode::Code { text, .. } => self.walk_literal(
507                text,
508                '`',
509                LexSemanticTokenKind::InlineCode,
510                LexSemanticTokenKind::InlineMarkerCodeStart,
511                LexSemanticTokenKind::InlineMarkerCodeEnd,
512            ),
513            InlineNode::Math { text, .. } => self.walk_literal(
514                text,
515                '#',
516                LexSemanticTokenKind::InlineMath,
517                LexSemanticTokenKind::InlineMarkerMathStart,
518                LexSemanticTokenKind::InlineMarkerMathEnd,
519            ),
520            InlineNode::Reference { data, .. } => self.walk_reference(data),
521        }
522    }
523
524    /// Walk a Plain text node, advancing cursor through escape sequences in raw text.
525    /// Emits AnnotationContent or DefinitionContent when inside those contexts.
526    fn walk_plain(&mut self, text: &str) {
527        let start = self.cursor;
528        self.advance_unescaped(text);
529        let end = self.cursor;
530
531        if start < end {
532            let kind = if self.in_formatted {
533                None // Covered by the container's content span
534            } else if self.in_annotation {
535                Some(LexSemanticTokenKind::AnnotationContent)
536            } else if self.in_definition {
537                Some(LexSemanticTokenKind::DefinitionContent)
538            } else {
539                None
540            };
541            if let Some(kind) = kind {
542                self.push(self.make_range(start, end), kind);
543            }
544        }
545    }
546
547    /// Walk a container node (Strong/Emphasis) which has an opening marker, children, and closing marker.
548    fn walk_container(
549        &mut self,
550        content: &[InlineNode],
551        marker: char,
552        content_kind: LexSemanticTokenKind,
553        start_marker_kind: LexSemanticTokenKind,
554        end_marker_kind: LexSemanticTokenKind,
555    ) {
556        let marker_len = marker.len_utf8();
557
558        // Opening marker
559        let marker_start = self.cursor;
560        self.cursor += marker_len;
561        self.push(
562            self.make_range(marker_start, self.cursor),
563            start_marker_kind,
564        );
565
566        // Recurse into children — record span boundaries for the content token
567        let content_start = self.cursor;
568        let was_in_formatted = self.in_formatted;
569        self.in_formatted = true;
570        self.walk_nodes(content);
571        self.in_formatted = was_in_formatted;
572        let content_end = self.cursor;
573
574        // Emit a single content span covering all children
575        if content_start < content_end {
576            self.push(self.make_range(content_start, content_end), content_kind);
577        }
578
579        // Closing marker
580        let close_start = self.cursor;
581        self.cursor += marker_len;
582        self.push(self.make_range(close_start, self.cursor), end_marker_kind);
583    }
584
585    /// Walk a literal node (Code/Math) — no escape processing inside.
586    fn walk_literal(
587        &mut self,
588        text: &str,
589        marker: char,
590        content_kind: LexSemanticTokenKind,
591        start_marker_kind: LexSemanticTokenKind,
592        end_marker_kind: LexSemanticTokenKind,
593    ) {
594        let marker_len = marker.len_utf8();
595
596        // Opening marker
597        let marker_start = self.cursor;
598        self.cursor += marker_len;
599        self.push(
600            self.make_range(marker_start, self.cursor),
601            start_marker_kind,
602        );
603
604        // Literal content (verbatim, no escape processing)
605        let content_start = self.cursor;
606        self.cursor += text.len();
607        if content_start < self.cursor {
608            self.push(self.make_range(content_start, self.cursor), content_kind);
609        }
610
611        // Closing marker
612        let close_start = self.cursor;
613        self.cursor += marker_len;
614        self.push(self.make_range(close_start, self.cursor), end_marker_kind);
615    }
616
617    /// Walk a Reference node — literal content wrapped in `[` `]`.
618    fn walk_reference(&mut self, data: &lex_core::lex::inlines::ReferenceInline) {
619        let ref_kind = match &data.reference_type {
620            ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
621            ReferenceType::FootnoteNumber { .. } => LexSemanticTokenKind::ReferenceFootnote,
622            ReferenceType::AnnotationReference { .. } => LexSemanticTokenKind::ReferenceAnnotation,
623            _ => LexSemanticTokenKind::Reference,
624        };
625
626        // Opening bracket
627        let open_start = self.cursor;
628        self.cursor += 1;
629        self.push(
630            self.make_range(open_start, self.cursor),
631            LexSemanticTokenKind::InlineMarkerRefStart,
632        );
633
634        // Reference content (literal — matches raw verbatim)
635        let content_start = self.cursor;
636        self.cursor += data.raw.len();
637        if content_start < self.cursor {
638            self.push(self.make_range(content_start, self.cursor), ref_kind);
639        }
640
641        // Closing bracket
642        let close_start = self.cursor;
643        self.cursor += 1;
644        self.push(
645            self.make_range(close_start, self.cursor),
646            LexSemanticTokenKind::InlineMarkerRefEnd,
647        );
648    }
649
650    /// Advance the raw-text cursor to match unescaped `text` from an InlineNode::Plain.
651    ///
652    /// The inline parser applies escape rules: `\*` → `*`, `\\` → `\`, but `\n` stays `\n`.
653    /// This function mirrors that logic to track how many raw bytes correspond to each
654    /// unescaped character.
655    fn advance_unescaped(&mut self, text: &str) {
656        for expected in text.chars() {
657            if self.cursor >= self.raw.len() {
658                break;
659            }
660            let raw_ch = self.raw[self.cursor..].chars().next().unwrap();
661            if raw_ch == '\\' {
662                if self.cursor + 1 >= self.raw.len() {
663                    // Trailing backslash: treat as literal to mirror parser behavior and
664                    // avoid out-of-bounds slicing on `self.raw[self.cursor + 1..]`.
665                    self.cursor += 1;
666                } else {
667                    let next_ch = self.raw[self.cursor + 1..].chars().next();
668                    match next_ch {
669                        Some(nc) if !nc.is_alphanumeric() => {
670                            // Escaped: raw `\X` maps to unescaped `X`
671                            self.cursor += 1 + nc.len_utf8();
672                        }
673                        _ => {
674                            // Literal backslash: raw `\` stays as `\` in the node
675                            self.cursor += 1;
676                        }
677                    }
678                }
679            } else {
680                self.cursor += raw_ch.len_utf8();
681            }
682            let _ = expected; // cursor already advanced
683        }
684    }
685
686    fn make_range(&self, start: usize, end: usize) -> Range {
687        let start_pos = self.position_at(start);
688        let end_pos = self.position_at(end);
689        Range::new(
690            (self.base_range.span.start + start)..(self.base_range.span.start + end),
691            start_pos,
692            end_pos,
693        )
694    }
695
696    fn position_at(&self, offset: usize) -> Position {
697        let mut line = self.base_range.start.line;
698        let mut column = self.base_range.start.column;
699        for ch in self.raw[..offset].chars() {
700            if ch == '\n' {
701                line += 1;
702                column = 0;
703            } else {
704                column += ch.len_utf8();
705            }
706        }
707        Position::new(line, column)
708    }
709
710    fn push(&mut self, range: Range, kind: LexSemanticTokenKind) {
711        if range.span.start < range.span.end {
712            self.tokens.push(LexSemanticToken { kind, range });
713        }
714    }
715}
716
717#[cfg(test)]
718mod tests {
719    use super::*;
720    use crate::test_support::{sample_document, sample_source};
721    use lex_core::lex::testing::lexplore::Lexplore;
722
723    fn snippets(
724        tokens: &[LexSemanticToken],
725        kind: LexSemanticTokenKind,
726        source: &str,
727    ) -> Vec<String> {
728        tokens
729            .iter()
730            .filter(|token| token.kind == kind)
731            .map(|token| source[token.range.span.clone()].to_string())
732            .collect()
733    }
734
735    #[test]
736    fn collects_structural_tokens() {
737        let document = sample_document();
738        let tokens = collect_semantic_tokens(&document);
739        let source = sample_source();
740
741        // Session titles are now split into SessionMarker and SessionTitleText
742        assert!(
743            snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
744                .iter()
745                .any(|snippet| snippet.trim() == "1.")
746        );
747        assert!(
748            snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
749                .iter()
750                .any(|snippet| snippet.trim() == "Intro")
751        );
752        // Cache is parsed as VerbatimSubject
753        assert!(
754            snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
755                .iter()
756                .any(|snippet| snippet.trim_end() == "Cache")
757        );
758        let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
759        assert_eq!(markers.len(), 4);
760        assert!(markers
761            .iter()
762            .all(|snippet| snippet.trim_start().starts_with('-')
763                || snippet.trim_start().chars().next().unwrap().is_numeric()));
764        let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
765        assert!(annotation_labels
766            .iter()
767            .any(|snippet| snippet.contains("doc.note")));
768        let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
769        assert!(parameters
770            .iter()
771            .any(|snippet| snippet.contains("severity=info")));
772        let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
773        assert!(verbatim_subjects
774            .iter()
775            .any(|snippet| snippet.contains("CLI Example")));
776        assert!(snippets(&tokens, LexSemanticTokenKind::DataLabel, source)
777            .iter()
778            .any(|snippet| snippet.contains("shell")));
779    }
780
781    #[test]
782    fn collects_inline_tokens() {
783        let document = sample_document();
784        let tokens = collect_semantic_tokens(&document);
785        let source = sample_source();
786        assert!(
787            snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
788                .iter()
789                .any(|snippet| snippet.contains("Lex"))
790        );
791        assert!(
792            snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
793                .iter()
794                .any(|snippet| snippet.contains("format"))
795        );
796        assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
797            .iter()
798            .any(|snippet| snippet.contains("code")));
799        assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
800            .iter()
801            .any(|snippet| snippet.contains("math")));
802    }
803
804    #[test]
805    fn classifies_references() {
806        let document = sample_document();
807        let tokens = collect_semantic_tokens(&document);
808        let source = sample_source();
809        assert!(
810            snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
811                .iter()
812                .any(|snippet| snippet.contains("@spec2025"))
813        );
814        assert!(
815            snippets(&tokens, LexSemanticTokenKind::ReferenceAnnotation, source)
816                .iter()
817                .any(|snippet| snippet.contains("::source"))
818        );
819        assert!(
820            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
821                .iter()
822                .any(|snippet| snippet.contains("1"))
823        );
824        assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
825            .iter()
826            .any(|snippet| snippet.contains("Cache")));
827    }
828
829    #[test]
830    fn empty_document_has_no_tokens() {
831        let document = Lexplore::benchmark(0)
832            .parse()
833            .expect("failed to parse empty benchmark fixture");
834        let tokens = collect_semantic_tokens(&document);
835        assert!(tokens.is_empty());
836    }
837
838    #[test]
839    fn emits_annotation_content_for_inline_annotation() {
840        let document = sample_document();
841        let tokens = collect_semantic_tokens(&document);
842        let source = sample_source();
843
844        // The fixture starts with `:: doc.note severity=info :: Document preface.`
845        // "Document preface." is inline annotation content — plain text inside annotation context.
846        let annotation_content = snippets(&tokens, LexSemanticTokenKind::AnnotationContent, source);
847        assert!(
848            annotation_content
849                .iter()
850                .any(|snippet| snippet.contains("Document preface")),
851            "AnnotationContent should be emitted for plain text inside annotations, got: {annotation_content:?}"
852        );
853    }
854
855    #[test]
856    fn annotation_content_excludes_formatted_text() {
857        // Inline formatting within annotation context should get its own token type,
858        // not AnnotationContent — only Plain nodes emit AnnotationContent.
859        let source = ":: note :: Some *bold* text.\n";
860        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
861        let tokens = collect_semantic_tokens(&document);
862
863        let annotation_content: Vec<_> = tokens
864            .iter()
865            .filter(|t| t.kind == LexSemanticTokenKind::AnnotationContent)
866            .map(|t| &source[t.range.span.clone()])
867            .collect();
868
869        // "Some " and " text." should be AnnotationContent, but "bold" should not
870        assert!(
871            annotation_content.iter().any(|s| s.contains("Some")),
872            "Plain text before formatting should be AnnotationContent"
873        );
874        assert!(
875            annotation_content.iter().any(|s| s.contains("text.")),
876            "Plain text after formatting should be AnnotationContent"
877        );
878        assert!(
879            !annotation_content.iter().any(|s| s.contains("bold")),
880            "Formatted text should NOT be AnnotationContent"
881        );
882
883        // "bold" should be InlineStrong
884        let strong: Vec<_> = tokens
885            .iter()
886            .filter(|t| t.kind == LexSemanticTokenKind::InlineStrong)
887            .map(|t| &source[t.range.span.clone()])
888            .collect();
889        assert!(strong.contains(&"bold"));
890    }
891
892    #[test]
893    fn table_cell_inline_formatting_gets_tokens() {
894        let source = "Stats:\n    | *Name* | `code` |\n    | _test_ | #42#   |\n";
895        let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
896        let tokens = collect_semantic_tokens(&document);
897
898        let strong = snippets(&tokens, LexSemanticTokenKind::InlineStrong, source);
899        assert!(
900            strong.iter().any(|s| s.contains("Name")),
901            "Expected InlineStrong for *Name* in table cell, got: {strong:?}"
902        );
903
904        let code = snippets(&tokens, LexSemanticTokenKind::InlineCode, source);
905        assert!(
906            code.iter().any(|s| s.contains("code")),
907            "Expected InlineCode for `code` in table cell, got: {code:?}"
908        );
909
910        let emphasis = snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source);
911        assert!(
912            emphasis.iter().any(|s| s.contains("test")),
913            "Expected InlineEmphasis for _test_ in table cell, got: {emphasis:?}"
914        );
915
916        let math = snippets(&tokens, LexSemanticTokenKind::InlineMath, source);
917        assert!(
918            math.iter().any(|s| s.contains("42")),
919            "Expected InlineMath for #42# in table cell, got: {math:?}"
920        );
921    }
922}
lex_analysis/semantic_tokens.rs

lex_analysis/
semantic_tokens.rs