lex_analysis/
semantic_tokens.rs

1//! This is the semantic token collector, which editors use for syntax highlighting.
2//! It's worth going over the general approach.
3//!
4//! Semantic Tokens and Editor Highlighting Architecture
5//!
6//!     1. LSP emits semantic tokens using our format's native terminology (e.g., `Verbatim`
7//! Annotation, etc). The LSP declares a token legend at initialization and emits tokens as indices
8//! into that legend—it has no knowledge of editor-specific theming.
9//!     2. Editor plugins map our token types to the editor's theme primitives. This lets users
10//! leverage their existing theme choices while our core LSP code remains editor-agnostic.
11//!     
12//! Editor-Specific Mapping
13//!
14//!     VSCode — declarative mapping in `package.json`:
15//!         "semanticTokenScopes": [{
16//!         "language": "ourformat",
17//!         "scopes": {
18//!         "Verbatim": ["markup.inline.raw"],
19//!         "Heading": ["markup.heading"],
20//!         "Emphasis": ["markup.italic"]
21//!         }
22//!         }]
23//!     :: javascript
24//!
25//!     We map to TextMate scopes (`markup.*`) as they have broad theme support and are a natural
26//! fit for markup.
27//!
28//!     Neovim — imperative mapping in the plugin:
29//!         vim.api.nvim_set_hl(0, '@lsp.type.Verbatim', { link = '@markup.raw' })
30//!         `vim.api.nvim_set_hl(0, '@lsp.type.Heading', { link = '@markup.heading' })
31//!         vim.api.nvim_set_hl(0, '@lsp.type.Emphasis', { link = '@markup.italic' })
32//!     :: lua
33//!     We link to treesitter's `@markup.*` groups for equivalent theme coverage.
34//!     Benefits:
35//!         - LSP speaks our format's semantics—no impedance mismatch
36//!         - Users get syntax highlighting that respects their theme
37//!         - Mapping logic is isolated to editor plugins; adding a new editor doesn't touch the LSP
38//!
39//! The file editors/vscode/themes/lex-light.json has the reocommended theming for Lex to be used in
40//! tests and so forth.
41use crate::inline::{extract_inline_spans, InlineSpanKind};
42use lex_core::lex::ast::{
43    Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Range, Session,
44    TextContent, Verbatim,
45};
46use lex_core::lex::inlines::ReferenceType;
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
49pub enum LexSemanticTokenKind {
50    DocumentTitle,
51    SessionMarker,
52    SessionTitleText,
53    DefinitionSubject,
54    DefinitionContent,
55    ListMarker,
56    ListItemText,
57    AnnotationLabel,
58    AnnotationParameter,
59    AnnotationContent,
60    InlineStrong,
61    InlineEmphasis,
62    InlineCode,
63    InlineMath,
64    Reference,
65    ReferenceCitation,
66    ReferenceFootnote,
67    VerbatimSubject,
68    VerbatimLanguage,
69    VerbatimAttribute,
70    VerbatimContent,
71    InlineMarkerStrongStart,
72    InlineMarkerStrongEnd,
73    InlineMarkerEmphasisStart,
74    InlineMarkerEmphasisEnd,
75    InlineMarkerCodeStart,
76    InlineMarkerCodeEnd,
77    InlineMarkerMathStart,
78    InlineMarkerMathEnd,
79    InlineMarkerRefStart,
80    InlineMarkerRefEnd,
81}
82
83impl LexSemanticTokenKind {
84    /// Returns the semantic token type string for LSP.
85    ///
86    /// These token type names are mapped to standard TextMate scopes in editor configurations
87    /// to ensure compatibility with existing themes (Neovim, VSCode, etc.).
88    ///
89    /// Mapping rationale (based on Lex↔Markdown mapping from lex-babel):
90    /// - Session → Heading → maps to "markup.heading"
91    /// - Definition → Term: Desc → maps to "variable.other.definition"
92    /// - InlineStrong → bold → maps to "markup.bold"
93    /// - InlineEmphasis → *italic* → maps to "markup.italic"
94    /// - InlineCode → `code` → maps to "markup.inline.raw"
95    /// - InlineMath → $math$ → maps to "constant.numeric"
96    /// - Reference → \[citation\] → maps to "markup.underline.link"
97    /// - Verbatim → ```block``` → maps to "markup.raw.block"
98    /// - Annotation → <!-- comment --> → maps to "comment.block"
99    /// - ListMarker → - or 1. → maps to "punctuation.definition.list"
100    pub fn as_str(self) -> &'static str {
101        match self {
102            LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
103            LexSemanticTokenKind::SessionMarker => "SessionMarker",
104            LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
105            LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
106            LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
107            LexSemanticTokenKind::ListMarker => "ListMarker",
108            LexSemanticTokenKind::ListItemText => "ListItemText",
109            LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
110            LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
111            LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
112            LexSemanticTokenKind::InlineStrong => "InlineStrong",
113            LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
114            LexSemanticTokenKind::InlineCode => "InlineCode",
115            LexSemanticTokenKind::InlineMath => "InlineMath",
116            LexSemanticTokenKind::Reference => "Reference",
117            LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
118            LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
119            LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
120            LexSemanticTokenKind::VerbatimLanguage => "VerbatimLanguage",
121            LexSemanticTokenKind::VerbatimAttribute => "VerbatimAttribute",
122            LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
123            LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
124            LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
125            LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
126            LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
127            LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
128            LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
129            LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
130            LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
131            LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
132            LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
133        }
134    }
135}
136
137pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
138    LexSemanticTokenKind::DocumentTitle,
139    LexSemanticTokenKind::SessionMarker,
140    LexSemanticTokenKind::SessionTitleText,
141    LexSemanticTokenKind::DefinitionSubject,
142    LexSemanticTokenKind::DefinitionContent,
143    LexSemanticTokenKind::ListMarker,
144    LexSemanticTokenKind::ListItemText,
145    LexSemanticTokenKind::AnnotationLabel,
146    LexSemanticTokenKind::AnnotationParameter,
147    LexSemanticTokenKind::AnnotationContent,
148    LexSemanticTokenKind::InlineStrong,
149    LexSemanticTokenKind::InlineEmphasis,
150    LexSemanticTokenKind::InlineCode,
151    LexSemanticTokenKind::InlineMath,
152    LexSemanticTokenKind::Reference,
153    LexSemanticTokenKind::ReferenceCitation,
154    LexSemanticTokenKind::ReferenceFootnote,
155    LexSemanticTokenKind::VerbatimSubject,
156    LexSemanticTokenKind::VerbatimLanguage,
157    LexSemanticTokenKind::VerbatimAttribute,
158    LexSemanticTokenKind::VerbatimContent,
159    LexSemanticTokenKind::InlineMarkerStrongStart,
160    LexSemanticTokenKind::InlineMarkerStrongEnd,
161    LexSemanticTokenKind::InlineMarkerEmphasisStart,
162    LexSemanticTokenKind::InlineMarkerEmphasisEnd,
163    LexSemanticTokenKind::InlineMarkerCodeStart,
164    LexSemanticTokenKind::InlineMarkerCodeEnd,
165    LexSemanticTokenKind::InlineMarkerMathStart,
166    LexSemanticTokenKind::InlineMarkerMathEnd,
167    LexSemanticTokenKind::InlineMarkerRefStart,
168    LexSemanticTokenKind::InlineMarkerRefEnd,
169];
170
171#[derive(Debug, Clone, PartialEq)]
172pub struct LexSemanticToken {
173    pub kind: LexSemanticTokenKind,
174    pub range: Range,
175}
176
177pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
178    let mut collector = TokenCollector::new();
179    collector.process_document(document);
180    collector.finish()
181}
182
183struct TokenCollector {
184    tokens: Vec<LexSemanticToken>,
185    in_annotation: bool,
186    in_definition: bool,
187}
188
189impl TokenCollector {
190    fn new() -> Self {
191        Self {
192            tokens: Vec::new(),
193            in_annotation: false,
194            in_definition: false,
195        }
196    }
197
198    fn finish(mut self) -> Vec<LexSemanticToken> {
199        self.tokens.sort_by(|a, b| {
200            let a_start = (
201                &a.range.start.line,
202                &a.range.start.column,
203                &a.range.end.line,
204                &a.range.end.column,
205            );
206            let b_start = (
207                &b.range.start.line,
208                &b.range.start.column,
209                &b.range.end.line,
210                &b.range.end.column,
211            );
212            a_start.cmp(&b_start)
213        });
214        self.tokens
215    }
216
217    fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
218        if range.span.start < range.span.end {
219            self.tokens.push(LexSemanticToken {
220                kind,
221                range: range.clone(),
222            });
223        }
224    }
225
226    fn process_document(&mut self, document: &Document) {
227        self.process_annotations(document.annotations());
228        self.process_session(&document.root, LexSemanticTokenKind::DocumentTitle);
229    }
230
231    fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
232        // Emit separate tokens for marker and title text
233        if let Some(marker) = &session.marker {
234            // Emit SessionMarker token for the sequence marker
235            self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
236        }
237
238        // Emit SessionTitleText token for the title text (without marker)
239        // Create a range for the title text by using the full title location
240        // and adjusting if there's a marker
241        if let Some(header) = session.header_location() {
242            if let Some(marker) = &session.marker {
243                // Calculate the title text range (after the marker)
244                let marker_text = marker.as_str();
245                let full_title = session.full_title();
246
247                // Find where the marker ends in the title
248                if let Some(pos) = full_title.find(marker_text) {
249                    let marker_end = pos + marker_text.len();
250                    // Skip whitespace after marker
251                    let title_start = full_title[marker_end..]
252                        .chars()
253                        .position(|c| !c.is_whitespace())
254                        .map(|p| marker_end + p)
255                        .unwrap_or(marker_end);
256
257                    if title_start < full_title.len() {
258                        // Create range for title text only
259                        use lex_core::lex::ast::Position;
260                        let title_text_range = Range::new(
261                            header.span.start + title_start..header.span.end,
262                            Position::new(header.start.line, header.start.column + title_start),
263                            header.end,
264                        );
265                        self.push_range(&title_text_range, title_kind);
266                    }
267                }
268            } else {
269                // No marker, the entire header is title text
270                self.push_range(header, title_kind);
271            }
272        }
273
274        self.process_text_content(&session.title);
275
276        self.process_annotations(session.annotations());
277        for child in session.children.iter() {
278            self.process_content_item(child);
279        }
280    }
281
282    fn process_content_item(&mut self, item: &ContentItem) {
283        match item {
284            ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
285            ContentItem::Session(session) => {
286                self.process_session(session, LexSemanticTokenKind::SessionTitleText)
287            }
288            ContentItem::List(list) => self.process_list(list),
289            ContentItem::ListItem(list_item) => self.process_list_item(list_item),
290            ContentItem::Definition(definition) => self.process_definition(definition),
291            ContentItem::Annotation(annotation) => self.process_annotation(annotation),
292            ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
293            ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
294            ContentItem::VerbatimLine(_) => {}
295            ContentItem::BlankLineGroup(_) => {}
296        }
297    }
298
299    fn process_paragraph(&mut self, paragraph: &Paragraph) {
300        for line in &paragraph.lines {
301            if let ContentItem::TextLine(text_line) = line {
302                // Don't emit full-line tokens for DefinitionContent or AnnotationContent
303                // as they overlap with inline tokens. The context is already clear from
304                // the DefinitionSubject and AnnotationLabel tokens.
305                self.process_text_content(&text_line.content);
306            }
307        }
308        self.process_annotations(paragraph.annotations());
309    }
310
311    fn process_list(&mut self, list: &List) {
312        self.process_annotations(list.annotations());
313        for item in list.items.iter() {
314            if let ContentItem::ListItem(list_item) = item {
315                self.process_list_item(list_item);
316            }
317        }
318    }
319
320    fn process_list_item(&mut self, list_item: &ListItem) {
321        if let Some(marker_range) = &list_item.marker.location {
322            self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
323        }
324        for text in &list_item.text {
325            if let Some(location) = &text.location {
326                self.push_range(location, LexSemanticTokenKind::ListItemText);
327            }
328            self.process_text_content(text);
329        }
330        self.process_annotations(list_item.annotations());
331        for child in list_item.children.iter() {
332            self.process_content_item(child);
333        }
334    }
335
336    fn process_definition(&mut self, definition: &Definition) {
337        if let Some(header) = definition.header_location() {
338            self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
339        }
340        self.process_text_content(&definition.subject);
341        self.process_annotations(definition.annotations());
342        let was_in_definition = self.in_definition;
343        self.in_definition = true;
344        for child in definition.children.iter() {
345            self.process_content_item(child);
346        }
347        self.in_definition = was_in_definition;
348    }
349
350    fn process_verbatim(&mut self, verbatim: &Verbatim) {
351        for group in verbatim.group() {
352            self.process_text_content(group.subject);
353            if let Some(location) = &group.subject.location {
354                self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
355            }
356        }
357
358        self.push_range(
359            &verbatim.closing_data.label.location,
360            LexSemanticTokenKind::VerbatimLanguage,
361        );
362        for parameter in &verbatim.closing_data.parameters {
363            self.push_range(&parameter.location, LexSemanticTokenKind::VerbatimAttribute);
364        }
365
366        // Highlight verbatim content lines
367        for child in &verbatim.children {
368            if let ContentItem::VerbatimLine(line) = child {
369                self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
370            }
371        }
372
373        self.process_annotations(verbatim.annotations());
374    }
375
376    fn process_annotation(&mut self, annotation: &Annotation) {
377        self.push_range(
378            annotation.header_location(),
379            LexSemanticTokenKind::AnnotationLabel,
380        );
381        for parameter in &annotation.data.parameters {
382            self.push_range(
383                &parameter.location,
384                LexSemanticTokenKind::AnnotationParameter,
385            );
386        }
387        let was_in_annotation = self.in_annotation;
388        self.in_annotation = true;
389        for child in annotation.children.iter() {
390            self.process_content_item(child);
391        }
392        self.in_annotation = was_in_annotation;
393    }
394
395    fn process_annotations(&mut self, annotations: &[Annotation]) {
396        for annotation in annotations {
397            self.process_annotation(annotation);
398        }
399    }
400
401    fn process_text_content(&mut self, text: &TextContent) {
402        for span in extract_inline_spans(text) {
403            let kind = match span.kind {
404                InlineSpanKind::Strong => Some(LexSemanticTokenKind::InlineStrong),
405                InlineSpanKind::Emphasis => Some(LexSemanticTokenKind::InlineEmphasis),
406                InlineSpanKind::Code => Some(LexSemanticTokenKind::InlineCode),
407                InlineSpanKind::Math => Some(LexSemanticTokenKind::InlineMath),
408                InlineSpanKind::Reference(reference_type) => Some(match reference_type {
409                    ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
410                    ReferenceType::FootnoteNumber { .. }
411                    | ReferenceType::FootnoteLabeled { .. } => {
412                        LexSemanticTokenKind::ReferenceFootnote
413                    }
414                    _ => LexSemanticTokenKind::Reference,
415                }),
416                InlineSpanKind::StrongMarkerStart => {
417                    Some(LexSemanticTokenKind::InlineMarkerStrongStart)
418                }
419                InlineSpanKind::StrongMarkerEnd => {
420                    Some(LexSemanticTokenKind::InlineMarkerStrongEnd)
421                }
422                InlineSpanKind::EmphasisMarkerStart => {
423                    Some(LexSemanticTokenKind::InlineMarkerEmphasisStart)
424                }
425                InlineSpanKind::EmphasisMarkerEnd => {
426                    Some(LexSemanticTokenKind::InlineMarkerEmphasisEnd)
427                }
428                InlineSpanKind::CodeMarkerStart => {
429                    Some(LexSemanticTokenKind::InlineMarkerCodeStart)
430                }
431                InlineSpanKind::CodeMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerCodeEnd),
432                InlineSpanKind::MathMarkerStart => {
433                    Some(LexSemanticTokenKind::InlineMarkerMathStart)
434                }
435                InlineSpanKind::MathMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerMathEnd),
436                InlineSpanKind::RefMarkerStart => Some(LexSemanticTokenKind::InlineMarkerRefStart),
437                InlineSpanKind::RefMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerRefEnd),
438            };
439            if let Some(kind) = kind {
440                self.push_range(&span.range, kind);
441            }
442        }
443    }
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449    use crate::test_support::{sample_document, sample_source};
450    use lex_core::lex::testing::lexplore::Lexplore;
451
452    fn snippets(
453        tokens: &[LexSemanticToken],
454        kind: LexSemanticTokenKind,
455        source: &str,
456    ) -> Vec<String> {
457        tokens
458            .iter()
459            .filter(|token| token.kind == kind)
460            .map(|token| source[token.range.span.clone()].to_string())
461            .collect()
462    }
463
464    #[test]
465    fn collects_structural_tokens() {
466        let document = sample_document();
467        let tokens = collect_semantic_tokens(&document);
468        let source = sample_source();
469
470        // Session titles are now split into SessionMarker and SessionTitleText
471        assert!(
472            snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
473                .iter()
474                .any(|snippet| snippet.trim() == "1.")
475        );
476        assert!(
477            snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
478                .iter()
479                .any(|snippet| snippet.trim() == "Intro")
480        );
481        // Cache is parsed as VerbatimSubject
482        assert!(
483            snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
484                .iter()
485                .any(|snippet| snippet.trim_end() == "Cache")
486        );
487        let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
488        assert_eq!(markers.len(), 4);
489        assert!(markers
490            .iter()
491            .all(|snippet| snippet.trim_start().starts_with('-')
492                || snippet.trim_start().chars().next().unwrap().is_numeric()));
493        let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
494        assert!(annotation_labels
495            .iter()
496            .any(|snippet| snippet.contains("doc.note")));
497        let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
498        assert!(parameters
499            .iter()
500            .any(|snippet| snippet.contains("severity=info")));
501        let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
502        assert!(verbatim_subjects
503            .iter()
504            .any(|snippet| snippet.contains("CLI Example")));
505        assert!(
506            snippets(&tokens, LexSemanticTokenKind::VerbatimLanguage, source)
507                .iter()
508                .any(|snippet| snippet.contains("shell"))
509        );
510    }
511
512    #[test]
513    fn collects_inline_tokens() {
514        let document = sample_document();
515        let tokens = collect_semantic_tokens(&document);
516        let source = sample_source();
517        assert!(
518            snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
519                .iter()
520                .any(|snippet| snippet.contains("Lex"))
521        );
522        assert!(
523            snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
524                .iter()
525                .any(|snippet| snippet.contains("format"))
526        );
527        assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
528            .iter()
529            .any(|snippet| snippet.contains("code")));
530        assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
531            .iter()
532            .any(|snippet| snippet.contains("math")));
533    }
534
535    #[test]
536    fn classifies_references() {
537        let document = sample_document();
538        let tokens = collect_semantic_tokens(&document);
539        let source = sample_source();
540        assert!(
541            snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
542                .iter()
543                .any(|snippet| snippet.contains("@spec2025"))
544        );
545        assert!(
546            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
547                .iter()
548                .any(|snippet| snippet.contains("^source"))
549        );
550        assert!(
551            snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
552                .iter()
553                .any(|snippet| snippet.contains("1"))
554        );
555        assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
556            .iter()
557            .any(|snippet| snippet.contains("Cache")));
558    }
559
560    #[test]
561    fn empty_document_has_no_tokens() {
562        let document = Lexplore::benchmark(0)
563            .parse()
564            .expect("failed to parse empty benchmark fixture");
565        let tokens = collect_semantic_tokens(&document);
566        assert!(tokens.is_empty());
567    }
568}