1use crate::inline::{extract_inline_spans, InlineSpanKind};
42use lex_core::lex::ast::{
43 Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Range, Session,
44 TextContent, Verbatim,
45};
46use lex_core::lex::inlines::ReferenceType;
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
49pub enum LexSemanticTokenKind {
50 DocumentTitle,
51 SessionMarker,
52 SessionTitleText,
53 DefinitionSubject,
54 DefinitionContent,
55 ListMarker,
56 ListItemText,
57 AnnotationLabel,
58 AnnotationParameter,
59 AnnotationContent,
60 InlineStrong,
61 InlineEmphasis,
62 InlineCode,
63 InlineMath,
64 Reference,
65 ReferenceCitation,
66 ReferenceFootnote,
67 VerbatimSubject,
68 VerbatimLanguage,
69 VerbatimAttribute,
70 VerbatimContent,
71 InlineMarkerStrongStart,
72 InlineMarkerStrongEnd,
73 InlineMarkerEmphasisStart,
74 InlineMarkerEmphasisEnd,
75 InlineMarkerCodeStart,
76 InlineMarkerCodeEnd,
77 InlineMarkerMathStart,
78 InlineMarkerMathEnd,
79 InlineMarkerRefStart,
80 InlineMarkerRefEnd,
81}
82
83impl LexSemanticTokenKind {
84 pub fn as_str(self) -> &'static str {
101 match self {
102 LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
103 LexSemanticTokenKind::SessionMarker => "SessionMarker",
104 LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
105 LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
106 LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
107 LexSemanticTokenKind::ListMarker => "ListMarker",
108 LexSemanticTokenKind::ListItemText => "ListItemText",
109 LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
110 LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
111 LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
112 LexSemanticTokenKind::InlineStrong => "InlineStrong",
113 LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
114 LexSemanticTokenKind::InlineCode => "InlineCode",
115 LexSemanticTokenKind::InlineMath => "InlineMath",
116 LexSemanticTokenKind::Reference => "Reference",
117 LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
118 LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
119 LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
120 LexSemanticTokenKind::VerbatimLanguage => "VerbatimLanguage",
121 LexSemanticTokenKind::VerbatimAttribute => "VerbatimAttribute",
122 LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
123 LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
124 LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
125 LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
126 LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
127 LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
128 LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
129 LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
130 LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
131 LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
132 LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
133 }
134 }
135}
136
137pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
138 LexSemanticTokenKind::DocumentTitle,
139 LexSemanticTokenKind::SessionMarker,
140 LexSemanticTokenKind::SessionTitleText,
141 LexSemanticTokenKind::DefinitionSubject,
142 LexSemanticTokenKind::DefinitionContent,
143 LexSemanticTokenKind::ListMarker,
144 LexSemanticTokenKind::ListItemText,
145 LexSemanticTokenKind::AnnotationLabel,
146 LexSemanticTokenKind::AnnotationParameter,
147 LexSemanticTokenKind::AnnotationContent,
148 LexSemanticTokenKind::InlineStrong,
149 LexSemanticTokenKind::InlineEmphasis,
150 LexSemanticTokenKind::InlineCode,
151 LexSemanticTokenKind::InlineMath,
152 LexSemanticTokenKind::Reference,
153 LexSemanticTokenKind::ReferenceCitation,
154 LexSemanticTokenKind::ReferenceFootnote,
155 LexSemanticTokenKind::VerbatimSubject,
156 LexSemanticTokenKind::VerbatimLanguage,
157 LexSemanticTokenKind::VerbatimAttribute,
158 LexSemanticTokenKind::VerbatimContent,
159 LexSemanticTokenKind::InlineMarkerStrongStart,
160 LexSemanticTokenKind::InlineMarkerStrongEnd,
161 LexSemanticTokenKind::InlineMarkerEmphasisStart,
162 LexSemanticTokenKind::InlineMarkerEmphasisEnd,
163 LexSemanticTokenKind::InlineMarkerCodeStart,
164 LexSemanticTokenKind::InlineMarkerCodeEnd,
165 LexSemanticTokenKind::InlineMarkerMathStart,
166 LexSemanticTokenKind::InlineMarkerMathEnd,
167 LexSemanticTokenKind::InlineMarkerRefStart,
168 LexSemanticTokenKind::InlineMarkerRefEnd,
169];
170
171#[derive(Debug, Clone, PartialEq)]
172pub struct LexSemanticToken {
173 pub kind: LexSemanticTokenKind,
174 pub range: Range,
175}
176
177pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
178 let mut collector = TokenCollector::new();
179 collector.process_document(document);
180 collector.finish()
181}
182
183struct TokenCollector {
184 tokens: Vec<LexSemanticToken>,
185 in_annotation: bool,
186 in_definition: bool,
187}
188
189impl TokenCollector {
190 fn new() -> Self {
191 Self {
192 tokens: Vec::new(),
193 in_annotation: false,
194 in_definition: false,
195 }
196 }
197
198 fn finish(mut self) -> Vec<LexSemanticToken> {
199 self.tokens.sort_by(|a, b| {
200 let a_start = (
201 &a.range.start.line,
202 &a.range.start.column,
203 &a.range.end.line,
204 &a.range.end.column,
205 );
206 let b_start = (
207 &b.range.start.line,
208 &b.range.start.column,
209 &b.range.end.line,
210 &b.range.end.column,
211 );
212 a_start.cmp(&b_start)
213 });
214 self.tokens
215 }
216
217 fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
218 if range.span.start < range.span.end {
219 self.tokens.push(LexSemanticToken {
220 kind,
221 range: range.clone(),
222 });
223 }
224 }
225
226 fn process_document(&mut self, document: &Document) {
227 self.process_annotations(document.annotations());
228 self.process_session(&document.root, LexSemanticTokenKind::DocumentTitle);
229 }
230
231 fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
232 if let Some(marker) = &session.marker {
234 self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
236 }
237
238 if let Some(header) = session.header_location() {
242 if let Some(marker) = &session.marker {
243 let marker_text = marker.as_str();
245 let full_title = session.full_title();
246
247 if let Some(pos) = full_title.find(marker_text) {
249 let marker_end = pos + marker_text.len();
250 let title_start = full_title[marker_end..]
252 .chars()
253 .position(|c| !c.is_whitespace())
254 .map(|p| marker_end + p)
255 .unwrap_or(marker_end);
256
257 if title_start < full_title.len() {
258 use lex_core::lex::ast::Position;
260 let title_text_range = Range::new(
261 header.span.start + title_start..header.span.end,
262 Position::new(header.start.line, header.start.column + title_start),
263 header.end,
264 );
265 self.push_range(&title_text_range, title_kind);
266 }
267 }
268 } else {
269 self.push_range(header, title_kind);
271 }
272 }
273
274 self.process_text_content(&session.title);
275
276 self.process_annotations(session.annotations());
277 for child in session.children.iter() {
278 self.process_content_item(child);
279 }
280 }
281
282 fn process_content_item(&mut self, item: &ContentItem) {
283 match item {
284 ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
285 ContentItem::Session(session) => {
286 self.process_session(session, LexSemanticTokenKind::SessionTitleText)
287 }
288 ContentItem::List(list) => self.process_list(list),
289 ContentItem::ListItem(list_item) => self.process_list_item(list_item),
290 ContentItem::Definition(definition) => self.process_definition(definition),
291 ContentItem::Annotation(annotation) => self.process_annotation(annotation),
292 ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
293 ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
294 ContentItem::VerbatimLine(_) => {}
295 ContentItem::BlankLineGroup(_) => {}
296 }
297 }
298
299 fn process_paragraph(&mut self, paragraph: &Paragraph) {
300 for line in ¶graph.lines {
301 if let ContentItem::TextLine(text_line) = line {
302 self.process_text_content(&text_line.content);
306 }
307 }
308 self.process_annotations(paragraph.annotations());
309 }
310
311 fn process_list(&mut self, list: &List) {
312 self.process_annotations(list.annotations());
313 for item in list.items.iter() {
314 if let ContentItem::ListItem(list_item) = item {
315 self.process_list_item(list_item);
316 }
317 }
318 }
319
320 fn process_list_item(&mut self, list_item: &ListItem) {
321 if let Some(marker_range) = &list_item.marker.location {
322 self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
323 }
324 for text in &list_item.text {
325 if let Some(location) = &text.location {
326 self.push_range(location, LexSemanticTokenKind::ListItemText);
327 }
328 self.process_text_content(text);
329 }
330 self.process_annotations(list_item.annotations());
331 for child in list_item.children.iter() {
332 self.process_content_item(child);
333 }
334 }
335
336 fn process_definition(&mut self, definition: &Definition) {
337 if let Some(header) = definition.header_location() {
338 self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
339 }
340 self.process_text_content(&definition.subject);
341 self.process_annotations(definition.annotations());
342 let was_in_definition = self.in_definition;
343 self.in_definition = true;
344 for child in definition.children.iter() {
345 self.process_content_item(child);
346 }
347 self.in_definition = was_in_definition;
348 }
349
350 fn process_verbatim(&mut self, verbatim: &Verbatim) {
351 for group in verbatim.group() {
352 self.process_text_content(group.subject);
353 if let Some(location) = &group.subject.location {
354 self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
355 }
356 }
357
358 self.push_range(
359 &verbatim.closing_data.label.location,
360 LexSemanticTokenKind::VerbatimLanguage,
361 );
362 for parameter in &verbatim.closing_data.parameters {
363 self.push_range(¶meter.location, LexSemanticTokenKind::VerbatimAttribute);
364 }
365
366 for child in &verbatim.children {
368 if let ContentItem::VerbatimLine(line) = child {
369 self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
370 }
371 }
372
373 self.process_annotations(verbatim.annotations());
374 }
375
376 fn process_annotation(&mut self, annotation: &Annotation) {
377 self.push_range(
378 annotation.header_location(),
379 LexSemanticTokenKind::AnnotationLabel,
380 );
381 for parameter in &annotation.data.parameters {
382 self.push_range(
383 ¶meter.location,
384 LexSemanticTokenKind::AnnotationParameter,
385 );
386 }
387 let was_in_annotation = self.in_annotation;
388 self.in_annotation = true;
389 for child in annotation.children.iter() {
390 self.process_content_item(child);
391 }
392 self.in_annotation = was_in_annotation;
393 }
394
395 fn process_annotations(&mut self, annotations: &[Annotation]) {
396 for annotation in annotations {
397 self.process_annotation(annotation);
398 }
399 }
400
401 fn process_text_content(&mut self, text: &TextContent) {
402 for span in extract_inline_spans(text) {
403 let kind = match span.kind {
404 InlineSpanKind::Strong => Some(LexSemanticTokenKind::InlineStrong),
405 InlineSpanKind::Emphasis => Some(LexSemanticTokenKind::InlineEmphasis),
406 InlineSpanKind::Code => Some(LexSemanticTokenKind::InlineCode),
407 InlineSpanKind::Math => Some(LexSemanticTokenKind::InlineMath),
408 InlineSpanKind::Reference(reference_type) => Some(match reference_type {
409 ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
410 ReferenceType::FootnoteNumber { .. }
411 | ReferenceType::FootnoteLabeled { .. } => {
412 LexSemanticTokenKind::ReferenceFootnote
413 }
414 _ => LexSemanticTokenKind::Reference,
415 }),
416 InlineSpanKind::StrongMarkerStart => {
417 Some(LexSemanticTokenKind::InlineMarkerStrongStart)
418 }
419 InlineSpanKind::StrongMarkerEnd => {
420 Some(LexSemanticTokenKind::InlineMarkerStrongEnd)
421 }
422 InlineSpanKind::EmphasisMarkerStart => {
423 Some(LexSemanticTokenKind::InlineMarkerEmphasisStart)
424 }
425 InlineSpanKind::EmphasisMarkerEnd => {
426 Some(LexSemanticTokenKind::InlineMarkerEmphasisEnd)
427 }
428 InlineSpanKind::CodeMarkerStart => {
429 Some(LexSemanticTokenKind::InlineMarkerCodeStart)
430 }
431 InlineSpanKind::CodeMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerCodeEnd),
432 InlineSpanKind::MathMarkerStart => {
433 Some(LexSemanticTokenKind::InlineMarkerMathStart)
434 }
435 InlineSpanKind::MathMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerMathEnd),
436 InlineSpanKind::RefMarkerStart => Some(LexSemanticTokenKind::InlineMarkerRefStart),
437 InlineSpanKind::RefMarkerEnd => Some(LexSemanticTokenKind::InlineMarkerRefEnd),
438 };
439 if let Some(kind) = kind {
440 self.push_range(&span.range, kind);
441 }
442 }
443 }
444}
445
446#[cfg(test)]
447mod tests {
448 use super::*;
449 use crate::test_support::{sample_document, sample_source};
450 use lex_core::lex::testing::lexplore::Lexplore;
451
452 fn snippets(
453 tokens: &[LexSemanticToken],
454 kind: LexSemanticTokenKind,
455 source: &str,
456 ) -> Vec<String> {
457 tokens
458 .iter()
459 .filter(|token| token.kind == kind)
460 .map(|token| source[token.range.span.clone()].to_string())
461 .collect()
462 }
463
464 #[test]
465 fn collects_structural_tokens() {
466 let document = sample_document();
467 let tokens = collect_semantic_tokens(&document);
468 let source = sample_source();
469
470 assert!(
472 snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
473 .iter()
474 .any(|snippet| snippet.trim() == "1.")
475 );
476 assert!(
477 snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
478 .iter()
479 .any(|snippet| snippet.trim() == "Intro")
480 );
481 assert!(
483 snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
484 .iter()
485 .any(|snippet| snippet.trim_end() == "Cache")
486 );
487 let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
488 assert_eq!(markers.len(), 4);
489 assert!(markers
490 .iter()
491 .all(|snippet| snippet.trim_start().starts_with('-')
492 || snippet.trim_start().chars().next().unwrap().is_numeric()));
493 let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
494 assert!(annotation_labels
495 .iter()
496 .any(|snippet| snippet.contains("doc.note")));
497 let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
498 assert!(parameters
499 .iter()
500 .any(|snippet| snippet.contains("severity=info")));
501 let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
502 assert!(verbatim_subjects
503 .iter()
504 .any(|snippet| snippet.contains("CLI Example")));
505 assert!(
506 snippets(&tokens, LexSemanticTokenKind::VerbatimLanguage, source)
507 .iter()
508 .any(|snippet| snippet.contains("shell"))
509 );
510 }
511
512 #[test]
513 fn collects_inline_tokens() {
514 let document = sample_document();
515 let tokens = collect_semantic_tokens(&document);
516 let source = sample_source();
517 assert!(
518 snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
519 .iter()
520 .any(|snippet| snippet.contains("Lex"))
521 );
522 assert!(
523 snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
524 .iter()
525 .any(|snippet| snippet.contains("format"))
526 );
527 assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
528 .iter()
529 .any(|snippet| snippet.contains("code")));
530 assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
531 .iter()
532 .any(|snippet| snippet.contains("math")));
533 }
534
535 #[test]
536 fn classifies_references() {
537 let document = sample_document();
538 let tokens = collect_semantic_tokens(&document);
539 let source = sample_source();
540 assert!(
541 snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
542 .iter()
543 .any(|snippet| snippet.contains("@spec2025"))
544 );
545 assert!(
546 snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
547 .iter()
548 .any(|snippet| snippet.contains("^source"))
549 );
550 assert!(
551 snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
552 .iter()
553 .any(|snippet| snippet.contains("1"))
554 );
555 assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
556 .iter()
557 .any(|snippet| snippet.contains("Cache")));
558 }
559
560 #[test]
561 fn empty_document_has_no_tokens() {
562 let document = Lexplore::benchmark(0)
563 .parse()
564 .expect("failed to parse empty benchmark fixture");
565 let tokens = collect_semantic_tokens(&document);
566 assert!(tokens.is_empty());
567 }
568}