use lex_core::lex::ast::inline_positions::{walk_text_content_positions, InlinePositionVisitor};
use lex_core::lex::ast::{
Annotation, ContentItem, Definition, Document, List, ListItem, Paragraph, Range, Session,
Table, TextContent, Verbatim,
};
use lex_core::lex::inlines::{ReferenceInline, ReferenceType};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum LexSemanticTokenKind {
DocumentTitle,
DocumentSubtitle,
SessionMarker,
SessionTitleText,
DefinitionSubject,
DefinitionContent,
ListMarker,
ListItemText,
AnnotationLabel,
AnnotationParameter,
AnnotationContent,
InlineStrong,
InlineEmphasis,
InlineCode,
InlineMath,
Reference,
ReferenceCitation,
ReferenceFootnote,
ReferenceAnnotation,
VerbatimSubject,
DataLabel,
DataParameter,
VerbatimContent,
InlineMarkerStrongStart,
InlineMarkerStrongEnd,
InlineMarkerEmphasisStart,
InlineMarkerEmphasisEnd,
InlineMarkerCodeStart,
InlineMarkerCodeEnd,
InlineMarkerMathStart,
InlineMarkerMathEnd,
InlineMarkerRefStart,
InlineMarkerRefEnd,
}
impl LexSemanticTokenKind {
pub fn as_str(self) -> &'static str {
match self {
LexSemanticTokenKind::DocumentTitle => "DocumentTitle",
LexSemanticTokenKind::DocumentSubtitle => "DocumentSubtitle",
LexSemanticTokenKind::SessionMarker => "SessionMarker",
LexSemanticTokenKind::SessionTitleText => "SessionTitleText",
LexSemanticTokenKind::DefinitionSubject => "DefinitionSubject",
LexSemanticTokenKind::DefinitionContent => "DefinitionContent",
LexSemanticTokenKind::ListMarker => "ListMarker",
LexSemanticTokenKind::ListItemText => "ListItemText",
LexSemanticTokenKind::AnnotationLabel => "AnnotationLabel",
LexSemanticTokenKind::AnnotationParameter => "AnnotationParameter",
LexSemanticTokenKind::AnnotationContent => "AnnotationContent",
LexSemanticTokenKind::InlineStrong => "InlineStrong",
LexSemanticTokenKind::InlineEmphasis => "InlineEmphasis",
LexSemanticTokenKind::InlineCode => "InlineCode",
LexSemanticTokenKind::InlineMath => "InlineMath",
LexSemanticTokenKind::Reference => "Reference",
LexSemanticTokenKind::ReferenceCitation => "ReferenceCitation",
LexSemanticTokenKind::ReferenceFootnote => "ReferenceFootnote",
LexSemanticTokenKind::ReferenceAnnotation => "ReferenceAnnotation",
LexSemanticTokenKind::VerbatimSubject => "VerbatimSubject",
LexSemanticTokenKind::DataLabel => "DataLabel",
LexSemanticTokenKind::DataParameter => "DataParameter",
LexSemanticTokenKind::VerbatimContent => "VerbatimContent",
LexSemanticTokenKind::InlineMarkerStrongStart => "InlineMarker_strong_start",
LexSemanticTokenKind::InlineMarkerStrongEnd => "InlineMarker_strong_end",
LexSemanticTokenKind::InlineMarkerEmphasisStart => "InlineMarker_emphasis_start",
LexSemanticTokenKind::InlineMarkerEmphasisEnd => "InlineMarker_emphasis_end",
LexSemanticTokenKind::InlineMarkerCodeStart => "InlineMarker_code_start",
LexSemanticTokenKind::InlineMarkerCodeEnd => "InlineMarker_code_end",
LexSemanticTokenKind::InlineMarkerMathStart => "InlineMarker_math_start",
LexSemanticTokenKind::InlineMarkerMathEnd => "InlineMarker_math_end",
LexSemanticTokenKind::InlineMarkerRefStart => "InlineMarker_ref_start",
LexSemanticTokenKind::InlineMarkerRefEnd => "InlineMarker_ref_end",
}
}
}
pub const SEMANTIC_TOKEN_KINDS: &[LexSemanticTokenKind] = &[
LexSemanticTokenKind::DocumentTitle,
LexSemanticTokenKind::DocumentSubtitle,
LexSemanticTokenKind::SessionMarker,
LexSemanticTokenKind::SessionTitleText,
LexSemanticTokenKind::DefinitionSubject,
LexSemanticTokenKind::DefinitionContent,
LexSemanticTokenKind::ListMarker,
LexSemanticTokenKind::ListItemText,
LexSemanticTokenKind::AnnotationLabel,
LexSemanticTokenKind::AnnotationParameter,
LexSemanticTokenKind::AnnotationContent,
LexSemanticTokenKind::InlineStrong,
LexSemanticTokenKind::InlineEmphasis,
LexSemanticTokenKind::InlineCode,
LexSemanticTokenKind::InlineMath,
LexSemanticTokenKind::Reference,
LexSemanticTokenKind::ReferenceCitation,
LexSemanticTokenKind::ReferenceFootnote,
LexSemanticTokenKind::VerbatimSubject,
LexSemanticTokenKind::DataLabel,
LexSemanticTokenKind::DataParameter,
LexSemanticTokenKind::VerbatimContent,
LexSemanticTokenKind::InlineMarkerStrongStart,
LexSemanticTokenKind::InlineMarkerStrongEnd,
LexSemanticTokenKind::InlineMarkerEmphasisStart,
LexSemanticTokenKind::InlineMarkerEmphasisEnd,
LexSemanticTokenKind::InlineMarkerCodeStart,
LexSemanticTokenKind::InlineMarkerCodeEnd,
LexSemanticTokenKind::InlineMarkerMathStart,
LexSemanticTokenKind::InlineMarkerMathEnd,
LexSemanticTokenKind::InlineMarkerRefStart,
LexSemanticTokenKind::InlineMarkerRefEnd,
LexSemanticTokenKind::ReferenceAnnotation,
];
#[derive(Debug, Clone, PartialEq)]
pub struct LexSemanticToken {
pub kind: LexSemanticTokenKind,
pub range: Range,
}
pub fn collect_semantic_tokens(document: &Document) -> Vec<LexSemanticToken> {
let mut collector = TokenCollector::new();
collector.process_document(document);
collector.finish()
}
struct TokenCollector {
tokens: Vec<LexSemanticToken>,
in_annotation: bool,
in_definition: bool,
}
impl TokenCollector {
fn new() -> Self {
Self {
tokens: Vec::new(),
in_annotation: false,
in_definition: false,
}
}
fn finish(mut self) -> Vec<LexSemanticToken> {
self.tokens.sort_by(|a, b| {
let a_start = (
&a.range.start.line,
&a.range.start.column,
&a.range.end.line,
&a.range.end.column,
);
let b_start = (
&b.range.start.line,
&b.range.start.column,
&b.range.end.line,
&b.range.end.column,
);
a_start.cmp(&b_start)
});
self.tokens
}
fn push_range(&mut self, range: &Range, kind: LexSemanticTokenKind) {
if range.span.start < range.span.end {
self.tokens.push(LexSemanticToken {
kind,
range: range.clone(),
});
}
}
fn process_document(&mut self, document: &Document) {
self.process_annotations(document.annotations());
if let Some(title) = &document.title {
if let Some(title_loc) = &title.content.location {
self.push_range(title_loc, LexSemanticTokenKind::DocumentTitle);
} else {
self.push_range(&title.location, LexSemanticTokenKind::DocumentTitle);
}
self.process_text_content(&title.content);
if let Some(subtitle) = &title.subtitle {
if let Some(sub_loc) = &subtitle.location {
self.push_range(sub_loc, LexSemanticTokenKind::DocumentSubtitle);
}
self.process_text_content(subtitle);
}
}
self.process_session(&document.root, LexSemanticTokenKind::SessionTitleText);
}
fn process_session(&mut self, session: &Session, title_kind: LexSemanticTokenKind) {
if let Some(marker) = &session.marker {
self.push_range(&marker.location, LexSemanticTokenKind::SessionMarker);
}
if let Some(header) = session.header_location() {
if let Some(marker) = &session.marker {
let marker_text = marker.as_str();
let full_title = session.full_title();
if let Some(pos) = full_title.find(marker_text) {
let marker_end = pos + marker_text.len();
let title_start = full_title[marker_end..]
.chars()
.position(|c| !c.is_whitespace())
.map(|p| marker_end + p)
.unwrap_or(marker_end);
if title_start < full_title.len() {
use lex_core::lex::ast::Position;
let title_text_range = Range::new(
header.span.start + title_start..header.span.end,
Position::new(header.start.line, header.start.column + title_start),
header.end,
);
self.push_range(&title_text_range, title_kind);
}
}
} else {
self.push_range(header, title_kind);
}
}
self.process_text_content(&session.title);
self.process_annotations(session.annotations());
for child in session.children.iter() {
self.process_content_item(child);
}
}
fn process_content_item(&mut self, item: &ContentItem) {
match item {
ContentItem::Paragraph(paragraph) => self.process_paragraph(paragraph),
ContentItem::Session(session) => {
self.process_session(session, LexSemanticTokenKind::SessionTitleText)
}
ContentItem::List(list) => self.process_list(list),
ContentItem::ListItem(list_item) => self.process_list_item(list_item),
ContentItem::Definition(definition) => self.process_definition(definition),
ContentItem::Annotation(annotation) => self.process_annotation(annotation),
ContentItem::VerbatimBlock(verbatim) => self.process_verbatim(verbatim),
ContentItem::Table(table) => self.process_table(table),
ContentItem::TextLine(text_line) => self.process_text_content(&text_line.content),
ContentItem::VerbatimLine(_) => {}
ContentItem::BlankLineGroup(_) => {}
}
}
fn process_paragraph(&mut self, paragraph: &Paragraph) {
for line in ¶graph.lines {
if let ContentItem::TextLine(text_line) = line {
self.process_text_content(&text_line.content);
}
}
self.process_annotations(paragraph.annotations());
}
fn process_list(&mut self, list: &List) {
self.process_annotations(list.annotations());
for item in list.items.iter() {
if let ContentItem::ListItem(list_item) = item {
self.process_list_item(list_item);
}
}
}
fn process_list_item(&mut self, list_item: &ListItem) {
if let Some(marker_range) = &list_item.marker.location {
self.push_range(marker_range, LexSemanticTokenKind::ListMarker);
}
for text in &list_item.text {
if let Some(location) = &text.location {
self.push_range(location, LexSemanticTokenKind::ListItemText);
}
self.process_text_content(text);
}
self.process_annotations(list_item.annotations());
for child in list_item.children.iter() {
self.process_content_item(child);
}
}
fn process_definition(&mut self, definition: &Definition) {
if let Some(header) = definition.header_location() {
self.push_range(header, LexSemanticTokenKind::DefinitionSubject);
}
self.process_text_content(&definition.subject);
self.process_annotations(definition.annotations());
let was_in_definition = self.in_definition;
self.in_definition = true;
for child in definition.children.iter() {
self.process_content_item(child);
}
self.in_definition = was_in_definition;
}
fn process_verbatim(&mut self, verbatim: &Verbatim) {
for group in verbatim.group() {
self.process_text_content(group.subject);
if let Some(location) = &group.subject.location {
self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
}
for child in group.children {
if let ContentItem::VerbatimLine(line) = child {
self.push_range(&line.location, LexSemanticTokenKind::VerbatimContent);
}
}
}
self.push_range(
&verbatim.closing_data.label.location,
LexSemanticTokenKind::DataLabel,
);
for parameter in &verbatim.closing_data.parameters {
self.push_range(¶meter.location, LexSemanticTokenKind::DataParameter);
}
self.process_annotations(verbatim.annotations());
}
fn process_table(&mut self, table: &Table) {
self.process_text_content(&table.subject);
if let Some(location) = &table.subject.location {
self.push_range(location, LexSemanticTokenKind::VerbatimSubject);
}
for row in table.all_rows() {
for cell in &row.cells {
self.process_text_content(&cell.content);
for child in cell.children.iter() {
self.process_content_item(child);
}
}
}
self.process_annotations(table.annotations());
}
fn process_annotation(&mut self, annotation: &Annotation) {
self.push_range(
annotation.header_location(),
LexSemanticTokenKind::AnnotationLabel,
);
for parameter in &annotation.data.parameters {
self.push_range(
¶meter.location,
LexSemanticTokenKind::AnnotationParameter,
);
}
let was_in_annotation = self.in_annotation;
self.in_annotation = true;
for child in annotation.children.iter() {
self.process_content_item(child);
}
self.in_annotation = was_in_annotation;
}
fn process_annotations(&mut self, annotations: &[Annotation]) {
for annotation in annotations {
self.process_annotation(annotation);
}
}
fn process_text_content(&mut self, text: &TextContent) {
let mut emitter = InlineTokenEmitter {
tokens: &mut self.tokens,
in_annotation: self.in_annotation,
in_definition: self.in_definition,
in_formatted: 0,
};
walk_text_content_positions(text, &mut emitter);
}
}
struct InlineTokenEmitter<'a> {
tokens: &'a mut Vec<LexSemanticToken>,
in_annotation: bool,
in_definition: bool,
in_formatted: usize,
}
impl<'a> InlineTokenEmitter<'a> {
fn push(&mut self, range: &Range, kind: LexSemanticTokenKind) {
if range.span.start < range.span.end {
self.tokens.push(LexSemanticToken {
kind,
range: range.clone(),
});
}
}
}
impl<'a> InlinePositionVisitor for InlineTokenEmitter<'a> {
fn visit_plain(&mut self, range: &Range, _text: &str) {
if self.in_formatted > 0 {
return;
}
let kind = if self.in_annotation {
LexSemanticTokenKind::AnnotationContent
} else if self.in_definition {
LexSemanticTokenKind::DefinitionContent
} else {
return;
};
self.push(range, kind);
}
fn enter_strong(&mut self, open_marker: &Range) {
self.push(open_marker, LexSemanticTokenKind::InlineMarkerStrongStart);
self.in_formatted += 1;
}
fn leave_strong(&mut self, content: &Range, close_marker: &Range) {
self.in_formatted -= 1;
self.push(content, LexSemanticTokenKind::InlineStrong);
self.push(close_marker, LexSemanticTokenKind::InlineMarkerStrongEnd);
}
fn enter_emphasis(&mut self, open_marker: &Range) {
self.push(open_marker, LexSemanticTokenKind::InlineMarkerEmphasisStart);
self.in_formatted += 1;
}
fn leave_emphasis(&mut self, content: &Range, close_marker: &Range) {
self.in_formatted -= 1;
self.push(content, LexSemanticTokenKind::InlineEmphasis);
self.push(close_marker, LexSemanticTokenKind::InlineMarkerEmphasisEnd);
}
fn visit_code(
&mut self,
open_marker: &Range,
content: &Range,
close_marker: &Range,
_text: &str,
) {
self.push(open_marker, LexSemanticTokenKind::InlineMarkerCodeStart);
self.push(content, LexSemanticTokenKind::InlineCode);
self.push(close_marker, LexSemanticTokenKind::InlineMarkerCodeEnd);
}
fn visit_math(
&mut self,
open_marker: &Range,
content: &Range,
close_marker: &Range,
_text: &str,
) {
self.push(open_marker, LexSemanticTokenKind::InlineMarkerMathStart);
self.push(content, LexSemanticTokenKind::InlineMath);
self.push(close_marker, LexSemanticTokenKind::InlineMarkerMathEnd);
}
fn visit_reference(
&mut self,
open_marker: &Range,
content: &Range,
close_marker: &Range,
data: &ReferenceInline,
) {
self.push(open_marker, LexSemanticTokenKind::InlineMarkerRefStart);
let ref_kind = match &data.reference_type {
ReferenceType::Citation(_) => LexSemanticTokenKind::ReferenceCitation,
ReferenceType::FootnoteNumber { .. } => LexSemanticTokenKind::ReferenceFootnote,
ReferenceType::AnnotationReference { .. } => LexSemanticTokenKind::ReferenceAnnotation,
_ => LexSemanticTokenKind::Reference,
};
self.push(content, ref_kind);
self.push(close_marker, LexSemanticTokenKind::InlineMarkerRefEnd);
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_support::{sample_document, sample_source};
use lex_core::lex::testing::lexplore::Lexplore;
fn snippets(
tokens: &[LexSemanticToken],
kind: LexSemanticTokenKind,
source: &str,
) -> Vec<String> {
tokens
.iter()
.filter(|token| token.kind == kind)
.map(|token| source[token.range.span.clone()].to_string())
.collect()
}
#[test]
fn collects_structural_tokens() {
let document = sample_document();
let tokens = collect_semantic_tokens(&document);
let source = sample_source();
assert!(
snippets(&tokens, LexSemanticTokenKind::SessionMarker, source)
.iter()
.any(|snippet| snippet.trim() == "1.")
);
assert!(
snippets(&tokens, LexSemanticTokenKind::SessionTitleText, source)
.iter()
.any(|snippet| snippet.trim() == "Intro")
);
assert!(
snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source)
.iter()
.any(|snippet| snippet.trim_end() == "Cache")
);
let markers = snippets(&tokens, LexSemanticTokenKind::ListMarker, source);
assert_eq!(markers.len(), 4);
assert!(markers
.iter()
.all(|snippet| snippet.trim_start().starts_with('-')
|| snippet.trim_start().chars().next().unwrap().is_numeric()));
let annotation_labels = snippets(&tokens, LexSemanticTokenKind::AnnotationLabel, source);
assert!(annotation_labels
.iter()
.any(|snippet| snippet.contains("doc.note")));
let parameters = snippets(&tokens, LexSemanticTokenKind::AnnotationParameter, source);
assert!(parameters
.iter()
.any(|snippet| snippet.contains("severity=info")));
let verbatim_subjects = snippets(&tokens, LexSemanticTokenKind::VerbatimSubject, source);
assert!(verbatim_subjects
.iter()
.any(|snippet| snippet.contains("CLI Example")));
assert!(snippets(&tokens, LexSemanticTokenKind::DataLabel, source)
.iter()
.any(|snippet| snippet.contains("shell")));
}
#[test]
fn collects_inline_tokens() {
let document = sample_document();
let tokens = collect_semantic_tokens(&document);
let source = sample_source();
assert!(
snippets(&tokens, LexSemanticTokenKind::InlineStrong, source)
.iter()
.any(|snippet| snippet.contains("Lex"))
);
assert!(
snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source)
.iter()
.any(|snippet| snippet.contains("format"))
);
assert!(snippets(&tokens, LexSemanticTokenKind::InlineCode, source)
.iter()
.any(|snippet| snippet.contains("code")));
assert!(snippets(&tokens, LexSemanticTokenKind::InlineMath, source)
.iter()
.any(|snippet| snippet.contains("math")));
}
#[test]
fn classifies_references() {
let document = sample_document();
let tokens = collect_semantic_tokens(&document);
let source = sample_source();
assert!(
snippets(&tokens, LexSemanticTokenKind::ReferenceCitation, source)
.iter()
.any(|snippet| snippet.contains("@spec2025"))
);
assert!(
snippets(&tokens, LexSemanticTokenKind::ReferenceAnnotation, source)
.iter()
.any(|snippet| snippet.contains("::source"))
);
assert!(
snippets(&tokens, LexSemanticTokenKind::ReferenceFootnote, source)
.iter()
.any(|snippet| snippet.contains("1"))
);
assert!(snippets(&tokens, LexSemanticTokenKind::Reference, source)
.iter()
.any(|snippet| snippet.contains("Cache")));
}
#[test]
fn empty_document_has_no_tokens() {
let document = Lexplore::benchmark(0)
.parse()
.expect("failed to parse empty benchmark fixture");
let tokens = collect_semantic_tokens(&document);
assert!(tokens.is_empty());
}
#[test]
fn emits_annotation_content_for_inline_annotation() {
let document = sample_document();
let tokens = collect_semantic_tokens(&document);
let source = sample_source();
let annotation_content = snippets(&tokens, LexSemanticTokenKind::AnnotationContent, source);
assert!(
annotation_content
.iter()
.any(|snippet| snippet.contains("Document preface")),
"AnnotationContent should be emitted for plain text inside annotations, got: {annotation_content:?}"
);
}
#[test]
fn annotation_content_excludes_formatted_text() {
let source = ":: note :: Some *bold* text.\n";
let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
let tokens = collect_semantic_tokens(&document);
let annotation_content: Vec<_> = tokens
.iter()
.filter(|t| t.kind == LexSemanticTokenKind::AnnotationContent)
.map(|t| &source[t.range.span.clone()])
.collect();
assert!(
annotation_content.iter().any(|s| s.contains("Some")),
"Plain text before formatting should be AnnotationContent"
);
assert!(
annotation_content.iter().any(|s| s.contains("text.")),
"Plain text after formatting should be AnnotationContent"
);
assert!(
!annotation_content.iter().any(|s| s.contains("bold")),
"Formatted text should NOT be AnnotationContent"
);
let strong: Vec<_> = tokens
.iter()
.filter(|t| t.kind == LexSemanticTokenKind::InlineStrong)
.map(|t| &source[t.range.span.clone()])
.collect();
assert!(strong.contains(&"bold"));
}
#[test]
fn table_cell_inline_formatting_gets_tokens() {
let source = "Stats:\n | *Name* | `code` |\n | _test_ | #42# |\n";
let document = lex_core::lex::parsing::parse_document(source).expect("failed to parse");
let tokens = collect_semantic_tokens(&document);
let strong = snippets(&tokens, LexSemanticTokenKind::InlineStrong, source);
assert!(
strong.iter().any(|s| s.contains("Name")),
"Expected InlineStrong for *Name* in table cell, got: {strong:?}"
);
let code = snippets(&tokens, LexSemanticTokenKind::InlineCode, source);
assert!(
code.iter().any(|s| s.contains("code")),
"Expected InlineCode for `code` in table cell, got: {code:?}"
);
let emphasis = snippets(&tokens, LexSemanticTokenKind::InlineEmphasis, source);
assert!(
emphasis.iter().any(|s| s.contains("test")),
"Expected InlineEmphasis for _test_ in table cell, got: {emphasis:?}"
);
let math = snippets(&tokens, LexSemanticTokenKind::InlineMath, source);
assert!(
math.iter().any(|s| s.contains("42")),
"Expected InlineMath for #42# in table cell, got: {math:?}"
);
}
}