tinymist_world/parser/
semantic_tokens.rs1use strum::IntoEnumIterator;
4use typst::syntax::{ast, LinkedNode, Source, SyntaxKind};
5
6use super::modifier_set::ModifierSet;
7use super::typst_tokens::{Modifier, TokenType};
8
9#[derive(serde::Deserialize, serde::Serialize, Debug, Clone)]
10pub struct SemanticTokensLegend {
11 #[serde(rename = "tokenTypes")]
12 pub token_types: Vec<String>,
13 #[serde(rename = "tokenModifiers")]
14 pub token_modifiers: Vec<String>,
15}
16
17pub fn get_semantic_tokens_legend() -> SemanticTokensLegend {
18 SemanticTokensLegend {
19 token_types: TokenType::iter()
20 .map(|e| {
21 let e: &'static str = e.into();
22
23 e.to_owned()
24 })
25 .collect(),
26 token_modifiers: Modifier::iter()
27 .map(|e| {
28 let e: &'static str = e.into();
29
30 e.to_owned()
31 })
32 .collect(),
33 }
34}
35
36#[derive(Debug, Clone, Copy)]
37pub enum OffsetEncoding {
38 Utf8,
39 Utf16,
40}
41
42pub fn get_semantic_tokens_full(source: &Source, encoding: OffsetEncoding) -> Vec<SemanticToken> {
43 let root = LinkedNode::new(source.root());
44 let mut full = tokenize_tree(&root, ModifierSet::empty());
45
46 let mut init = (0, 0);
47 for token in full.iter_mut() {
48 let offset = ((token.delta_line as u64) << 32) | token.delta_start_character as u64;
50 let position = (match encoding {
51 OffsetEncoding::Utf8 => offset_to_position_utf8,
52 OffsetEncoding::Utf16 => offset_to_position_utf16,
53 })(offset as usize, source);
54 token.delta_line = position.0;
55 token.delta_start_character = position.1;
56
57 let next = (token.delta_line, token.delta_start_character);
58 token.delta_line -= init.0;
59 if token.delta_line == 0 {
60 token.delta_start_character -= init.1;
61 }
62 init = next;
63 }
64
65 full
66}
67
68fn tokenize_single_node(node: &LinkedNode, modifiers: ModifierSet) -> Option<SemanticToken> {
69 let is_leaf = node.children().next().is_none();
70
71 token_from_node(node)
72 .or_else(|| is_leaf.then_some(TokenType::Text))
73 .map(|token_type| SemanticToken::new(token_type, modifiers, node))
74}
75
76fn tokenize_tree(root: &LinkedNode<'_>, parent_modifiers: ModifierSet) -> Vec<SemanticToken> {
78 let modifiers = parent_modifiers | modifiers_from_node(root);
79
80 let token = tokenize_single_node(root, modifiers).into_iter();
81 let children = root
82 .children()
83 .flat_map(move |child| tokenize_tree(&child, modifiers));
84 token.chain(children).collect()
85}
86
87#[derive(Debug, Clone, Copy)]
88pub struct SemanticToken {
89 pub delta_line: u32,
90 pub delta_start_character: u32,
91 pub length: u32,
92 pub token_type: u32,
93 pub token_modifiers: u32,
94}
95
96impl SemanticToken {
97 fn new(token_type: TokenType, modifiers: ModifierSet, node: &LinkedNode) -> Self {
98 let source = node.get().clone().into_text();
99
100 let raw_position = node.offset() as u64;
101 let raw_position = ((raw_position >> 32) as u32, raw_position as u32);
102
103 Self {
104 token_type: token_type as u32,
105 token_modifiers: modifiers.bitset(),
106 delta_line: raw_position.0,
107 delta_start_character: raw_position.1,
108 length: source.chars().map(char::len_utf16).sum::<usize>() as u32,
109 }
110 }
111}
112
113fn modifiers_from_node(node: &LinkedNode) -> ModifierSet {
118 match node.kind() {
119 SyntaxKind::Emph => ModifierSet::new(&[Modifier::Emph]),
120 SyntaxKind::Strong => ModifierSet::new(&[Modifier::Strong]),
121 SyntaxKind::Math | SyntaxKind::Equation => ModifierSet::new(&[Modifier::Math]),
122 _ => ModifierSet::empty(),
123 }
124}
125
126fn token_from_node(node: &LinkedNode) -> Option<TokenType> {
134 use SyntaxKind::*;
135
136 match node.kind() {
137 Star if node.parent_kind() == Some(Strong) => Some(TokenType::Punctuation),
138 Star if node.parent_kind() == Some(ModuleImport) => Some(TokenType::Operator),
139
140 Underscore if node.parent_kind() == Some(Emph) => Some(TokenType::Punctuation),
141 Underscore if node.parent_kind() == Some(MathAttach) => Some(TokenType::Operator),
142
143 MathIdent | Ident => Some(token_from_ident(node)),
144 Hash => token_from_hashtag(node),
145
146 LeftBrace | RightBrace | LeftBracket | RightBracket | LeftParen | RightParen | Comma
147 | Semicolon | Colon => Some(TokenType::Punctuation),
148 Linebreak | Escape | Shorthand => Some(TokenType::Escape),
149 Link => Some(TokenType::Link),
150 Raw => Some(TokenType::Raw),
151 Label => Some(TokenType::Label),
152 RefMarker => Some(TokenType::Ref),
153 Heading | HeadingMarker => Some(TokenType::Heading),
154 ListMarker | EnumMarker | TermMarker => Some(TokenType::ListMarker),
155 MathAlignPoint | Plus | Minus | Slash | Hat | Dot | Eq | EqEq | ExclEq | Lt | LtEq | Gt
156 | GtEq | PlusEq | HyphEq | StarEq | SlashEq | Dots | Arrow | Not | And | Or => {
157 Some(TokenType::Operator)
158 }
159 Dollar => Some(TokenType::Delimiter),
160 None | Auto | Let | Show | If | Else | For | In | While | Break | Continue | Return
161 | Import | Include | As | Set => Some(TokenType::Keyword),
162 Bool => Some(TokenType::Bool),
163 Int | Float | Numeric => Some(TokenType::Number),
164 Str => Some(TokenType::String),
165 LineComment | BlockComment => Some(TokenType::Comment),
166 Error => Some(TokenType::Error),
167
168 _ => Option::None,
170 }
171}
172
173fn is_function_ident(ident: &LinkedNode) -> bool {
175 let Some(next) = ident.next_leaf() else {
176 return false;
177 };
178 let function_call = matches!(next.kind(), SyntaxKind::LeftParen)
179 && matches!(
180 next.parent_kind(),
181 Some(SyntaxKind::Args | SyntaxKind::Params)
182 );
183 let function_content = matches!(next.kind(), SyntaxKind::LeftBracket)
184 && matches!(next.parent_kind(), Some(SyntaxKind::ContentBlock));
185 function_call || function_content
186}
187
188fn token_from_ident(ident: &LinkedNode) -> TokenType {
189 if is_function_ident(ident) {
190 TokenType::Function
191 } else {
192 TokenType::Interpolated
193 }
194}
195
196fn get_expr_following_hashtag<'a>(hashtag: &LinkedNode<'a>) -> Option<LinkedNode<'a>> {
197 hashtag
198 .next_sibling()
199 .filter(|next| next.cast::<ast::Expr>().is_some_and(|expr| expr.hash()))
200 .and_then(|node| node.leftmost_leaf())
201}
202
203fn token_from_hashtag(hashtag: &LinkedNode) -> Option<TokenType> {
204 get_expr_following_hashtag(hashtag)
205 .as_ref()
206 .and_then(token_from_node)
207}
208
209fn offset_to_position_utf8(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
210 let line_index = typst_source.byte_to_line(typst_offset).unwrap();
211 let column_index = typst_source.byte_to_column(typst_offset).unwrap();
212
213 (line_index as u32, column_index as u32)
214}
215
216fn offset_to_position_utf16(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
217 let line_index = typst_source.byte_to_line(typst_offset).unwrap();
218
219 let lsp_line = line_index as u32;
220
221 let utf16_offset = typst_source.byte_to_utf16(typst_offset).unwrap();
229
230 let byte_line_offset = typst_source.line_to_byte(line_index).unwrap();
231 let utf16_line_offset = typst_source.byte_to_utf16(byte_line_offset).unwrap();
232
233 let utf16_column_offset = utf16_offset - utf16_line_offset;
234 let lsp_column = utf16_column_offset;
235
236 (lsp_line, lsp_column as u32)
237}