tinymist_world/parser/
semantic_tokens.rs

1//! From <https://github.com/nvarner/typst-lsp/blob/cc7bad9bd9764bfea783f2fab415cb3061fd8bff/src/server/semantic_tokens/mod.rs>
2
3use strum::IntoEnumIterator;
4use typst::syntax::{ast, LinkedNode, Source, SyntaxKind};
5
6use super::modifier_set::ModifierSet;
7use super::typst_tokens::{Modifier, TokenType};
8
9#[derive(serde::Deserialize, serde::Serialize, Debug, Clone)]
10pub struct SemanticTokensLegend {
11    #[serde(rename = "tokenTypes")]
12    pub token_types: Vec<String>,
13    #[serde(rename = "tokenModifiers")]
14    pub token_modifiers: Vec<String>,
15}
16
17pub fn get_semantic_tokens_legend() -> SemanticTokensLegend {
18    SemanticTokensLegend {
19        token_types: TokenType::iter()
20            .map(|e| {
21                let e: &'static str = e.into();
22
23                e.to_owned()
24            })
25            .collect(),
26        token_modifiers: Modifier::iter()
27            .map(|e| {
28                let e: &'static str = e.into();
29
30                e.to_owned()
31            })
32            .collect(),
33    }
34}
35
36#[derive(Debug, Clone, Copy)]
37pub enum OffsetEncoding {
38    Utf8,
39    Utf16,
40}
41
42pub fn get_semantic_tokens_full(source: &Source, encoding: OffsetEncoding) -> Vec<SemanticToken> {
43    let root = LinkedNode::new(source.root());
44    let mut full = tokenize_tree(&root, ModifierSet::empty());
45
46    let mut init = (0, 0);
47    for token in full.iter_mut() {
48        // resolve offset to position
49        let offset = ((token.delta_line as u64) << 32) | token.delta_start_character as u64;
50        let position = (match encoding {
51            OffsetEncoding::Utf8 => offset_to_position_utf8,
52            OffsetEncoding::Utf16 => offset_to_position_utf16,
53        })(offset as usize, source);
54        token.delta_line = position.0;
55        token.delta_start_character = position.1;
56
57        let next = (token.delta_line, token.delta_start_character);
58        token.delta_line -= init.0;
59        if token.delta_line == 0 {
60            token.delta_start_character -= init.1;
61        }
62        init = next;
63    }
64
65    full
66}
67
68fn tokenize_single_node(node: &LinkedNode, modifiers: ModifierSet) -> Option<SemanticToken> {
69    let is_leaf = node.children().next().is_none();
70
71    token_from_node(node)
72        .or_else(|| is_leaf.then_some(TokenType::Text))
73        .map(|token_type| SemanticToken::new(token_type, modifiers, node))
74}
75
76/// Tokenize a node and its children
77fn tokenize_tree(root: &LinkedNode<'_>, parent_modifiers: ModifierSet) -> Vec<SemanticToken> {
78    let modifiers = parent_modifiers | modifiers_from_node(root);
79
80    let token = tokenize_single_node(root, modifiers).into_iter();
81    let children = root
82        .children()
83        .flat_map(move |child| tokenize_tree(&child, modifiers));
84    token.chain(children).collect()
85}
86
87#[derive(Debug, Clone, Copy)]
88pub struct SemanticToken {
89    pub delta_line: u32,
90    pub delta_start_character: u32,
91    pub length: u32,
92    pub token_type: u32,
93    pub token_modifiers: u32,
94}
95
96impl SemanticToken {
97    fn new(token_type: TokenType, modifiers: ModifierSet, node: &LinkedNode) -> Self {
98        let source = node.get().clone().into_text();
99
100        let raw_position = node.offset() as u64;
101        let raw_position = ((raw_position >> 32) as u32, raw_position as u32);
102
103        Self {
104            token_type: token_type as u32,
105            token_modifiers: modifiers.bitset(),
106            delta_line: raw_position.0,
107            delta_start_character: raw_position.1,
108            length: source.chars().map(char::len_utf16).sum::<usize>() as u32,
109        }
110    }
111}
112
113/// Determines the [`Modifier`]s to be applied to a node and all its children.
114///
115/// Note that this does not recurse up, so calling it on a child node may not
116/// return a modifier that should be applied to it due to a parent.
117fn modifiers_from_node(node: &LinkedNode) -> ModifierSet {
118    match node.kind() {
119        SyntaxKind::Emph => ModifierSet::new(&[Modifier::Emph]),
120        SyntaxKind::Strong => ModifierSet::new(&[Modifier::Strong]),
121        SyntaxKind::Math | SyntaxKind::Equation => ModifierSet::new(&[Modifier::Math]),
122        _ => ModifierSet::empty(),
123    }
124}
125
126/// Determines the best [`TokenType`] for an entire node and its children, if
127/// any. If there is no single `TokenType`, or none better than `Text`, returns
128/// `None`.
129///
130/// In tokenization, returning `Some` stops recursion, while returning `None`
131/// continues and attempts to tokenize each of `node`'s children. If there are
132/// no children, `Text` is taken as the default.
133fn token_from_node(node: &LinkedNode) -> Option<TokenType> {
134    use SyntaxKind::*;
135
136    match node.kind() {
137        Star if node.parent_kind() == Some(Strong) => Some(TokenType::Punctuation),
138        Star if node.parent_kind() == Some(ModuleImport) => Some(TokenType::Operator),
139
140        Underscore if node.parent_kind() == Some(Emph) => Some(TokenType::Punctuation),
141        Underscore if node.parent_kind() == Some(MathAttach) => Some(TokenType::Operator),
142
143        MathIdent | Ident => Some(token_from_ident(node)),
144        Hash => token_from_hashtag(node),
145
146        LeftBrace | RightBrace | LeftBracket | RightBracket | LeftParen | RightParen | Comma
147        | Semicolon | Colon => Some(TokenType::Punctuation),
148        Linebreak | Escape | Shorthand => Some(TokenType::Escape),
149        Link => Some(TokenType::Link),
150        Raw => Some(TokenType::Raw),
151        Label => Some(TokenType::Label),
152        RefMarker => Some(TokenType::Ref),
153        Heading | HeadingMarker => Some(TokenType::Heading),
154        ListMarker | EnumMarker | TermMarker => Some(TokenType::ListMarker),
155        MathAlignPoint | Plus | Minus | Slash | Hat | Dot | Eq | EqEq | ExclEq | Lt | LtEq | Gt
156        | GtEq | PlusEq | HyphEq | StarEq | SlashEq | Dots | Arrow | Not | And | Or => {
157            Some(TokenType::Operator)
158        }
159        Dollar => Some(TokenType::Delimiter),
160        None | Auto | Let | Show | If | Else | For | In | While | Break | Continue | Return
161        | Import | Include | As | Set => Some(TokenType::Keyword),
162        Bool => Some(TokenType::Bool),
163        Int | Float | Numeric => Some(TokenType::Number),
164        Str => Some(TokenType::String),
165        LineComment | BlockComment => Some(TokenType::Comment),
166        Error => Some(TokenType::Error),
167
168        // Disambiguate from `SyntaxKind::None`
169        _ => Option::None,
170    }
171}
172
173// TODO: differentiate also using tokens in scope, not just context
174fn is_function_ident(ident: &LinkedNode) -> bool {
175    let Some(next) = ident.next_leaf() else {
176        return false;
177    };
178    let function_call = matches!(next.kind(), SyntaxKind::LeftParen)
179        && matches!(
180            next.parent_kind(),
181            Some(SyntaxKind::Args | SyntaxKind::Params)
182        );
183    let function_content = matches!(next.kind(), SyntaxKind::LeftBracket)
184        && matches!(next.parent_kind(), Some(SyntaxKind::ContentBlock));
185    function_call || function_content
186}
187
188fn token_from_ident(ident: &LinkedNode) -> TokenType {
189    if is_function_ident(ident) {
190        TokenType::Function
191    } else {
192        TokenType::Interpolated
193    }
194}
195
196fn get_expr_following_hashtag<'a>(hashtag: &LinkedNode<'a>) -> Option<LinkedNode<'a>> {
197    hashtag
198        .next_sibling()
199        .filter(|next| next.cast::<ast::Expr>().is_some_and(|expr| expr.hash()))
200        .and_then(|node| node.leftmost_leaf())
201}
202
203fn token_from_hashtag(hashtag: &LinkedNode) -> Option<TokenType> {
204    get_expr_following_hashtag(hashtag)
205        .as_ref()
206        .and_then(token_from_node)
207}
208
209fn offset_to_position_utf8(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
210    let line_index = typst_source.byte_to_line(typst_offset).unwrap();
211    let column_index = typst_source.byte_to_column(typst_offset).unwrap();
212
213    (line_index as u32, column_index as u32)
214}
215
216fn offset_to_position_utf16(typst_offset: usize, typst_source: &Source) -> (u32, u32) {
217    let line_index = typst_source.byte_to_line(typst_offset).unwrap();
218
219    let lsp_line = line_index as u32;
220
221    // See the implementation of `lsp_to_typst::position_to_offset` for discussion
222    // relevant to this function.
223
224    // TODO: Typst's `Source` could easily provide an implementation of the method
225    // we   need here. Submit a PR to `typst` to add it, then update
226    // this if/when merged.
227
228    let utf16_offset = typst_source.byte_to_utf16(typst_offset).unwrap();
229
230    let byte_line_offset = typst_source.line_to_byte(line_index).unwrap();
231    let utf16_line_offset = typst_source.byte_to_utf16(byte_line_offset).unwrap();
232
233    let utf16_column_offset = utf16_offset - utf16_line_offset;
234    let lsp_column = utf16_column_offset;
235
236    (lsp_line, lsp_column as u32)
237}