ane/commands/syntax_engine/
tree_sitter_parse.rs1use crate::data::lsp::types::{Language, SemanticToken};
2
3#[cfg(test)]
4thread_local! {
5 pub(crate) static PARSE_COUNT: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
6}
7
8pub fn parse(lang: Language, content: &str) -> Vec<SemanticToken> {
9 #[cfg(test)]
10 PARSE_COUNT.with(|c| c.set(c.get() + 1));
11 let result = match lang {
12 Language::Rust => parse_with(&tree_sitter_rust::LANGUAGE.into(), content, rust_node_type),
13 Language::Go => parse_with(&tree_sitter_go::LANGUAGE.into(), content, go_node_type),
14 Language::TypeScript => parse_with(
15 &tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
16 content,
17 ts_node_type,
18 ),
19 Language::Python => parse_with(
20 &tree_sitter_python::LANGUAGE.into(),
21 content,
22 python_node_type,
23 ),
24 Language::Markdown => Some(parse_markdown(content)),
25 };
26 result.unwrap_or_default()
27}
28
29fn parse_with(
30 language: &tree_sitter::Language,
31 content: &str,
32 map_fn: fn(&str) -> Option<&'static str>,
33) -> Option<Vec<SemanticToken>> {
34 let mut parser = tree_sitter::Parser::new();
35 parser.set_language(language).ok()?;
36 let tree = parser.parse(content, None)?;
37 let root = tree.root_node();
38
39 let mut tokens = Vec::new();
40 let mut cursor = root.walk();
41 walk_tree(&mut cursor, content, map_fn, &mut tokens);
42 tokens.sort_by_key(|t| (t.line, t.start_col));
43 Some(tokens)
44}
45
46fn walk_tree(
47 cursor: &mut tree_sitter::TreeCursor,
48 content: &str,
49 map_fn: fn(&str) -> Option<&'static str>,
50 tokens: &mut Vec<SemanticToken>,
51) {
52 loop {
53 let node = cursor.node();
54 let kind = node.kind();
55
56 if let Some(token_type) = map_fn(kind) {
57 if node.child_count() == 0 || is_leaf_like(kind) {
58 emit_tokens_for_node(&node, content, token_type, tokens);
59 }
60 }
61
62 if !is_leaf_like(kind) && cursor.goto_first_child() {
63 walk_tree(cursor, content, map_fn, tokens);
64 cursor.goto_parent();
65 }
66
67 if !cursor.goto_next_sibling() {
68 break;
69 }
70 }
71}
72
73fn is_leaf_like(kind: &str) -> bool {
74 matches!(
75 kind,
76 "string_literal"
77 | "raw_string_literal"
78 | "char_literal"
79 | "line_comment"
80 | "block_comment"
81 | "comment"
82 | "interpreted_string_literal"
83 | "rune_literal"
84 | "string"
85 | "template_string"
86 | "concatenated_string"
87 | "atx_heading"
88 | "setext_heading"
89 | "code_span"
90 | "emphasis"
91 | "strong_emphasis"
92 | "inline_link"
93 | "full_reference_link"
94 | "collapsed_reference_link"
95 | "shortcut_link"
96 | "uri_autolink"
97 | "email_autolink"
98 | "image"
99 | "strikethrough"
100 )
101}
102
103fn emit_tokens_for_node(
104 node: &tree_sitter::Node,
105 content: &str,
106 token_type: &'static str,
107 tokens: &mut Vec<SemanticToken>,
108) {
109 let start_line = node.start_position().row;
110 let end_line = node.end_position().row;
111
112 if start_line == end_line {
113 let start_col = byte_to_char_col(content, start_line, node.start_position().column);
114 let end_col = byte_to_char_col(content, end_line, node.end_position().column);
115 if end_col > start_col {
116 tokens.push(SemanticToken {
117 line: start_line,
118 start_col,
119 length: end_col - start_col,
120 token_type: token_type.to_string(),
121 });
122 }
123 } else {
124 let lines: Vec<&str> = content.lines().collect();
125 for line_num in start_line..=end_line {
126 if let Some(line_text) = lines.get(line_num) {
127 let char_count = line_text.chars().count();
128 let (start_col, end_col) = if line_num == start_line {
129 let sc = byte_to_char_col(content, line_num, node.start_position().column);
130 (sc, char_count)
131 } else if line_num == end_line {
132 let ec = byte_to_char_col(content, line_num, node.end_position().column);
133 (0, ec)
134 } else {
135 (0, char_count)
136 };
137 if end_col > start_col {
138 tokens.push(SemanticToken {
139 line: line_num,
140 start_col,
141 length: end_col - start_col,
142 token_type: token_type.to_string(),
143 });
144 }
145 }
146 }
147 }
148}
149
150fn byte_to_char_col(content: &str, line_num: usize, byte_col: usize) -> usize {
151 content
152 .lines()
153 .nth(line_num)
154 .map(|line| {
155 let safe_byte = byte_col.min(line.len());
156 line[..safe_byte].chars().count()
157 })
158 .unwrap_or(0)
159}
160
161fn parse_markdown(content: &str) -> Vec<SemanticToken> {
162 let mut tokens = Vec::new();
163
164 if let Some(block_tokens) = parse_with(&tree_sitter_md::LANGUAGE.into(), content, md_node_type)
166 {
167 tokens.extend(block_tokens);
168 }
169
170 if let Some(inline_tokens) = parse_with(
172 &tree_sitter_md::INLINE_LANGUAGE.into(),
173 content,
174 md_inline_node_type,
175 ) {
176 tokens.extend(inline_tokens);
177 }
178
179 tokens.sort_by_key(|t| (t.line, t.start_col));
180 tokens
181}
182
183fn rust_node_type(kind: &str) -> Option<&'static str> {
186 match kind {
187 "use" | "let" | "mut" | "const" | "static" | "fn" | "pub" | "mod" | "struct" | "enum"
188 | "impl" | "trait" | "type" | "where" | "for" | "in" | "loop" | "while" | "if" | "else"
189 | "match" | "return" | "break" | "continue" | "as" | "ref" | "self" | "super" | "crate"
190 | "async" | "await" | "move" | "unsafe" | "extern" | "dyn" | "true" | "false" => {
191 Some("keyword")
192 }
193 "type_identifier" | "primitive_type" => Some("type"),
194 "identifier" => None,
195 "function_item" => None,
196 "string_literal" | "raw_string_literal" | "char_literal" => Some("string"),
197 "integer_literal" | "float_literal" => Some("number"),
198 "line_comment" | "block_comment" => Some("comment"),
199 "attribute_item" | "inner_attribute_item" => Some("macro"),
200 "macro_invocation" => None,
201 "!" => None,
202 _ => None,
203 }
204}
205
206fn go_node_type(kind: &str) -> Option<&'static str> {
207 match kind {
208 "package" | "import" | "func" | "return" | "var" | "const" | "type" | "struct"
209 | "interface" | "map" | "chan" | "go" | "defer" | "if" | "else" | "for" | "range"
210 | "switch" | "case" | "default" | "select" | "break" | "continue" | "fallthrough"
211 | "goto" | "nil" | "true" | "false" => Some("keyword"),
212 "type_identifier" => Some("type"),
213 "field_identifier" => Some("property"),
214 "identifier" => None,
215 "interpreted_string_literal" | "raw_string_literal" | "rune_literal" => Some("string"),
216 "int_literal" | "float_literal" | "imaginary_literal" => Some("number"),
217 "comment" => Some("comment"),
218 _ => None,
219 }
220}
221
222fn ts_node_type(kind: &str) -> Option<&'static str> {
223 match kind {
224 "import" | "export" | "from" | "const" | "let" | "var" | "function" | "return" | "if"
225 | "else" | "for" | "while" | "do" | "switch" | "case" | "break" | "continue" | "class"
226 | "extends" | "implements" | "new" | "this" | "super" | "typeof" | "instanceof" | "in"
227 | "of" | "async" | "await" | "yield" | "throw" | "try" | "catch" | "finally"
228 | "default" | "void" | "delete" | "true" | "false" | "null" | "undefined" | "type"
229 | "interface" | "enum" | "namespace" | "declare" | "as" | "readonly" | "abstract"
230 | "static" | "private" | "protected" | "public" | "keyof" | "infer" | "satisfies" => {
231 Some("keyword")
232 }
233 "type_identifier" | "predefined_type" => Some("type"),
234 "property_identifier" => Some("property"),
235 "identifier" => None,
236 "string" | "template_string" => Some("string"),
237 "number" | "regex" => Some("number"),
238 "comment" => Some("comment"),
239 _ => None,
240 }
241}
242
243fn python_node_type(kind: &str) -> Option<&'static str> {
244 match kind {
245 "import" | "from" | "def" | "class" | "return" | "if" | "elif" | "else" | "for"
246 | "while" | "break" | "continue" | "pass" | "raise" | "try" | "except" | "finally"
247 | "with" | "as" | "lambda" | "yield" | "global" | "nonlocal" | "assert" | "del" | "and"
248 | "or" | "not" | "is" | "in" | "True" | "False" | "None" | "async" | "await" => {
249 Some("keyword")
250 }
251 "identifier" => None,
252 "type" => Some("type"),
253 "string" | "concatenated_string" => Some("string"),
254 "integer" | "float" => Some("number"),
255 "comment" => Some("comment"),
256 "decorator" => Some("macro"),
257 _ => None,
258 }
259}
260
261fn md_node_type(kind: &str) -> Option<&'static str> {
262 match kind {
263 "atx_heading" | "setext_heading" | "atx_h1_marker" | "atx_h2_marker" | "atx_h3_marker"
264 | "atx_h4_marker" | "atx_h5_marker" | "atx_h6_marker" => Some("heading"),
265 "fenced_code_block" | "indented_code_block" | "code_fence_content" | "info_string" => {
266 Some("code")
267 }
268 "block_quote" | "block_quote_marker" => Some("quote"),
269 "list_marker_dot"
270 | "list_marker_minus"
271 | "list_marker_star"
272 | "list_marker_plus"
273 | "list_marker_parenthesis" => Some("list_marker"),
274 "thematic_break" => Some("punctuation"),
275 _ => None,
276 }
277}
278
279fn md_inline_node_type(kind: &str) -> Option<&'static str> {
280 match kind {
281 "code_span" => Some("code"),
282 "emphasis" => Some("emphasis"),
283 "strong_emphasis" => Some("strong"),
284 "inline_link"
285 | "full_reference_link"
286 | "collapsed_reference_link"
287 | "shortcut_link"
288 | "uri_autolink"
289 | "email_autolink"
290 | "image" => Some("link"),
291 "strikethrough" => Some("punctuation"),
292 _ => None,
293 }
294}