Skip to main content

ane/commands/syntax_engine/
tree_sitter_parse.rs

1use crate::data::lsp::types::{Language, SemanticToken};
2
3#[cfg(test)]
4thread_local! {
5    pub(crate) static PARSE_COUNT: std::cell::Cell<usize> = const { std::cell::Cell::new(0) };
6}
7
8pub fn parse(lang: Language, content: &str) -> Vec<SemanticToken> {
9    #[cfg(test)]
10    PARSE_COUNT.with(|c| c.set(c.get() + 1));
11    let result = match lang {
12        Language::Rust => parse_with(&tree_sitter_rust::LANGUAGE.into(), content, rust_node_type),
13        Language::Go => parse_with(&tree_sitter_go::LANGUAGE.into(), content, go_node_type),
14        Language::TypeScript => parse_with(
15            &tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
16            content,
17            ts_node_type,
18        ),
19        Language::Python => parse_with(
20            &tree_sitter_python::LANGUAGE.into(),
21            content,
22            python_node_type,
23        ),
24        Language::Markdown => Some(parse_markdown(content)),
25    };
26    result.unwrap_or_default()
27}
28
29fn parse_with(
30    language: &tree_sitter::Language,
31    content: &str,
32    map_fn: fn(&str) -> Option<&'static str>,
33) -> Option<Vec<SemanticToken>> {
34    let mut parser = tree_sitter::Parser::new();
35    parser.set_language(language).ok()?;
36    let tree = parser.parse(content, None)?;
37    let root = tree.root_node();
38
39    let mut tokens = Vec::new();
40    let mut cursor = root.walk();
41    walk_tree(&mut cursor, content, map_fn, &mut tokens);
42    tokens.sort_by_key(|t| (t.line, t.start_col));
43    Some(tokens)
44}
45
46fn walk_tree(
47    cursor: &mut tree_sitter::TreeCursor,
48    content: &str,
49    map_fn: fn(&str) -> Option<&'static str>,
50    tokens: &mut Vec<SemanticToken>,
51) {
52    loop {
53        let node = cursor.node();
54        let kind = node.kind();
55
56        if let Some(token_type) = map_fn(kind) {
57            if node.child_count() == 0 || is_leaf_like(kind) {
58                emit_tokens_for_node(&node, content, token_type, tokens);
59            }
60        }
61
62        if !is_leaf_like(kind) && cursor.goto_first_child() {
63            walk_tree(cursor, content, map_fn, tokens);
64            cursor.goto_parent();
65        }
66
67        if !cursor.goto_next_sibling() {
68            break;
69        }
70    }
71}
72
73fn is_leaf_like(kind: &str) -> bool {
74    matches!(
75        kind,
76        "string_literal"
77            | "raw_string_literal"
78            | "char_literal"
79            | "line_comment"
80            | "block_comment"
81            | "comment"
82            | "interpreted_string_literal"
83            | "rune_literal"
84            | "string"
85            | "template_string"
86            | "concatenated_string"
87            | "atx_heading"
88            | "setext_heading"
89            | "code_span"
90            | "emphasis"
91            | "strong_emphasis"
92            | "inline_link"
93            | "full_reference_link"
94            | "collapsed_reference_link"
95            | "shortcut_link"
96            | "uri_autolink"
97            | "email_autolink"
98            | "image"
99            | "strikethrough"
100    )
101}
102
103fn emit_tokens_for_node(
104    node: &tree_sitter::Node,
105    content: &str,
106    token_type: &'static str,
107    tokens: &mut Vec<SemanticToken>,
108) {
109    let start_line = node.start_position().row;
110    let end_line = node.end_position().row;
111
112    if start_line == end_line {
113        let start_col = byte_to_char_col(content, start_line, node.start_position().column);
114        let end_col = byte_to_char_col(content, end_line, node.end_position().column);
115        if end_col > start_col {
116            tokens.push(SemanticToken {
117                line: start_line,
118                start_col,
119                length: end_col - start_col,
120                token_type: token_type.to_string(),
121            });
122        }
123    } else {
124        let lines: Vec<&str> = content.lines().collect();
125        for line_num in start_line..=end_line {
126            if let Some(line_text) = lines.get(line_num) {
127                let char_count = line_text.chars().count();
128                let (start_col, end_col) = if line_num == start_line {
129                    let sc = byte_to_char_col(content, line_num, node.start_position().column);
130                    (sc, char_count)
131                } else if line_num == end_line {
132                    let ec = byte_to_char_col(content, line_num, node.end_position().column);
133                    (0, ec)
134                } else {
135                    (0, char_count)
136                };
137                if end_col > start_col {
138                    tokens.push(SemanticToken {
139                        line: line_num,
140                        start_col,
141                        length: end_col - start_col,
142                        token_type: token_type.to_string(),
143                    });
144                }
145            }
146        }
147    }
148}
149
150fn byte_to_char_col(content: &str, line_num: usize, byte_col: usize) -> usize {
151    content
152        .lines()
153        .nth(line_num)
154        .map(|line| {
155            let safe_byte = byte_col.min(line.len());
156            line[..safe_byte].chars().count()
157        })
158        .unwrap_or(0)
159}
160
161fn parse_markdown(content: &str) -> Vec<SemanticToken> {
162    let mut tokens = Vec::new();
163
164    // Phase 1: Block-level parsing (headings, code blocks, quotes, list markers)
165    if let Some(block_tokens) = parse_with(&tree_sitter_md::LANGUAGE.into(), content, md_node_type)
166    {
167        tokens.extend(block_tokens);
168    }
169
170    // Phase 2: Inline-level parsing (emphasis, strong, code spans, links)
171    if let Some(inline_tokens) = parse_with(
172        &tree_sitter_md::INLINE_LANGUAGE.into(),
173        content,
174        md_inline_node_type,
175    ) {
176        tokens.extend(inline_tokens);
177    }
178
179    tokens.sort_by_key(|t| (t.line, t.start_col));
180    tokens
181}
182
183// --- Language-specific node type mappings ---
184
185fn rust_node_type(kind: &str) -> Option<&'static str> {
186    match kind {
187        "use" | "let" | "mut" | "const" | "static" | "fn" | "pub" | "mod" | "struct" | "enum"
188        | "impl" | "trait" | "type" | "where" | "for" | "in" | "loop" | "while" | "if" | "else"
189        | "match" | "return" | "break" | "continue" | "as" | "ref" | "self" | "super" | "crate"
190        | "async" | "await" | "move" | "unsafe" | "extern" | "dyn" | "true" | "false" => {
191            Some("keyword")
192        }
193        "type_identifier" | "primitive_type" => Some("type"),
194        "identifier" => None,
195        "function_item" => None,
196        "string_literal" | "raw_string_literal" | "char_literal" => Some("string"),
197        "integer_literal" | "float_literal" => Some("number"),
198        "line_comment" | "block_comment" => Some("comment"),
199        "attribute_item" | "inner_attribute_item" => Some("macro"),
200        "macro_invocation" => None,
201        "!" => None,
202        _ => None,
203    }
204}
205
206fn go_node_type(kind: &str) -> Option<&'static str> {
207    match kind {
208        "package" | "import" | "func" | "return" | "var" | "const" | "type" | "struct"
209        | "interface" | "map" | "chan" | "go" | "defer" | "if" | "else" | "for" | "range"
210        | "switch" | "case" | "default" | "select" | "break" | "continue" | "fallthrough"
211        | "goto" | "nil" | "true" | "false" => Some("keyword"),
212        "type_identifier" => Some("type"),
213        "field_identifier" => Some("property"),
214        "identifier" => None,
215        "interpreted_string_literal" | "raw_string_literal" | "rune_literal" => Some("string"),
216        "int_literal" | "float_literal" | "imaginary_literal" => Some("number"),
217        "comment" => Some("comment"),
218        _ => None,
219    }
220}
221
222fn ts_node_type(kind: &str) -> Option<&'static str> {
223    match kind {
224        "import" | "export" | "from" | "const" | "let" | "var" | "function" | "return" | "if"
225        | "else" | "for" | "while" | "do" | "switch" | "case" | "break" | "continue" | "class"
226        | "extends" | "implements" | "new" | "this" | "super" | "typeof" | "instanceof" | "in"
227        | "of" | "async" | "await" | "yield" | "throw" | "try" | "catch" | "finally"
228        | "default" | "void" | "delete" | "true" | "false" | "null" | "undefined" | "type"
229        | "interface" | "enum" | "namespace" | "declare" | "as" | "readonly" | "abstract"
230        | "static" | "private" | "protected" | "public" | "keyof" | "infer" | "satisfies" => {
231            Some("keyword")
232        }
233        "type_identifier" | "predefined_type" => Some("type"),
234        "property_identifier" => Some("property"),
235        "identifier" => None,
236        "string" | "template_string" => Some("string"),
237        "number" | "regex" => Some("number"),
238        "comment" => Some("comment"),
239        _ => None,
240    }
241}
242
243fn python_node_type(kind: &str) -> Option<&'static str> {
244    match kind {
245        "import" | "from" | "def" | "class" | "return" | "if" | "elif" | "else" | "for"
246        | "while" | "break" | "continue" | "pass" | "raise" | "try" | "except" | "finally"
247        | "with" | "as" | "lambda" | "yield" | "global" | "nonlocal" | "assert" | "del" | "and"
248        | "or" | "not" | "is" | "in" | "True" | "False" | "None" | "async" | "await" => {
249            Some("keyword")
250        }
251        "identifier" => None,
252        "type" => Some("type"),
253        "string" | "concatenated_string" => Some("string"),
254        "integer" | "float" => Some("number"),
255        "comment" => Some("comment"),
256        "decorator" => Some("macro"),
257        _ => None,
258    }
259}
260
261fn md_node_type(kind: &str) -> Option<&'static str> {
262    match kind {
263        "atx_heading" | "setext_heading" | "atx_h1_marker" | "atx_h2_marker" | "atx_h3_marker"
264        | "atx_h4_marker" | "atx_h5_marker" | "atx_h6_marker" => Some("heading"),
265        "fenced_code_block" | "indented_code_block" | "code_fence_content" | "info_string" => {
266            Some("code")
267        }
268        "block_quote" | "block_quote_marker" => Some("quote"),
269        "list_marker_dot"
270        | "list_marker_minus"
271        | "list_marker_star"
272        | "list_marker_plus"
273        | "list_marker_parenthesis" => Some("list_marker"),
274        "thematic_break" => Some("punctuation"),
275        _ => None,
276    }
277}
278
279fn md_inline_node_type(kind: &str) -> Option<&'static str> {
280    match kind {
281        "code_span" => Some("code"),
282        "emphasis" => Some("emphasis"),
283        "strong_emphasis" => Some("strong"),
284        "inline_link"
285        | "full_reference_link"
286        | "collapsed_reference_link"
287        | "shortcut_link"
288        | "uri_autolink"
289        | "email_autolink"
290        | "image" => Some("link"),
291        "strikethrough" => Some("punctuation"),
292        _ => None,
293    }
294}