Skip to main content

rag_rat_core/index/
parser.rs

1use std::path::Path;
2
3use tree_sitter::Node;
4
5use crate::language::Language;
6
7#[derive(Debug, Clone)]
8pub struct ParsedSymbol {
9    pub name: String,
10    pub qualified_name: String,
11    pub kind: String,
12    pub start_byte: usize,
13    pub end_byte: usize,
14    pub start_line: usize,
15    pub end_line: usize,
16    pub signature: Option<String>,
17    pub docs: Option<String>,
18    pub facts: Vec<ParsedSymbolFact>,
19}
20
21#[derive(Debug, Clone, PartialEq, Eq)]
22pub struct ParsedSymbolFact {
23    pub kind: String,
24    pub value: String,
25}
26
27const NAME_KINDS: &[&str] = &[
28    "identifier",
29    "type_identifier",
30    "property_identifier",
31    "field_identifier",
32    "simple_identifier",
33    "namespace_identifier",
34];
35
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub enum ParserKind {
38    Rust,
39    TypeScript,
40    Tsx,
41    Kotlin,
42    C,
43    Cpp,
44    Markdown,
45}
46
47pub fn parser_kind(path: &Path, language: Language) -> ParserKind {
48    match language {
49        Language::Rust => ParserKind::Rust,
50        Language::TypeScript => {
51            if path.extension().and_then(|ext| ext.to_str()) == Some("tsx") {
52                ParserKind::Tsx
53            } else {
54                ParserKind::TypeScript
55            }
56        },
57        Language::Kotlin => ParserKind::Kotlin,
58        Language::C => ParserKind::C,
59        Language::Cpp => ParserKind::Cpp,
60        Language::Markdown => ParserKind::Markdown,
61    }
62}
63
64pub fn parse_symbols(
65    path: &Path,
66    language: Language,
67    text: &str,
68) -> anyhow::Result<Vec<ParsedSymbol>> {
69    match parser_kind(path, language) {
70        ParserKind::Rust => {
71            parse_tree_sitter(path, language, text, tree_sitter_rust::LANGUAGE.into())
72        },
73        ParserKind::TypeScript => parse_tree_sitter(
74            path,
75            language,
76            text,
77            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
78        ),
79        ParserKind::Tsx => {
80            parse_tree_sitter(path, language, text, tree_sitter_typescript::LANGUAGE_TSX.into())
81        },
82        ParserKind::Kotlin => {
83            parse_tree_sitter(path, language, text, tree_sitter_kotlin::LANGUAGE.into())
84        },
85        ParserKind::C => parse_tree_sitter(path, language, text, tree_sitter_c::LANGUAGE.into()),
86        ParserKind::Cpp => {
87            parse_tree_sitter(path, language, text, tree_sitter_cpp::LANGUAGE.into())
88        },
89        ParserKind::Markdown => Ok(Vec::new()),
90    }
91}
92
93pub fn parse_error(path: &Path, language: Language, text: &str) -> anyhow::Result<Option<String>> {
94    let grammar = match parser_kind(path, language) {
95        ParserKind::Rust => tree_sitter_rust::LANGUAGE.into(),
96        ParserKind::TypeScript => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
97        ParserKind::Tsx => tree_sitter_typescript::LANGUAGE_TSX.into(),
98        ParserKind::Kotlin => tree_sitter_kotlin::LANGUAGE.into(),
99        ParserKind::C => tree_sitter_c::LANGUAGE.into(),
100        ParserKind::Cpp => tree_sitter_cpp::LANGUAGE.into(),
101        ParserKind::Markdown => return Ok(None),
102    };
103    let mut parser = tree_sitter::Parser::new();
104    parser.set_language(&grammar)?;
105    let tree =
106        parser.parse(text, None).ok_or_else(|| anyhow::anyhow!("tree-sitter parse failed"))?;
107    Ok(tree.root_node().has_error().then(|| {
108        "tree-sitter parse produced error nodes; partial structural index was retained".to_string()
109    }))
110}
111
112fn parse_tree_sitter(
113    path: &Path,
114    language: Language,
115    text: &str,
116    grammar: tree_sitter::Language,
117) -> anyhow::Result<Vec<ParsedSymbol>> {
118    let mut parser = tree_sitter::Parser::new();
119    parser.set_language(&grammar)?;
120    let tree =
121        parser.parse(text, None).ok_or_else(|| anyhow::anyhow!("tree-sitter parse failed"))?;
122    let mut out = Vec::new();
123    collect_symbols(path, language, text, tree.root_node(), &mut out);
124    out.sort_by_key(|symbol| (symbol.start_byte, symbol.end_byte));
125    out.dedup_by_key(|symbol| (symbol.start_byte, symbol.end_byte, symbol.name.clone()));
126    Ok(out)
127}
128
129fn collect_symbols(
130    path: &Path,
131    language: Language,
132    text: &str,
133    node: Node<'_>,
134    out: &mut Vec<ParsedSymbol>,
135) {
136    if node.is_error() || node.is_missing() {
137        return;
138    }
139    if let Some((kind, name_node)) = symbol_node(language, node) {
140        let name = node_text(name_node, text).unwrap_or_default();
141        if !name.is_empty() {
142            out.push(make_symbol(path, language, text, node, kind, name));
143        }
144    }
145    let mut cursor = node.walk();
146    for child in node.named_children(&mut cursor) {
147        collect_symbols(path, language, text, child, out);
148    }
149}
150
151fn symbol_node(language: Language, node: Node<'_>) -> Option<(&'static str, Node<'_>)> {
152    let kind = node.kind();
153    match language {
154        Language::Rust => match kind {
155            "function_item" => Some(("function", child_name(node)?)),
156            "struct_item" => Some(("struct", child_name(node)?)),
157            "enum_item" => Some(("enum", child_name(node)?)),
158            "trait_item" => Some(("trait", child_name(node)?)),
159            "impl_item" => Some(("impl", impl_name(node).unwrap_or(node))),
160            "mod_item" => Some(("module", child_name(node)?)),
161            "const_item" => Some(("const", child_name(node)?)),
162            "static_item" => Some(("static", child_name(node)?)),
163            "type_item" => Some(("type", child_name(node)?)),
164            "macro_definition" => Some(("macro", child_name(node)?)),
165            _ => None,
166        },
167        Language::TypeScript => match kind {
168            "function_declaration" | "method_definition" | "generator_function_declaration" => {
169                Some(("function", child_name(node)?))
170            },
171            "class_declaration" => Some(("class", child_name(node)?)),
172            "interface_declaration" => Some(("interface", child_name(node)?)),
173            "type_alias_declaration" => Some(("type", child_name(node)?)),
174            "variable_declarator" | "public_field_definition" => Some(("const", child_name(node)?)),
175            _ => None,
176        },
177        Language::Kotlin => match kind {
178            "class_declaration" => Some(("class", child_name(node)?)),
179            "object_declaration" => Some(("object", child_name(node)?)),
180            "function_declaration" => Some(("function", child_name(node)?)),
181            "property_declaration" => Some(("property", kotlin_property_name(node)?)),
182            "companion_object" | "companion_object_declaration" => {
183                Some(("object", companion_name(node).unwrap_or(node)))
184            },
185            _ => None,
186        },
187        Language::C => match kind {
188            "function_definition" => {
189                Some(("function", function_name(node).or_else(|| child_name(node))?))
190            },
191            "declaration" if has_descendant_kind(node, "function_declarator") => {
192                Some(("function", function_name(node).or_else(|| child_name(node))?))
193            },
194            "struct_specifier" => Some(("struct", child_name(node)?)),
195            "union_specifier" => Some(("union", child_name(node)?)),
196            "enum_specifier" => Some(("enum", child_name(node)?)),
197            "type_definition" => Some(("type", child_name(node)?)),
198            "preproc_function_def" => Some(("macro", child_name(node)?)),
199            _ => None,
200        },
201        Language::Cpp => match kind {
202            "function_definition" => {
203                Some(("function", function_name(node).or_else(|| child_name(node))?))
204            },
205            "declaration" if has_descendant_kind(node, "function_declarator") => {
206                Some(("function", function_name(node).or_else(|| child_name(node))?))
207            },
208            "class_specifier" => Some(("class", child_name(node)?)),
209            "struct_specifier" => Some(("struct", child_name(node)?)),
210            "union_specifier" => Some(("union", child_name(node)?)),
211            "enum_specifier" => Some(("enum", child_name(node)?)),
212            "type_definition" | "alias_declaration" => Some(("type", child_name(node)?)),
213            "namespace_definition" => Some(("namespace", child_name(node)?)),
214            "preproc_function_def" => Some(("macro", child_name(node)?)),
215            _ => None,
216        },
217        Language::Markdown => None,
218    }
219}
220
221fn child_name(node: Node<'_>) -> Option<Node<'_>> {
222    if let Some(name) = node.child_by_field_name("name") {
223        return Some(name);
224    }
225
226    let mut cursor = node.walk();
227    if let Some(name) =
228        node.named_children(&mut cursor).find(|child| NAME_KINDS.contains(&child.kind()))
229    {
230        return Some(name);
231    }
232
233    let mut cursor = node.walk();
234    node.named_children(&mut cursor).find_map(|child| first_descendant_node(child, NAME_KINDS))
235}
236
237fn first_descendant_node<'tree>(node: Node<'tree>, kinds: &[&str]) -> Option<Node<'tree>> {
238    let mut cursor = node.walk();
239    for child in node.named_children(&mut cursor) {
240        if kinds.contains(&child.kind()) {
241            return Some(child);
242        }
243        if let Some(value) = first_descendant_node(child, kinds) {
244            return Some(value);
245        }
246    }
247    None
248}
249
250fn has_descendant_kind(node: Node<'_>, kind: &str) -> bool {
251    let mut cursor = node.walk();
252    node.named_children(&mut cursor)
253        .any(|child| child.kind() == kind || has_descendant_kind(child, kind))
254}
255
256fn companion_name(node: Node<'_>) -> Option<Node<'_>> {
257    for index in 0..node.child_count() {
258        let Some(index) = u32::try_from(index).ok() else {
259            continue;
260        };
261        if let Some(child) = node.child(index)
262            && child.kind() == "companion"
263        {
264            return Some(child);
265        }
266    }
267    let mut cursor = node.walk();
268    node.named_children(&mut cursor)
269        .find(|child| matches!(child.kind(), "simple_identifier" | "type_identifier"))
270}
271
272fn kotlin_property_name(node: Node<'_>) -> Option<Node<'_>> {
273    child_name(kotlin_variable_declaration(node).unwrap_or(node))
274}
275
276fn kotlin_variable_declaration(node: Node<'_>) -> Option<Node<'_>> {
277    let mut cursor = node.walk();
278    node.named_children(&mut cursor).find_map(|child| {
279        if child.kind() == "variable_declaration" {
280            Some(child)
281        } else if matches!(child.kind(), "modifiers" | "type_parameters" | "type_constraints") {
282            None
283        } else {
284            kotlin_variable_declaration(child)
285        }
286    })
287}
288
289fn function_name(node: Node<'_>) -> Option<Node<'_>> {
290    let declarator = first_descendant_node(node, &["function_declarator"]).unwrap_or(node);
291    let name_root = declarator.child_by_field_name("declarator").unwrap_or(declarator);
292    if NAME_KINDS.contains(&name_root.kind()) {
293        return Some(name_root);
294    }
295    last_descendant_node(name_root, NAME_KINDS)
296}
297
298fn last_descendant_node<'tree>(node: Node<'tree>, kinds: &[&str]) -> Option<Node<'tree>> {
299    let mut cursor = node.walk();
300    let mut last = None;
301    for child in node.named_children(&mut cursor) {
302        if kinds.contains(&child.kind()) {
303            last = Some(child);
304        }
305        if let Some(value) = last_descendant_node(child, kinds) {
306            last = Some(value);
307        }
308    }
309    last
310}
311
312fn impl_name(node: Node<'_>) -> Option<Node<'_>> {
313    let mut cursor = node.walk();
314    node.named_children(&mut cursor).find(|child| {
315        matches!(child.kind(), "type_identifier" | "generic_type" | "scoped_type_identifier")
316    })
317}
318
319fn make_symbol(
320    path: &Path,
321    language: Language,
322    text: &str,
323    node: Node<'_>,
324    kind: &str,
325    name: String,
326) -> ParsedSymbol {
327    let start_byte = node.start_byte();
328    let end_byte = node.end_byte();
329    let start_line = byte_to_line(text, start_byte);
330    let end_line = byte_to_line(text, end_byte);
331    ParsedSymbol {
332        qualified_name: format!("{}::{name}", path.to_string_lossy().replace('\\', "/")),
333        name,
334        kind: kind.to_string(),
335        start_byte,
336        end_byte,
337        start_line,
338        end_line,
339        signature: signature_for(text, start_byte, end_byte),
340        docs: docs_before(text, start_byte),
341        facts: symbol_facts(language, text, node),
342    }
343}
344
345fn symbol_facts(language: Language, text: &str, node: Node<'_>) -> Vec<ParsedSymbolFact> {
346    if language != Language::Rust {
347        return Vec::new();
348    }
349    let mut facts = Vec::new();
350    for attribute in rust_attribute_items(text, node) {
351        if rust_attribute_is_uniffi_export(&attribute) {
352            facts.push(ParsedSymbolFact {
353                kind: "rust_attr".to_string(),
354                value: "uniffi_export".to_string(),
355            });
356        }
357    }
358    facts.sort_by(|left, right| (&left.kind, &left.value).cmp(&(&right.kind, &right.value)));
359    facts.dedup();
360    facts
361}
362
363fn rust_attribute_items(text: &str, node: Node<'_>) -> Vec<String> {
364    let mut attributes = Vec::new();
365    let mut cursor = node.walk();
366    for child in node.named_children(&mut cursor) {
367        if child.kind() == "attribute_item" {
368            attributes.push(node_text(child, text).unwrap_or_default());
369        }
370    }
371
372    let mut preceding = Vec::new();
373    let mut sibling = node.prev_named_sibling();
374    while let Some(previous) = sibling {
375        if previous.kind() != "attribute_item" {
376            break;
377        }
378        preceding.push(node_text(previous, text).unwrap_or_default());
379        sibling = previous.prev_named_sibling();
380    }
381    preceding.reverse();
382    preceding.extend(attributes);
383    preceding
384}
385
386fn rust_attribute_is_uniffi_export(attribute: &str) -> bool {
387    attribute.contains("uniffi::export") || attribute.contains("::uniffi::export")
388}
389
390fn node_text(node: Node<'_>, text: &str) -> Option<String> {
391    node.utf8_text(text.as_bytes()).ok().map(ToOwned::to_owned)
392}
393
394fn byte_to_line(text: &str, byte: usize) -> usize {
395    text[..byte.min(text.len())].bytes().filter(|byte| *byte == b'\n').count() + 1
396}
397
398fn signature_for(text: &str, start_byte: usize, end_byte: usize) -> Option<String> {
399    text.get(start_byte..end_byte)?
400        .lines()
401        .find(|line| !line.trim().is_empty())
402        .map(|line| line.trim().to_string())
403}
404
405fn docs_before(text: &str, start_byte: usize) -> Option<String> {
406    let before = text.get(..start_byte)?;
407    let mut docs = Vec::new();
408    for line in before.lines().rev() {
409        let trimmed = line.trim();
410        if matches!(trimmed, "/**" | "*/") {
411            continue;
412        } else if let Some(doc_line) = clean_doc_comment_line(trimmed) {
413            docs.push(doc_line);
414        } else if trimmed.is_empty() {
415            continue;
416        } else {
417            break;
418        }
419    }
420    docs.reverse();
421    (!docs.is_empty()).then(|| docs.join("\n"))
422}
423
424fn clean_doc_comment_line(trimmed: &str) -> Option<String> {
425    let line = if trimmed.starts_with("///") {
426        trimmed.trim_start_matches('/')
427    } else if trimmed.starts_with('*') || trimmed.starts_with("/**") {
428        trimmed.trim_start_matches('/').trim_start_matches('*').trim_end_matches('/')
429    } else {
430        return None;
431    }
432    .trim();
433
434    (!line.is_empty()).then(|| line.to_string())
435}