Skip to main content

gobby_code/index/
parser.rs

1//! Tree-sitter AST parsing for symbol, import, and call extraction.
2//! Ports logic from src/gobby/code_index/parser.py.
3
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7use anyhow::Context as _;
8use streaming_iterator::StreamingIterator;
9use tree_sitter::{Parser, Query, QueryCursor};
10
11mod calls;
12
13use crate::index::MAX_FILE_SIZE;
14use crate::index::hasher::symbol_content_hash;
15use crate::index::import_resolution::{self, ExtractedImports};
16use crate::index::languages;
17use crate::index::security;
18use crate::index::semantic::SemanticCallResolver;
19use crate::models::{ParseResult, Symbol};
20use calls::{CallExtractionContext, extract_calls};
21
22pub use crate::index::import_resolution::{
23    ImportResolutionContext, build_import_resolution_context,
24};
25
26#[cfg(test)]
27use calls::{call_qualifier_path, line_terminator_len, split_qualified_callee};
28
29pub(crate) fn parse_file_with_semantic(
30    file_path: &Path,
31    project_id: &str,
32    root_path: &Path,
33    exclude_patterns: &[impl AsRef<str>],
34    import_context: &ImportResolutionContext,
35    semantic_resolver: Option<&mut (dyn SemanticCallResolver + '_)>,
36) -> anyhow::Result<Option<ParseResult>> {
37    // Security checks
38    if !security::validate_path(file_path, root_path) {
39        return Ok(None);
40    }
41    if !security::is_symlink_safe(file_path, root_path) {
42        return Ok(None);
43    }
44    if security::should_exclude_path(root_path, file_path, exclude_patterns) {
45        return Ok(None);
46    }
47    if security::has_secret_extension(file_path) {
48        return Ok(None);
49    }
50
51    let Ok(meta) = file_path.metadata() else {
52        return Ok(None);
53    };
54    if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
55        return Ok(None);
56    }
57
58    if security::is_binary(file_path) {
59        return Ok(None);
60    }
61
62    let file_str = file_path.to_string_lossy();
63    let Some(language) = languages::detect_language(&file_str) else {
64        return Ok(None);
65    };
66    let Some(spec) = languages::get_spec(language) else {
67        return Ok(None);
68    };
69    let Some(ts_lang) = languages::get_ts_language_for_path(language, &file_str) else {
70        return Ok(None);
71    };
72
73    let Ok(source) = std::fs::read(file_path) else {
74        return Ok(None);
75    };
76
77    let mut parser = Parser::new();
78    if parser.set_language(&ts_lang).is_err() {
79        return Ok(None);
80    }
81    let Some(tree) = parser.parse(&source, None) else {
82        return Ok(None);
83    };
84
85    let rel_path = file_path
86        .canonicalize()
87        .ok()
88        .and_then(|abs| {
89            root_path.canonicalize().ok().and_then(|root| {
90                abs.strip_prefix(&root)
91                    .ok()
92                    .map(|p| p.to_string_lossy().to_string())
93            })
94        })
95        .unwrap_or_else(|| file_str.to_string());
96
97    let mut symbols = extract_symbols(
98        &tree, &source, spec, language, &ts_lang, project_id, &rel_path,
99    )?;
100    link_parents(&mut symbols);
101    collapse_rust_impl_symbols(&mut symbols);
102    let extracted_imports = extract_imports(
103        &tree,
104        &source,
105        spec,
106        language,
107        &ts_lang,
108        &rel_path,
109        import_context,
110    )?;
111    let calls = extract_calls(
112        &tree,
113        &source,
114        spec,
115        CallExtractionContext {
116            language,
117            ts_lang: &ts_lang,
118            rel_path: &rel_path,
119            symbols: &symbols,
120            import_context,
121            import_bindings: &extracted_imports.bindings,
122            file_path,
123            root_path,
124        },
125        semantic_resolver,
126    )?;
127
128    Ok(Some(ParseResult {
129        symbols,
130        imports: extracted_imports.imports,
131        calls,
132        source,
133    }))
134}
135
136fn extract_symbols(
137    tree: &tree_sitter::Tree,
138    source: &[u8],
139    spec: &languages::LanguageSpec,
140    language: &str,
141    ts_lang: &tree_sitter::Language,
142    project_id: &str,
143    rel_path: &str,
144) -> anyhow::Result<Vec<Symbol>> {
145    if spec.symbol_query.trim().is_empty() {
146        return Ok(Vec::new());
147    }
148
149    let query = Query::new(ts_lang, spec.symbol_query).with_context(|| {
150        format!("failed to compile symbol query for language `{language}` while parsing {rel_path}")
151    })?;
152
153    let mut cursor = QueryCursor::new();
154    let mut matches = cursor.matches(&query, tree.root_node(), source);
155
156    let mut symbols = Vec::new();
157    let mut seen_ids = HashSet::new();
158    let capture_names = query.capture_names();
159    let name_capture = capture_names.iter().position(|name| *name == "name");
160    let definition_kinds = capture_names
161        .iter()
162        .map(|name| name.strip_prefix("definition."))
163        .collect::<Vec<_>>();
164
165    while let Some(m) = matches.next() {
166        let mut name_text: Option<String> = None;
167        let mut def_node = None;
168        let mut kind = String::from("function");
169
170        for cap in m.captures {
171            let capture_index = cap.index as usize;
172            if name_capture == Some(capture_index) {
173                name_text = Some(
174                    String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
175                        .to_string(),
176                );
177            } else if let Some(Some(k)) = definition_kinds.get(capture_index) {
178                def_node = Some(cap.node);
179                kind = (*k).to_string();
180            }
181        }
182
183        let (name, node) = match (name_text, def_node) {
184            (Some(n), Some(d)) => (n, d),
185            _ => continue,
186        };
187
188        // Signature: first line of definition
189        let sig_end = source[node.start_byte()..]
190            .iter()
191            .position(|&b| b == b'\n')
192            .map(|p| node.start_byte() + p)
193            .unwrap_or(node.end_byte());
194        let mut signature = String::from_utf8_lossy(&source[node.start_byte()..sig_end])
195            .trim()
196            .to_string();
197        if signature.len() > 200 {
198            signature.truncate(200);
199            signature.push_str("...");
200        }
201
202        let docstring = extract_docstring(&node, source, language);
203        let c_hash =
204            symbol_content_hash(source, node.start_byte(), node.end_byte()).unwrap_or_default();
205        let symbol_id = Symbol::make_id(project_id, rel_path, &name, &kind, node.start_byte());
206
207        if seen_ids.contains(&symbol_id) {
208            continue;
209        }
210        seen_ids.insert(symbol_id.clone());
211
212        symbols.push(Symbol {
213            id: symbol_id,
214            project_id: project_id.to_string(),
215            file_path: rel_path.to_string(),
216            name: name.clone(),
217            qualified_name: name,
218            kind,
219            language: language.to_string(),
220            byte_start: node.start_byte(),
221            byte_end: node.end_byte(),
222            line_start: node.start_position().row + 1,
223            line_end: node.end_position().row + 1,
224            signature: Some(signature),
225            docstring,
226            parent_symbol_id: None,
227            content_hash: c_hash,
228            summary: None,
229            created_at: String::new(),
230            updated_at: String::new(),
231        });
232    }
233
234    Ok(symbols)
235}
236
237fn link_parents(symbols: &mut [Symbol]) {
238    let mut indices: Vec<usize> = (0..symbols.len()).collect();
239    indices.sort_by_key(|&i| symbols[i].byte_start);
240
241    for idx in 0..indices.len() {
242        let i = indices[idx];
243        for jdx in (0..idx).rev() {
244            let j = indices[jdx];
245            let parent_kind = symbols[j].kind.as_str();
246            if (parent_kind == "class" || parent_kind == "type")
247                && symbols[j].byte_start <= symbols[i].byte_start
248                && symbols[j].byte_end >= symbols[i].byte_end
249            {
250                let parent_name = symbols[j].name.clone();
251                let parent_id = symbols[j].id.clone();
252                let sym = &mut symbols[i];
253                sym.parent_symbol_id = Some(parent_id);
254                sym.qualified_name = format!("{}.{}", parent_name, sym.name);
255                // Elixir module functions are still function symbols, even
256                // when nested under a defmodule symbol.
257                if sym.kind == "function" && sym.language != "elixir" {
258                    sym.kind = "method".to_string();
259                }
260                break;
261            }
262        }
263    }
264}
265
266fn collapse_rust_impl_symbols(symbols: &mut Vec<Symbol>) {
267    let canonical_types = symbols
268        .iter()
269        .filter(|symbol| {
270            symbol.language == "rust"
271                && (symbol.kind == "class" || symbol.kind == "type")
272                && !is_rust_impl_symbol(symbol)
273        })
274        .map(|symbol| {
275            (
276                (symbol.file_path.clone(), symbol.name.clone()),
277                symbol.id.clone(),
278            )
279        })
280        .collect::<HashMap<_, _>>();
281
282    // impl symbol id -> (implemented type name, canonical type id in the same file).
283    // The type name always comes from the impl block, so methods keep their
284    // `Type::method` qualified name even when the type is defined in another file
285    // (cross-file impl); only the parent link is dropped in that case.
286    let impl_parent_map = symbols
287        .iter()
288        .filter(|symbol| is_rust_impl_symbol(symbol))
289        .map(|symbol| {
290            (
291                symbol.id.clone(),
292                (
293                    symbol.name.clone(),
294                    canonical_types
295                        .get(&(symbol.file_path.clone(), symbol.name.clone()))
296                        .cloned(),
297                ),
298            )
299        })
300        .collect::<HashMap<_, _>>();
301
302    if impl_parent_map.is_empty() {
303        return;
304    }
305
306    for symbol in symbols.iter_mut() {
307        let Some(parent_id) = symbol.parent_symbol_id.as_deref() else {
308            continue;
309        };
310        let Some((type_name, canonical_id)) = impl_parent_map.get(parent_id) else {
311            continue;
312        };
313        symbol.qualified_name = format!("{type_name}::{}", symbol.name);
314        symbol.parent_symbol_id = canonical_id.clone();
315    }
316
317    symbols.retain(|symbol| !is_rust_impl_symbol(symbol));
318}
319
320fn is_rust_impl_symbol(symbol: &Symbol) -> bool {
321    symbol.language == "rust"
322        && symbol.kind == "class"
323        && symbol.signature.as_deref().is_some_and(|signature| {
324            signature.starts_with("impl ") || signature.starts_with("unsafe impl ")
325        })
326}
327
328fn extract_docstring(node: &tree_sitter::Node, source: &[u8], language: &str) -> Option<String> {
329    if !matches!(language, "python" | "javascript" | "typescript") {
330        return None;
331    }
332
333    let mut body = None;
334    let mut walk = node.walk();
335    for child in node.children(&mut walk) {
336        let ty = child.kind();
337        if ty == "block" || ty == "statement_block" {
338            body = Some(child);
339            break;
340        }
341    }
342    let body = body?;
343
344    let mut walk2 = body.walk();
345    for child in body.children(&mut walk2) {
346        let ty = child.kind();
347        if ty == "comment" || ty == "\n" || ty == "newline" {
348            continue;
349        }
350
351        let string_node = if ty == "string" {
352            Some(child)
353        } else if ty == "expression_statement" {
354            let mut w3 = child.walk();
355            child.children(&mut w3).find(|gc| gc.kind() == "string")
356        } else {
357            None
358        };
359
360        let string_node = string_node?;
361
362        // Try string_content child first
363        let mut w4 = string_node.walk();
364        for sc in string_node.children(&mut w4) {
365            if sc.kind() == "string_content" {
366                let raw = String::from_utf8_lossy(&source[sc.start_byte()..sc.end_byte()]);
367                let trimmed = raw.trim();
368                return if trimmed.is_empty() {
369                    None
370                } else {
371                    Some(trimmed.to_string())
372                };
373            }
374        }
375
376        // Fallback: strip quotes
377        let raw =
378            String::from_utf8_lossy(&source[string_node.start_byte()..string_node.end_byte()]);
379        let raw = raw.trim();
380        let stripped = strip_quotes(raw);
381        return if stripped.is_empty() {
382            None
383        } else {
384            Some(stripped.to_string())
385        };
386    }
387
388    None
389}
390
391fn strip_quotes(s: &str) -> &str {
392    for q in &["\"\"\"", "'''", "\"", "'"] {
393        if s.starts_with(q) && s.ends_with(q) && s.len() >= q.len() * 2 {
394            return s[q.len()..s.len() - q.len()].trim();
395        }
396    }
397    s
398}
399
400fn extract_imports(
401    tree: &tree_sitter::Tree,
402    source: &[u8],
403    spec: &languages::LanguageSpec,
404    language: &str,
405    ts_lang: &tree_sitter::Language,
406    rel_path: &str,
407    import_context: &ImportResolutionContext,
408) -> anyhow::Result<ExtractedImports> {
409    if spec.import_query.trim().is_empty() {
410        return Ok(ExtractedImports::default());
411    }
412
413    let query = Query::new(ts_lang, spec.import_query).with_context(|| {
414        format!("failed to compile import query for language `{language}` while parsing {rel_path}")
415    })?;
416
417    let mut cursor = QueryCursor::new();
418    let mut matches = cursor.matches(&query, tree.root_node(), source);
419    let capture_names = query.capture_names();
420    let import_capture = capture_names.iter().position(|name| *name == "import");
421    let mut extracted = ExtractedImports::default();
422
423    while let Some(m) = matches.next() {
424        for cap in m.captures {
425            if import_capture == Some(cap.index as usize) {
426                let text =
427                    String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
428                        .trim()
429                        .to_string();
430                import_resolution::parse_import_statement(
431                    language,
432                    &text,
433                    rel_path,
434                    import_context,
435                    &mut extracted,
436                )?;
437            }
438        }
439    }
440
441    import_resolution::seed_import_bindings(language, import_context, &mut extracted.bindings);
442    Ok(extracted)
443}
444
445#[cfg(test)]
446mod tests;