Skip to main content

gobby_code/index/
parser.rs

1//! Tree-sitter AST parsing for symbol, import, and call extraction.
2//! Ports logic from src/gobby/code_index/parser.py.
3
4use std::collections::HashSet;
5use std::path::Path;
6
7use anyhow::Context as _;
8use streaming_iterator::StreamingIterator;
9use tree_sitter::{Parser, Query, QueryCursor};
10
11mod calls;
12
13use crate::index::hasher::symbol_content_hash;
14use crate::index::import_resolution::{self, ExtractedImports};
15use crate::index::languages;
16use crate::index::security;
17use crate::index::semantic::SemanticCallResolver;
18use crate::models::{ParseResult, Symbol};
19use calls::{CallExtractionContext, extract_calls};
20
21pub use crate::index::import_resolution::{
22    ImportResolutionContext, build_import_resolution_context,
23};
24
25#[cfg(test)]
26use calls::{call_qualifier_path, line_terminator_len, split_qualified_callee};
27
28/// Maximum file size to index (10 MB).
29const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
30
31pub(crate) fn parse_file_with_semantic(
32    file_path: &Path,
33    project_id: &str,
34    root_path: &Path,
35    exclude_patterns: &[String],
36    import_context: &ImportResolutionContext,
37    semantic_resolver: Option<&mut (dyn SemanticCallResolver + '_)>,
38) -> anyhow::Result<Option<ParseResult>> {
39    // Security checks
40    if !security::validate_path(file_path, root_path) {
41        return Ok(None);
42    }
43    if !security::is_symlink_safe(file_path, root_path) {
44        return Ok(None);
45    }
46    if security::should_exclude_path(root_path, file_path, exclude_patterns) {
47        return Ok(None);
48    }
49    if security::has_secret_extension(file_path) {
50        return Ok(None);
51    }
52
53    let Ok(meta) = file_path.metadata() else {
54        return Ok(None);
55    };
56    if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
57        return Ok(None);
58    }
59
60    if security::is_binary(file_path) {
61        return Ok(None);
62    }
63
64    let file_str = file_path.to_string_lossy();
65    let Some(language) = languages::detect_language(&file_str) else {
66        return Ok(None);
67    };
68    let Some(spec) = languages::get_spec(language) else {
69        return Ok(None);
70    };
71    let Some(ts_lang) = languages::get_ts_language(language) else {
72        return Ok(None);
73    };
74
75    let Ok(source) = std::fs::read(file_path) else {
76        return Ok(None);
77    };
78
79    let mut parser = Parser::new();
80    if parser.set_language(&ts_lang).is_err() {
81        return Ok(None);
82    }
83    let Some(tree) = parser.parse(&source, None) else {
84        return Ok(None);
85    };
86
87    let rel_path = file_path
88        .canonicalize()
89        .ok()
90        .and_then(|abs| {
91            root_path.canonicalize().ok().and_then(|root| {
92                abs.strip_prefix(&root)
93                    .ok()
94                    .map(|p| p.to_string_lossy().to_string())
95            })
96        })
97        .unwrap_or_else(|| file_str.to_string());
98
99    let mut symbols = extract_symbols(
100        &tree, &source, spec, language, &ts_lang, project_id, &rel_path,
101    )?;
102    link_parents(&mut symbols);
103    let extracted_imports = extract_imports(
104        &tree,
105        &source,
106        spec,
107        language,
108        &ts_lang,
109        &rel_path,
110        import_context,
111    )?;
112    let calls = extract_calls(
113        &tree,
114        &source,
115        spec,
116        CallExtractionContext {
117            language,
118            ts_lang: &ts_lang,
119            rel_path: &rel_path,
120            symbols: &symbols,
121            import_context,
122            import_bindings: &extracted_imports.bindings,
123            file_path,
124            root_path,
125        },
126        semantic_resolver,
127    )?;
128
129    Ok(Some(ParseResult {
130        symbols,
131        imports: extracted_imports.imports,
132        calls,
133        source,
134    }))
135}
136
137fn extract_symbols(
138    tree: &tree_sitter::Tree,
139    source: &[u8],
140    spec: &languages::LanguageSpec,
141    language: &str,
142    ts_lang: &tree_sitter::Language,
143    project_id: &str,
144    rel_path: &str,
145) -> anyhow::Result<Vec<Symbol>> {
146    if spec.symbol_query.trim().is_empty() {
147        return Ok(Vec::new());
148    }
149
150    let query = Query::new(ts_lang, spec.symbol_query).with_context(|| {
151        format!("failed to compile symbol query for language `{language}` while parsing {rel_path}")
152    })?;
153
154    let mut cursor = QueryCursor::new();
155    let mut matches = cursor.matches(&query, tree.root_node(), source);
156
157    let mut symbols = Vec::new();
158    let mut seen_ids = HashSet::new();
159    let capture_names = query.capture_names();
160    let name_capture = capture_names.iter().position(|name| *name == "name");
161    let definition_kinds = capture_names
162        .iter()
163        .map(|name| name.strip_prefix("definition."))
164        .collect::<Vec<_>>();
165
166    while let Some(m) = matches.next() {
167        let mut name_text: Option<String> = None;
168        let mut def_node = None;
169        let mut kind = String::from("function");
170
171        for cap in m.captures {
172            let capture_index = cap.index as usize;
173            if name_capture == Some(capture_index) {
174                name_text = Some(
175                    String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
176                        .to_string(),
177                );
178            } else if let Some(Some(k)) = definition_kinds.get(capture_index) {
179                def_node = Some(cap.node);
180                kind = (*k).to_string();
181            }
182        }
183
184        let (name, node) = match (name_text, def_node) {
185            (Some(n), Some(d)) => (n, d),
186            _ => continue,
187        };
188
189        // Signature: first line of definition
190        let sig_end = source[node.start_byte()..]
191            .iter()
192            .position(|&b| b == b'\n')
193            .map(|p| node.start_byte() + p)
194            .unwrap_or(node.end_byte());
195        let mut signature = String::from_utf8_lossy(&source[node.start_byte()..sig_end])
196            .trim()
197            .to_string();
198        if signature.len() > 200 {
199            signature.truncate(200);
200            signature.push_str("...");
201        }
202
203        let docstring = extract_docstring(&node, source, language);
204        let c_hash =
205            symbol_content_hash(source, node.start_byte(), node.end_byte()).unwrap_or_default();
206        let symbol_id = Symbol::make_id(project_id, rel_path, &name, &kind, node.start_byte());
207
208        if seen_ids.contains(&symbol_id) {
209            continue;
210        }
211        seen_ids.insert(symbol_id.clone());
212
213        symbols.push(Symbol {
214            id: symbol_id,
215            project_id: project_id.to_string(),
216            file_path: rel_path.to_string(),
217            name: name.clone(),
218            qualified_name: name,
219            kind,
220            language: language.to_string(),
221            byte_start: node.start_byte(),
222            byte_end: node.end_byte(),
223            line_start: node.start_position().row + 1,
224            line_end: node.end_position().row + 1,
225            signature: Some(signature),
226            docstring,
227            parent_symbol_id: None,
228            content_hash: c_hash,
229            summary: None,
230            created_at: String::new(),
231            updated_at: String::new(),
232        });
233    }
234
235    Ok(symbols)
236}
237
238fn link_parents(symbols: &mut [Symbol]) {
239    let mut indices: Vec<usize> = (0..symbols.len()).collect();
240    indices.sort_by_key(|&i| symbols[i].byte_start);
241
242    for idx in 0..indices.len() {
243        let i = indices[idx];
244        for jdx in (0..idx).rev() {
245            let j = indices[jdx];
246            let parent_kind = symbols[j].kind.as_str();
247            if (parent_kind == "class" || parent_kind == "type")
248                && symbols[j].byte_start <= symbols[i].byte_start
249                && symbols[j].byte_end >= symbols[i].byte_end
250            {
251                let parent_name = symbols[j].name.clone();
252                let parent_id = symbols[j].id.clone();
253                let sym = &mut symbols[i];
254                sym.parent_symbol_id = Some(parent_id);
255                sym.qualified_name = format!("{}.{}", parent_name, sym.name);
256                if sym.kind == "function" {
257                    sym.kind = "method".to_string();
258                }
259                break;
260            }
261        }
262    }
263}
264
265fn extract_docstring(node: &tree_sitter::Node, source: &[u8], language: &str) -> Option<String> {
266    if !matches!(language, "python" | "javascript" | "typescript") {
267        return None;
268    }
269
270    let mut body = None;
271    let mut walk = node.walk();
272    for child in node.children(&mut walk) {
273        let ty = child.kind();
274        if ty == "block" || ty == "statement_block" {
275            body = Some(child);
276            break;
277        }
278    }
279    let body = body?;
280
281    let mut walk2 = body.walk();
282    for child in body.children(&mut walk2) {
283        let ty = child.kind();
284        if ty == "comment" || ty == "\n" || ty == "newline" {
285            continue;
286        }
287
288        let string_node = if ty == "string" {
289            Some(child)
290        } else if ty == "expression_statement" {
291            let mut w3 = child.walk();
292            child.children(&mut w3).find(|gc| gc.kind() == "string")
293        } else {
294            None
295        };
296
297        let string_node = string_node?;
298
299        // Try string_content child first
300        let mut w4 = string_node.walk();
301        for sc in string_node.children(&mut w4) {
302            if sc.kind() == "string_content" {
303                let raw = String::from_utf8_lossy(&source[sc.start_byte()..sc.end_byte()]);
304                let trimmed = raw.trim();
305                return if trimmed.is_empty() {
306                    None
307                } else {
308                    Some(trimmed.to_string())
309                };
310            }
311        }
312
313        // Fallback: strip quotes
314        let raw =
315            String::from_utf8_lossy(&source[string_node.start_byte()..string_node.end_byte()]);
316        let raw = raw.trim();
317        let stripped = strip_quotes(raw);
318        return if stripped.is_empty() {
319            None
320        } else {
321            Some(stripped.to_string())
322        };
323    }
324
325    None
326}
327
328fn strip_quotes(s: &str) -> &str {
329    for q in &["\"\"\"", "'''", "\"", "'"] {
330        if s.starts_with(q) && s.ends_with(q) && s.len() >= q.len() * 2 {
331            return s[q.len()..s.len() - q.len()].trim();
332        }
333    }
334    s
335}
336
337fn extract_imports(
338    tree: &tree_sitter::Tree,
339    source: &[u8],
340    spec: &languages::LanguageSpec,
341    language: &str,
342    ts_lang: &tree_sitter::Language,
343    rel_path: &str,
344    import_context: &ImportResolutionContext,
345) -> anyhow::Result<ExtractedImports> {
346    if spec.import_query.trim().is_empty() {
347        return Ok(ExtractedImports::default());
348    }
349
350    let query = Query::new(ts_lang, spec.import_query).with_context(|| {
351        format!("failed to compile import query for language `{language}` while parsing {rel_path}")
352    })?;
353
354    let mut cursor = QueryCursor::new();
355    let mut matches = cursor.matches(&query, tree.root_node(), source);
356    let capture_names = query.capture_names();
357    let import_capture = capture_names.iter().position(|name| *name == "import");
358    let mut extracted = ExtractedImports::default();
359
360    while let Some(m) = matches.next() {
361        for cap in m.captures {
362            if import_capture == Some(cap.index as usize) {
363                let text =
364                    String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
365                        .trim()
366                        .to_string();
367                import_resolution::parse_import_statement(
368                    language,
369                    &text,
370                    rel_path,
371                    import_context,
372                    &mut extracted,
373                );
374            }
375        }
376    }
377
378    import_resolution::seed_import_bindings(language, import_context, &mut extracted.bindings);
379    Ok(extracted)
380}
381
382#[cfg(test)]
383mod tests;