1use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7use anyhow::Context as _;
8use streaming_iterator::StreamingIterator;
9use tree_sitter::{Parser, Query, QueryCursor};
10
11mod calls;
12
13use crate::index::MAX_FILE_SIZE;
14use crate::index::hasher::symbol_content_hash;
15use crate::index::import_resolution::{self, ExtractedImports};
16use crate::index::languages;
17use crate::index::security;
18use crate::index::semantic::SemanticCallResolver;
19use crate::models::{ParseResult, Symbol};
20use calls::{CallExtractionContext, extract_calls};
21
22pub use crate::index::import_resolution::{
23 ImportResolutionContext, build_import_resolution_context,
24};
25
26#[cfg(test)]
27use calls::{call_qualifier_path, line_terminator_len, split_qualified_callee};
28
29pub(crate) fn parse_file_with_semantic(
30 file_path: &Path,
31 project_id: &str,
32 root_path: &Path,
33 exclude_patterns: &[impl AsRef<str>],
34 import_context: &ImportResolutionContext,
35 semantic_resolver: Option<&mut (dyn SemanticCallResolver + '_)>,
36) -> anyhow::Result<Option<ParseResult>> {
37 if !security::validate_path(file_path, root_path) {
39 return Ok(None);
40 }
41 if !security::is_symlink_safe(file_path, root_path) {
42 return Ok(None);
43 }
44 if security::should_exclude_path(root_path, file_path, exclude_patterns) {
45 return Ok(None);
46 }
47 if security::has_secret_extension(file_path) {
48 return Ok(None);
49 }
50
51 let Ok(meta) = file_path.metadata() else {
52 return Ok(None);
53 };
54 if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
55 return Ok(None);
56 }
57
58 if security::is_binary(file_path) {
59 return Ok(None);
60 }
61
62 let file_str = file_path.to_string_lossy();
63 let Some(language) = languages::detect_language(&file_str) else {
64 return Ok(None);
65 };
66 let Some(spec) = languages::get_spec(language) else {
67 return Ok(None);
68 };
69 let Some(ts_lang) = languages::get_ts_language_for_path(language, &file_str) else {
70 return Ok(None);
71 };
72
73 let Ok(source) = std::fs::read(file_path) else {
74 return Ok(None);
75 };
76
77 let mut parser = Parser::new();
78 if parser.set_language(&ts_lang).is_err() {
79 return Ok(None);
80 }
81 let Some(tree) = parser.parse(&source, None) else {
82 return Ok(None);
83 };
84
85 let rel_path = file_path
86 .canonicalize()
87 .ok()
88 .and_then(|abs| {
89 root_path.canonicalize().ok().and_then(|root| {
90 abs.strip_prefix(&root)
91 .ok()
92 .map(|p| p.to_string_lossy().to_string())
93 })
94 })
95 .unwrap_or_else(|| file_str.to_string());
96
97 let mut symbols = extract_symbols(
98 &tree, &source, spec, language, &ts_lang, project_id, &rel_path,
99 )?;
100 link_parents(&mut symbols);
101 collapse_rust_impl_symbols(&mut symbols);
102 let extracted_imports = extract_imports(
103 &tree,
104 &source,
105 spec,
106 language,
107 &ts_lang,
108 &rel_path,
109 import_context,
110 )?;
111 let calls = extract_calls(
112 &tree,
113 &source,
114 spec,
115 CallExtractionContext {
116 language,
117 ts_lang: &ts_lang,
118 rel_path: &rel_path,
119 symbols: &symbols,
120 import_context,
121 import_bindings: &extracted_imports.bindings,
122 file_path,
123 root_path,
124 },
125 semantic_resolver,
126 )?;
127
128 Ok(Some(ParseResult {
129 symbols,
130 imports: extracted_imports.imports,
131 calls,
132 source,
133 }))
134}
135
136fn extract_symbols(
137 tree: &tree_sitter::Tree,
138 source: &[u8],
139 spec: &languages::LanguageSpec,
140 language: &str,
141 ts_lang: &tree_sitter::Language,
142 project_id: &str,
143 rel_path: &str,
144) -> anyhow::Result<Vec<Symbol>> {
145 if spec.symbol_query.trim().is_empty() {
146 return Ok(Vec::new());
147 }
148
149 let query = Query::new(ts_lang, spec.symbol_query).with_context(|| {
150 format!("failed to compile symbol query for language `{language}` while parsing {rel_path}")
151 })?;
152
153 let mut cursor = QueryCursor::new();
154 let mut matches = cursor.matches(&query, tree.root_node(), source);
155
156 let mut symbols = Vec::new();
157 let mut seen_ids = HashSet::new();
158 let capture_names = query.capture_names();
159 let name_capture = capture_names.iter().position(|name| *name == "name");
160 let definition_kinds = capture_names
161 .iter()
162 .map(|name| name.strip_prefix("definition."))
163 .collect::<Vec<_>>();
164
165 while let Some(m) = matches.next() {
166 let mut name_text: Option<String> = None;
167 let mut def_node = None;
168 let mut kind = String::from("function");
169
170 for cap in m.captures {
171 let capture_index = cap.index as usize;
172 if name_capture == Some(capture_index) {
173 name_text = Some(
174 String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
175 .to_string(),
176 );
177 } else if let Some(Some(k)) = definition_kinds.get(capture_index) {
178 def_node = Some(cap.node);
179 kind = (*k).to_string();
180 }
181 }
182
183 let (name, node) = match (name_text, def_node) {
184 (Some(n), Some(d)) => (n, d),
185 _ => continue,
186 };
187
188 let sig_end = source[node.start_byte()..]
190 .iter()
191 .position(|&b| b == b'\n')
192 .map(|p| node.start_byte() + p)
193 .unwrap_or(node.end_byte());
194 let mut signature = String::from_utf8_lossy(&source[node.start_byte()..sig_end])
195 .trim()
196 .to_string();
197 if signature.len() > 200 {
198 signature.truncate(200);
199 signature.push_str("...");
200 }
201
202 let docstring = extract_docstring(&node, source, language);
203 let c_hash =
204 symbol_content_hash(source, node.start_byte(), node.end_byte()).unwrap_or_default();
205 let symbol_id = Symbol::make_id(project_id, rel_path, &name, &kind, node.start_byte());
206
207 if seen_ids.contains(&symbol_id) {
208 continue;
209 }
210 seen_ids.insert(symbol_id.clone());
211
212 symbols.push(Symbol {
213 id: symbol_id,
214 project_id: project_id.to_string(),
215 file_path: rel_path.to_string(),
216 name: name.clone(),
217 qualified_name: name,
218 kind,
219 language: language.to_string(),
220 byte_start: node.start_byte(),
221 byte_end: node.end_byte(),
222 line_start: node.start_position().row + 1,
223 line_end: node.end_position().row + 1,
224 signature: Some(signature),
225 docstring,
226 parent_symbol_id: None,
227 content_hash: c_hash,
228 summary: None,
229 created_at: String::new(),
230 updated_at: String::new(),
231 });
232 }
233
234 Ok(symbols)
235}
236
237fn link_parents(symbols: &mut [Symbol]) {
238 let mut indices: Vec<usize> = (0..symbols.len()).collect();
239 indices.sort_by_key(|&i| symbols[i].byte_start);
240
241 for idx in 0..indices.len() {
242 let i = indices[idx];
243 for jdx in (0..idx).rev() {
244 let j = indices[jdx];
245 let parent_kind = symbols[j].kind.as_str();
246 if (parent_kind == "class" || parent_kind == "type")
247 && symbols[j].byte_start <= symbols[i].byte_start
248 && symbols[j].byte_end >= symbols[i].byte_end
249 {
250 let parent_name = symbols[j].name.clone();
251 let parent_id = symbols[j].id.clone();
252 let sym = &mut symbols[i];
253 sym.parent_symbol_id = Some(parent_id);
254 sym.qualified_name = format!("{}.{}", parent_name, sym.name);
255 if sym.kind == "function" && sym.language != "elixir" {
258 sym.kind = "method".to_string();
259 }
260 break;
261 }
262 }
263 }
264}
265
266fn collapse_rust_impl_symbols(symbols: &mut Vec<Symbol>) {
267 let canonical_types = symbols
268 .iter()
269 .filter(|symbol| {
270 symbol.language == "rust"
271 && (symbol.kind == "class" || symbol.kind == "type")
272 && !is_rust_impl_symbol(symbol)
273 })
274 .map(|symbol| {
275 (
276 (symbol.file_path.clone(), symbol.name.clone()),
277 symbol.id.clone(),
278 )
279 })
280 .collect::<HashMap<_, _>>();
281
282 let impl_parent_map = symbols
287 .iter()
288 .filter(|symbol| is_rust_impl_symbol(symbol))
289 .map(|symbol| {
290 (
291 symbol.id.clone(),
292 (
293 symbol.name.clone(),
294 canonical_types
295 .get(&(symbol.file_path.clone(), symbol.name.clone()))
296 .cloned(),
297 ),
298 )
299 })
300 .collect::<HashMap<_, _>>();
301
302 if impl_parent_map.is_empty() {
303 return;
304 }
305
306 for symbol in symbols.iter_mut() {
307 let Some(parent_id) = symbol.parent_symbol_id.as_deref() else {
308 continue;
309 };
310 let Some((type_name, canonical_id)) = impl_parent_map.get(parent_id) else {
311 continue;
312 };
313 symbol.qualified_name = format!("{type_name}::{}", symbol.name);
314 symbol.parent_symbol_id = canonical_id.clone();
315 }
316
317 symbols.retain(|symbol| !is_rust_impl_symbol(symbol));
318}
319
320fn is_rust_impl_symbol(symbol: &Symbol) -> bool {
321 symbol.language == "rust"
322 && symbol.kind == "class"
323 && symbol.signature.as_deref().is_some_and(|signature| {
324 signature.starts_with("impl ") || signature.starts_with("unsafe impl ")
325 })
326}
327
328fn extract_docstring(node: &tree_sitter::Node, source: &[u8], language: &str) -> Option<String> {
329 if !matches!(language, "python" | "javascript" | "typescript") {
330 return None;
331 }
332
333 let mut body = None;
334 let mut walk = node.walk();
335 for child in node.children(&mut walk) {
336 let ty = child.kind();
337 if ty == "block" || ty == "statement_block" {
338 body = Some(child);
339 break;
340 }
341 }
342 let body = body?;
343
344 let mut walk2 = body.walk();
345 for child in body.children(&mut walk2) {
346 let ty = child.kind();
347 if ty == "comment" || ty == "\n" || ty == "newline" {
348 continue;
349 }
350
351 let string_node = if ty == "string" {
352 Some(child)
353 } else if ty == "expression_statement" {
354 let mut w3 = child.walk();
355 child.children(&mut w3).find(|gc| gc.kind() == "string")
356 } else {
357 None
358 };
359
360 let string_node = string_node?;
361
362 let mut w4 = string_node.walk();
364 for sc in string_node.children(&mut w4) {
365 if sc.kind() == "string_content" {
366 let raw = String::from_utf8_lossy(&source[sc.start_byte()..sc.end_byte()]);
367 let trimmed = raw.trim();
368 return if trimmed.is_empty() {
369 None
370 } else {
371 Some(trimmed.to_string())
372 };
373 }
374 }
375
376 let raw =
378 String::from_utf8_lossy(&source[string_node.start_byte()..string_node.end_byte()]);
379 let raw = raw.trim();
380 let stripped = strip_quotes(raw);
381 return if stripped.is_empty() {
382 None
383 } else {
384 Some(stripped.to_string())
385 };
386 }
387
388 None
389}
390
391fn strip_quotes(s: &str) -> &str {
392 for q in &["\"\"\"", "'''", "\"", "'"] {
393 if s.starts_with(q) && s.ends_with(q) && s.len() >= q.len() * 2 {
394 return s[q.len()..s.len() - q.len()].trim();
395 }
396 }
397 s
398}
399
400fn extract_imports(
401 tree: &tree_sitter::Tree,
402 source: &[u8],
403 spec: &languages::LanguageSpec,
404 language: &str,
405 ts_lang: &tree_sitter::Language,
406 rel_path: &str,
407 import_context: &ImportResolutionContext,
408) -> anyhow::Result<ExtractedImports> {
409 if spec.import_query.trim().is_empty() {
410 return Ok(ExtractedImports::default());
411 }
412
413 let query = Query::new(ts_lang, spec.import_query).with_context(|| {
414 format!("failed to compile import query for language `{language}` while parsing {rel_path}")
415 })?;
416
417 let mut cursor = QueryCursor::new();
418 let mut matches = cursor.matches(&query, tree.root_node(), source);
419 let capture_names = query.capture_names();
420 let import_capture = capture_names.iter().position(|name| *name == "import");
421 let mut extracted = ExtractedImports::default();
422
423 while let Some(m) = matches.next() {
424 for cap in m.captures {
425 if import_capture == Some(cap.index as usize) {
426 let text =
427 String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
428 .trim()
429 .to_string();
430 import_resolution::parse_import_statement(
431 language,
432 &text,
433 rel_path,
434 import_context,
435 &mut extracted,
436 )?;
437 }
438 }
439 }
440
441 import_resolution::seed_import_bindings(language, import_context, &mut extracted.bindings);
442 Ok(extracted)
443}
444
445#[cfg(test)]
446mod tests;