1use std::collections::HashSet;
5use std::path::Path;
6
7use anyhow::Context as _;
8use streaming_iterator::StreamingIterator;
9use tree_sitter::{Parser, Query, QueryCursor};
10
11mod calls;
12
13use crate::index::hasher::symbol_content_hash;
14use crate::index::import_resolution::{self, ExtractedImports};
15use crate::index::languages;
16use crate::index::security;
17use crate::index::semantic::SemanticCallResolver;
18use crate::models::{ParseResult, Symbol};
19use calls::{CallExtractionContext, extract_calls};
20
21pub use crate::index::import_resolution::{
22 ImportResolutionContext, build_import_resolution_context,
23};
24
25#[cfg(test)]
26use calls::{call_qualifier_path, line_terminator_len, split_qualified_callee};
27
28const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
30
31pub(crate) fn parse_file_with_semantic(
32 file_path: &Path,
33 project_id: &str,
34 root_path: &Path,
35 exclude_patterns: &[String],
36 import_context: &ImportResolutionContext,
37 semantic_resolver: Option<&mut (dyn SemanticCallResolver + '_)>,
38) -> anyhow::Result<Option<ParseResult>> {
39 if !security::validate_path(file_path, root_path) {
41 return Ok(None);
42 }
43 if !security::is_symlink_safe(file_path, root_path) {
44 return Ok(None);
45 }
46 if security::should_exclude_path(root_path, file_path, exclude_patterns) {
47 return Ok(None);
48 }
49 if security::has_secret_extension(file_path) {
50 return Ok(None);
51 }
52
53 let Ok(meta) = file_path.metadata() else {
54 return Ok(None);
55 };
56 if meta.len() == 0 || meta.len() > MAX_FILE_SIZE {
57 return Ok(None);
58 }
59
60 if security::is_binary(file_path) {
61 return Ok(None);
62 }
63
64 let file_str = file_path.to_string_lossy();
65 let Some(language) = languages::detect_language(&file_str) else {
66 return Ok(None);
67 };
68 let Some(spec) = languages::get_spec(language) else {
69 return Ok(None);
70 };
71 let Some(ts_lang) = languages::get_ts_language(language) else {
72 return Ok(None);
73 };
74
75 let Ok(source) = std::fs::read(file_path) else {
76 return Ok(None);
77 };
78
79 let mut parser = Parser::new();
80 if parser.set_language(&ts_lang).is_err() {
81 return Ok(None);
82 }
83 let Some(tree) = parser.parse(&source, None) else {
84 return Ok(None);
85 };
86
87 let rel_path = file_path
88 .canonicalize()
89 .ok()
90 .and_then(|abs| {
91 root_path.canonicalize().ok().and_then(|root| {
92 abs.strip_prefix(&root)
93 .ok()
94 .map(|p| p.to_string_lossy().to_string())
95 })
96 })
97 .unwrap_or_else(|| file_str.to_string());
98
99 let mut symbols = extract_symbols(
100 &tree, &source, spec, language, &ts_lang, project_id, &rel_path,
101 )?;
102 link_parents(&mut symbols);
103 let extracted_imports = extract_imports(
104 &tree,
105 &source,
106 spec,
107 language,
108 &ts_lang,
109 &rel_path,
110 import_context,
111 )?;
112 let calls = extract_calls(
113 &tree,
114 &source,
115 spec,
116 CallExtractionContext {
117 language,
118 ts_lang: &ts_lang,
119 rel_path: &rel_path,
120 symbols: &symbols,
121 import_context,
122 import_bindings: &extracted_imports.bindings,
123 file_path,
124 root_path,
125 },
126 semantic_resolver,
127 )?;
128
129 Ok(Some(ParseResult {
130 symbols,
131 imports: extracted_imports.imports,
132 calls,
133 source,
134 }))
135}
136
137fn extract_symbols(
138 tree: &tree_sitter::Tree,
139 source: &[u8],
140 spec: &languages::LanguageSpec,
141 language: &str,
142 ts_lang: &tree_sitter::Language,
143 project_id: &str,
144 rel_path: &str,
145) -> anyhow::Result<Vec<Symbol>> {
146 if spec.symbol_query.trim().is_empty() {
147 return Ok(Vec::new());
148 }
149
150 let query = Query::new(ts_lang, spec.symbol_query).with_context(|| {
151 format!("failed to compile symbol query for language `{language}` while parsing {rel_path}")
152 })?;
153
154 let mut cursor = QueryCursor::new();
155 let mut matches = cursor.matches(&query, tree.root_node(), source);
156
157 let mut symbols = Vec::new();
158 let mut seen_ids = HashSet::new();
159 let capture_names = query.capture_names();
160 let name_capture = capture_names.iter().position(|name| *name == "name");
161 let definition_kinds = capture_names
162 .iter()
163 .map(|name| name.strip_prefix("definition."))
164 .collect::<Vec<_>>();
165
166 while let Some(m) = matches.next() {
167 let mut name_text: Option<String> = None;
168 let mut def_node = None;
169 let mut kind = String::from("function");
170
171 for cap in m.captures {
172 let capture_index = cap.index as usize;
173 if name_capture == Some(capture_index) {
174 name_text = Some(
175 String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
176 .to_string(),
177 );
178 } else if let Some(Some(k)) = definition_kinds.get(capture_index) {
179 def_node = Some(cap.node);
180 kind = (*k).to_string();
181 }
182 }
183
184 let (name, node) = match (name_text, def_node) {
185 (Some(n), Some(d)) => (n, d),
186 _ => continue,
187 };
188
189 let sig_end = source[node.start_byte()..]
191 .iter()
192 .position(|&b| b == b'\n')
193 .map(|p| node.start_byte() + p)
194 .unwrap_or(node.end_byte());
195 let mut signature = String::from_utf8_lossy(&source[node.start_byte()..sig_end])
196 .trim()
197 .to_string();
198 if signature.len() > 200 {
199 signature.truncate(200);
200 signature.push_str("...");
201 }
202
203 let docstring = extract_docstring(&node, source, language);
204 let c_hash =
205 symbol_content_hash(source, node.start_byte(), node.end_byte()).unwrap_or_default();
206 let symbol_id = Symbol::make_id(project_id, rel_path, &name, &kind, node.start_byte());
207
208 if seen_ids.contains(&symbol_id) {
209 continue;
210 }
211 seen_ids.insert(symbol_id.clone());
212
213 symbols.push(Symbol {
214 id: symbol_id,
215 project_id: project_id.to_string(),
216 file_path: rel_path.to_string(),
217 name: name.clone(),
218 qualified_name: name,
219 kind,
220 language: language.to_string(),
221 byte_start: node.start_byte(),
222 byte_end: node.end_byte(),
223 line_start: node.start_position().row + 1,
224 line_end: node.end_position().row + 1,
225 signature: Some(signature),
226 docstring,
227 parent_symbol_id: None,
228 content_hash: c_hash,
229 summary: None,
230 created_at: String::new(),
231 updated_at: String::new(),
232 });
233 }
234
235 Ok(symbols)
236}
237
238fn link_parents(symbols: &mut [Symbol]) {
239 let mut indices: Vec<usize> = (0..symbols.len()).collect();
240 indices.sort_by_key(|&i| symbols[i].byte_start);
241
242 for idx in 0..indices.len() {
243 let i = indices[idx];
244 for jdx in (0..idx).rev() {
245 let j = indices[jdx];
246 let parent_kind = symbols[j].kind.as_str();
247 if (parent_kind == "class" || parent_kind == "type")
248 && symbols[j].byte_start <= symbols[i].byte_start
249 && symbols[j].byte_end >= symbols[i].byte_end
250 {
251 let parent_name = symbols[j].name.clone();
252 let parent_id = symbols[j].id.clone();
253 let sym = &mut symbols[i];
254 sym.parent_symbol_id = Some(parent_id);
255 sym.qualified_name = format!("{}.{}", parent_name, sym.name);
256 if sym.kind == "function" {
257 sym.kind = "method".to_string();
258 }
259 break;
260 }
261 }
262 }
263}
264
265fn extract_docstring(node: &tree_sitter::Node, source: &[u8], language: &str) -> Option<String> {
266 if !matches!(language, "python" | "javascript" | "typescript") {
267 return None;
268 }
269
270 let mut body = None;
271 let mut walk = node.walk();
272 for child in node.children(&mut walk) {
273 let ty = child.kind();
274 if ty == "block" || ty == "statement_block" {
275 body = Some(child);
276 break;
277 }
278 }
279 let body = body?;
280
281 let mut walk2 = body.walk();
282 for child in body.children(&mut walk2) {
283 let ty = child.kind();
284 if ty == "comment" || ty == "\n" || ty == "newline" {
285 continue;
286 }
287
288 let string_node = if ty == "string" {
289 Some(child)
290 } else if ty == "expression_statement" {
291 let mut w3 = child.walk();
292 child.children(&mut w3).find(|gc| gc.kind() == "string")
293 } else {
294 None
295 };
296
297 let string_node = string_node?;
298
299 let mut w4 = string_node.walk();
301 for sc in string_node.children(&mut w4) {
302 if sc.kind() == "string_content" {
303 let raw = String::from_utf8_lossy(&source[sc.start_byte()..sc.end_byte()]);
304 let trimmed = raw.trim();
305 return if trimmed.is_empty() {
306 None
307 } else {
308 Some(trimmed.to_string())
309 };
310 }
311 }
312
313 let raw =
315 String::from_utf8_lossy(&source[string_node.start_byte()..string_node.end_byte()]);
316 let raw = raw.trim();
317 let stripped = strip_quotes(raw);
318 return if stripped.is_empty() {
319 None
320 } else {
321 Some(stripped.to_string())
322 };
323 }
324
325 None
326}
327
328fn strip_quotes(s: &str) -> &str {
329 for q in &["\"\"\"", "'''", "\"", "'"] {
330 if s.starts_with(q) && s.ends_with(q) && s.len() >= q.len() * 2 {
331 return s[q.len()..s.len() - q.len()].trim();
332 }
333 }
334 s
335}
336
337fn extract_imports(
338 tree: &tree_sitter::Tree,
339 source: &[u8],
340 spec: &languages::LanguageSpec,
341 language: &str,
342 ts_lang: &tree_sitter::Language,
343 rel_path: &str,
344 import_context: &ImportResolutionContext,
345) -> anyhow::Result<ExtractedImports> {
346 if spec.import_query.trim().is_empty() {
347 return Ok(ExtractedImports::default());
348 }
349
350 let query = Query::new(ts_lang, spec.import_query).with_context(|| {
351 format!("failed to compile import query for language `{language}` while parsing {rel_path}")
352 })?;
353
354 let mut cursor = QueryCursor::new();
355 let mut matches = cursor.matches(&query, tree.root_node(), source);
356 let capture_names = query.capture_names();
357 let import_capture = capture_names.iter().position(|name| *name == "import");
358 let mut extracted = ExtractedImports::default();
359
360 while let Some(m) = matches.next() {
361 for cap in m.captures {
362 if import_capture == Some(cap.index as usize) {
363 let text =
364 String::from_utf8_lossy(&source[cap.node.start_byte()..cap.node.end_byte()])
365 .trim()
366 .to_string();
367 import_resolution::parse_import_statement(
368 language,
369 &text,
370 rel_path,
371 import_context,
372 &mut extracted,
373 );
374 }
375 }
376 }
377
378 import_resolution::seed_import_bindings(language, import_context, &mut extracted.bindings);
379 Ok(extracted)
380}
381
382#[cfg(test)]
383mod tests;