1use anyhow::{Context, Result};
2use matryoshka_core_ir::{
3 FileFact, ImportFact, MatryoshkaProgressEvent, SnippetFact, SymbolFact, SymbolKind,
4};
5use sha2::{Digest, Sha256};
6use std::fs;
7use std::path::{Path, PathBuf};
8use tree_sitter::{Node, Parser as TreeSitterParser};
9use walkdir::WalkDir;
10
11#[derive(Debug, Clone)]
12pub struct ParsedRepository {
13 pub repo_root: PathBuf,
14 pub files: Vec<FileFact>,
15 pub symbols: Vec<SymbolFact>,
16}
17
18#[derive(Debug, Clone)]
19pub struct ParserConfig {
20 pub include_extensions: Vec<String>,
21 pub ignored_dirs: Vec<String>,
22 pub ignored_paths: Vec<String>,
23 pub max_snippets_per_file: usize,
24}
25
26impl Default for ParserConfig {
27 fn default() -> Self {
28 Self {
29 include_extensions: vec!["py".into(), "ts".into(), "tsx".into(), "rs".into()],
30 ignored_dirs: vec![
31 ".git".into(),
32 ".venv".into(),
33 "venv".into(),
34 "node_modules".into(),
35 "dist".into(),
36 "build".into(),
37 "__pycache__".into(),
38 ".pytest_cache".into(),
39 "target".into(),
40 ],
41 ignored_paths: Vec::new(),
42 max_snippets_per_file: 6,
43 }
44 }
45}
46
47impl ParserConfig {
48 pub fn with_ignored_paths(mut self, ignored_paths: impl IntoIterator<Item = String>) -> Self {
49 self.ignored_paths.extend(
50 ignored_paths
51 .into_iter()
52 .map(|path| normalize_ignored_path(&path))
53 .filter(|path| !path.is_empty()),
54 );
55 self
56 }
57
58 pub fn ignores_entry(&self, repo_root: &Path, path: &Path) -> bool {
59 let Some(name) = path.file_name().and_then(|name| name.to_str()) else {
60 return false;
61 };
62 if self.ignored_dirs.iter().any(|ignored| ignored == name) {
63 return true;
64 }
65 let relative = relative_path(repo_root, path);
66 self.ignored_paths
67 .iter()
68 .any(|ignored| path_matches_ignore(&relative, ignored))
69 }
70}
71
72pub struct SourceParser {
73 config: ParserConfig,
74}
75
76impl SourceParser {
77 pub fn new(config: ParserConfig) -> Self {
78 Self { config }
79 }
80
81 pub fn parse_repo(&self, repo_root: impl AsRef<Path>) -> Result<ParsedRepository> {
82 self.parse_repo_with_progress(repo_root, |_| {})
83 }
84
85 pub fn parse_repo_with_progress(
86 &self,
87 repo_root: impl AsRef<Path>,
88 mut progress: impl FnMut(MatryoshkaProgressEvent),
89 ) -> Result<ParsedRepository> {
90 let repo_root = repo_root.as_ref().to_path_buf();
91 progress(MatryoshkaProgressEvent::DiscoveringFiles);
92 let candidate_paths = self.discover_paths(&repo_root)?;
93 let total_files = candidate_paths.len();
94 progress(MatryoshkaProgressEvent::FilesDiscovered { total_files });
95 let mut files = Vec::new();
96 let mut symbols = Vec::new();
97
98 for (index, path) in candidate_paths.iter().enumerate() {
99 let relative = relative_path(&repo_root, path);
100 progress(MatryoshkaProgressEvent::ParsingFile {
101 path: relative.clone(),
102 index: index + 1,
103 total_files,
104 });
105 let (file, mut file_symbols) = self.parse_file(&repo_root, path)?;
106 progress(MatryoshkaProgressEvent::ParsedFile {
107 path: relative,
108 index: index + 1,
109 total_files,
110 });
111 files.push(file);
112 symbols.append(&mut file_symbols);
113 }
114
115 files.sort_by(|left, right| left.path.cmp(&right.path));
116 symbols.sort_by(|left, right| left.symbol_id.cmp(&right.symbol_id));
117
118 Ok(ParsedRepository {
119 repo_root,
120 files,
121 symbols,
122 })
123 }
124
125 fn discover_paths(&self, repo_root: &Path) -> Result<Vec<PathBuf>> {
126 let mut paths = Vec::new();
127 for entry in WalkDir::new(repo_root)
128 .into_iter()
129 .filter_entry(|entry| !self.config.ignores_entry(repo_root, entry.path()))
130 {
131 let entry = entry?;
132 if !entry.file_type().is_file() {
133 continue;
134 }
135 let path = entry.into_path();
136 if !self.config.ignores_entry(repo_root, &path) && self.should_parse(&path) {
137 paths.push(path);
138 }
139 }
140 paths.sort();
141 Ok(paths)
142 }
143
144 fn should_parse(&self, path: &Path) -> bool {
145 path.extension()
146 .and_then(|ext| ext.to_str())
147 .map(|ext| {
148 self.config
149 .include_extensions
150 .iter()
151 .any(|allowed| allowed == ext)
152 })
153 .unwrap_or(false)
154 }
155
156 fn parse_file(&self, repo_root: &Path, path: &Path) -> Result<(FileFact, Vec<SymbolFact>)> {
157 let source = fs::read_to_string(path)
158 .with_context(|| format!("failed to read source file {}", path.display()))?;
159 let relative = path
160 .strip_prefix(repo_root)
161 .unwrap_or(path)
162 .to_string_lossy()
163 .replace('\\', "/");
164 let language = language_for(path);
165 let source_hash = hash_text(&source);
166 let lines: Vec<&str> = source.lines().collect();
167 let parent_folder_id = parent_folder_id(&relative);
168 let imports = parse_imports(&relative, &language, &lines);
169 let symbols = parse_symbols(&relative, &language, &source, &lines);
170 let snippets = select_snippets(
171 &relative,
172 &source,
173 &symbols,
174 self.config.max_snippets_per_file,
175 );
176
177 let file = FileFact {
178 file_id: relative.clone(),
179 path: relative.clone(),
180 name: Path::new(&relative)
181 .file_name()
182 .and_then(|name| name.to_str())
183 .unwrap_or(&relative)
184 .to_string(),
185 language,
186 parent_folder_id,
187 source_hash,
188 line_count: lines.len(),
189 imports,
190 snippets,
191 };
192
193 Ok((file, symbols))
194 }
195}
196
197fn normalize_ignored_path(path: &str) -> String {
198 path.trim()
199 .trim_matches('/')
200 .replace('\\', "/")
201 .split('/')
202 .filter(|part| !part.is_empty() && *part != ".")
203 .collect::<Vec<_>>()
204 .join("/")
205}
206
207fn path_matches_ignore(relative_path: &str, ignored_path: &str) -> bool {
208 if ignored_path.is_empty() || relative_path.is_empty() {
209 return false;
210 }
211 if ignored_path.contains('/') {
212 relative_path == ignored_path || relative_path.starts_with(&format!("{ignored_path}/"))
213 } else {
214 relative_path
215 .split('/')
216 .any(|component| component == ignored_path)
217 }
218}
219
220fn relative_path(repo_root: &Path, path: &Path) -> String {
221 path.strip_prefix(repo_root)
222 .unwrap_or(path)
223 .to_string_lossy()
224 .replace('\\', "/")
225}
226
227pub fn hash_text(text: &str) -> String {
228 let mut hasher = Sha256::new();
229 hasher.update(text.as_bytes());
230 format!("{:x}", hasher.finalize())
231}
232
233fn language_for(path: &Path) -> String {
234 match path
235 .extension()
236 .and_then(|ext| ext.to_str())
237 .unwrap_or_default()
238 {
239 "py" => "python",
240 "ts" | "tsx" => "typescript",
241 "rs" => "rust",
242 other => other,
243 }
244 .to_string()
245}
246
247fn parent_folder_id(path: &str) -> String {
248 Path::new(path)
249 .parent()
250 .and_then(|parent| parent.to_str())
251 .filter(|parent| !parent.is_empty())
252 .unwrap_or("repo")
253 .replace('\\', "/")
254}
255
256fn parse_imports(file_id: &str, language: &str, lines: &[&str]) -> Vec<ImportFact> {
257 let mut imports = Vec::new();
258 for (index, line) in lines.iter().enumerate() {
259 let trimmed = line.trim();
260 let parsed = match language {
261 "python" => parse_python_import(trimmed),
262 "typescript" => parse_typescript_import(trimmed),
263 "rust" => parse_rust_import(trimmed),
264 _ => None,
265 };
266 if let Some((module, names)) = parsed {
267 imports.push(ImportFact {
268 module,
269 names,
270 line: index + 1,
271 resolved_file_id: None,
272 is_internal: false,
273 });
274 }
275 }
276 imports.sort_by(|left, right| (left.line, &left.module).cmp(&(right.line, &right.module)));
277 imports.dedup_by(|left, right| left.module == right.module && left.line == right.line);
278 imports.iter_mut().for_each(|import| {
279 import.is_internal = looks_internal(&import.module, file_id);
280 });
281 imports
282}
283
284fn parse_python_import(line: &str) -> Option<(String, Vec<String>)> {
285 if let Some(rest) = line.strip_prefix("from ") {
286 let mut parts = rest.splitn(2, " import ");
287 let module = parts.next()?.trim().to_string();
288 let names = parts
289 .next()
290 .unwrap_or_default()
291 .split(',')
292 .map(|name| {
293 name.trim()
294 .split_whitespace()
295 .next()
296 .unwrap_or_default()
297 .to_string()
298 })
299 .filter(|name| !name.is_empty())
300 .collect();
301 return (!module.is_empty()).then_some((module, names));
302 }
303 if let Some(rest) = line.strip_prefix("import ") {
304 let module = rest
305 .split(',')
306 .next()?
307 .trim()
308 .split_whitespace()
309 .next()
310 .unwrap_or_default()
311 .to_string();
312 return (!module.is_empty()).then_some((module, Vec::new()));
313 }
314 None
315}
316
317#[cfg(test)]
318mod tests {
319 use super::{
320 ParserConfig, SourceParser, parse_python_import, parse_rust_import, parse_rust_symbols,
321 };
322 use matryoshka_core_ir::SymbolKind;
323 use std::fs;
324
325 #[test]
326 fn python_relative_imports_preserve_leading_dots() {
327 let parsed = parse_python_import("from ..graph import RepositoryGraph").unwrap();
328 assert_eq!(parsed.0, "..graph");
329 assert_eq!(parsed.1, vec!["RepositoryGraph"]);
330 }
331
332 #[test]
333 fn rust_grouped_imports_extract_module_and_names() {
334 let parsed = parse_rust_import("use matryoshka_core_ir::{FileFact, SymbolFact};").unwrap();
335 assert_eq!(parsed.0, "matryoshka_core_ir");
336 assert_eq!(parsed.1, vec!["FileFact", "SymbolFact"]);
337 }
338
339 #[test]
340 fn rust_impl_methods_are_qualified_as_methods() {
341 let lines = vec![
342 "pub struct MatryoshkaStore {",
343 " db_path: PathBuf,",
344 "}",
345 "",
346 "impl MatryoshkaStore {",
347 " pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {",
348 " Self { db_path: db_path.as_ref().to_path_buf() }",
349 " }",
350 "}",
351 ];
352 let symbols = parse_rust_symbols("store.rs", &lines);
353 assert!(symbols.iter().any(|symbol| {
354 symbol.qualified_name == "MatryoshkaStore::open" && symbol.kind == SymbolKind::Method
355 }));
356 assert!(symbols.iter().any(|symbol| {
357 symbol.qualified_name == "MatryoshkaStore" && symbol.kind == SymbolKind::Struct
358 }));
359 }
360
361 #[test]
362 fn tree_sitter_parser_extracts_python_methods() {
363 let temp = tempfile::tempdir().unwrap();
364 fs::write(
365 temp.path().join("service.py"),
366 "class TokenService:\n def refresh(self):\n return True\n",
367 )
368 .unwrap();
369 let parser = SourceParser::new(ParserConfig::default());
370 let parsed = parser.parse_repo(temp.path()).unwrap();
371 assert!(parsed.symbols.iter().any(|symbol| {
372 symbol.qualified_name == "TokenService::refresh"
373 && symbol.kind == SymbolKind::Method
374 && symbol.start_line == 2
375 && symbol.end_line == 3
376 }));
377 }
378
379 #[test]
380 fn tree_sitter_parser_extracts_typescript_class_methods() {
381 let temp = tempfile::tempdir().unwrap();
382 fs::write(
383 temp.path().join("client.ts"),
384 "export class ApiClient {\n async fetchToken(): Promise<string> {\n return 'token';\n }\n}\n",
385 )
386 .unwrap();
387 let parser = SourceParser::new(ParserConfig::default());
388 let parsed = parser.parse_repo(temp.path()).unwrap();
389 assert!(parsed.symbols.iter().any(|symbol| {
390 symbol.qualified_name == "ApiClient::fetchToken"
391 && symbol.kind == SymbolKind::Method
392 && symbol.start_line == 2
393 && symbol.end_line == 4
394 }));
395 }
396
397 #[test]
398 fn parser_config_ignores_path_components_and_subtrees() {
399 let temp = tempfile::tempdir().unwrap();
400 fs::create_dir_all(temp.path().join("src")).unwrap();
401 fs::create_dir_all(temp.path().join("tests")).unwrap();
402 fs::create_dir_all(temp.path().join("packages/web")).unwrap();
403 fs::write(temp.path().join("src/lib.rs"), "pub fn keep() {}\n").unwrap();
404 fs::write(
405 temp.path().join("tests/test_api.py"),
406 "def drop_me(): pass\n",
407 )
408 .unwrap();
409 fs::write(
410 temp.path().join("packages/web/app.ts"),
411 "export function app() {}\n",
412 )
413 .unwrap();
414
415 let parser = SourceParser::new(
416 ParserConfig::default()
417 .with_ignored_paths(["tests".to_string(), "packages/web".to_string()]),
418 );
419 let parsed = parser.parse_repo(temp.path()).unwrap();
420 let paths = parsed
421 .files
422 .iter()
423 .map(|file| file.path.as_str())
424 .collect::<Vec<_>>();
425
426 assert_eq!(paths, vec!["src/lib.rs"]);
427 }
428}
429
430fn parse_typescript_import(line: &str) -> Option<(String, Vec<String>)> {
431 if !line.starts_with("import ") && !line.starts_with("export ") {
432 return None;
433 }
434 let quote = if line.contains('"') { '"' } else { '\'' };
435 let parts: Vec<&str> = line.split(quote).collect();
436 if parts.len() < 2 {
437 return None;
438 }
439 let module = parts[1].to_string();
440 let names = line
441 .split('{')
442 .nth(1)
443 .and_then(|rest| rest.split('}').next())
444 .map(|inside| {
445 inside
446 .split(',')
447 .map(|name| {
448 name.trim()
449 .split_whitespace()
450 .next()
451 .unwrap_or_default()
452 .to_string()
453 })
454 .filter(|name| !name.is_empty())
455 .collect()
456 })
457 .unwrap_or_default();
458 Some((module, names))
459}
460
461fn parse_rust_import(line: &str) -> Option<(String, Vec<String>)> {
462 let rest = line.strip_prefix("use ")?;
463 let rest = rest.trim_end_matches(';').trim();
464 if let Some((module, names)) = rest.split_once("::{") {
465 let names = names
466 .trim_end_matches('}')
467 .split(',')
468 .map(|name| name.trim())
469 .filter(|name| !name.is_empty())
470 .map(ToString::to_string)
471 .collect::<Vec<_>>();
472 let module = module.trim().replace("::", ".");
473 return (!module.is_empty()).then_some((module, names));
474 }
475 let module = rest.replace("::", ".");
476 (!module.is_empty()).then_some((module, Vec::new()))
477}
478
479fn looks_internal(module: &str, file_id: &str) -> bool {
480 module.starts_with('.')
481 || module.starts_with("./")
482 || module.starts_with("../")
483 || module.starts_with("crate.")
484 || module.starts_with("self.")
485 || module.starts_with("super.")
486 || file_id
487 .split('/')
488 .next()
489 .is_some_and(|root| module.starts_with(root))
490}
491
492fn parse_symbols(file_id: &str, language: &str, source: &str, lines: &[&str]) -> Vec<SymbolFact> {
493 if let Some(symbols) = parse_tree_sitter_symbols(file_id, language, source) {
494 return symbols;
495 }
496 if language == "rust" {
497 return parse_rust_symbols(file_id, lines);
498 }
499 let mut symbols = Vec::new();
500 for (index, line) in lines.iter().enumerate() {
501 let trimmed = line.trim_start();
502 let parsed = match language {
503 "python" => parse_python_symbol(trimmed),
504 "typescript" => parse_typescript_symbol(trimmed),
505 "rust" => None,
506 _ => None,
507 };
508 if let Some((kind, name, signature)) = parsed {
509 let start_line = index + 1;
510 let end_line = find_block_end(lines, index);
511 let symbol_id = format!("{file_id}::{name}:{start_line}");
512 symbols.push(SymbolFact {
513 symbol_id,
514 file_id: file_id.to_string(),
515 path: file_id.to_string(),
516 name: name.clone(),
517 qualified_name: name,
518 kind,
519 signature,
520 start_line,
521 end_line,
522 });
523 }
524 }
525 symbols
526}
527
528fn parse_tree_sitter_symbols(
529 file_id: &str,
530 language: &str,
531 source: &str,
532) -> Option<Vec<SymbolFact>> {
533 let mut parser = TreeSitterParser::new();
534 let tree_sitter_language = match language {
535 "rust" => tree_sitter_rust::LANGUAGE.into(),
536 "python" => tree_sitter_python::LANGUAGE.into(),
537 "typescript" if file_id.ends_with(".tsx") => tree_sitter_typescript::LANGUAGE_TSX.into(),
538 "typescript" => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
539 _ => return None,
540 };
541 parser.set_language(&tree_sitter_language).ok()?;
542 let tree = parser.parse(source, None)?;
543 let mut symbols = Vec::new();
544 visit_tree_sitter_symbols(
545 file_id,
546 language,
547 source,
548 tree.root_node(),
549 None,
550 &mut symbols,
551 );
552 (!symbols.is_empty()).then_some(symbols)
553}
554
555fn visit_tree_sitter_symbols(
556 file_id: &str,
557 language: &str,
558 source: &str,
559 node: Node<'_>,
560 owner: Option<String>,
561 symbols: &mut Vec<SymbolFact>,
562) {
563 let mut next_owner = owner.clone();
564
565 if let Some((kind, name, owner_for_children)) =
566 tree_sitter_symbol_kind_and_name(language, source, node, owner.as_deref())
567 {
568 let start_line = node.start_position().row + 1;
569 let end_line = node.end_position().row + 1;
570 let qualified_name = owner
571 .as_ref()
572 .filter(|_| kind == SymbolKind::Method)
573 .map(|owner| format!("{owner}::{name}"))
574 .unwrap_or_else(|| name.clone());
575 let symbol_id = format!("{file_id}::{qualified_name}:{start_line}");
576 symbols.push(SymbolFact {
577 symbol_id,
578 file_id: file_id.to_string(),
579 path: file_id.to_string(),
580 name: name.clone(),
581 qualified_name,
582 kind,
583 signature: tree_sitter_signature(source, node),
584 start_line,
585 end_line,
586 });
587 next_owner = owner_for_children.or(Some(name));
588 } else if language == "rust" && node.kind() == "impl_item" {
589 next_owner = rust_impl_target(source, node).or(owner);
590 }
591
592 let mut cursor = node.walk();
593 for child in node.named_children(&mut cursor) {
594 visit_tree_sitter_symbols(
595 file_id,
596 language,
597 source,
598 child,
599 next_owner.clone(),
600 symbols,
601 );
602 }
603}
604
605fn tree_sitter_symbol_kind_and_name(
606 language: &str,
607 source: &str,
608 node: Node<'_>,
609 owner: Option<&str>,
610) -> Option<(SymbolKind, String, Option<String>)> {
611 let kind = node.kind();
612 let name = tree_sitter_node_name(source, node)?;
613 match language {
614 "rust" => match kind {
615 "function_item" => {
616 let symbol_kind = if owner.is_some() {
617 SymbolKind::Method
618 } else {
619 SymbolKind::Function
620 };
621 Some((symbol_kind, name, None))
622 }
623 "struct_item" => Some((SymbolKind::Struct, name.clone(), Some(name))),
624 "enum_item" => Some((SymbolKind::Enum, name.clone(), Some(name))),
625 "trait_item" => Some((SymbolKind::Interface, name.clone(), Some(name))),
626 "type_item" => Some((SymbolKind::TypeAlias, name, None)),
627 _ => None,
628 },
629 "python" => match kind {
630 "function_definition" => {
631 let symbol_kind = if owner.is_some() {
632 SymbolKind::Method
633 } else {
634 SymbolKind::Function
635 };
636 Some((symbol_kind, name, None))
637 }
638 "class_definition" => Some((SymbolKind::Class, name.clone(), Some(name))),
639 _ => None,
640 },
641 "typescript" => match kind {
642 "function_declaration" | "generator_function_declaration" => {
643 Some((SymbolKind::Function, name, None))
644 }
645 "class_declaration" => Some((SymbolKind::Class, name.clone(), Some(name))),
646 "method_definition" | "public_field_definition" => {
647 Some((SymbolKind::Method, name, None))
648 }
649 "interface_declaration" => Some((SymbolKind::Interface, name.clone(), Some(name))),
650 "type_alias_declaration" => Some((SymbolKind::TypeAlias, name, None)),
651 "lexical_declaration" | "variable_declaration" => {
652 if node_text(source, node).contains("=>")
653 || node_text(source, node).contains("function")
654 {
655 Some((SymbolKind::Function, name, None))
656 } else {
657 Some((SymbolKind::Constant, name, None))
658 }
659 }
660 _ => None,
661 },
662 _ => None,
663 }
664}
665
666fn tree_sitter_node_name(source: &str, node: Node<'_>) -> Option<String> {
667 for field in ["name", "property", "identifier"] {
668 if let Some(child) = node.child_by_field_name(field) {
669 let text = node_text(source, child).trim().to_string();
670 if !text.is_empty() {
671 return Some(text);
672 }
673 }
674 }
675
676 let mut cursor = node.walk();
677 for child in node.named_children(&mut cursor) {
678 if matches!(
679 child.kind(),
680 "identifier" | "type_identifier" | "property_identifier" | "field_identifier"
681 ) {
682 let text = node_text(source, child).trim().to_string();
683 if !text.is_empty() {
684 return Some(text);
685 }
686 }
687 if child.kind() == "variable_declarator" {
688 if let Some(name) = tree_sitter_node_name(source, child) {
689 return Some(name);
690 }
691 }
692 }
693 None
694}
695
696fn rust_impl_target(source: &str, node: Node<'_>) -> Option<String> {
697 if let Some(type_node) = node.child_by_field_name("type") {
698 return Some(clean_type_name(node_text(source, type_node)));
699 }
700 let text = node_text(source, node);
701 let header = text.split('{').next()?.trim();
702 let rest = header.strip_prefix("impl")?.trim();
703 let target = rest
704 .split(" for ")
705 .last()
706 .unwrap_or(rest)
707 .split_whitespace()
708 .last()
709 .unwrap_or(rest);
710 let target = clean_type_name(target);
711 (!target.is_empty()).then_some(target)
712}
713
714fn clean_type_name(text: &str) -> String {
715 text.trim()
716 .trim_matches('{')
717 .split('<')
718 .next()
719 .unwrap_or_default()
720 .trim()
721 .to_string()
722}
723
724fn tree_sitter_signature(source: &str, node: Node<'_>) -> String {
725 node_text(source, node)
726 .lines()
727 .next()
728 .unwrap_or_default()
729 .trim_end_matches('{')
730 .trim_end_matches(':')
731 .trim()
732 .to_string()
733}
734
735fn node_text<'a>(source: &'a str, node: Node<'a>) -> &'a str {
736 node.utf8_text(source.as_bytes()).unwrap_or_default()
737}
738
739fn parse_rust_symbols(file_id: &str, lines: &[&str]) -> Vec<SymbolFact> {
740 let mut symbols = Vec::new();
741 let mut brace_depth = 0usize;
742 let mut impl_stack: Vec<(String, usize)> = Vec::new();
743
744 for (index, line) in lines.iter().enumerate() {
745 let trimmed = line.trim_start();
746 let depth_before = brace_depth;
747
748 while let Some((_, close_depth)) = impl_stack.last() {
749 if *close_depth > depth_before {
750 impl_stack.pop();
751 } else {
752 break;
753 }
754 }
755
756 if let Some(type_name) = parse_rust_impl_target(trimmed) {
757 impl_stack.push((type_name, depth_before + 1));
758 }
759
760 if let Some((kind, name, signature)) = parse_rust_symbol(trimmed) {
761 let start_line = index + 1;
762 let end_line = find_rust_block_end(lines, index);
763 let (name, qualified_name, kind) = if kind == SymbolKind::Function {
764 if let Some((owner, _)) = impl_stack.last() {
765 (name.clone(), format!("{owner}::{name}"), SymbolKind::Method)
766 } else {
767 (name.clone(), name, SymbolKind::Function)
768 }
769 } else {
770 (name.clone(), name, kind)
771 };
772 let symbol_id = format!("{file_id}::{qualified_name}:{start_line}");
773 symbols.push(SymbolFact {
774 symbol_id,
775 file_id: file_id.to_string(),
776 path: file_id.to_string(),
777 name,
778 qualified_name,
779 kind,
780 signature,
781 start_line,
782 end_line,
783 });
784 }
785
786 brace_depth = update_brace_depth(brace_depth, line);
787 }
788
789 symbols
790}
791
792fn parse_python_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
793 if line.starts_with("def ") || line.starts_with("async def ") {
794 let signature = line.trim_end_matches(':').to_string();
795 let name = signature
796 .split("def ")
797 .nth(1)?
798 .split('(')
799 .next()?
800 .trim()
801 .to_string();
802 return Some((SymbolKind::Function, name, signature));
803 }
804 if let Some(rest) = line.strip_prefix("class ") {
805 let signature = line.trim_end_matches(':').to_string();
806 let name = rest.split(['(', ':']).next()?.trim().to_string();
807 return Some((SymbolKind::Class, name, signature));
808 }
809 None
810}
811
812fn parse_typescript_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
813 let cleaned = line
814 .strip_prefix("export ")
815 .unwrap_or(line)
816 .strip_prefix("default ")
817 .unwrap_or(line)
818 .trim();
819 for prefix in ["async function ", "function "] {
820 if let Some(rest) = cleaned.strip_prefix(prefix) {
821 let name = rest.split('(').next()?.trim().to_string();
822 return Some((SymbolKind::Function, name, cleaned.to_string()));
823 }
824 }
825 if let Some(rest) = cleaned.strip_prefix("class ") {
826 let name = rest.split([' ', '{', '<']).next()?.trim().to_string();
827 return Some((SymbolKind::Class, name, cleaned.to_string()));
828 }
829 if let Some(rest) = cleaned.strip_prefix("interface ") {
830 let name = rest.split([' ', '{', '<']).next()?.trim().to_string();
831 return Some((SymbolKind::Interface, name, cleaned.to_string()));
832 }
833 for prefix in ["const ", "let ", "var "] {
834 if let Some(rest) = cleaned.strip_prefix(prefix) {
835 if rest.contains("=>") || rest.contains("function") {
836 let name = rest.split([':', '=', ' ']).next()?.trim().to_string();
837 return Some((SymbolKind::Function, name, cleaned.to_string()));
838 }
839 }
840 }
841 None
842}
843
844fn parse_rust_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
845 let cleaned = line.strip_prefix("pub ").unwrap_or(line).trim();
846 if let Some(rest) = cleaned.strip_prefix("fn ") {
847 let name = rest.split('(').next()?.trim().to_string();
848 return Some((SymbolKind::Function, name, cleaned.to_string()));
849 }
850 if let Some(rest) = cleaned.strip_prefix("struct ") {
851 let name = rest.split([' ', '{', '<', ';']).next()?.trim().to_string();
852 return Some((SymbolKind::Struct, name, cleaned.to_string()));
853 }
854 if let Some(rest) = cleaned.strip_prefix("enum ") {
855 let name = rest.split([' ', '{', '<', ';']).next()?.trim().to_string();
856 return Some((SymbolKind::Enum, name, cleaned.to_string()));
857 }
858 None
859}
860
861fn parse_rust_impl_target(line: &str) -> Option<String> {
862 let cleaned = line.strip_prefix("pub ").unwrap_or(line).trim();
863 let rest = cleaned.strip_prefix("impl")?.trim();
864 let target = if let Some((_, after_for)) = rest.split_once(" for ") {
865 after_for
866 } else {
867 rest
868 };
869 let target = target.trim_end_matches('{').trim();
870 let target = target
871 .split('<')
872 .next()
873 .unwrap_or(target)
874 .split_whitespace()
875 .next()
876 .unwrap_or(target)
877 .trim();
878 (!target.is_empty()).then_some(target.to_string())
879}
880
881fn find_rust_block_end(lines: &[&str], start_index: usize) -> usize {
882 let mut depth = 0usize;
883 let mut seen_open = false;
884 for (index, line) in lines.iter().enumerate().skip(start_index) {
885 for ch in line.chars() {
886 match ch {
887 '{' => {
888 depth += 1;
889 seen_open = true;
890 }
891 '}' => {
892 if depth > 0 {
893 depth -= 1;
894 }
895 if seen_open && depth == 0 {
896 return index + 1;
897 }
898 }
899 _ => {}
900 }
901 }
902 }
903 lines.len()
904}
905
906fn update_brace_depth(current: usize, line: &str) -> usize {
907 let opens = line.chars().filter(|ch| *ch == '{').count();
908 let closes = line.chars().filter(|ch| *ch == '}').count();
909 current.saturating_add(opens).saturating_sub(closes)
910}
911
912fn find_block_end(lines: &[&str], start_index: usize) -> usize {
913 let base_indent = lines[start_index]
914 .chars()
915 .take_while(|ch| ch.is_whitespace())
916 .count();
917 for (index, line) in lines.iter().enumerate().skip(start_index + 1) {
918 if line.trim().is_empty() {
919 continue;
920 }
921 let indent = line.chars().take_while(|ch| ch.is_whitespace()).count();
922 if indent <= base_indent
923 && (line.trim_start().starts_with("def ")
924 || line.trim_start().starts_with("class ")
925 || line.trim_start().starts_with("function ")
926 || line.trim_start().starts_with("pub fn "))
927 {
928 return index;
929 }
930 }
931 lines.len()
932}
933
934fn select_snippets(
935 file_id: &str,
936 source: &str,
937 symbols: &[SymbolFact],
938 limit: usize,
939) -> Vec<SnippetFact> {
940 let lines: Vec<&str> = source.lines().collect();
941 symbols
942 .iter()
943 .take(limit)
944 .map(|symbol| {
945 let start = symbol.start_line.saturating_sub(1);
946 let end = symbol.end_line.min(symbol.start_line + 20).min(lines.len());
947 SnippetFact {
948 snippet_id: format!("{}#{}-{}", file_id, symbol.start_line, end),
949 file_id: file_id.to_string(),
950 title: symbol.qualified_name.clone(),
951 start_line: symbol.start_line,
952 end_line: end,
953 text: lines[start..end].join("\n"),
954 }
955 })
956 .collect()
957}