Skip to main content

matryoshka_parser/
source_parser.rs

1use anyhow::{Context, Result};
2use matryoshka_core_ir::{
3    FileFact, ImportFact, MatryoshkaProgressEvent, SnippetFact, SymbolFact, SymbolKind,
4};
5use sha2::{Digest, Sha256};
6use std::fs;
7use std::path::{Path, PathBuf};
8use tree_sitter::{Node, Parser as TreeSitterParser};
9use walkdir::WalkDir;
10
11#[derive(Debug, Clone)]
12pub struct ParsedRepository {
13    pub repo_root: PathBuf,
14    pub files: Vec<FileFact>,
15    pub symbols: Vec<SymbolFact>,
16}
17
18#[derive(Debug, Clone)]
19pub struct ParserConfig {
20    pub include_extensions: Vec<String>,
21    pub ignored_dirs: Vec<String>,
22    pub ignored_paths: Vec<String>,
23    pub max_snippets_per_file: usize,
24}
25
26impl Default for ParserConfig {
27    fn default() -> Self {
28        Self {
29            include_extensions: vec!["py".into(), "ts".into(), "tsx".into(), "rs".into()],
30            ignored_dirs: vec![
31                ".git".into(),
32                ".venv".into(),
33                "venv".into(),
34                "node_modules".into(),
35                "dist".into(),
36                "build".into(),
37                "__pycache__".into(),
38                ".pytest_cache".into(),
39                "target".into(),
40            ],
41            ignored_paths: Vec::new(),
42            max_snippets_per_file: 6,
43        }
44    }
45}
46
47impl ParserConfig {
48    pub fn with_ignored_paths(mut self, ignored_paths: impl IntoIterator<Item = String>) -> Self {
49        self.ignored_paths.extend(
50            ignored_paths
51                .into_iter()
52                .map(|path| normalize_ignored_path(&path))
53                .filter(|path| !path.is_empty()),
54        );
55        self
56    }
57
58    pub fn ignores_entry(&self, repo_root: &Path, path: &Path) -> bool {
59        let Some(name) = path.file_name().and_then(|name| name.to_str()) else {
60            return false;
61        };
62        if self.ignored_dirs.iter().any(|ignored| ignored == name) {
63            return true;
64        }
65        let relative = relative_path(repo_root, path);
66        self.ignored_paths
67            .iter()
68            .any(|ignored| path_matches_ignore(&relative, ignored))
69    }
70}
71
72pub struct SourceParser {
73    config: ParserConfig,
74}
75
76impl SourceParser {
77    pub fn new(config: ParserConfig) -> Self {
78        Self { config }
79    }
80
81    pub fn parse_repo(&self, repo_root: impl AsRef<Path>) -> Result<ParsedRepository> {
82        self.parse_repo_with_progress(repo_root, |_| {})
83    }
84
85    pub fn parse_repo_with_progress(
86        &self,
87        repo_root: impl AsRef<Path>,
88        mut progress: impl FnMut(MatryoshkaProgressEvent),
89    ) -> Result<ParsedRepository> {
90        let repo_root = repo_root.as_ref().to_path_buf();
91        progress(MatryoshkaProgressEvent::DiscoveringFiles);
92        let candidate_paths = self.discover_paths(&repo_root)?;
93        let total_files = candidate_paths.len();
94        progress(MatryoshkaProgressEvent::FilesDiscovered { total_files });
95        let mut files = Vec::new();
96        let mut symbols = Vec::new();
97
98        for (index, path) in candidate_paths.iter().enumerate() {
99            let relative = relative_path(&repo_root, path);
100            progress(MatryoshkaProgressEvent::ParsingFile {
101                path: relative.clone(),
102                index: index + 1,
103                total_files,
104            });
105            let (file, mut file_symbols) = self.parse_file(&repo_root, path)?;
106            progress(MatryoshkaProgressEvent::ParsedFile {
107                path: relative,
108                index: index + 1,
109                total_files,
110            });
111            files.push(file);
112            symbols.append(&mut file_symbols);
113        }
114
115        files.sort_by(|left, right| left.path.cmp(&right.path));
116        symbols.sort_by(|left, right| left.symbol_id.cmp(&right.symbol_id));
117
118        Ok(ParsedRepository {
119            repo_root,
120            files,
121            symbols,
122        })
123    }
124
125    fn discover_paths(&self, repo_root: &Path) -> Result<Vec<PathBuf>> {
126        let mut paths = Vec::new();
127        for entry in WalkDir::new(repo_root)
128            .into_iter()
129            .filter_entry(|entry| !self.config.ignores_entry(repo_root, entry.path()))
130        {
131            let entry = entry?;
132            if !entry.file_type().is_file() {
133                continue;
134            }
135            let path = entry.into_path();
136            if !self.config.ignores_entry(repo_root, &path) && self.should_parse(&path) {
137                paths.push(path);
138            }
139        }
140        paths.sort();
141        Ok(paths)
142    }
143
144    fn should_parse(&self, path: &Path) -> bool {
145        path.extension()
146            .and_then(|ext| ext.to_str())
147            .map(|ext| {
148                self.config
149                    .include_extensions
150                    .iter()
151                    .any(|allowed| allowed == ext)
152            })
153            .unwrap_or(false)
154    }
155
156    fn parse_file(&self, repo_root: &Path, path: &Path) -> Result<(FileFact, Vec<SymbolFact>)> {
157        let source = fs::read_to_string(path)
158            .with_context(|| format!("failed to read source file {}", path.display()))?;
159        let relative = path
160            .strip_prefix(repo_root)
161            .unwrap_or(path)
162            .to_string_lossy()
163            .replace('\\', "/");
164        let language = language_for(path);
165        let source_hash = hash_text(&source);
166        let lines: Vec<&str> = source.lines().collect();
167        let parent_folder_id = parent_folder_id(&relative);
168        let imports = parse_imports(&relative, &language, &lines);
169        let symbols = parse_symbols(&relative, &language, &source, &lines);
170        let snippets = select_snippets(
171            &relative,
172            &source,
173            &symbols,
174            self.config.max_snippets_per_file,
175        );
176
177        let file = FileFact {
178            file_id: relative.clone(),
179            path: relative.clone(),
180            name: Path::new(&relative)
181                .file_name()
182                .and_then(|name| name.to_str())
183                .unwrap_or(&relative)
184                .to_string(),
185            language,
186            parent_folder_id,
187            source_hash,
188            line_count: lines.len(),
189            imports,
190            snippets,
191        };
192
193        Ok((file, symbols))
194    }
195}
196
197fn normalize_ignored_path(path: &str) -> String {
198    path.trim()
199        .trim_matches('/')
200        .replace('\\', "/")
201        .split('/')
202        .filter(|part| !part.is_empty() && *part != ".")
203        .collect::<Vec<_>>()
204        .join("/")
205}
206
207fn path_matches_ignore(relative_path: &str, ignored_path: &str) -> bool {
208    if ignored_path.is_empty() || relative_path.is_empty() {
209        return false;
210    }
211    if ignored_path.contains('/') {
212        relative_path == ignored_path || relative_path.starts_with(&format!("{ignored_path}/"))
213    } else {
214        relative_path
215            .split('/')
216            .any(|component| component == ignored_path)
217    }
218}
219
220fn relative_path(repo_root: &Path, path: &Path) -> String {
221    path.strip_prefix(repo_root)
222        .unwrap_or(path)
223        .to_string_lossy()
224        .replace('\\', "/")
225}
226
227pub fn hash_text(text: &str) -> String {
228    let mut hasher = Sha256::new();
229    hasher.update(text.as_bytes());
230    format!("{:x}", hasher.finalize())
231}
232
233fn language_for(path: &Path) -> String {
234    match path
235        .extension()
236        .and_then(|ext| ext.to_str())
237        .unwrap_or_default()
238    {
239        "py" => "python",
240        "ts" | "tsx" => "typescript",
241        "rs" => "rust",
242        other => other,
243    }
244    .to_string()
245}
246
247fn parent_folder_id(path: &str) -> String {
248    Path::new(path)
249        .parent()
250        .and_then(|parent| parent.to_str())
251        .filter(|parent| !parent.is_empty())
252        .unwrap_or("repo")
253        .replace('\\', "/")
254}
255
256fn parse_imports(file_id: &str, language: &str, lines: &[&str]) -> Vec<ImportFact> {
257    let mut imports = Vec::new();
258    for (index, line) in lines.iter().enumerate() {
259        let trimmed = line.trim();
260        let parsed = match language {
261            "python" => parse_python_import(trimmed),
262            "typescript" => parse_typescript_import(trimmed),
263            "rust" => parse_rust_import(trimmed),
264            _ => None,
265        };
266        if let Some((module, names)) = parsed {
267            imports.push(ImportFact {
268                module,
269                names,
270                line: index + 1,
271                resolved_file_id: None,
272                is_internal: false,
273            });
274        }
275    }
276    imports.sort_by(|left, right| (left.line, &left.module).cmp(&(right.line, &right.module)));
277    imports.dedup_by(|left, right| left.module == right.module && left.line == right.line);
278    imports.iter_mut().for_each(|import| {
279        import.is_internal = looks_internal(&import.module, file_id);
280    });
281    imports
282}
283
284fn parse_python_import(line: &str) -> Option<(String, Vec<String>)> {
285    if let Some(rest) = line.strip_prefix("from ") {
286        let mut parts = rest.splitn(2, " import ");
287        let module = parts.next()?.trim().to_string();
288        let names = parts
289            .next()
290            .unwrap_or_default()
291            .split(',')
292            .map(|name| {
293                name.trim()
294                    .split_whitespace()
295                    .next()
296                    .unwrap_or_default()
297                    .to_string()
298            })
299            .filter(|name| !name.is_empty())
300            .collect();
301        return (!module.is_empty()).then_some((module, names));
302    }
303    if let Some(rest) = line.strip_prefix("import ") {
304        let module = rest
305            .split(',')
306            .next()?
307            .trim()
308            .split_whitespace()
309            .next()
310            .unwrap_or_default()
311            .to_string();
312        return (!module.is_empty()).then_some((module, Vec::new()));
313    }
314    None
315}
316
317#[cfg(test)]
318mod tests {
319    use super::{
320        ParserConfig, SourceParser, parse_python_import, parse_rust_import, parse_rust_symbols,
321    };
322    use matryoshka_core_ir::SymbolKind;
323    use std::fs;
324
325    #[test]
326    fn python_relative_imports_preserve_leading_dots() {
327        let parsed = parse_python_import("from ..graph import RepositoryGraph").unwrap();
328        assert_eq!(parsed.0, "..graph");
329        assert_eq!(parsed.1, vec!["RepositoryGraph"]);
330    }
331
332    #[test]
333    fn rust_grouped_imports_extract_module_and_names() {
334        let parsed = parse_rust_import("use matryoshka_core_ir::{FileFact, SymbolFact};").unwrap();
335        assert_eq!(parsed.0, "matryoshka_core_ir");
336        assert_eq!(parsed.1, vec!["FileFact", "SymbolFact"]);
337    }
338
339    #[test]
340    fn rust_impl_methods_are_qualified_as_methods() {
341        let lines = vec![
342            "pub struct MatryoshkaStore {",
343            "    db_path: PathBuf,",
344            "}",
345            "",
346            "impl MatryoshkaStore {",
347            "    pub fn open(db_path: impl AsRef<Path>) -> Result<Self> {",
348            "        Self { db_path: db_path.as_ref().to_path_buf() }",
349            "    }",
350            "}",
351        ];
352        let symbols = parse_rust_symbols("store.rs", &lines);
353        assert!(symbols.iter().any(|symbol| {
354            symbol.qualified_name == "MatryoshkaStore::open" && symbol.kind == SymbolKind::Method
355        }));
356        assert!(symbols.iter().any(|symbol| {
357            symbol.qualified_name == "MatryoshkaStore" && symbol.kind == SymbolKind::Struct
358        }));
359    }
360
361    #[test]
362    fn tree_sitter_parser_extracts_python_methods() {
363        let temp = tempfile::tempdir().unwrap();
364        fs::write(
365            temp.path().join("service.py"),
366            "class TokenService:\n    def refresh(self):\n        return True\n",
367        )
368        .unwrap();
369        let parser = SourceParser::new(ParserConfig::default());
370        let parsed = parser.parse_repo(temp.path()).unwrap();
371        assert!(parsed.symbols.iter().any(|symbol| {
372            symbol.qualified_name == "TokenService::refresh"
373                && symbol.kind == SymbolKind::Method
374                && symbol.start_line == 2
375                && symbol.end_line == 3
376        }));
377    }
378
379    #[test]
380    fn tree_sitter_parser_extracts_typescript_class_methods() {
381        let temp = tempfile::tempdir().unwrap();
382        fs::write(
383            temp.path().join("client.ts"),
384            "export class ApiClient {\n  async fetchToken(): Promise<string> {\n    return 'token';\n  }\n}\n",
385        )
386        .unwrap();
387        let parser = SourceParser::new(ParserConfig::default());
388        let parsed = parser.parse_repo(temp.path()).unwrap();
389        assert!(parsed.symbols.iter().any(|symbol| {
390            symbol.qualified_name == "ApiClient::fetchToken"
391                && symbol.kind == SymbolKind::Method
392                && symbol.start_line == 2
393                && symbol.end_line == 4
394        }));
395    }
396
397    #[test]
398    fn parser_config_ignores_path_components_and_subtrees() {
399        let temp = tempfile::tempdir().unwrap();
400        fs::create_dir_all(temp.path().join("src")).unwrap();
401        fs::create_dir_all(temp.path().join("tests")).unwrap();
402        fs::create_dir_all(temp.path().join("packages/web")).unwrap();
403        fs::write(temp.path().join("src/lib.rs"), "pub fn keep() {}\n").unwrap();
404        fs::write(
405            temp.path().join("tests/test_api.py"),
406            "def drop_me(): pass\n",
407        )
408        .unwrap();
409        fs::write(
410            temp.path().join("packages/web/app.ts"),
411            "export function app() {}\n",
412        )
413        .unwrap();
414
415        let parser = SourceParser::new(
416            ParserConfig::default()
417                .with_ignored_paths(["tests".to_string(), "packages/web".to_string()]),
418        );
419        let parsed = parser.parse_repo(temp.path()).unwrap();
420        let paths = parsed
421            .files
422            .iter()
423            .map(|file| file.path.as_str())
424            .collect::<Vec<_>>();
425
426        assert_eq!(paths, vec!["src/lib.rs"]);
427    }
428}
429
430fn parse_typescript_import(line: &str) -> Option<(String, Vec<String>)> {
431    if !line.starts_with("import ") && !line.starts_with("export ") {
432        return None;
433    }
434    let quote = if line.contains('"') { '"' } else { '\'' };
435    let parts: Vec<&str> = line.split(quote).collect();
436    if parts.len() < 2 {
437        return None;
438    }
439    let module = parts[1].to_string();
440    let names = line
441        .split('{')
442        .nth(1)
443        .and_then(|rest| rest.split('}').next())
444        .map(|inside| {
445            inside
446                .split(',')
447                .map(|name| {
448                    name.trim()
449                        .split_whitespace()
450                        .next()
451                        .unwrap_or_default()
452                        .to_string()
453                })
454                .filter(|name| !name.is_empty())
455                .collect()
456        })
457        .unwrap_or_default();
458    Some((module, names))
459}
460
461fn parse_rust_import(line: &str) -> Option<(String, Vec<String>)> {
462    let rest = line.strip_prefix("use ")?;
463    let rest = rest.trim_end_matches(';').trim();
464    if let Some((module, names)) = rest.split_once("::{") {
465        let names = names
466            .trim_end_matches('}')
467            .split(',')
468            .map(|name| name.trim())
469            .filter(|name| !name.is_empty())
470            .map(ToString::to_string)
471            .collect::<Vec<_>>();
472        let module = module.trim().replace("::", ".");
473        return (!module.is_empty()).then_some((module, names));
474    }
475    let module = rest.replace("::", ".");
476    (!module.is_empty()).then_some((module, Vec::new()))
477}
478
479fn looks_internal(module: &str, file_id: &str) -> bool {
480    module.starts_with('.')
481        || module.starts_with("./")
482        || module.starts_with("../")
483        || module.starts_with("crate.")
484        || module.starts_with("self.")
485        || module.starts_with("super.")
486        || file_id
487            .split('/')
488            .next()
489            .is_some_and(|root| module.starts_with(root))
490}
491
492fn parse_symbols(file_id: &str, language: &str, source: &str, lines: &[&str]) -> Vec<SymbolFact> {
493    if let Some(symbols) = parse_tree_sitter_symbols(file_id, language, source) {
494        return symbols;
495    }
496    if language == "rust" {
497        return parse_rust_symbols(file_id, lines);
498    }
499    let mut symbols = Vec::new();
500    for (index, line) in lines.iter().enumerate() {
501        let trimmed = line.trim_start();
502        let parsed = match language {
503            "python" => parse_python_symbol(trimmed),
504            "typescript" => parse_typescript_symbol(trimmed),
505            "rust" => None,
506            _ => None,
507        };
508        if let Some((kind, name, signature)) = parsed {
509            let start_line = index + 1;
510            let end_line = find_block_end(lines, index);
511            let symbol_id = format!("{file_id}::{name}:{start_line}");
512            symbols.push(SymbolFact {
513                symbol_id,
514                file_id: file_id.to_string(),
515                path: file_id.to_string(),
516                name: name.clone(),
517                qualified_name: name,
518                kind,
519                signature,
520                start_line,
521                end_line,
522            });
523        }
524    }
525    symbols
526}
527
528fn parse_tree_sitter_symbols(
529    file_id: &str,
530    language: &str,
531    source: &str,
532) -> Option<Vec<SymbolFact>> {
533    let mut parser = TreeSitterParser::new();
534    let tree_sitter_language = match language {
535        "rust" => tree_sitter_rust::LANGUAGE.into(),
536        "python" => tree_sitter_python::LANGUAGE.into(),
537        "typescript" if file_id.ends_with(".tsx") => tree_sitter_typescript::LANGUAGE_TSX.into(),
538        "typescript" => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
539        _ => return None,
540    };
541    parser.set_language(&tree_sitter_language).ok()?;
542    let tree = parser.parse(source, None)?;
543    let mut symbols = Vec::new();
544    visit_tree_sitter_symbols(
545        file_id,
546        language,
547        source,
548        tree.root_node(),
549        None,
550        &mut symbols,
551    );
552    (!symbols.is_empty()).then_some(symbols)
553}
554
555fn visit_tree_sitter_symbols(
556    file_id: &str,
557    language: &str,
558    source: &str,
559    node: Node<'_>,
560    owner: Option<String>,
561    symbols: &mut Vec<SymbolFact>,
562) {
563    let mut next_owner = owner.clone();
564
565    if let Some((kind, name, owner_for_children)) =
566        tree_sitter_symbol_kind_and_name(language, source, node, owner.as_deref())
567    {
568        let start_line = node.start_position().row + 1;
569        let end_line = node.end_position().row + 1;
570        let qualified_name = owner
571            .as_ref()
572            .filter(|_| kind == SymbolKind::Method)
573            .map(|owner| format!("{owner}::{name}"))
574            .unwrap_or_else(|| name.clone());
575        let symbol_id = format!("{file_id}::{qualified_name}:{start_line}");
576        symbols.push(SymbolFact {
577            symbol_id,
578            file_id: file_id.to_string(),
579            path: file_id.to_string(),
580            name: name.clone(),
581            qualified_name,
582            kind,
583            signature: tree_sitter_signature(source, node),
584            start_line,
585            end_line,
586        });
587        next_owner = owner_for_children.or(Some(name));
588    } else if language == "rust" && node.kind() == "impl_item" {
589        next_owner = rust_impl_target(source, node).or(owner);
590    }
591
592    let mut cursor = node.walk();
593    for child in node.named_children(&mut cursor) {
594        visit_tree_sitter_symbols(
595            file_id,
596            language,
597            source,
598            child,
599            next_owner.clone(),
600            symbols,
601        );
602    }
603}
604
605fn tree_sitter_symbol_kind_and_name(
606    language: &str,
607    source: &str,
608    node: Node<'_>,
609    owner: Option<&str>,
610) -> Option<(SymbolKind, String, Option<String>)> {
611    let kind = node.kind();
612    let name = tree_sitter_node_name(source, node)?;
613    match language {
614        "rust" => match kind {
615            "function_item" => {
616                let symbol_kind = if owner.is_some() {
617                    SymbolKind::Method
618                } else {
619                    SymbolKind::Function
620                };
621                Some((symbol_kind, name, None))
622            }
623            "struct_item" => Some((SymbolKind::Struct, name.clone(), Some(name))),
624            "enum_item" => Some((SymbolKind::Enum, name.clone(), Some(name))),
625            "trait_item" => Some((SymbolKind::Interface, name.clone(), Some(name))),
626            "type_item" => Some((SymbolKind::TypeAlias, name, None)),
627            _ => None,
628        },
629        "python" => match kind {
630            "function_definition" => {
631                let symbol_kind = if owner.is_some() {
632                    SymbolKind::Method
633                } else {
634                    SymbolKind::Function
635                };
636                Some((symbol_kind, name, None))
637            }
638            "class_definition" => Some((SymbolKind::Class, name.clone(), Some(name))),
639            _ => None,
640        },
641        "typescript" => match kind {
642            "function_declaration" | "generator_function_declaration" => {
643                Some((SymbolKind::Function, name, None))
644            }
645            "class_declaration" => Some((SymbolKind::Class, name.clone(), Some(name))),
646            "method_definition" | "public_field_definition" => {
647                Some((SymbolKind::Method, name, None))
648            }
649            "interface_declaration" => Some((SymbolKind::Interface, name.clone(), Some(name))),
650            "type_alias_declaration" => Some((SymbolKind::TypeAlias, name, None)),
651            "lexical_declaration" | "variable_declaration" => {
652                if node_text(source, node).contains("=>")
653                    || node_text(source, node).contains("function")
654                {
655                    Some((SymbolKind::Function, name, None))
656                } else {
657                    Some((SymbolKind::Constant, name, None))
658                }
659            }
660            _ => None,
661        },
662        _ => None,
663    }
664}
665
666fn tree_sitter_node_name(source: &str, node: Node<'_>) -> Option<String> {
667    for field in ["name", "property", "identifier"] {
668        if let Some(child) = node.child_by_field_name(field) {
669            let text = node_text(source, child).trim().to_string();
670            if !text.is_empty() {
671                return Some(text);
672            }
673        }
674    }
675
676    let mut cursor = node.walk();
677    for child in node.named_children(&mut cursor) {
678        if matches!(
679            child.kind(),
680            "identifier" | "type_identifier" | "property_identifier" | "field_identifier"
681        ) {
682            let text = node_text(source, child).trim().to_string();
683            if !text.is_empty() {
684                return Some(text);
685            }
686        }
687        if child.kind() == "variable_declarator" {
688            if let Some(name) = tree_sitter_node_name(source, child) {
689                return Some(name);
690            }
691        }
692    }
693    None
694}
695
696fn rust_impl_target(source: &str, node: Node<'_>) -> Option<String> {
697    if let Some(type_node) = node.child_by_field_name("type") {
698        return Some(clean_type_name(node_text(source, type_node)));
699    }
700    let text = node_text(source, node);
701    let header = text.split('{').next()?.trim();
702    let rest = header.strip_prefix("impl")?.trim();
703    let target = rest
704        .split(" for ")
705        .last()
706        .unwrap_or(rest)
707        .split_whitespace()
708        .last()
709        .unwrap_or(rest);
710    let target = clean_type_name(target);
711    (!target.is_empty()).then_some(target)
712}
713
714fn clean_type_name(text: &str) -> String {
715    text.trim()
716        .trim_matches('{')
717        .split('<')
718        .next()
719        .unwrap_or_default()
720        .trim()
721        .to_string()
722}
723
724fn tree_sitter_signature(source: &str, node: Node<'_>) -> String {
725    node_text(source, node)
726        .lines()
727        .next()
728        .unwrap_or_default()
729        .trim_end_matches('{')
730        .trim_end_matches(':')
731        .trim()
732        .to_string()
733}
734
735fn node_text<'a>(source: &'a str, node: Node<'a>) -> &'a str {
736    node.utf8_text(source.as_bytes()).unwrap_or_default()
737}
738
739fn parse_rust_symbols(file_id: &str, lines: &[&str]) -> Vec<SymbolFact> {
740    let mut symbols = Vec::new();
741    let mut brace_depth = 0usize;
742    let mut impl_stack: Vec<(String, usize)> = Vec::new();
743
744    for (index, line) in lines.iter().enumerate() {
745        let trimmed = line.trim_start();
746        let depth_before = brace_depth;
747
748        while let Some((_, close_depth)) = impl_stack.last() {
749            if *close_depth > depth_before {
750                impl_stack.pop();
751            } else {
752                break;
753            }
754        }
755
756        if let Some(type_name) = parse_rust_impl_target(trimmed) {
757            impl_stack.push((type_name, depth_before + 1));
758        }
759
760        if let Some((kind, name, signature)) = parse_rust_symbol(trimmed) {
761            let start_line = index + 1;
762            let end_line = find_rust_block_end(lines, index);
763            let (name, qualified_name, kind) = if kind == SymbolKind::Function {
764                if let Some((owner, _)) = impl_stack.last() {
765                    (name.clone(), format!("{owner}::{name}"), SymbolKind::Method)
766                } else {
767                    (name.clone(), name, SymbolKind::Function)
768                }
769            } else {
770                (name.clone(), name, kind)
771            };
772            let symbol_id = format!("{file_id}::{qualified_name}:{start_line}");
773            symbols.push(SymbolFact {
774                symbol_id,
775                file_id: file_id.to_string(),
776                path: file_id.to_string(),
777                name,
778                qualified_name,
779                kind,
780                signature,
781                start_line,
782                end_line,
783            });
784        }
785
786        brace_depth = update_brace_depth(brace_depth, line);
787    }
788
789    symbols
790}
791
792fn parse_python_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
793    if line.starts_with("def ") || line.starts_with("async def ") {
794        let signature = line.trim_end_matches(':').to_string();
795        let name = signature
796            .split("def ")
797            .nth(1)?
798            .split('(')
799            .next()?
800            .trim()
801            .to_string();
802        return Some((SymbolKind::Function, name, signature));
803    }
804    if let Some(rest) = line.strip_prefix("class ") {
805        let signature = line.trim_end_matches(':').to_string();
806        let name = rest.split(['(', ':']).next()?.trim().to_string();
807        return Some((SymbolKind::Class, name, signature));
808    }
809    None
810}
811
812fn parse_typescript_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
813    let cleaned = line
814        .strip_prefix("export ")
815        .unwrap_or(line)
816        .strip_prefix("default ")
817        .unwrap_or(line)
818        .trim();
819    for prefix in ["async function ", "function "] {
820        if let Some(rest) = cleaned.strip_prefix(prefix) {
821            let name = rest.split('(').next()?.trim().to_string();
822            return Some((SymbolKind::Function, name, cleaned.to_string()));
823        }
824    }
825    if let Some(rest) = cleaned.strip_prefix("class ") {
826        let name = rest.split([' ', '{', '<']).next()?.trim().to_string();
827        return Some((SymbolKind::Class, name, cleaned.to_string()));
828    }
829    if let Some(rest) = cleaned.strip_prefix("interface ") {
830        let name = rest.split([' ', '{', '<']).next()?.trim().to_string();
831        return Some((SymbolKind::Interface, name, cleaned.to_string()));
832    }
833    for prefix in ["const ", "let ", "var "] {
834        if let Some(rest) = cleaned.strip_prefix(prefix) {
835            if rest.contains("=>") || rest.contains("function") {
836                let name = rest.split([':', '=', ' ']).next()?.trim().to_string();
837                return Some((SymbolKind::Function, name, cleaned.to_string()));
838            }
839        }
840    }
841    None
842}
843
844fn parse_rust_symbol(line: &str) -> Option<(SymbolKind, String, String)> {
845    let cleaned = line.strip_prefix("pub ").unwrap_or(line).trim();
846    if let Some(rest) = cleaned.strip_prefix("fn ") {
847        let name = rest.split('(').next()?.trim().to_string();
848        return Some((SymbolKind::Function, name, cleaned.to_string()));
849    }
850    if let Some(rest) = cleaned.strip_prefix("struct ") {
851        let name = rest.split([' ', '{', '<', ';']).next()?.trim().to_string();
852        return Some((SymbolKind::Struct, name, cleaned.to_string()));
853    }
854    if let Some(rest) = cleaned.strip_prefix("enum ") {
855        let name = rest.split([' ', '{', '<', ';']).next()?.trim().to_string();
856        return Some((SymbolKind::Enum, name, cleaned.to_string()));
857    }
858    None
859}
860
861fn parse_rust_impl_target(line: &str) -> Option<String> {
862    let cleaned = line.strip_prefix("pub ").unwrap_or(line).trim();
863    let rest = cleaned.strip_prefix("impl")?.trim();
864    let target = if let Some((_, after_for)) = rest.split_once(" for ") {
865        after_for
866    } else {
867        rest
868    };
869    let target = target.trim_end_matches('{').trim();
870    let target = target
871        .split('<')
872        .next()
873        .unwrap_or(target)
874        .split_whitespace()
875        .next()
876        .unwrap_or(target)
877        .trim();
878    (!target.is_empty()).then_some(target.to_string())
879}
880
881fn find_rust_block_end(lines: &[&str], start_index: usize) -> usize {
882    let mut depth = 0usize;
883    let mut seen_open = false;
884    for (index, line) in lines.iter().enumerate().skip(start_index) {
885        for ch in line.chars() {
886            match ch {
887                '{' => {
888                    depth += 1;
889                    seen_open = true;
890                }
891                '}' => {
892                    if depth > 0 {
893                        depth -= 1;
894                    }
895                    if seen_open && depth == 0 {
896                        return index + 1;
897                    }
898                }
899                _ => {}
900            }
901        }
902    }
903    lines.len()
904}
905
906fn update_brace_depth(current: usize, line: &str) -> usize {
907    let opens = line.chars().filter(|ch| *ch == '{').count();
908    let closes = line.chars().filter(|ch| *ch == '}').count();
909    current.saturating_add(opens).saturating_sub(closes)
910}
911
912fn find_block_end(lines: &[&str], start_index: usize) -> usize {
913    let base_indent = lines[start_index]
914        .chars()
915        .take_while(|ch| ch.is_whitespace())
916        .count();
917    for (index, line) in lines.iter().enumerate().skip(start_index + 1) {
918        if line.trim().is_empty() {
919            continue;
920        }
921        let indent = line.chars().take_while(|ch| ch.is_whitespace()).count();
922        if indent <= base_indent
923            && (line.trim_start().starts_with("def ")
924                || line.trim_start().starts_with("class ")
925                || line.trim_start().starts_with("function ")
926                || line.trim_start().starts_with("pub fn "))
927        {
928            return index;
929        }
930    }
931    lines.len()
932}
933
934fn select_snippets(
935    file_id: &str,
936    source: &str,
937    symbols: &[SymbolFact],
938    limit: usize,
939) -> Vec<SnippetFact> {
940    let lines: Vec<&str> = source.lines().collect();
941    symbols
942        .iter()
943        .take(limit)
944        .map(|symbol| {
945            let start = symbol.start_line.saturating_sub(1);
946            let end = symbol.end_line.min(symbol.start_line + 20).min(lines.len());
947            SnippetFact {
948                snippet_id: format!("{}#{}-{}", file_id, symbol.start_line, end),
949                file_id: file_id.to_string(),
950                title: symbol.qualified_name.clone(),
951                start_line: symbol.start_line,
952                end_line: end,
953                text: lines[start..end].join("\n"),
954            }
955        })
956        .collect()
957}