Skip to main content

codemem_engine/index/
parser.rs

1//! ast-grep parsing coordinator.
2//!
3//! Detects language from file extension, selects the appropriate rules,
4//! and runs symbol + reference extraction via the unified engine.
5
6use crate::index::chunker::{chunk_file, ChunkConfig, CodeChunk};
7use crate::index::engine::AstGrepEngine;
8use crate::index::symbol::{Reference, Symbol};
9use ast_grep_core::tree_sitter::LanguageExt;
10use std::path::Path;
11
12/// Result of parsing a single file.
13#[derive(Debug, Clone)]
14pub struct ParseResult {
15    /// Path to the parsed file.
16    pub file_path: String,
17    /// Language that was detected and used.
18    pub language: String,
19    /// All symbols extracted from the file.
20    pub symbols: Vec<Symbol>,
21    /// All references extracted from the file.
22    pub references: Vec<Reference>,
23    /// CST-aware code chunks extracted from the file.
24    pub chunks: Vec<CodeChunk>,
25}
26
27/// Coordinates ast-grep parsing across multiple languages.
28pub struct CodeParser {
29    engine: AstGrepEngine,
30    chunk_config: ChunkConfig,
31}
32
33impl CodeParser {
34    /// Create a new CodeParser with all registered language rules.
35    pub fn new() -> Self {
36        Self {
37            engine: AstGrepEngine::new(),
38            chunk_config: ChunkConfig::default(),
39        }
40    }
41
42    /// Parse a single file and extract symbols, references, and chunks.
43    ///
44    /// Returns `None` if the file extension is not supported or parsing fails.
45    pub fn parse_file(&self, path: &str, content: &[u8]) -> Option<ParseResult> {
46        let extension = Path::new(path).extension().and_then(|ext| ext.to_str())?;
47
48        let lang = self.engine.find_language(extension)?;
49        let source = std::str::from_utf8(content).ok()?;
50
51        // C1: Parse source once and share the tree across all three passes.
52        let root = lang.lang.ast_grep(source);
53        let symbols = self
54            .engine
55            .extract_symbols_from_tree(lang, &root, source, path);
56        let references = self
57            .engine
58            .extract_references_from_tree(lang, &root, source, path);
59        let chunks = chunk_file(&root, source, path, &symbols, &self.chunk_config);
60
61        // Map internal language names to the canonical names used by consumers.
62        // tsx/javascript share TypeScript extraction rules (same grammar family).
63        // Consumers (graph nodes, MCP tools) treat them as "typescript" for uniformity,
64        // since JS/TS/TSX/JSX all use the same symbol/reference extraction logic.
65        let language_name = match lang.name {
66            "tsx" | "javascript" => "typescript",
67            other => other,
68        };
69
70        Some(ParseResult {
71            file_path: path.to_string(),
72            language: language_name.to_string(),
73            symbols,
74            references,
75            chunks,
76        })
77    }
78
79    /// Check if a given file extension is supported.
80    pub fn supports_extension(&self, ext: &str) -> bool {
81        self.engine.supports_extension(ext)
82    }
83}
84
85impl Default for CodeParser {
86    fn default() -> Self {
87        Self::new()
88    }
89}
90
91#[cfg(test)]
92#[path = "tests/parser_tests.rs"]
93mod tests;