Skip to main content

ast_doc_core/parser/
mod.rs

1//! Phase 2: AST parsing and strategy extraction.
2//!
3//! Uses tree-sitter to parse source files and pre-compute
4//! Full/NoTests/Summary strategy variants with token counts.
5
6pub mod lang;
7pub mod strategy;
8
9use std::{
10    collections::HashMap,
11    path::{Path, PathBuf},
12};
13
14use crate::{config::OutputStrategy, error::AstDocError, ingestion::DiscoveredFile};
15
16/// Supported programming languages.
17#[derive(Debug, Clone, PartialEq, Eq, Hash)]
18pub enum Language {
19    /// Rust source files (.rs).
20    Rust,
21    /// Python source files (.py).
22    Python,
23    /// TypeScript/JavaScript source files (.ts, .tsx, .js, .jsx).
24    TypeScript,
25    /// Go source files (.go).
26    Go,
27    /// C source files (.c, .h).
28    C,
29    /// Any other language supported by tree-sitter-language-pack.
30    /// Contains the language name (e.g., "java", "kotlin", "ruby").
31    Generic(String),
32}
33
34impl std::fmt::Display for Language {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            Self::Rust => write!(f, "Rust"),
38            Self::Python => write!(f, "Python"),
39            Self::TypeScript => write!(f, "TypeScript"),
40            Self::Go => write!(f, "Go"),
41            Self::C => write!(f, "C"),
42            Self::Generic(name) => write!(f, "{name}"),
43        }
44    }
45}
46
47impl Language {
48    /// Return the tree-sitter language name used by `tree-sitter-language-pack`.
49    #[must_use]
50    #[expect(clippy::missing_const_for_fn)]
51    pub fn ts_pack_name(&self) -> &str {
52        match self {
53            Self::Rust => "rust",
54            Self::Python => "python",
55            Self::TypeScript => "typescript",
56            Self::Go => "go",
57            Self::C => "c",
58            Self::Generic(name) => name.as_str(),
59        }
60    }
61
62    /// Return `true` if this is one of the core 5 languages with deep analysis.
63    #[must_use]
64    pub const fn is_core(&self) -> bool {
65        matches!(self, Self::Rust | Self::Python | Self::TypeScript | Self::Go | Self::C)
66    }
67}
68
69/// Pre-computed content and token count for a single strategy.
70#[derive(Debug, Clone)]
71pub struct StrategyData {
72    /// The rendered source text for this strategy.
73    pub content: String,
74    /// Token count of content (computed once via tiktoken-rs during parsing).
75    pub token_count: usize,
76}
77
78/// A parsed file with pre-computed strategy data for all output modes.
79#[derive(Debug, Clone)]
80pub struct ParsedFile {
81    /// Relative path from the project root.
82    pub path: PathBuf,
83    /// Detected language.
84    pub language: Language,
85    /// Original source content.
86    pub source: String,
87    /// Pre-computed strategy data for each output mode.
88    pub strategies_data: HashMap<OutputStrategy, StrategyData>,
89}
90
91/// Trait for language-specific parsers.
92pub trait LanguageParser {
93    /// Parse the source code and produce a `ParsedFile`.
94    ///
95    /// # Errors
96    ///
97    /// Returns an error if tree-sitter parsing fails.
98    fn parse(&self, source: &str, path: &Path) -> Result<ParsedFile, AstDocError>;
99}
100
101/// Detect the language from a file extension.
102///
103/// When `lang-pack` feature is enabled, falls back to
104/// `tree-sitter-language-pack` for extension resolution.
105#[must_use]
106pub fn detect_language(path: &Path) -> Option<Language> {
107    // Core languages (always available when their feature is enabled)
108    match path.extension().and_then(|e| e.to_str()) {
109        Some("rs") => return Some(Language::Rust),
110        Some("py") => return Some(Language::Python),
111        Some("ts" | "tsx" | "js" | "jsx") => return Some(Language::TypeScript),
112        Some("go") => return Some(Language::Go),
113        Some("c" | "h") => return Some(Language::C),
114        _ => {}
115    }
116
117    // Fall back to tree-sitter-language-pack for other languages
118    #[cfg(feature = "lang-pack")]
119    {
120        detect_language_via_pack(path)
121    }
122
123    #[cfg(not(feature = "lang-pack"))]
124    {
125        None
126    }
127}
128
129/// Detect language using `tree-sitter-language-pack`'s extension mapping.
130#[cfg(feature = "lang-pack")]
131fn detect_language_via_pack(path: &Path) -> Option<Language> {
132    let ext = path.extension().and_then(|e| e.to_str())?;
133    let name = tree_sitter_language_pack::detect_language_from_extension(ext)?;
134    // Skip if the pack returns a core language name (already handled above)
135    #[expect(clippy::useless_asref)]
136    match name.as_ref() {
137        "rust" | "python" | "typescript" | "tsx" | "javascript" | "go" | "c" => None,
138        other => tree_sitter_language_pack::has_language(other)
139            .then(|| Language::Generic(other.to_string())),
140    }
141}
142
143/// Parse a discovered file into a `ParsedFile`.
144///
145/// Dispatches to the appropriate language parser based on the detected language.
146///
147/// # Errors
148///
149/// Returns an error if the language feature is not enabled or parsing fails.
150pub fn parse_file(file: &DiscoveredFile, lang: &Language) -> Result<ParsedFile, AstDocError> {
151    match lang {
152        #[cfg(feature = "lang-rust")]
153        Language::Rust => lang::rust_parser::RustParser::new().parse(&file.content, &file.path),
154        #[cfg(feature = "lang-python")]
155        Language::Python => {
156            lang::python_parser::PythonParser::new().parse(&file.content, &file.path)
157        }
158        #[cfg(feature = "lang-typescript")]
159        Language::TypeScript => {
160            lang::typescript_parser::TypeScriptParser::new().parse(&file.content, &file.path)
161        }
162        #[cfg(feature = "lang-go")]
163        Language::Go => lang::go_parser::GoParser::new().parse(&file.content, &file.path),
164        #[cfg(feature = "lang-c")]
165        Language::C => lang::c_parser::CParser::new().parse(&file.content, &file.path),
166        #[cfg(feature = "lang-pack")]
167        Language::Generic(name) => {
168            lang::generic_parser::GenericParser::new(name).parse(&file.content, &file.path)
169        }
170        #[expect(unreachable_patterns)]
171        _ => Err(AstDocError::UnsupportedLanguage { language: lang.to_string() }),
172    }
173}
174
175#[cfg(test)]
176#[expect(clippy::unwrap_used)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn test_detect_language_rust() {
182        assert_eq!(detect_language(Path::new("main.rs")), Some(Language::Rust));
183    }
184
185    #[test]
186    fn test_detect_language_python() {
187        assert_eq!(detect_language(Path::new("app.py")), Some(Language::Python));
188    }
189
190    #[test]
191    fn test_detect_language_typescript() {
192        assert_eq!(detect_language(Path::new("index.ts")), Some(Language::TypeScript));
193        assert_eq!(detect_language(Path::new("app.tsx")), Some(Language::TypeScript));
194        assert_eq!(detect_language(Path::new("script.js")), Some(Language::TypeScript));
195    }
196
197    #[test]
198    fn test_detect_language_go() {
199        assert_eq!(detect_language(Path::new("main.go")), Some(Language::Go));
200    }
201
202    #[test]
203    fn test_detect_language_c() {
204        assert_eq!(detect_language(Path::new("main.c")), Some(Language::C));
205        assert_eq!(detect_language(Path::new("header.h")), Some(Language::C));
206    }
207
208    #[test]
209    fn test_detect_language_unknown() {
210        assert_eq!(detect_language(Path::new("readme.md")), None);
211        assert_eq!(detect_language(Path::new("data.json")), None);
212    }
213
214    #[test]
215    fn test_language_display_generic() {
216        assert_eq!(Language::Generic("java".to_string()).to_string(), "java");
217    }
218
219    #[test]
220    fn test_language_is_core() {
221        assert!(Language::Rust.is_core());
222        assert!(Language::Python.is_core());
223        assert!(!Language::Generic("java".to_string()).is_core());
224    }
225
226    #[test]
227    fn test_language_ts_pack_name() {
228        assert_eq!(Language::Rust.ts_pack_name(), "rust");
229        assert_eq!(Language::Python.ts_pack_name(), "python");
230        assert_eq!(Language::Generic("java".to_string()).ts_pack_name(), "java");
231    }
232
233    #[cfg(feature = "lang-rust")]
234    #[test]
235    fn test_parse_file_rust() {
236        let file = DiscoveredFile {
237            path: PathBuf::from("src/main.rs"),
238            content: "fn main() {\n    println!(\"hello\");\n}\n".to_string(),
239            language: Some(Language::Rust),
240            raw_token_count: 10,
241        };
242        let result = parse_file(&file, &Language::Rust).unwrap();
243        assert_eq!(result.language, Language::Rust);
244        assert!(result.strategies_data.contains_key(&OutputStrategy::Full));
245        assert!(result.strategies_data.contains_key(&OutputStrategy::NoTests));
246        assert!(result.strategies_data.contains_key(&OutputStrategy::Summary));
247    }
248}