Skip to main content

ast_doc_core/parser/lang/
generic_parser.rs

1//! Generic parser for any language supported by `tree-sitter-language-pack`.
2//!
3//! Uses the `process()` API to extract structured code intelligence
4//! (functions, classes, imports, etc.) and generates Full/NoTests/Summary
5//! strategy variants without requiring language-specific tree-sitter queries.
6
7use std::path::Path;
8
9use tree_sitter_language_pack::{ProcessConfig, StructureKind};
10
11use crate::{
12    error::AstDocError,
13    parser::{
14        Language, LanguageParser, ParsedFile,
15        strategy::{self, RemovalRange, RemovalReason},
16    },
17};
18
19/// Generic parser for languages beyond the core 5.
20///
21/// Uses `tree-sitter-language-pack`'s `process()` API for structure extraction.
22#[derive(Debug)]
23pub struct GenericParser {
24    /// The language name as recognized by `tree-sitter-language-pack`.
25    language_name: String,
26}
27
28impl GenericParser {
29    /// Create a new generic parser for the given language name.
30    #[must_use]
31    pub fn new(language_name: &str) -> Self {
32        Self { language_name: language_name.to_string() }
33    }
34}
35
36impl LanguageParser for GenericParser {
37    fn parse(&self, source: &str, path: &Path) -> Result<ParsedFile, AstDocError> {
38        let config = ProcessConfig::new(&self.language_name).all();
39
40        let result = tree_sitter_language_pack::process(source, &config).map_err(|e| {
41            AstDocError::Parse {
42                path: path.to_path_buf(),
43                message: format!("Failed to process {} source: {e}", self.language_name),
44            }
45        })?;
46
47        let test_ranges = collect_test_ranges_from_structure(&result, source);
48        let summary_ranges = collect_summary_ranges_from_structure(&result, source);
49
50        let strategies_data = strategy::build_strategies(source, &test_ranges, &summary_ranges);
51
52        Ok(ParsedFile {
53            path: path.to_path_buf(),
54            language: Language::Generic(self.language_name.clone()),
55            source: source.to_string(),
56            strategies_data,
57        })
58    }
59}
60
61/// Heuristic: check if a function/method name indicates a test.
62fn is_test_name(name: &str) -> bool {
63    // Common test naming conventions across languages
64    name.starts_with("test_") ||           // Python, Rust
65    name.starts_with("Test") ||             // Go
66    name.starts_with("test") && name.len() > 4 && name.as_bytes()[4].is_ascii_uppercase() || // Java/Kotlin testXxx
67    name.starts_with("it_") ||
68    name.starts_with("should_") ||          // BDD style
69    name.starts_with("bench_") ||           // Rust benchmarks
70    name.starts_with("Benchmark") // Go benchmarks
71}
72
73/// Heuristic: check if a structure item looks like a test-related node.
74fn is_test_structure_item(item: &tree_sitter_language_pack::StructureItem) -> bool {
75    match &item.name {
76        Some(name) => is_test_name(name),
77        None => false,
78    }
79}
80
81/// Collect byte ranges for test functions/classes from the `process()` result.
82fn collect_test_ranges_from_structure(
83    result: &tree_sitter_language_pack::ProcessResult,
84    _source: &str,
85) -> Vec<RemovalRange> {
86    let mut ranges = Vec::new();
87
88    for item in &result.structure {
89        if is_test_structure_item(item) {
90            ranges.push(RemovalRange {
91                start: item.span.start_byte,
92                end: item.span.end_byte,
93                reason: match item.kind {
94                    StructureKind::Function | StructureKind::Method => RemovalReason::TestFunction,
95                    StructureKind::Class | StructureKind::Struct => RemovalReason::TestModule,
96                    _ => RemovalReason::TestFunction,
97                },
98            });
99        }
100    }
101
102    ranges
103}
104
105/// Collect byte ranges for Summary mode: implementation bodies of non-test functions.
106fn collect_summary_ranges_from_structure(
107    result: &tree_sitter_language_pack::ProcessResult,
108    source: &str,
109) -> Vec<RemovalRange> {
110    let mut ranges = Vec::new();
111
112    for item in &result.structure {
113        if is_test_structure_item(item) {
114            continue;
115        }
116
117        match item.kind {
118            StructureKind::Function | StructureKind::Method => {
119                // For functions/methods, try to find the body (the `{...}` block).
120                // The structure item span covers the entire function.
121                // We want to keep the signature and replace the body.
122                if let Some(range) = extract_body_range(source, &item.span) {
123                    ranges.push(range);
124                }
125            }
126            StructureKind::Class |
127            StructureKind::Struct |
128            StructureKind::Interface |
129            StructureKind::Enum |
130            StructureKind::Module |
131            StructureKind::Trait |
132            StructureKind::Impl |
133            StructureKind::Namespace |
134            StructureKind::Other(_) => {
135                // Keep these as-is in summary mode (just the declaration)
136            }
137        }
138    }
139
140    ranges
141}
142
143/// Try to find the implementation body range within a structure item.
144///
145/// Looks for the first `{` after the declaration keyword and matches it
146/// to the closing `}` to identify the body range.
147fn extract_body_range(
148    source: &str,
149    span: &tree_sitter_language_pack::Span,
150) -> Option<RemovalRange> {
151    let start = span.start_byte;
152    let end = span.end_byte;
153    if end > source.len() || start >= end {
154        return None;
155    }
156
157    let item_text = &source[start..end];
158
159    // Find the first `{` that likely starts the body
160    // Skip the first line (signature) and find the body
161    let body_start_in_item = find_body_open_brace(item_text)?;
162
163    let abs_body_start = start + body_start_in_item;
164    let abs_body_end = start + find_matching_brace(item_text, body_start_in_item)?;
165
166    if abs_body_end <= abs_body_start {
167        return None;
168    }
169
170    Some(RemovalRange {
171        start: abs_body_start,
172        end: abs_body_end + 1, // include the closing brace
173        reason: RemovalReason::Implementation,
174    })
175}
176
177/// Find the position of the opening `{` that starts a function body.
178/// Skips generic parameters, arguments, and return types that may contain `{` in type expressions.
179fn find_body_open_brace(text: &str) -> Option<usize> {
180    let bytes = text.as_bytes();
181    let mut depth_paren = 0i32;
182    let mut depth_angle = 0i32;
183    let mut found_sig_end = false;
184
185    let mut i = 0;
186    while i < bytes.len() {
187        match bytes[i] {
188            b'(' => depth_paren += 1,
189            b')' => depth_paren -= 1,
190            b'<' => depth_angle += 1,
191            b'>' => depth_angle -= 1,
192            b':' if !found_sig_end => {
193                // Could be return type separator or label
194            }
195            b'{' if depth_paren == 0 && depth_angle <= 0 => {
196                return Some(i);
197            }
198            b'\n' if depth_paren == 0 && depth_angle <= 0 => {
199                found_sig_end = true;
200            }
201            _ => {}
202        }
203        i += 1;
204    }
205    None
206}
207
208/// Find the matching closing `}` for an opening `{` at the given position.
209fn find_matching_brace(text: &str, open_pos: usize) -> Option<usize> {
210    let bytes = text.as_bytes();
211    let mut depth = 0i32;
212
213    for (i, &byte) in bytes.iter().enumerate().skip(open_pos) {
214        match byte {
215            b'{' => depth += 1,
216            b'}' => {
217                depth -= 1;
218                if depth == 0 {
219                    return Some(i);
220                }
221            }
222            _ => {}
223        }
224    }
225    None
226}
227
228#[cfg(test)]
229#[expect(clippy::unwrap_used, clippy::panic)]
230mod tests {
231    use super::*;
232    use crate::config::OutputStrategy;
233
234    fn get_strategy_content<'a>(parsed: &'a ParsedFile, strategy: &OutputStrategy) -> &'a str {
235        parsed.strategies_data.get(strategy).map_or("", |s| s.content.as_str())
236    }
237
238    #[test]
239    fn test_generic_parser_full_is_verbatim() {
240        if !tree_sitter_language_pack::has_language("rust") {
241            return;
242        }
243        let source = "fn main() {\n    println!(\"hello\");\n}\n";
244        let parser = GenericParser::new("rust");
245        let parsed = parser.parse(source, Path::new("test.rs")).unwrap();
246        assert_eq!(get_strategy_content(&parsed, &OutputStrategy::Full), source);
247    }
248
249    #[test]
250    fn test_generic_parser_creates_three_strategies() {
251        if !tree_sitter_language_pack::has_language("rust") {
252            return;
253        }
254        let source = "fn main() {\n    println!(\"hello\");\n}\n";
255        let parser = GenericParser::new("rust");
256        let parsed = parser.parse(source, Path::new("test.rs")).unwrap();
257        assert!(parsed.strategies_data.contains_key(&OutputStrategy::Full));
258        assert!(parsed.strategies_data.contains_key(&OutputStrategy::NoTests));
259        assert!(parsed.strategies_data.contains_key(&OutputStrategy::Summary));
260    }
261
262    #[test]
263    fn test_generic_parser_language_stored() {
264        if !tree_sitter_language_pack::has_language("java") {
265            return;
266        }
267        let source = "fn main() {}\n";
268        let parser = GenericParser::new("java");
269        let parsed = parser.parse(source, Path::new("Main.java")).unwrap();
270        assert_eq!(parsed.language, Language::Generic("java".to_string()));
271    }
272
273    #[test]
274    fn test_is_test_name() {
275        assert!(is_test_name("test_add"));
276        assert!(is_test_name("TestAdd"));
277        assert!(is_test_name("should_work"));
278        assert!(is_test_name("bench_sort"));
279        assert!(is_test_name("BenchmarkSort"));
280        assert!(!is_test_name("add"));
281        assert!(!is_test_name("main"));
282        assert!(!is_test_name("get_test_value")); // contains "test" but doesn't match pattern
283    }
284
285    #[test]
286    fn test_find_matching_brace() {
287        let text = "{ body }";
288        assert_eq!(find_matching_brace(text, 0), Some(7));
289
290        let text = "{ { nested } }";
291        assert_eq!(find_matching_brace(text, 0), Some(13));
292
293        let text = "{ unclosed";
294        assert_eq!(find_matching_brace(text, 0), None);
295    }
296
297    #[test]
298    fn test_generic_parser_with_python() {
299        if !tree_sitter_language_pack::has_language("python") {
300            return;
301        }
302        let source = "def hello():\n    pass\n";
303        let parser = GenericParser::new("python");
304        let parsed = parser.parse(source, Path::new("test.py")).unwrap();
305        assert_eq!(get_strategy_content(&parsed, &OutputStrategy::Full), source);
306    }
307
308    #[test]
309    fn test_generic_parser_empty_source() {
310        if !tree_sitter_language_pack::has_language("java") {
311            return;
312        }
313        let source = "";
314        let parser = GenericParser::new("java");
315        let parsed = parser.parse(source, Path::new("Empty.java")).unwrap();
316        assert_eq!(get_strategy_content(&parsed, &OutputStrategy::Full), "");
317    }
318
319    proptest::proptest! {
320        #[test]
321        fn test_generic_parser_full_matches_source(source in "[a-zA-Z0-9 {}();\n\t]{0,200}") {
322            if tree_sitter_language_pack::has_language("c") {
323                let parser = GenericParser::new("c");
324                let parsed = parser.parse(&source, Path::new("test.c")).unwrap();
325                let full_data = parsed.strategies_data.get(&OutputStrategy::Full);
326                proptest::prop_assert!(full_data.is_some());
327                proptest::prop_assert_eq!(&full_data.unwrap().content, &source);
328            }
329        }
330    }
331}