Skip to main content

sqz_engine/
ast_parser.rs

1//! AST-based code signature extraction using tree-sitter and regex fallbacks.
2//!
3//! Supports 18+ programming languages. Languages with tree-sitter grammars
4//! (Rust, Python, JavaScript, Bash) use full AST parsing. All other languages
5//! use regex-based extraction which is fast and reliable.
6
7use crate::error::{Result, SqzError};
8use std::collections::HashMap;
9use tree_sitter::{Language, Parser};
10
11/// A single import declaration extracted from source code.
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct ImportDecl {
14    pub text: String,
15}
16
17/// A function or method signature.
18#[derive(Debug, Clone, PartialEq, Eq)]
19pub struct FunctionSignature {
20    pub name: String,
21    pub signature: String,
22}
23
24/// A class, struct, or interface definition.
25#[derive(Debug, Clone, PartialEq, Eq)]
26pub struct ClassDefinition {
27    pub name: String,
28    pub signature: String,
29}
30
31/// A type alias or type declaration.
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct TypeDeclaration {
34    pub name: String,
35    pub signature: String,
36}
37
38/// Summary of extracted code structure.
39#[derive(Debug, Clone)]
40pub struct CodeSummary {
41    pub imports: Vec<ImportDecl>,
42    pub functions: Vec<FunctionSignature>,
43    pub classes: Vec<ClassDefinition>,
44    pub types: Vec<TypeDeclaration>,
45    pub tokens_original: u32,
46    pub tokens_summary: u32,
47}
48
49impl CodeSummary {
50    /// Render the summary as a compact text representation.
51    pub fn to_text(&self) -> String {
52        let mut parts = Vec::new();
53        for imp in &self.imports {
54            parts.push(imp.text.clone());
55        }
56        for cls in &self.classes {
57            parts.push(cls.signature.clone());
58        }
59        for ty in &self.types {
60            parts.push(ty.signature.clone());
61        }
62        for func in &self.functions {
63            parts.push(func.signature.clone());
64        }
65        parts.join("\n")
66    }
67}
68
69/// Approximate token count as char_count / 4.
70fn approx_tokens(s: &str) -> u32 {
71    ((s.len() as f64) / 4.0).ceil() as u32
72}
73
74// ---------------------------------------------------------------------------
75// Tree-sitter based extractors
76// ---------------------------------------------------------------------------
77
78fn extract_line(source: &str, node: &tree_sitter::Node) -> String {
79    let start = node.start_position().row;
80    let end = node.end_position().row;
81    let lines: Vec<&str> = source.lines().collect();
82    if start < lines.len() {
83        if start == end {
84            lines[start].trim().to_string()
85        } else {
86            // Multi-line: take first line only (signature line)
87            lines[start].trim().to_string()
88        }
89    } else {
90        String::new()
91    }
92}
93
94fn node_text<'a>(source: &'a str, node: &tree_sitter::Node) -> &'a str {
95    &source[node.byte_range()]
96}
97
98/// Extract signatures from Rust source using tree-sitter.
99fn extract_rust(source: &str, language: Language) -> Result<CodeSummary> {
100    let mut parser = Parser::new();
101    parser
102        .set_language(&language)
103        .map_err(|e| SqzError::Other(format!("tree-sitter language error: {e}")))?;
104
105    let tree = parser
106        .parse(source, None)
107        .ok_or_else(|| SqzError::Other("tree-sitter parse failed".into()))?;
108
109    let root = tree.root_node();
110    let mut imports = Vec::new();
111    let mut functions = Vec::new();
112    let mut classes = Vec::new();
113    let mut types = Vec::new();
114
115    let mut cursor = root.walk();
116    // Walk top-level items
117    for child in root.children(&mut cursor) {
118        match child.kind() {
119            "use_declaration" => {
120                imports.push(ImportDecl {
121                    text: extract_line(source, &child),
122                });
123            }
124            "function_item" => {
125                let name = child
126                    .child_by_field_name("name")
127                    .map(|n| node_text(source, &n).to_string())
128                    .unwrap_or_default();
129                functions.push(FunctionSignature {
130                    name,
131                    signature: extract_line(source, &child),
132                });
133            }
134            "struct_item" | "enum_item" | "impl_item" | "trait_item" => {
135                let name = child
136                    .child_by_field_name("name")
137                    .map(|n| node_text(source, &n).to_string())
138                    .unwrap_or_default();
139                classes.push(ClassDefinition {
140                    name,
141                    signature: extract_line(source, &child),
142                });
143            }
144            "type_item" => {
145                let name = child
146                    .child_by_field_name("name")
147                    .map(|n| node_text(source, &n).to_string())
148                    .unwrap_or_default();
149                types.push(TypeDeclaration {
150                    name,
151                    signature: extract_line(source, &child),
152                });
153            }
154            _ => {}
155        }
156    }
157
158    let summary_text = {
159        let mut parts = Vec::new();
160        for i in &imports {
161            parts.push(i.text.clone());
162        }
163        for c in &classes {
164            parts.push(c.signature.clone());
165        }
166        for t in &types {
167            parts.push(t.signature.clone());
168        }
169        for f in &functions {
170            parts.push(f.signature.clone());
171        }
172        parts.join("\n")
173    };
174
175    Ok(CodeSummary {
176        imports,
177        functions,
178        classes,
179        types,
180        tokens_original: approx_tokens(source),
181        tokens_summary: approx_tokens(&summary_text),
182    })
183}
184
185/// Extract signatures from Python source using tree-sitter.
186fn extract_python(source: &str, language: Language) -> Result<CodeSummary> {
187    let mut parser = Parser::new();
188    parser
189        .set_language(&language)
190        .map_err(|e| SqzError::Other(format!("tree-sitter language error: {e}")))?;
191
192    let tree = parser
193        .parse(source, None)
194        .ok_or_else(|| SqzError::Other("tree-sitter parse failed".into()))?;
195
196    let root = tree.root_node();
197    let mut imports = Vec::new();
198    let mut functions = Vec::new();
199    let mut classes = Vec::new();
200    let types = Vec::new();
201
202    let mut cursor = root.walk();
203    for child in root.children(&mut cursor) {
204        match child.kind() {
205            "import_statement" | "import_from_statement" => {
206                imports.push(ImportDecl {
207                    text: extract_line(source, &child),
208                });
209            }
210            "function_definition" => {
211                let name = child
212                    .child_by_field_name("name")
213                    .map(|n| node_text(source, &n).to_string())
214                    .unwrap_or_default();
215                functions.push(FunctionSignature {
216                    name,
217                    signature: extract_line(source, &child),
218                });
219            }
220            "class_definition" => {
221                let name = child
222                    .child_by_field_name("name")
223                    .map(|n| node_text(source, &n).to_string())
224                    .unwrap_or_default();
225                classes.push(ClassDefinition {
226                    name,
227                    signature: extract_line(source, &child),
228                });
229            }
230            _ => {}
231        }
232    }
233
234    let summary_text = build_summary_text(&imports, &functions, &classes, &types);
235    Ok(CodeSummary {
236        imports,
237        functions,
238        classes,
239        types,
240        tokens_original: approx_tokens(source),
241        tokens_summary: approx_tokens(&summary_text),
242    })
243}
244
245/// Extract signatures from JavaScript source using tree-sitter.
246fn extract_javascript(source: &str, language: Language) -> Result<CodeSummary> {
247    let mut parser = Parser::new();
248    parser
249        .set_language(&language)
250        .map_err(|e| SqzError::Other(format!("tree-sitter language error: {e}")))?;
251
252    let tree = parser
253        .parse(source, None)
254        .ok_or_else(|| SqzError::Other("tree-sitter parse failed".into()))?;
255
256    let root = tree.root_node();
257    let mut imports = Vec::new();
258    let mut functions = Vec::new();
259    let mut classes = Vec::new();
260    let types = Vec::new();
261
262    let mut cursor = root.walk();
263    for child in root.children(&mut cursor) {
264        match child.kind() {
265            "import_statement" => {
266                imports.push(ImportDecl {
267                    text: extract_line(source, &child),
268                });
269            }
270            "function_declaration" => {
271                let name = child
272                    .child_by_field_name("name")
273                    .map(|n| node_text(source, &n).to_string())
274                    .unwrap_or_default();
275                functions.push(FunctionSignature {
276                    name,
277                    signature: extract_line(source, &child),
278                });
279            }
280            "class_declaration" => {
281                let name = child
282                    .child_by_field_name("name")
283                    .map(|n| node_text(source, &n).to_string())
284                    .unwrap_or_default();
285                classes.push(ClassDefinition {
286                    name,
287                    signature: extract_line(source, &child),
288                });
289            }
290            "lexical_declaration" | "variable_declaration" => {
291                // Capture exported const fn = () => {} style
292                let line = extract_line(source, &child);
293                if line.contains("function") || line.contains("=>") {
294                    let name = child
295                        .named_child(0)
296                        .and_then(|d| d.child_by_field_name("name"))
297                        .map(|n| node_text(source, &n).to_string())
298                        .unwrap_or_default();
299                    if !name.is_empty() {
300                        functions.push(FunctionSignature {
301                            name,
302                            signature: line,
303                        });
304                    }
305                }
306            }
307            _ => {}
308        }
309    }
310
311    let summary_text = build_summary_text(&imports, &functions, &classes, &types);
312    Ok(CodeSummary {
313        imports,
314        functions,
315        classes,
316        types,
317        tokens_original: approx_tokens(source),
318        tokens_summary: approx_tokens(&summary_text),
319    })
320}
321
322/// Extract signatures from Bash source using tree-sitter.
323fn extract_bash(source: &str, language: Language) -> Result<CodeSummary> {
324    let mut parser = Parser::new();
325    parser
326        .set_language(&language)
327        .map_err(|e| SqzError::Other(format!("tree-sitter language error: {e}")))?;
328
329    let tree = parser
330        .parse(source, None)
331        .ok_or_else(|| SqzError::Other("tree-sitter parse failed".into()))?;
332
333    let root = tree.root_node();
334    let mut functions = Vec::new();
335    let imports = Vec::new();
336    let classes = Vec::new();
337    let types = Vec::new();
338
339    let mut cursor = root.walk();
340    for child in root.children(&mut cursor) {
341        if child.kind() == "function_definition" {
342            let name = child
343                .child_by_field_name("name")
344                .map(|n| node_text(source, &n).to_string())
345                .unwrap_or_default();
346            functions.push(FunctionSignature {
347                name,
348                signature: extract_line(source, &child),
349            });
350        }
351    }
352
353    let summary_text = build_summary_text(&imports, &functions, &classes, &types);
354    Ok(CodeSummary {
355        imports,
356        functions,
357        classes,
358        types,
359        tokens_original: approx_tokens(source),
360        tokens_summary: approx_tokens(&summary_text),
361    })
362}
363
364fn build_summary_text(
365    imports: &[ImportDecl],
366    functions: &[FunctionSignature],
367    classes: &[ClassDefinition],
368    types: &[TypeDeclaration],
369) -> String {
370    let mut parts = Vec::new();
371    for i in imports {
372        parts.push(i.text.clone());
373    }
374    for c in classes {
375        parts.push(c.signature.clone());
376    }
377    for t in types {
378        parts.push(t.signature.clone());
379    }
380    for f in functions {
381        parts.push(f.signature.clone());
382    }
383    parts.join("\n")
384}
385
386// ---------------------------------------------------------------------------
387// Regex-based extractors for languages without 0.21 grammar crates
388// ---------------------------------------------------------------------------
389
390/// Generic regex-based extractor. Each language provides its own patterns.
391struct RegexExtractor {
392    import_patterns: Vec<&'static str>,
393    function_patterns: Vec<&'static str>,
394    class_patterns: Vec<&'static str>,
395    type_patterns: Vec<&'static str>,
396}
397
398impl RegexExtractor {
399    fn extract(&self, source: &str) -> CodeSummary {
400        use std::collections::HashSet;
401
402        let mut imports = Vec::new();
403        let mut functions = Vec::new();
404        let mut classes = Vec::new();
405        let mut types = Vec::new();
406        let mut seen: HashSet<String> = HashSet::new();
407
408        for line in source.lines() {
409            let trimmed = line.trim();
410            if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with('#') {
411                continue;
412            }
413
414            for pat in &self.import_patterns {
415                if trimmed.starts_with(pat) && seen.insert(trimmed.to_string()) {
416                    imports.push(ImportDecl {
417                        text: trimmed.to_string(),
418                    });
419                    break;
420                }
421            }
422
423            for pat in &self.function_patterns {
424                if trimmed.contains(pat) && seen.insert(trimmed.to_string()) {
425                    // Extract name heuristically: word after the keyword
426                    let name = extract_name_after(trimmed, pat);
427                    functions.push(FunctionSignature {
428                        name,
429                        signature: trimmed.to_string(),
430                    });
431                    break;
432                }
433            }
434
435            for pat in &self.class_patterns {
436                if trimmed.starts_with(pat) && seen.insert(trimmed.to_string()) {
437                    let name = extract_name_after(trimmed, pat);
438                    classes.push(ClassDefinition {
439                        name,
440                        signature: trimmed.to_string(),
441                    });
442                    break;
443                }
444            }
445
446            for pat in &self.type_patterns {
447                if trimmed.starts_with(pat) && seen.insert(trimmed.to_string()) {
448                    let name = extract_name_after(trimmed, pat);
449                    types.push(TypeDeclaration {
450                        name,
451                        signature: trimmed.to_string(),
452                    });
453                    break;
454                }
455            }
456        }
457
458        let summary_text = build_summary_text(&imports, &functions, &classes, &types);
459        CodeSummary {
460            imports,
461            functions,
462            classes,
463            types,
464            tokens_original: approx_tokens(source),
465            tokens_summary: approx_tokens(&summary_text),
466        }
467    }
468}
469
470fn extract_name_after(line: &str, keyword: &str) -> String {
471    let rest = line[line.find(keyword).unwrap_or(0) + keyword.len()..].trim();
472    rest.split(|c: char| !c.is_alphanumeric() && c != '_')
473        .next()
474        .unwrap_or("")
475        .to_string()
476}
477
478fn go_extractor() -> RegexExtractor {
479    RegexExtractor {
480        import_patterns: vec!["import "],
481        function_patterns: vec!["func "],
482        class_patterns: vec!["type "],
483        type_patterns: vec![],
484    }
485}
486
487fn java_extractor() -> RegexExtractor {
488    RegexExtractor {
489        import_patterns: vec!["import "],
490        function_patterns: vec![
491            "public ",
492            "private ",
493            "protected ",
494            "static ",
495            "void ",
496            "int ",
497            "String ",
498        ],
499        class_patterns: vec!["class ", "interface ", "enum ", "record "],
500        type_patterns: vec![],
501    }
502}
503
504fn c_extractor() -> RegexExtractor {
505    RegexExtractor {
506        import_patterns: vec!["#include"],
507        function_patterns: vec![],
508        class_patterns: vec!["struct ", "union ", "enum "],
509        type_patterns: vec!["typedef "],
510    }
511}
512
513fn cpp_extractor() -> RegexExtractor {
514    RegexExtractor {
515        import_patterns: vec!["#include"],
516        function_patterns: vec![],
517        class_patterns: vec!["class ", "struct ", "union ", "enum "],
518        type_patterns: vec!["typedef ", "using "],
519    }
520}
521
522fn ruby_extractor() -> RegexExtractor {
523    RegexExtractor {
524        import_patterns: vec!["require ", "require_relative "],
525        function_patterns: vec!["def "],
526        class_patterns: vec!["class ", "module "],
527        type_patterns: vec![],
528    }
529}
530
531fn json_extractor() -> RegexExtractor {
532    // JSON has no functions/classes; just return top-level keys
533    RegexExtractor {
534        import_patterns: vec![],
535        function_patterns: vec![],
536        class_patterns: vec![],
537        type_patterns: vec![],
538    }
539}
540
541fn html_extractor() -> RegexExtractor {
542    RegexExtractor {
543        import_patterns: vec!["<link", "<script"],
544        function_patterns: vec![],
545        class_patterns: vec![],
546        type_patterns: vec![],
547    }
548}
549
550fn css_extractor() -> RegexExtractor {
551    RegexExtractor {
552        import_patterns: vec!["@import"],
553        function_patterns: vec![],
554        class_patterns: vec![],
555        type_patterns: vec!["@keyframes", "@media", "@mixin"],
556    }
557}
558
559fn typescript_extractor() -> RegexExtractor {
560    RegexExtractor {
561        import_patterns: vec!["import "],
562        function_patterns: vec!["function ", "async function ", "export function ", "export async function "],
563        class_patterns: vec!["class ", "interface ", "abstract class "],
564        type_patterns: vec!["type ", "enum "],
565    }
566}
567
568fn csharp_extractor() -> RegexExtractor {
569    RegexExtractor {
570        import_patterns: vec!["using "],
571        function_patterns: vec![
572            "public ",
573            "private ",
574            "protected ",
575            "internal ",
576            "static ",
577            "override ",
578            "virtual ",
579            "abstract ",
580        ],
581        class_patterns: vec!["class ", "interface ", "struct ", "enum ", "record "],
582        type_patterns: vec![],
583    }
584}
585
586fn kotlin_extractor() -> RegexExtractor {
587    RegexExtractor {
588        import_patterns: vec!["import "],
589        function_patterns: vec!["fun "],
590        class_patterns: vec!["class ", "interface ", "object ", "data class ", "sealed class "],
591        type_patterns: vec!["typealias "],
592    }
593}
594
595fn swift_extractor() -> RegexExtractor {
596    RegexExtractor {
597        import_patterns: vec!["import "],
598        function_patterns: vec!["func "],
599        class_patterns: vec!["class ", "struct ", "enum ", "protocol ", "extension "],
600        type_patterns: vec!["typealias "],
601    }
602}
603
604fn toml_extractor() -> RegexExtractor {
605    RegexExtractor {
606        import_patterns: vec![],
607        function_patterns: vec![],
608        class_patterns: vec!["["],
609        type_patterns: vec![],
610    }
611}
612
613fn yaml_extractor() -> RegexExtractor {
614    RegexExtractor {
615        import_patterns: vec![],
616        function_patterns: vec![],
617        class_patterns: vec![],
618        type_patterns: vec![],
619    }
620}
621
622// ---------------------------------------------------------------------------
623// AstParser
624// ---------------------------------------------------------------------------
625
626/// Tree-sitter based code structure extractor supporting 18+ languages.
627pub struct AstParser {
628    grammars: HashMap<String, Language>,
629}
630
631impl AstParser {
632    /// Create a new `AstParser` loading all bundled grammars.
633    pub fn new() -> Self {
634        let mut grammars = HashMap::new();
635        grammars.insert("rust".to_string(), tree_sitter_rust::language());
636        grammars.insert("python".to_string(), tree_sitter_python::language());
637        grammars.insert("javascript".to_string(), tree_sitter_javascript::language());
638        grammars.insert("bash".to_string(), tree_sitter_bash::language());
639        AstParser { grammars }
640    }
641
642    /// Returns the list of supported language identifiers.
643    pub fn supported_languages(&self) -> &[&'static str] {
644        &[
645            "rust",
646            "python",
647            "javascript",
648            "typescript",
649            "go",
650            "java",
651            "c",
652            "cpp",
653            "ruby",
654            "bash",
655            "json",
656            "html",
657            "css",
658            "csharp",
659            "kotlin",
660            "swift",
661            "toml",
662            "yaml",
663        ]
664    }
665
666    /// Returns true if the given language identifier is supported.
667    pub fn is_supported(&self, language: &str) -> bool {
668        self.supported_languages().contains(&language)
669    }
670
671    /// Extract code signatures from `source` written in `language`.
672    ///
673    /// Returns the file unchanged (as a single-import entry) for unsupported
674    /// languages, per Requirement 19.4.
675    pub fn extract_signatures(&self, source: &str, language: &str) -> Result<CodeSummary> {
676        if !self.is_supported(language) {
677            eprintln!("AstParser: unsupported language '{language}', returning source unchanged");
678            return Err(SqzError::UnsupportedLanguage(language.to_string()));
679        }
680
681        match language {
682            "rust" => {
683                let lang = self.grammars["rust"].clone();
684                extract_rust(source, lang)
685            }
686            "python" => {
687                let lang = self.grammars["python"].clone();
688                extract_python(source, lang)
689            }
690            "javascript" => {
691                let lang = self.grammars["javascript"].clone();
692                extract_javascript(source, lang)
693            }
694            "bash" => {
695                let lang = self.grammars["bash"].clone();
696                extract_bash(source, lang)
697            }
698            "typescript" => Ok(typescript_extractor().extract(source)),
699            "go" => Ok(go_extractor().extract(source)),
700            "java" => Ok(java_extractor().extract(source)),
701            "c" => Ok(c_extractor().extract(source)),
702            "cpp" => Ok(cpp_extractor().extract(source)),
703            "ruby" => Ok(ruby_extractor().extract(source)),
704            "json" => Ok(json_extractor().extract(source)),
705            "html" => Ok(html_extractor().extract(source)),
706            "css" => Ok(css_extractor().extract(source)),
707            "csharp" => Ok(csharp_extractor().extract(source)),
708            "kotlin" => Ok(kotlin_extractor().extract(source)),
709            "swift" => Ok(swift_extractor().extract(source)),
710            "toml" => Ok(toml_extractor().extract(source)),
711            "yaml" => Ok(yaml_extractor().extract(source)),
712            _ => unreachable!("is_supported check above covers all cases"),
713        }
714    }
715}
716
717impl Default for AstParser {
718    fn default() -> Self {
719        Self::new()
720    }
721}
722
723// ---------------------------------------------------------------------------
724// Tests
725// ---------------------------------------------------------------------------
726
727#[cfg(test)]
728mod tests {
729    use super::*;
730
731    #[test]
732    fn test_supported_languages_count() {
733        let parser = AstParser::new();
734        assert!(
735            parser.supported_languages().len() >= 18,
736            "must support 18+ languages"
737        );
738    }
739
740    #[test]
741    fn test_is_supported() {
742        let parser = AstParser::new();
743        assert!(parser.is_supported("rust"));
744        assert!(parser.is_supported("python"));
745        assert!(parser.is_supported("go"));
746        assert!(!parser.is_supported("cobol"));
747        assert!(!parser.is_supported(""));
748    }
749
750    #[test]
751    fn test_unsupported_language_returns_error() {
752        let parser = AstParser::new();
753        let result = parser.extract_signatures("fn main() {}", "cobol");
754        assert!(matches!(result, Err(SqzError::UnsupportedLanguage(_))));
755    }
756
757    #[test]
758    fn test_rust_extraction() {
759        let parser = AstParser::new();
760        let source = r#"
761use std::collections::HashMap;
762
763pub struct Foo {
764    x: i32,
765}
766
767pub fn bar(x: i32) -> i32 {
768    x + 1
769}
770
771pub type MyType = Vec<i32>;
772"#;
773        let summary = parser.extract_signatures(source, "rust").unwrap();
774        assert!(!summary.functions.is_empty());
775        assert!(!summary.classes.is_empty());
776        assert!(!summary.imports.is_empty());
777        assert!(summary.tokens_summary < summary.tokens_original);
778    }
779
780    #[test]
781    fn test_python_extraction() {
782        let parser = AstParser::new();
783        let source = r#"
784import os
785from typing import List
786
787class MyClass:
788    def __init__(self):
789        pass
790
791def my_function(x: int) -> int:
792    return x + 1
793"#;
794        let summary = parser.extract_signatures(source, "python").unwrap();
795        assert!(!summary.functions.is_empty());
796        assert!(!summary.classes.is_empty());
797        assert!(!summary.imports.is_empty());
798    }
799
800    #[test]
801    fn test_go_extraction() {
802        let parser = AstParser::new();
803        let source = r#"
804package main
805
806import "fmt"
807
808type Server struct {
809    port int
810}
811
812func NewServer(port int) *Server {
813    return &Server{port: port}
814}
815
816func (s *Server) Start() error {
817    fmt.Println("starting")
818    return nil
819}
820"#;
821        let summary = parser.extract_signatures(source, "go").unwrap();
822        assert!(!summary.functions.is_empty());
823        assert!(!summary.imports.is_empty());
824    }
825
826    #[test]
827    fn test_compression_ratio() {
828        let parser = AstParser::new();
829        // A large Rust file with lots of implementation details
830        let source = r#"
831use std::collections::HashMap;
832use std::sync::Arc;
833
834/// A complex data structure with lots of implementation
835pub struct ComplexStruct {
836    field1: i32,
837    field2: String,
838    field3: Vec<u8>,
839    field4: HashMap<String, i32>,
840}
841
842impl ComplexStruct {
843    pub fn new() -> Self {
844        Self {
845            field1: 0,
846            field2: String::new(),
847            field3: Vec::new(),
848            field4: HashMap::new(),
849        }
850    }
851
852    pub fn process(&self, input: &str) -> Result<String, Box<dyn std::error::Error>> {
853        // lots of implementation
854        let mut result = String::new();
855        for c in input.chars() {
856            result.push(c);
857            result.push(' ');
858        }
859        Ok(result)
860    }
861
862    fn internal_helper(&self) -> i32 {
863        self.field1 * 2
864    }
865}
866
867pub fn standalone_function(x: i32, y: i32) -> i32 {
868    // implementation
869    let temp = x + y;
870    let temp2 = temp * 2;
871    temp2 - x
872}
873
874pub type MyAlias = Arc<ComplexStruct>;
875"#;
876        let summary = parser.extract_signatures(source, "rust").unwrap();
877        assert!(
878            summary.tokens_summary < summary.tokens_original,
879            "summary ({}) should be smaller than original ({})",
880            summary.tokens_summary,
881            summary.tokens_original
882        );
883    }
884
885    // -----------------------------------------------------------------------
886    // Property 25: AST extraction preserves public API
887    // -----------------------------------------------------------------------
888
889    /// **Property 25: AST extraction preserves public API**
890    ///
891    /// **Validates: Requirements 19.2, 19.3**
892    ///
893    /// For any source code in a supported language containing at least one
894    /// public function/class definition, `extract_signatures` SHALL produce a
895    /// `CodeSummary` where `tokens_summary < tokens_original` (compression
896    /// occurred) and the function/class names appear in the summary output.
897    #[cfg(test)]
898    mod prop25 {
899        use super::*;
900        use proptest::prelude::*;
901
902        /// Generate a Rust source file with N functions and M structs,
903        /// each with a body of `body_lines` lines of filler code.
904        /// This ensures tokens_original >> tokens_summary.
905        fn arb_rust_source() -> impl Strategy<Value = String> {
906            (
907                1usize..=5,   // number of functions
908                1usize..=3,   // number of structs
909                5usize..=20,  // body lines per function (filler)
910            )
911                .prop_map(|(n_fns, n_structs, body_lines)| {
912                    let mut src = String::new();
913                    src.push_str("use std::collections::HashMap;\n\n");
914
915                    for i in 0..n_structs {
916                        src.push_str(&format!("pub struct MyStruct{i} {{\n"));
917                        src.push_str("    field_a: i32,\n");
918                        src.push_str("    field_b: String,\n");
919                        src.push_str("    field_c: Vec<u8>,\n");
920                        src.push_str("}\n\n");
921                    }
922
923                    for i in 0..n_fns {
924                        src.push_str(&format!(
925                            "pub fn my_function_{i}(x: i32, y: i32) -> i32 {{\n"
926                        ));
927                        for j in 0..body_lines {
928                            src.push_str(&format!(
929                                "    let _var_{j} = x + y + {j};\n"
930                            ));
931                        }
932                        src.push_str("    x + y\n");
933                        src.push_str("}\n\n");
934                    }
935                    src
936                })
937        }
938
939        /// Generate a Python source file with N functions and M classes.
940        fn arb_python_source() -> impl Strategy<Value = String> {
941            (
942                1usize..=5,
943                1usize..=3,
944                5usize..=20,
945            )
946                .prop_map(|(n_fns, n_classes, body_lines)| {
947                    let mut src = String::new();
948                    src.push_str("import os\nimport sys\nfrom typing import List, Dict\n\n");
949
950                    for i in 0..n_classes {
951                        src.push_str(&format!("class MyClass{i}:\n"));
952                        src.push_str("    def __init__(self):\n");
953                        for j in 0..body_lines {
954                            src.push_str(&format!("        self.field_{j} = {j}\n"));
955                        }
956                        src.push('\n');
957                    }
958
959                    for i in 0..n_fns {
960                        src.push_str(&format!("def my_function_{i}(x, y):\n"));
961                        for j in 0..body_lines {
962                            src.push_str(&format!("    var_{j} = x + y + {j}\n"));
963                        }
964                        src.push_str("    return x + y\n\n");
965                    }
966                    src
967                })
968        }
969
970        proptest! {
971            /// **Property 25: AST extraction preserves public API (Rust)**
972            ///
973            /// **Validates: Requirements 19.2, 19.3**
974            #[test]
975            fn prop25_ast_preserves_public_api_rust(source in arb_rust_source()) {
976                let parser = AstParser::new();
977                let summary = parser.extract_signatures(&source, "rust")
978                    .expect("rust extraction should succeed");
979
980                // Compression occurred
981                prop_assert!(
982                    summary.tokens_summary < summary.tokens_original,
983                    "tokens_summary ({}) must be < tokens_original ({})",
984                    summary.tokens_summary,
985                    summary.tokens_original
986                );
987
988                // Function names appear in summary
989                let summary_text = summary.to_text();
990                for func in &summary.functions {
991                    prop_assert!(
992                        summary_text.contains(&func.name),
993                        "function name '{}' must appear in summary",
994                        func.name
995                    );
996                }
997
998                // Class/struct names appear in summary
999                for cls in &summary.classes {
1000                    prop_assert!(
1001                        summary_text.contains(&cls.name),
1002                        "class name '{}' must appear in summary",
1003                        cls.name
1004                    );
1005                }
1006
1007                // At least one function or class was extracted
1008                prop_assert!(
1009                    !summary.functions.is_empty() || !summary.classes.is_empty(),
1010                    "must extract at least one function or class"
1011                );
1012            }
1013
1014            /// **Property 25: AST extraction preserves public API (Python)**
1015            ///
1016            /// **Validates: Requirements 19.2, 19.3**
1017            #[test]
1018            fn prop25_ast_preserves_public_api_python(source in arb_python_source()) {
1019                let parser = AstParser::new();
1020                let summary = parser.extract_signatures(&source, "python")
1021                    .expect("python extraction should succeed");
1022
1023                // Compression occurred
1024                prop_assert!(
1025                    summary.tokens_summary < summary.tokens_original,
1026                    "tokens_summary ({}) must be < tokens_original ({})",
1027                    summary.tokens_summary,
1028                    summary.tokens_original
1029                );
1030
1031                // Function names appear in summary
1032                let summary_text = summary.to_text();
1033                for func in &summary.functions {
1034                    prop_assert!(
1035                        summary_text.contains(&func.name),
1036                        "function name '{}' must appear in summary",
1037                        func.name
1038                    );
1039                }
1040
1041                // Class names appear in summary
1042                for cls in &summary.classes {
1043                    prop_assert!(
1044                        summary_text.contains(&cls.name),
1045                        "class name '{}' must appear in summary",
1046                        cls.name
1047                    );
1048                }
1049
1050                // At least one function or class was extracted
1051                prop_assert!(
1052                    !summary.functions.is_empty() || !summary.classes.is_empty(),
1053                    "must extract at least one function or class"
1054                );
1055            }
1056        }
1057    }
1058}