bcore_mutation/
ast_analysis.rs

1use crate::error::Result;
2use regex::Regex;
3use std::collections::HashMap;
4
5/// Represents different types of AST nodes
6#[derive(Debug, Clone, PartialEq)]
7pub enum AstNodeType {
8    // Simple nodes (no body)
9    FunctionCall,
10    VariableDeclaration,
11    Assignment,
12    Literal,
13    Identifier,
14    BinaryOperator,
15    UnaryOperator,
16
17    // Compound nodes (have body/children)
18    IfStatement,
19    ForLoop,
20    WhileLoop,
21    Block,
22    Function,
23    Class,
24    Namespace,
25}
26
27/// Represents a node in the AST
28#[derive(Debug, Clone)]
29pub struct AstNode {
30    pub node_type: AstNodeType,
31    pub content: String,
32    pub line_number: usize,
33    pub column_start: usize,
34    #[allow(dead_code)]
35    pub column_end: usize,
36    pub children: Vec<AstNode>,
37}
38
39impl AstNode {
40    pub fn new(
41        node_type: AstNodeType,
42        content: String,
43        line_number: usize,
44        column_start: usize,
45        column_end: usize,
46    ) -> Self {
47        Self {
48            node_type,
49            content,
50            line_number,
51            column_start,
52            column_end,
53            children: Vec::new(),
54        }
55    }
56
57    #[allow(dead_code)]
58    pub fn add_child(&mut self, child: AstNode) {
59        self.children.push(child);
60    }
61
62    pub fn is_simple(&self) -> bool {
63        matches!(
64            self.node_type,
65            AstNodeType::FunctionCall
66                | AstNodeType::VariableDeclaration
67                | AstNodeType::Assignment
68                | AstNodeType::Literal
69                | AstNodeType::Identifier
70                | AstNodeType::BinaryOperator
71                | AstNodeType::UnaryOperator
72        )
73    }
74
75    #[allow(dead_code)]
76    pub fn is_compound(&self) -> bool {
77        !self.is_simple()
78    }
79}
80
81/// Expert knowledge for detecting arid nodes
82pub struct ExpertKnowledge {
83    arid_function_patterns: Vec<Regex>,
84    arid_variable_patterns: Vec<Regex>,
85    arid_statement_patterns: Vec<Regex>,
86    arid_namespace_patterns: Vec<Regex>,
87}
88
89impl ExpertKnowledge {
90    pub fn new() -> Result<Self> {
91        let arid_function_patterns = vec![
92            // Memory management functions
93            Regex::new(r"std::vector<.*>::reserve")?,
94            Regex::new(r"std::vector<.*>::resize")?,
95            Regex::new(r"std::.*::reserve")?,
96            Regex::new(r"\.reserve\s*\(")?,
97            Regex::new(r"\.resize\s*\(")?,
98            // I/O operations (typically not unit tested)
99            Regex::new(r"std::cout\s*<<")?,
100            Regex::new(r"std::cerr\s*<<")?,
101            Regex::new(r"printf\s*\(")?,
102            Regex::new(r"fprintf\s*\(")?,
103            Regex::new(r"std::endl")?,
104            // Logging functions
105            Regex::new(r"LogPrintf\s*\(")?,
106            Regex::new(r"LogPrint\s*\(")?,
107            Regex::new(r"log\.|logger\.|logging\.")?,
108            // Debug/trace functions
109            Regex::new(r"assert\s*\(")?,
110            Regex::new(r"DEBUG_")?,
111            Regex::new(r"TRACE_")?,
112            // Bitcoin Core specific patterns
113            Regex::new(r"G_FUZZING")?,
114            Regex::new(r"fPrintToConsole")?,
115            Regex::new(r"strprintf\s*\(")?,
116            // Memory allocation that's usually not tested
117            Regex::new(r"malloc\s*\(")?,
118            Regex::new(r"calloc\s*\(")?,
119            Regex::new(r"realloc\s*\(")?,
120            Regex::new(r"free\s*\(")?,
121            // Thread/concurrency primitives often not unit tested
122            Regex::new(r"std::thread")?,
123            Regex::new(r"std::mutex")?,
124            Regex::new(r"std::lock_guard")?,
125            // Performance monitoring (usually not tested)
126            Regex::new(r"\.now\(\)")?,
127            Regex::new(r"steady_clock")?,
128            Regex::new(r"high_resolution_clock")?,
129        ];
130
131        let arid_variable_patterns = vec![
132            // Timing/performance variables
133            Regex::new(r".*_time$")?,
134            Regex::new(r".*_duration$")?,
135            Regex::new(r".*_start$")?,
136            Regex::new(r".*_end$")?,
137            // Debug/logging variables
138            Regex::new(r".*_debug$")?,
139            Regex::new(r".*_log$")?,
140            Regex::new(r".*_trace$")?,
141            // Temporary/scratch variables
142            Regex::new(r"temp_.*")?,
143            Regex::new(r"tmp_.*")?,
144            Regex::new(r"scratch_.*")?,
145        ];
146
147        let arid_statement_patterns = vec![
148            // Comments
149            Regex::new(r"^\s*//")?,
150            Regex::new(r"^\s*/\*")?,
151            // Preprocessor directives
152            Regex::new(r"^\s*#")?,
153            // Empty statements
154            Regex::new(r"^\s*;")?,
155            // Namespace declarations
156            Regex::new(r"^\s*namespace\s+")?,
157            Regex::new(r"^\s*using\s+namespace\s+")?,
158            // Forward declarations
159            Regex::new(r"^\s*class\s+\w+\s*;")?,
160            Regex::new(r"^\s*struct\s+\w+\s*;")?,
161        ];
162
163        let arid_namespace_patterns = vec![
164            // Standard library
165            Regex::new(r"std::")?,
166            // Boost library (often infrastructure)
167            Regex::new(r"boost::")?,
168            // Testing frameworks
169            Regex::new(r"testing::")?,
170            Regex::new(r"gtest::")?,
171        ];
172
173        Ok(Self {
174            arid_function_patterns,
175            arid_variable_patterns,
176            arid_statement_patterns,
177            arid_namespace_patterns,
178        })
179    }
180
181    /// Expert function that determines if a simple node is arid
182    pub fn is_arid_simple_node(&self, node: &AstNode) -> bool {
183        if !node.is_simple() {
184            return false;
185        }
186
187        let content = &node.content;
188
189        // Check function call patterns
190        if matches!(node.node_type, AstNodeType::FunctionCall) {
191            for pattern in &self.arid_function_patterns {
192                if pattern.is_match(content) {
193                    return true;
194                }
195            }
196        }
197
198        // Check variable patterns
199        if matches!(
200            node.node_type,
201            AstNodeType::VariableDeclaration | AstNodeType::Assignment
202        ) {
203            for pattern in &self.arid_variable_patterns {
204                if pattern.is_match(content) {
205                    return true;
206                }
207            }
208        }
209
210        // Check general statement patterns
211        for pattern in &self.arid_statement_patterns {
212            if pattern.is_match(content) {
213                return true;
214            }
215        }
216
217        // Check namespace patterns
218        for pattern in &self.arid_namespace_patterns {
219            if pattern.is_match(content) {
220                return true;
221            }
222        }
223
224        false
225    }
226}
227
228/// Arid node detector implementing Google's algorithm
229pub struct AridNodeDetector {
230    expert: ExpertKnowledge,
231    cache: HashMap<String, bool>,
232}
233
234impl AridNodeDetector {
235    pub fn new() -> Result<Self> {
236        Ok(Self {
237            expert: ExpertKnowledge::new()?,
238            cache: HashMap::new(),
239        })
240    }
241
242    /// Implementation of Google's arid node detection algorithm
243    /// arid(N) = expert(N) if simple(N)
244    ///         = 1 if ∀(arid(c)) = 1, ∀c ∈ N otherwise
245    pub fn is_arid(&mut self, node: &AstNode) -> bool {
246        // Create cache key
247        let cache_key = format!(
248            "{}:{}:{}",
249            node.line_number, node.column_start, node.content
250        );
251
252        if let Some(&cached_result) = self.cache.get(&cache_key) {
253            return cached_result;
254        }
255
256        let result = if node.is_simple() {
257            // For simple nodes, use expert knowledge
258            self.expert.is_arid_simple_node(node)
259        } else {
260            // For compound nodes, check if ALL children are arid
261            if node.children.is_empty() {
262                // Empty compound node is not arid
263                false
264            } else {
265                // All children must be arid for compound node to be arid
266                node.children.iter().all(|child| self.is_arid(child))
267            }
268        };
269
270        // Cache the result
271        self.cache.insert(cache_key, result);
272        result
273    }
274
275    /// Check if a line should be mutated based on AST analysis
276    pub fn should_mutate_line(&mut self, line_content: &str, line_number: usize) -> bool {
277        // Create a simple AST node for the line
278        let node = self.parse_line_to_simple_ast(line_content, line_number);
279        !self.is_arid(&node)
280    }
281
282    /// Simple heuristic-based parsing to create AST nodes from single lines
283    fn parse_line_to_simple_ast(&self, line_content: &str, line_number: usize) -> AstNode {
284        let trimmed = line_content.trim();
285
286        // Skip empty lines and comments
287        if trimmed.is_empty() || trimmed.starts_with("//") || trimmed.starts_with("/*") {
288            return AstNode::new(
289                AstNodeType::Identifier,
290                trimmed.to_string(),
291                line_number,
292                0,
293                line_content.len(),
294            );
295        }
296
297        // Determine node type based on content patterns
298        let node_type = self.classify_line(trimmed);
299
300        AstNode::new(
301            node_type,
302            trimmed.to_string(),
303            line_number,
304            0,
305            line_content.len(),
306        )
307    }
308
309    /// Classify a line of code into the appropriate AST node type
310    fn classify_line(&self, line: &str) -> AstNodeType {
311        // Namespace declarations
312        if line.starts_with("namespace ") || line.contains("using namespace") {
313            return AstNodeType::Namespace;
314        }
315
316        // Class declarations
317        if line.starts_with("class ") || line.starts_with("struct ") {
318            return AstNodeType::Class;
319        }
320
321        // Function declarations/definitions
322        if self.is_function_declaration(line) {
323            return AstNodeType::Function;
324        }
325
326        // Control flow statements (compound nodes)
327        if line.starts_with("if ") || line.starts_with("if(") || line.contains("} else ") {
328            return AstNodeType::IfStatement;
329        }
330        if line.starts_with("for ") || line.starts_with("for(") {
331            return AstNodeType::ForLoop;
332        }
333        if line.starts_with("while ") || line.starts_with("while(") {
334            return AstNodeType::WhileLoop;
335        }
336
337        // Block statements
338        if line == "{" || line == "}" || line.ends_with(" {") {
339            return AstNodeType::Block;
340        }
341
342        // Variable declarations
343        if self.is_variable_declaration(line) {
344            return AstNodeType::VariableDeclaration;
345        }
346
347        // Assignment operations
348        if self.is_assignment(line) {
349            return AstNodeType::Assignment;
350        }
351
352        // Function calls
353        if self.is_function_call(line) {
354            return AstNodeType::FunctionCall;
355        }
356
357        // Binary operators
358        if self.is_binary_operation(line) {
359            return AstNodeType::BinaryOperator;
360        }
361
362        // Unary operators
363        if self.is_unary_operation(line) {
364            return AstNodeType::UnaryOperator;
365        }
366
367        // Literals
368        if self.is_literal(line) {
369            return AstNodeType::Literal;
370        }
371
372        // Default to identifier
373        AstNodeType::Identifier
374    }
375
376    /// Check if line is a function declaration or definition
377    fn is_function_declaration(&self, line: &str) -> bool {
378        let function_patterns = [
379            Regex::new(r"^\s*\w+\s+\w+\s*\([^)]*\)\s*[{;]?").unwrap(),
380            Regex::new(r"^\s*~?\w+\s*\([^)]*\)\s*[{;:]").unwrap(),
381            Regex::new(r"^\s*(?:template\s*<[^>]*>\s*)?(?:virtual\s+|static\s+|inline\s+)*[\w:<>*&\s]+\s+\w+\s*\([^)]*\)").unwrap(),
382        ];
383
384        function_patterns
385            .iter()
386            .any(|pattern| pattern.is_match(line))
387            && !line.contains('=')
388    }
389
390    /// Check if line is a variable declaration
391    fn is_variable_declaration(&self, line: &str) -> bool {
392        let var_patterns = [
393            Regex::new(r"^\s*(int|bool|char|float|double|long|short|unsigned|signed)\s+\w+")
394                .unwrap(),
395            Regex::new(r"^\s*std::\w+\s*<?[^>]*>?\s+\w+").unwrap(),
396            Regex::new(r"^\s*[A-Z]\w*\s+\w+").unwrap(),
397            Regex::new(r"^\s*\w+\s*[*&]+\s*\w+").unwrap(),
398            Regex::new(r"^\s*const\s+\w+").unwrap(),
399            Regex::new(r"^\s*auto\s+\w+").unwrap(),
400        ];
401
402        var_patterns.iter().any(|pattern| pattern.is_match(line))
403            && !line.contains('(')
404            && (line.contains('=') || line.ends_with(';'))
405    }
406
407    /// Check if line is an assignment
408    fn is_assignment(&self, line: &str) -> bool {
409        line.contains('=')
410            && !line.contains("==")
411            && !line.contains("!=")
412            && !line.contains("<=")
413            && !line.contains(">=")
414            && !self.is_variable_declaration(line)
415    }
416
417    /// Check if line is a function call
418    fn is_function_call(&self, line: &str) -> bool {
419        line.contains('(')
420            && line.contains(')')
421            && !self.is_function_declaration(line)
422            && !self.is_variable_declaration(line)
423            && !line.starts_with("if ")
424            && !line.starts_with("while ")
425            && !line.starts_with("for ")
426    }
427
428    /// Check if line contains binary operations
429    fn is_binary_operation(&self, line: &str) -> bool {
430        let binary_ops = [
431            "+", "-", "*", "/", "%", "&&", "||", "&", "|", "^", "<<", ">>",
432        ];
433        binary_ops.iter().any(|op| line.contains(op)) && !line.contains('=') && !line.contains('(')
434    }
435
436    /// Check if line contains unary operations
437    fn is_unary_operation(&self, line: &str) -> bool {
438        let unary_patterns = [
439            Regex::new(r"\+\+\w+").unwrap(),
440            Regex::new(r"\w\+\+").unwrap(),
441            Regex::new(r"--\w+").unwrap(),
442            Regex::new(r"\w--").unwrap(),
443            Regex::new(r"!\w+").unwrap(),
444            Regex::new(r"~\w+").unwrap(),
445        ];
446
447        unary_patterns.iter().any(|pattern| pattern.is_match(line))
448    }
449
450    /// Check if line is a literal value
451    fn is_literal(&self, line: &str) -> bool {
452        let literal_patterns = [
453            Regex::new(r"^\s*\d+\s*;?\s*$").unwrap(),
454            Regex::new(r"^\s*\d+\.\d+\s*;?\s*$").unwrap(),
455            Regex::new(r#"^\s*"[^"]*"\s*;?\s*$"#).unwrap(),
456            Regex::new(r"^\s*'[^']*'\s*;?\s*$").unwrap(),
457            Regex::new(r"^\s*(true|false)\s*;?\s*$").unwrap(),
458            Regex::new(r"^\s*(nullptr|NULL)\s*;?\s*$").unwrap(),
459        ];
460
461        literal_patterns
462            .iter()
463            .any(|pattern| pattern.is_match(line))
464    }
465
466    /// Add a new expert rule at runtime
467    pub fn add_expert_rule(&mut self, pattern: &str, description: &str) -> Result<()> {
468        let regex = Regex::new(pattern)?;
469        self.expert.arid_function_patterns.push(regex);
470        println!("Added expert rule: {} ({})", pattern, description);
471        Ok(())
472    }
473
474    /// Get statistics about arid node detection
475    pub fn get_stats(&self) -> HashMap<String, usize> {
476        let mut stats = HashMap::new();
477        stats.insert(
478            "total_expert_rules".to_string(),
479            self.expert.arid_function_patterns.len()
480                + self.expert.arid_variable_patterns.len()
481                + self.expert.arid_statement_patterns.len(),
482        );
483        stats.insert("cache_size".to_string(), self.cache.len());
484        stats.insert(
485            "function_patterns".to_string(),
486            self.expert.arid_function_patterns.len(),
487        );
488        stats.insert(
489            "variable_patterns".to_string(),
490            self.expert.arid_variable_patterns.len(),
491        );
492        stats.insert(
493            "statement_patterns".to_string(),
494            self.expert.arid_statement_patterns.len(),
495        );
496        stats
497    }
498
499    /// Export detailed analysis of which lines were filtered and why
500    #[allow(dead_code)]
501    pub fn analyze_file_detailed(&mut self, file_content: &str) -> DetailedAnalysis {
502        let lines: Vec<&str> = file_content.lines().collect();
503        let mut analysis = DetailedAnalysis::new();
504
505        for (idx, line) in lines.iter().enumerate() {
506            let line_number = idx + 1;
507            let node = self.parse_line_to_simple_ast(line, line_number);
508            let is_arid = self.is_arid(&node);
509            let reason = if is_arid {
510                self.get_arid_reason(&node)
511            } else {
512                "Not arid - will be mutated".to_string()
513            };
514
515            analysis.add_line_analysis(LineAnalysis {
516                line_number,
517                content: line.to_string(),
518                node_type: node.node_type,
519                is_arid,
520                reason,
521            });
522        }
523
524        analysis
525    }
526
527    /// Get the reason why a node is considered arid
528    #[allow(dead_code)]
529    fn get_arid_reason(&self, node: &AstNode) -> String {
530        if !node.is_simple() {
531            return "Compound node - arid if all children are arid".to_string();
532        }
533
534        let content = &node.content;
535
536        // Check function call patterns
537        if matches!(node.node_type, AstNodeType::FunctionCall) {
538            for (idx, pattern) in self.expert.arid_function_patterns.iter().enumerate() {
539                if pattern.is_match(content) {
540                    return format!(
541                        "Matches arid function pattern #{}: {}",
542                        idx + 1,
543                        pattern.as_str()
544                    );
545                }
546            }
547        }
548
549        // Check variable patterns
550        if matches!(
551            node.node_type,
552            AstNodeType::VariableDeclaration | AstNodeType::Assignment
553        ) {
554            for (idx, pattern) in self.expert.arid_variable_patterns.iter().enumerate() {
555                if pattern.is_match(content) {
556                    return format!(
557                        "Matches arid variable pattern #{}: {}",
558                        idx + 1,
559                        pattern.as_str()
560                    );
561                }
562            }
563        }
564
565        // Check statement patterns
566        for (idx, pattern) in self.expert.arid_statement_patterns.iter().enumerate() {
567            if pattern.is_match(content) {
568                return format!(
569                    "Matches arid statement pattern #{}: {}",
570                    idx + 1,
571                    pattern.as_str()
572                );
573            }
574        }
575
576        "Not arid".to_string()
577    }
578
579    /// Clear the cache (useful for testing or when rules change)
580    #[allow(dead_code)]
581    pub fn clear_cache(&mut self) {
582        self.cache.clear();
583    }
584}
585
586/// Detailed analysis results for a file
587#[allow(dead_code)]
588#[derive(Debug)]
589pub struct DetailedAnalysis {
590    pub lines: Vec<LineAnalysis>,
591    pub summary: AnalysisSummary,
592}
593
594#[allow(dead_code)]
595impl DetailedAnalysis {
596    pub fn new() -> Self {
597        Self {
598            lines: Vec::new(),
599            summary: AnalysisSummary::default(),
600        }
601    }
602
603    pub fn add_line_analysis(&mut self, analysis: LineAnalysis) {
604        if analysis.is_arid {
605            self.summary.arid_lines += 1;
606        } else {
607            self.summary.mutatable_lines += 1;
608        }
609        self.summary.total_lines += 1;
610        self.lines.push(analysis);
611    }
612
613    pub fn print_summary(&self) {
614        println!("\n=== AST Analysis Summary ===");
615        println!("Total lines: {}", self.summary.total_lines);
616        println!("Mutatable lines: {}", self.summary.mutatable_lines);
617        println!("Arid lines: {}", self.summary.arid_lines);
618        println!(
619            "Filtering efficiency: {:.1}% reduction",
620            (self.summary.arid_lines as f64 / self.summary.total_lines as f64) * 100.0
621        );
622    }
623
624    pub fn print_arid_lines(&self) {
625        println!("\n=== Filtered Out (Arid) Lines ===");
626        for line in &self.lines {
627            if line.is_arid {
628                println!(
629                    "Line {}: {} | Reason: {}",
630                    line.line_number,
631                    line.content.trim(),
632                    line.reason
633                );
634            }
635        }
636    }
637}
638
639/// Analysis of a single line
640#[allow(dead_code)]
641#[derive(Debug)]
642pub struct LineAnalysis {
643    pub line_number: usize,
644    pub content: String,
645    pub node_type: AstNodeType,
646    pub is_arid: bool,
647    pub reason: String,
648}
649
650/// Summary statistics for analysis
651#[allow(dead_code)]
652#[derive(Debug, Default)]
653pub struct AnalysisSummary {
654    pub total_lines: usize,
655    pub mutatable_lines: usize,
656    pub arid_lines: usize,
657}
658
659/// Integration with existing mutation system
660pub fn filter_mutatable_lines(lines: &[String], detector: &mut AridNodeDetector) -> Vec<usize> {
661    lines
662        .iter()
663        .enumerate()
664        .filter_map(|(idx, line)| {
665            let line_number = idx + 1;
666            if detector.should_mutate_line(line, line_number) {
667                Some(line_number)
668            } else {
669                None
670            }
671        })
672        .collect()
673}
674
675#[cfg(test)]
676mod tests {
677    use super::*;
678
679    #[test]
680    fn test_expert_knowledge() {
681        let expert = ExpertKnowledge::new().unwrap();
682
683        // Test arid function calls
684        let reserve_node = AstNode::new(
685            AstNodeType::FunctionCall,
686            "vec.reserve(100)".to_string(),
687            1,
688            0,
689            15,
690        );
691        assert!(expert.is_arid_simple_node(&reserve_node));
692
693        // Test non-arid function calls
694        let normal_node = AstNode::new(
695            AstNodeType::FunctionCall,
696            "calculate_sum(a, b)".to_string(),
697            1,
698            0,
699            18,
700        );
701        assert!(!expert.is_arid_simple_node(&normal_node));
702    }
703
704    #[test]
705    fn test_arid_detection_algorithm() {
706        let mut detector = AridNodeDetector::new().unwrap();
707
708        // Test simple arid node
709        let arid_simple = AstNode::new(
710            AstNodeType::FunctionCall,
711            "std::cout << \"debug\"".to_string(),
712            1,
713            0,
714            20,
715        );
716        assert!(detector.is_arid(&arid_simple));
717
718        // Test compound node with all arid children
719        let mut compound_arid =
720            AstNode::new(AstNodeType::Block, "{ debug block }".to_string(), 1, 0, 15);
721        compound_arid.add_child(arid_simple.clone());
722        assert!(detector.is_arid(&compound_arid));
723
724        // Test compound node with non-arid child
725        let non_arid_simple = AstNode::new(
726            AstNodeType::FunctionCall,
727            "important_function()".to_string(),
728            2,
729            0,
730            20,
731        );
732        let mut compound_mixed =
733            AstNode::new(AstNodeType::Block, "{ mixed block }".to_string(), 1, 0, 15);
734        compound_mixed.add_child(arid_simple);
735        compound_mixed.add_child(non_arid_simple);
736        assert!(!detector.is_arid(&compound_mixed));
737    }
738
739    #[test]
740    fn test_line_mutation_filtering() {
741        let mut detector = AridNodeDetector::new().unwrap();
742
743        let lines = vec![
744            "int x = 5;".to_string(),              // Should mutate
745            "std::cout << \"debug\";".to_string(), // Should NOT mutate (arid)
746            "vec.reserve(100);".to_string(),       // Should NOT mutate (arid)
747            "return x + y;".to_string(),           // Should mutate
748        ];
749
750        let mutatable_lines = filter_mutatable_lines(&lines, &mut detector);
751
752        // Should only include lines 1 and 4 (0-indexed: 0 and 3)
753        assert_eq!(mutatable_lines, vec![1, 4]);
754    }
755}