organizational_intelligence_plugin/
tarantula.rs

1// Tarantula-style Spectrum-Based Fault Localization (SBFL)
2// Toyota Way: Start with simplest formula, evolve based on evidence
3// Phase 1: Classic Tarantula + Ochiai + DStar formulas
4// Muda: Avoid waste by using lightweight SBFL before expensive MBFL
5// Muri: Prevent overburden by presenting only top-N suspicious statements
6
7use anyhow::{anyhow, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::path::{Path, PathBuf};
11use tracing::{debug, info};
12
13/// Represents a code location for fault localization
14#[derive(Debug, Clone, Hash, Eq, PartialEq, Serialize, Deserialize)]
15pub struct StatementId {
16    pub file: PathBuf,
17    pub line: usize,
18    #[serde(skip_serializing_if = "Option::is_none")]
19    pub column: Option<usize>,
20}
21
22impl StatementId {
23    pub fn new(file: impl Into<PathBuf>, line: usize) -> Self {
24        Self {
25            file: file.into(),
26            line,
27            column: None,
28        }
29    }
30
31    pub fn with_column(mut self, column: usize) -> Self {
32        self.column = Some(column);
33        self
34    }
35}
36
37/// Coverage information for a single statement
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct StatementCoverage {
40    pub id: StatementId,
41    pub executed_by_passed: usize,
42    pub executed_by_failed: usize,
43}
44
45impl StatementCoverage {
46    pub fn new(id: StatementId, passed: usize, failed: usize) -> Self {
47        Self {
48            id,
49            executed_by_passed: passed,
50            executed_by_failed: failed,
51        }
52    }
53}
54
55/// Available fault localization formulas
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
57pub enum SbflFormula {
58    /// Original Tarantula formula (Jones & Harrold, 2005)
59    #[default]
60    Tarantula,
61    /// Ochiai formula - often outperforms Tarantula (Abreu et al., 2009)
62    Ochiai,
63    /// DStar with configurable exponent (Wong et al., 2014)
64    DStar { exponent: u32 },
65}
66
67/// Individual suspiciousness ranking entry
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct SuspiciousnessRanking {
70    pub rank: usize,
71    pub statement: StatementId,
72    pub suspiciousness: f32,
73    pub scores: HashMap<String, f32>,
74    pub explanation: String,
75    pub failed_coverage: usize,
76    pub passed_coverage: usize,
77}
78
79/// Result of fault localization analysis
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct FaultLocalizationResult {
82    pub rankings: Vec<SuspiciousnessRanking>,
83    pub formula_used: SbflFormula,
84    pub confidence: f32,
85    pub total_passed_tests: usize,
86    pub total_failed_tests: usize,
87}
88
89/// Classic Tarantula suspiciousness formula
90///
91/// Formula: (failed/totalFailed) / ((passed/totalPassed) + (failed/totalFailed))
92///
93/// Reference: Jones, J.A., Harrold, M.J. (2005). ASE '05
94pub fn tarantula(failed: usize, passed: usize, total_failed: usize, total_passed: usize) -> f32 {
95    let failed_ratio = if total_failed > 0 {
96        failed as f32 / total_failed as f32
97    } else {
98        0.0
99    };
100
101    let passed_ratio = if total_passed > 0 {
102        passed as f32 / total_passed as f32
103    } else {
104        0.0
105    };
106
107    let denominator = passed_ratio + failed_ratio;
108    if denominator == 0.0 {
109        0.0
110    } else {
111        failed_ratio / denominator
112    }
113}
114
115/// Ochiai suspiciousness formula (from molecular biology)
116///
117/// Formula: failed / sqrt(totalFailed * (failed + passed))
118///
119/// Reference: Abreu et al. (2009). JSS 82(11)
120pub fn ochiai(failed: usize, passed: usize, total_failed: usize) -> f32 {
121    let denominator = ((total_failed * (failed + passed)) as f32).sqrt();
122    if denominator == 0.0 {
123        0.0
124    } else {
125        failed as f32 / denominator
126    }
127}
128
129/// DStar suspiciousness formula with configurable exponent
130///
131/// Formula: failed^* / (passed + (totalFailed - failed))
132///
133/// Reference: Wong et al. (2014). IEEE TSE 40(1)
134pub fn dstar(failed: usize, passed: usize, total_failed: usize, star: u32) -> f32 {
135    let numerator = (failed as f32).powi(star as i32);
136    let not_failed = total_failed.saturating_sub(failed);
137    let denominator = passed as f32 + not_failed as f32;
138
139    if denominator == 0.0 {
140        if numerator > 0.0 {
141            f32::MAX // Avoid infinity, use max finite value
142        } else {
143            0.0
144        }
145    } else {
146        numerator / denominator
147    }
148}
149
150/// Spectrum-Based Fault Localizer
151///
152/// Implements the core SBFL algorithms following Toyota Way principles:
153/// - Start simple (Tarantula baseline)
154/// - Measure and evolve (compare formulas)
155/// - Eliminate waste (skip expensive analysis when simple works)
156pub struct SbflLocalizer {
157    formula: SbflFormula,
158    top_n: usize,
159    include_explanations: bool,
160    min_confidence_threshold: f32,
161}
162
163impl Default for SbflLocalizer {
164    fn default() -> Self {
165        Self::new()
166    }
167}
168
169impl SbflLocalizer {
170    pub fn new() -> Self {
171        Self {
172            formula: SbflFormula::Tarantula,
173            top_n: 10,
174            include_explanations: true,
175            min_confidence_threshold: 0.0,
176        }
177    }
178
179    pub fn with_formula(mut self, formula: SbflFormula) -> Self {
180        self.formula = formula;
181        self
182    }
183
184    pub fn with_top_n(mut self, n: usize) -> Self {
185        self.top_n = n;
186        self
187    }
188
189    pub fn with_explanations(mut self, include: bool) -> Self {
190        self.include_explanations = include;
191        self
192    }
193
194    pub fn with_min_confidence(mut self, threshold: f32) -> Self {
195        self.min_confidence_threshold = threshold;
196        self
197    }
198
199    /// Localize faults using the configured SBFL formula
200    ///
201    /// # Arguments
202    /// * `coverage` - Statement coverage data
203    /// * `total_passed` - Total number of passing tests
204    /// * `total_failed` - Total number of failing tests
205    ///
206    /// # Returns
207    /// Ranked list of suspicious statements
208    pub fn localize(
209        &self,
210        coverage: &[StatementCoverage],
211        total_passed: usize,
212        total_failed: usize,
213    ) -> FaultLocalizationResult {
214        info!(
215            "Running {:?} fault localization on {} statements",
216            self.formula,
217            coverage.len()
218        );
219
220        // Calculate suspiciousness for each statement
221        let mut scored: Vec<(StatementId, f32, usize, usize)> = coverage
222            .iter()
223            .map(|cov| {
224                let score = self.calculate_score(
225                    cov.executed_by_failed,
226                    cov.executed_by_passed,
227                    total_failed,
228                    total_passed,
229                );
230                (
231                    cov.id.clone(),
232                    score,
233                    cov.executed_by_failed,
234                    cov.executed_by_passed,
235                )
236            })
237            .collect();
238
239        // Sort by suspiciousness (descending)
240        scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
241
242        // Take top N
243        let rankings: Vec<SuspiciousnessRanking> = scored
244            .into_iter()
245            .take(self.top_n)
246            .enumerate()
247            .filter(|(_, (_, score, _, _))| *score >= self.min_confidence_threshold)
248            .map(|(rank, (stmt, score, failed, passed))| {
249                let explanation = if self.include_explanations {
250                    self.generate_explanation(failed, passed, total_failed, total_passed, score)
251                } else {
252                    String::new()
253                };
254
255                // Calculate all formula scores for comparison
256                let mut scores = HashMap::new();
257                scores.insert(
258                    "tarantula".to_string(),
259                    tarantula(failed, passed, total_failed, total_passed),
260                );
261                scores.insert("ochiai".to_string(), ochiai(failed, passed, total_failed));
262                scores.insert("dstar2".to_string(), dstar(failed, passed, total_failed, 2));
263                scores.insert("dstar3".to_string(), dstar(failed, passed, total_failed, 3));
264
265                SuspiciousnessRanking {
266                    rank: rank + 1,
267                    statement: stmt,
268                    suspiciousness: score,
269                    scores,
270                    explanation,
271                    failed_coverage: failed,
272                    passed_coverage: passed,
273                }
274            })
275            .collect();
276
277        // Calculate confidence based on test coverage density
278        let confidence = self.calculate_confidence(coverage.len(), total_passed, total_failed);
279
280        debug!(
281            "Localized {} suspicious statements with confidence {}",
282            rankings.len(),
283            confidence
284        );
285
286        FaultLocalizationResult {
287            rankings,
288            formula_used: self.formula,
289            confidence,
290            total_passed_tests: total_passed,
291            total_failed_tests: total_failed,
292        }
293    }
294
295    fn calculate_score(
296        &self,
297        failed: usize,
298        passed: usize,
299        total_failed: usize,
300        total_passed: usize,
301    ) -> f32 {
302        match self.formula {
303            SbflFormula::Tarantula => tarantula(failed, passed, total_failed, total_passed),
304            SbflFormula::Ochiai => ochiai(failed, passed, total_failed),
305            SbflFormula::DStar { exponent } => dstar(failed, passed, total_failed, exponent),
306        }
307    }
308
309    fn generate_explanation(
310        &self,
311        failed: usize,
312        passed: usize,
313        total_failed: usize,
314        total_passed: usize,
315        score: f32,
316    ) -> String {
317        let failed_pct = if total_failed > 0 {
318            (failed as f32 / total_failed as f32 * 100.0) as u32
319        } else {
320            0
321        };
322
323        let passed_pct = if total_passed > 0 {
324            (passed as f32 / total_passed as f32 * 100.0) as u32
325        } else {
326            0
327        };
328
329        format!(
330            "Executed by {}% of failing tests ({}/{}) and {}% of passing tests ({}/{}). \
331             Suspiciousness score: {:.3}",
332            failed_pct, failed, total_failed, passed_pct, passed, total_passed, score
333        )
334    }
335
336    fn calculate_confidence(
337        &self,
338        statement_count: usize,
339        total_passed: usize,
340        total_failed: usize,
341    ) -> f32 {
342        // Confidence based on:
343        // 1. Number of failing tests (more = more signal)
344        // 2. Ratio of failing to total tests (too few failing = noisy)
345        // 3. Coverage density (more statements covered = more context)
346
347        let total_tests = total_passed + total_failed;
348        if total_tests == 0 || total_failed == 0 {
349            return 0.0;
350        }
351
352        // Factor 1: Log scale for failing test count (diminishing returns)
353        let fail_factor = (total_failed as f32).ln().min(3.0) / 3.0;
354
355        // Factor 2: Failing ratio (sweet spot around 5-20%)
356        let fail_ratio = total_failed as f32 / total_tests as f32;
357        let ratio_factor = if fail_ratio < 0.01 {
358            fail_ratio * 10.0 // Very few failures = low confidence
359        } else if fail_ratio > 0.5 {
360            1.0 - (fail_ratio - 0.5) // Too many failures = less localizing
361        } else {
362            1.0
363        };
364
365        // Factor 3: Statement coverage (more covered = more context)
366        let coverage_factor = (statement_count as f32).ln().min(7.0) / 7.0;
367
368        (fail_factor * ratio_factor * coverage_factor).min(1.0)
369    }
370}
371
372/// LCOV coverage data parser for cargo-llvm-cov integration
373#[derive(Debug, Default)]
374pub struct LcovParser;
375
376impl LcovParser {
377    /// Parse LCOV format coverage file
378    ///
379    /// LCOV format:
380    /// ```text
381    /// SF:path/to/file.rs
382    /// DA:line_number,execution_count
383    /// ...
384    /// end_of_record
385    /// ```
386    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<Vec<(StatementId, usize)>> {
387        let content = std::fs::read_to_string(path.as_ref())
388            .map_err(|e| anyhow!("Failed to read LCOV file: {}", e))?;
389        Self::parse(&content)
390    }
391
392    pub fn parse(content: &str) -> Result<Vec<(StatementId, usize)>> {
393        let mut results = Vec::new();
394        let mut current_file: Option<PathBuf> = None;
395
396        for line in content.lines() {
397            let line = line.trim();
398
399            if let Some(path) = line.strip_prefix("SF:") {
400                current_file = Some(PathBuf::from(path));
401            } else if let Some(da) = line.strip_prefix("DA:") {
402                if let Some(ref file) = current_file {
403                    let parts: Vec<&str> = da.split(',').collect();
404                    if parts.len() >= 2 {
405                        if let (Ok(line_num), Ok(count)) =
406                            (parts[0].parse::<usize>(), parts[1].parse::<usize>())
407                        {
408                            results.push((StatementId::new(file.clone(), line_num), count));
409                        }
410                    }
411                }
412            } else if line == "end_of_record" {
413                current_file = None;
414            }
415        }
416
417        Ok(results)
418    }
419
420    /// Combine coverage from multiple test runs (passed and failed)
421    ///
422    /// # Arguments
423    /// * `passed_coverage` - Coverage from passing tests
424    /// * `failed_coverage` - Coverage from failing tests
425    ///
426    /// # Returns
427    /// Combined statement coverage suitable for SBFL
428    pub fn combine_coverage(
429        passed_coverage: &[(StatementId, usize)],
430        failed_coverage: &[(StatementId, usize)],
431    ) -> Vec<StatementCoverage> {
432        let mut coverage_map: HashMap<StatementId, (usize, usize)> = HashMap::new();
433
434        // Count passed test coverage
435        for (stmt, count) in passed_coverage {
436            if *count > 0 {
437                coverage_map.entry(stmt.clone()).or_insert((0, 0)).0 += 1;
438            }
439        }
440
441        // Count failed test coverage
442        for (stmt, count) in failed_coverage {
443            if *count > 0 {
444                coverage_map.entry(stmt.clone()).or_insert((0, 0)).1 += 1;
445            }
446        }
447
448        coverage_map
449            .into_iter()
450            .map(|(id, (passed, failed))| StatementCoverage::new(id, passed, failed))
451            .collect()
452    }
453}
454
455// ============================================================================
456// TarantulaIntegration - pmat-style integration for fault localization
457// Toyota Way: Integrate with existing tools (cargo-llvm-cov, pmat TDG)
458// ============================================================================
459
460/// Report output format for fault localization results
461#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
462pub enum ReportFormat {
463    #[default]
464    Yaml,
465    Json,
466    Terminal,
467}
468
469/// Configuration for fault localization runs
470#[derive(Debug, Clone)]
471pub struct LocalizationConfig {
472    pub formula: SbflFormula,
473    pub top_n: usize,
474    pub include_explanations: bool,
475    pub min_confidence: f32,
476}
477
478impl Default for LocalizationConfig {
479    fn default() -> Self {
480        Self {
481            formula: SbflFormula::Tarantula,
482            top_n: 10,
483            include_explanations: true,
484            min_confidence: 0.0,
485        }
486    }
487}
488
489impl LocalizationConfig {
490    pub fn new() -> Self {
491        Self::default()
492    }
493
494    pub fn with_formula(mut self, formula: SbflFormula) -> Self {
495        self.formula = formula;
496        self
497    }
498
499    pub fn with_top_n(mut self, n: usize) -> Self {
500        self.top_n = n;
501        self
502    }
503
504    pub fn with_explanations(mut self, include: bool) -> Self {
505        self.include_explanations = include;
506        self
507    }
508
509    pub fn with_min_confidence(mut self, threshold: f32) -> Self {
510        self.min_confidence = threshold;
511        self
512    }
513}
514
515/// Tarantula integration wrapper (pmat-style)
516///
517/// Provides high-level interface for fault localization that integrates
518/// with cargo-llvm-cov for coverage and pmat for TDG enrichment.
519///
520/// # Toyota Way Principles
521/// - **Genchi Genbutsu**: Uses actual coverage data, not estimates
522/// - **Muda**: Avoids waste by reusing existing coverage tools
523/// - **Jidoka**: Provides human-readable explanations
524pub struct TarantulaIntegration;
525
526impl TarantulaIntegration {
527    /// Check if cargo-llvm-cov is available
528    pub fn is_coverage_tool_available() -> bool {
529        std::process::Command::new("cargo")
530            .args(["llvm-cov", "--version"])
531            .output()
532            .map(|o| o.status.success())
533            .unwrap_or(false)
534    }
535
536    /// Parse LCOV format output from cargo-llvm-cov
537    pub fn parse_lcov_output(content: &str) -> Result<Vec<(StatementId, usize)>> {
538        LcovParser::parse(content)
539    }
540
541    /// Run fault localization on coverage data
542    ///
543    /// # Arguments
544    /// * `passed_coverage` - Coverage from passing tests
545    /// * `failed_coverage` - Coverage from failing tests
546    /// * `total_passed` - Number of passing tests
547    /// * `total_failed` - Number of failing tests
548    /// * `config` - Localization configuration
549    pub fn run_localization(
550        passed_coverage: &[(StatementId, usize)],
551        failed_coverage: &[(StatementId, usize)],
552        total_passed: usize,
553        total_failed: usize,
554        config: &LocalizationConfig,
555    ) -> FaultLocalizationResult {
556        info!(
557            "Running fault localization: {} passed, {} failed tests",
558            total_passed, total_failed
559        );
560
561        // Combine coverage data
562        let combined = LcovParser::combine_coverage(passed_coverage, failed_coverage);
563
564        // Run SBFL localization
565        let localizer = SbflLocalizer::new()
566            .with_formula(config.formula)
567            .with_top_n(config.top_n)
568            .with_explanations(config.include_explanations)
569            .with_min_confidence(config.min_confidence);
570
571        localizer.localize(&combined, total_passed, total_failed)
572    }
573
574    /// Generate report in specified format
575    pub fn generate_report(
576        result: &FaultLocalizationResult,
577        format: ReportFormat,
578    ) -> Result<String> {
579        match format {
580            ReportFormat::Yaml => {
581                serde_yaml::to_string(result).map_err(|e| anyhow!("Failed to generate YAML: {}", e))
582            }
583            ReportFormat::Json => serde_json::to_string_pretty(result)
584                .map_err(|e| anyhow!("Failed to generate JSON: {}", e)),
585            ReportFormat::Terminal => Ok(Self::format_terminal_report(result)),
586        }
587    }
588
589    /// Format report for terminal output
590    fn format_terminal_report(result: &FaultLocalizationResult) -> String {
591        let mut output = String::new();
592
593        output.push_str("╔══════════════════════════════════════════════════════════════╗\n");
594        output.push_str(&format!(
595            "║           FAULT LOCALIZATION REPORT - {:?}              ║\n",
596            result.formula_used
597        ));
598        output.push_str("╠══════════════════════════════════════════════════════════════╣\n");
599        output.push_str(&format!(
600            "║ Tests: {} passed, {} failed                              ║\n",
601            result.total_passed_tests, result.total_failed_tests
602        ));
603        output.push_str(&format!(
604            "║ Confidence: {:.2}                                          ║\n",
605            result.confidence
606        ));
607        output.push_str("╠══════════════════════════════════════════════════════════════╣\n");
608        output.push_str("║  TOP SUSPICIOUS STATEMENTS                                   ║\n");
609        output.push_str("╠══════════════════════════════════════════════════════════════╣\n");
610
611        for ranking in &result.rankings {
612            let bar_len = (ranking.suspiciousness * 20.0) as usize;
613            let bar: String = "█".repeat(bar_len) + &"░".repeat(20 - bar_len);
614
615            output.push_str(&format!(
616                "║  #{:<2} {}:{:<6}  {} {:.2}   ║\n",
617                ranking.rank,
618                ranking.statement.file.display(),
619                ranking.statement.line,
620                bar,
621                ranking.suspiciousness
622            ));
623        }
624
625        output.push_str("╚══════════════════════════════════════════════════════════════╝\n");
626
627        output
628    }
629
630    /// Enrich fault localization results with TDG scores from pmat
631    ///
632    /// # Arguments
633    /// * `result` - Mutable reference to localization result
634    /// * `tdg_scores` - Map of file path to TDG score
635    pub fn enrich_with_tdg(
636        result: &mut FaultLocalizationResult,
637        tdg_scores: &HashMap<String, f32>,
638    ) {
639        for ranking in &mut result.rankings {
640            let file_path = ranking.statement.file.to_string_lossy().to_string();
641            if let Some(&tdg) = tdg_scores.get(&file_path) {
642                ranking.scores.insert("tdg".to_string(), tdg);
643            }
644        }
645    }
646
647    /// Run cargo-llvm-cov and collect coverage for a test run
648    ///
649    /// # Arguments
650    /// * `repo_path` - Path to repository
651    /// * `test_filter` - Optional test filter pattern
652    ///
653    /// # Returns
654    /// LCOV format coverage data as string
655    #[allow(dead_code)]
656    pub fn collect_coverage<P: AsRef<Path>>(
657        repo_path: P,
658        test_filter: Option<&str>,
659    ) -> Result<String> {
660        let mut cmd = std::process::Command::new("cargo");
661        cmd.current_dir(repo_path.as_ref())
662            .args(["llvm-cov", "--lcov"]);
663
664        if let Some(filter) = test_filter {
665            cmd.args(["--", filter]);
666        }
667
668        let output = cmd
669            .output()
670            .map_err(|e| anyhow!("Failed to run cargo-llvm-cov: {}", e))?;
671
672        if !output.status.success() {
673            let stderr = String::from_utf8_lossy(&output.stderr);
674            return Err(anyhow!("cargo-llvm-cov failed: {}", stderr));
675        }
676
677        String::from_utf8(output.stdout)
678            .map_err(|e| anyhow!("Invalid UTF-8 in coverage output: {}", e))
679    }
680}
681
682// ============================================================================
683// SZZ Algorithm - Bug-Introducing Commit Identification
684// Toyota Way: Genchi Genbutsu - trace back to root cause using git history
685// Reference: Śliwerski et al. (2005). "When do changes induce fixes?" MSR '05
686// ============================================================================
687
688/// Confidence level for SZZ tracing
689#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
690pub enum SzzConfidence {
691    /// Direct line trace via git blame
692    High,
693    /// Refactoring-aware trace (excluded cosmetic changes)
694    Medium,
695    /// Heuristic fallback (commit message patterns)
696    Low,
697}
698
699/// Result of SZZ algorithm tracing
700#[derive(Debug, Clone, Serialize, Deserialize)]
701pub struct SzzResult {
702    /// The commit that fixed the bug
703    pub bug_fixing_commit: String,
704    /// Commits that likely introduced the bug
705    pub bug_introducing_commits: Vec<String>,
706    /// Lines identified as faulty (file, line)
707    pub faulty_lines: Vec<(String, usize)>,
708    /// Confidence in the trace
709    pub confidence: SzzConfidence,
710    /// Commit message of the fix (for context)
711    pub fix_message: String,
712}
713
714/// SZZ Algorithm implementation for bug-introducing commit identification
715///
716/// # Algorithm Steps
717/// 1. Identify bug-fixing commits (from commit messages or issue links)
718/// 2. Find lines modified in the fix
719/// 3. Use git blame to trace back to introducing commits
720/// 4. Filter out cosmetic changes (refactoring-aware)
721pub struct SzzAnalyzer;
722
723impl SzzAnalyzer {
724    /// Identify bug-fixing commits from commit messages
725    ///
726    /// Looks for patterns like:
727    /// - "fix:", "fixes:", "fixed:"
728    /// - "bug:", "bugfix:"
729    /// - Issue references: "#123", "JIRA-456"
730    pub fn identify_bug_fixes(commits: &[(String, String)]) -> Vec<(String, String)> {
731        let fix_patterns = [
732            "fix:",
733            "fixes:",
734            "fixed:",
735            "fix(",
736            "bug:",
737            "bugfix:",
738            "hotfix:",
739            "resolve:",
740            "resolves:",
741            "resolved:",
742            "close:",
743            "closes:",
744            "closed:",
745        ];
746
747        commits
748            .iter()
749            .filter(|(_, msg)| {
750                let lower = msg.to_lowercase();
751                fix_patterns.iter().any(|p| lower.contains(p))
752                    || lower.contains("#") && lower.chars().any(|c| c.is_ascii_digit())
753            })
754            .cloned()
755            .collect()
756    }
757
758    /// Trace bug-introducing commits using simplified SZZ
759    ///
760    /// # Arguments
761    /// * `fix_commit` - The bug-fixing commit hash
762    /// * `fix_message` - Commit message
763    /// * `changed_lines` - Lines changed in the fix (file, line, was_deleted)
764    /// * `blame_data` - Git blame output (line -> (commit, author))
765    pub fn trace_introducing_commits(
766        fix_commit: &str,
767        fix_message: &str,
768        changed_lines: &[(String, usize, bool)],
769        blame_data: &HashMap<(String, usize), (String, String)>,
770    ) -> SzzResult {
771        let mut introducing_commits: Vec<String> = Vec::new();
772        let mut faulty_lines: Vec<(String, usize)> = Vec::new();
773
774        // For each deleted/modified line in the fix, trace back
775        for (file, line, was_deleted) in changed_lines {
776            if *was_deleted {
777                // Deleted lines are the key - they likely contained the bug
778                if let Some((commit, _author)) = blame_data.get(&(file.clone(), *line)) {
779                    if commit != fix_commit && !introducing_commits.contains(commit) {
780                        introducing_commits.push(commit.clone());
781                    }
782                    faulty_lines.push((file.clone(), *line));
783                }
784            }
785        }
786
787        // Determine confidence
788        let confidence = if !introducing_commits.is_empty() {
789            SzzConfidence::High
790        } else if !faulty_lines.is_empty() {
791            SzzConfidence::Medium
792        } else {
793            SzzConfidence::Low
794        };
795
796        SzzResult {
797            bug_fixing_commit: fix_commit.to_string(),
798            bug_introducing_commits: introducing_commits,
799            faulty_lines,
800            confidence,
801            fix_message: fix_message.to_string(),
802        }
803    }
804
805    /// Filter out cosmetic changes (refactoring-aware SZZ)
806    ///
807    /// Excludes:
808    /// - Whitespace-only changes
809    /// - Comment-only changes
810    /// - Import reordering
811    pub fn filter_cosmetic_changes(
812        changes: &[(String, usize, bool)],
813        file_contents: &HashMap<String, Vec<String>>,
814    ) -> Vec<(String, usize, bool)> {
815        changes
816            .iter()
817            .filter(|(file, line, _)| {
818                if let Some(lines) = file_contents.get(file) {
819                    if let Some(content) = lines.get(line.saturating_sub(1)) {
820                        let trimmed = content.trim();
821                        // Keep if not cosmetic
822                        !trimmed.is_empty()
823                            && !trimmed.starts_with("//")
824                            && !trimmed.starts_with("/*")
825                            && !trimmed.starts_with("*")
826                            && !trimmed.starts_with("use ")
827                            && !trimmed.starts_with("import ")
828                    } else {
829                        true
830                    }
831                } else {
832                    true
833                }
834            })
835            .cloned()
836            .collect()
837    }
838
839    /// Calculate suspiciousness for files based on SZZ results
840    ///
841    /// Combines SZZ bug-introduction data with historical defect frequency
842    pub fn calculate_file_suspiciousness(szz_results: &[SzzResult]) -> HashMap<String, f32> {
843        let mut file_bug_count: HashMap<String, usize> = HashMap::new();
844        let total_bugs = szz_results.len();
845
846        for result in szz_results {
847            for (file, _line) in &result.faulty_lines {
848                *file_bug_count.entry(file.clone()).or_insert(0) += 1;
849            }
850        }
851
852        file_bug_count
853            .into_iter()
854            .map(|(file, count)| {
855                let suspiciousness = if total_bugs > 0 {
856                    count as f32 / total_bugs as f32
857                } else {
858                    0.0
859                };
860                (file, suspiciousness)
861            })
862            .collect()
863    }
864}
865
866/// Combines SBFL with git history for enhanced fault localization
867pub struct HybridFaultLocalizer;
868
869impl HybridFaultLocalizer {
870    /// Combine SBFL suspiciousness with SZZ historical data
871    ///
872    /// Formula: combined = α * sbfl_score + (1-α) * historical_score
873    /// Where α is the weighting factor (default 0.7 for SBFL)
874    pub fn combine_scores(
875        sbfl_result: &FaultLocalizationResult,
876        historical_suspiciousness: &HashMap<String, f32>,
877        alpha: f32,
878    ) -> FaultLocalizationResult {
879        let mut combined_rankings: Vec<SuspiciousnessRanking> = sbfl_result
880            .rankings
881            .iter()
882            .map(|r| {
883                let file_path = r.statement.file.to_string_lossy().to_string();
884                let historical = historical_suspiciousness
885                    .get(&file_path)
886                    .copied()
887                    .unwrap_or(0.0);
888                let combined = alpha * r.suspiciousness + (1.0 - alpha) * historical;
889
890                let mut scores = r.scores.clone();
891                scores.insert("historical".to_string(), historical);
892                scores.insert("combined".to_string(), combined);
893
894                SuspiciousnessRanking {
895                    rank: 0, // Will be re-ranked
896                    statement: r.statement.clone(),
897                    suspiciousness: combined,
898                    scores,
899                    explanation: format!(
900                        "{} Historical suspiciousness: {:.2}",
901                        r.explanation, historical
902                    ),
903                    failed_coverage: r.failed_coverage,
904                    passed_coverage: r.passed_coverage,
905                }
906            })
907            .collect();
908
909        // Re-rank by combined score
910        combined_rankings.sort_by(|a, b| {
911            b.suspiciousness
912                .partial_cmp(&a.suspiciousness)
913                .unwrap_or(std::cmp::Ordering::Equal)
914        });
915
916        for (i, ranking) in combined_rankings.iter_mut().enumerate() {
917            ranking.rank = i + 1;
918        }
919
920        FaultLocalizationResult {
921            rankings: combined_rankings,
922            formula_used: sbfl_result.formula_used,
923            confidence: sbfl_result.confidence,
924            total_passed_tests: sbfl_result.total_passed_tests,
925            total_failed_tests: sbfl_result.total_failed_tests,
926        }
927    }
928}
929
930#[cfg(test)]
931mod tests {
932    use super::*;
933
934    // ============== Formula Unit Tests ==============
935
936    #[test]
937    fn test_tarantula_perfect_fault() {
938        // Statement executed by all failing tests, no passing tests
939        // Should have maximum suspiciousness
940        let score = tarantula(10, 0, 10, 100);
941        assert!((score - 1.0).abs() < 0.001);
942    }
943
944    #[test]
945    fn test_tarantula_perfect_clean() {
946        // Statement executed by all passing tests, no failing tests
947        // Should have minimum suspiciousness
948        let score = tarantula(0, 100, 10, 100);
949        assert!(score.abs() < 0.001);
950    }
951
952    #[test]
953    fn test_tarantula_mixed() {
954        // Statement executed by 50% of failing and 50% of passing
955        let score = tarantula(5, 50, 10, 100);
956        assert!(score > 0.0 && score < 1.0);
957    }
958
959    #[test]
960    fn test_tarantula_no_tests() {
961        // Edge case: no tests
962        let score = tarantula(0, 0, 0, 0);
963        assert!(score.abs() < 0.001);
964    }
965
966    #[test]
967    fn test_ochiai_perfect_fault() {
968        let score = ochiai(10, 0, 10);
969        assert!((score - 1.0).abs() < 0.001);
970    }
971
972    #[test]
973    fn test_ochiai_no_execution() {
974        let score = ochiai(0, 0, 10);
975        assert!(score.abs() < 0.001);
976    }
977
978    #[test]
979    fn test_ochiai_mixed() {
980        let score = ochiai(5, 50, 10);
981        assert!(score > 0.0 && score < 1.0);
982    }
983
984    #[test]
985    fn test_dstar_perfect_fault() {
986        let score = dstar(10, 0, 10, 2);
987        // With star=2: 10^2 / (0 + 0) = infinity, but we cap at MAX
988        assert!(score > 100.0);
989    }
990
991    #[test]
992    fn test_dstar_mixed() {
993        let score = dstar(5, 50, 10, 2);
994        // 25 / (50 + 5) = 0.4545...
995        assert!((score - 0.4545).abs() < 0.01);
996    }
997
998    #[test]
999    fn test_dstar_exponent_effect() {
1000        let score2 = dstar(5, 10, 10, 2);
1001        let score3 = dstar(5, 10, 10, 3);
1002        // Higher exponent amplifies the signal
1003        assert!(score3 > score2);
1004    }
1005
1006    // ============== Localizer Tests ==============
1007
1008    #[test]
1009    fn test_localizer_basic() {
1010        let localizer = SbflLocalizer::new();
1011
1012        let coverage = vec![
1013            StatementCoverage::new(StatementId::new("file.rs", 10), 0, 10), // All failing
1014            StatementCoverage::new(StatementId::new("file.rs", 20), 100, 0), // All passing
1015            StatementCoverage::new(StatementId::new("file.rs", 30), 50, 5), // Mixed
1016        ];
1017
1018        let result = localizer.localize(&coverage, 100, 10);
1019
1020        assert_eq!(result.rankings.len(), 3);
1021        assert_eq!(result.rankings[0].statement.line, 10); // Most suspicious first
1022        assert!(result.rankings[0].suspiciousness > result.rankings[1].suspiciousness);
1023    }
1024
1025    #[test]
1026    fn test_localizer_top_n() {
1027        let localizer = SbflLocalizer::new().with_top_n(2);
1028
1029        let coverage = vec![
1030            StatementCoverage::new(StatementId::new("file.rs", 10), 0, 10),
1031            StatementCoverage::new(StatementId::new("file.rs", 20), 50, 5),
1032            StatementCoverage::new(StatementId::new("file.rs", 30), 100, 0),
1033        ];
1034
1035        let result = localizer.localize(&coverage, 100, 10);
1036
1037        assert_eq!(result.rankings.len(), 2);
1038    }
1039
1040    #[test]
1041    fn test_localizer_formula_selection() {
1042        let coverage = vec![StatementCoverage::new(
1043            StatementId::new("file.rs", 10),
1044            50,
1045            5,
1046        )];
1047
1048        let tarantula_result = SbflLocalizer::new()
1049            .with_formula(SbflFormula::Tarantula)
1050            .localize(&coverage, 100, 10);
1051
1052        let ochiai_result = SbflLocalizer::new()
1053            .with_formula(SbflFormula::Ochiai)
1054            .localize(&coverage, 100, 10);
1055
1056        // Scores should differ between formulas
1057        assert_ne!(
1058            tarantula_result.rankings[0].suspiciousness,
1059            ochiai_result.rankings[0].suspiciousness
1060        );
1061    }
1062
1063    #[test]
1064    fn test_localizer_includes_all_scores() {
1065        let localizer = SbflLocalizer::new();
1066
1067        let coverage = vec![StatementCoverage::new(
1068            StatementId::new("file.rs", 10),
1069            50,
1070            5,
1071        )];
1072
1073        let result = localizer.localize(&coverage, 100, 10);
1074
1075        let scores = &result.rankings[0].scores;
1076        assert!(scores.contains_key("tarantula"));
1077        assert!(scores.contains_key("ochiai"));
1078        assert!(scores.contains_key("dstar2"));
1079        assert!(scores.contains_key("dstar3"));
1080    }
1081
1082    #[test]
1083    fn test_localizer_explanation() {
1084        let localizer = SbflLocalizer::new().with_explanations(true);
1085
1086        let coverage = vec![StatementCoverage::new(
1087            StatementId::new("file.rs", 10),
1088            10,
1089            5,
1090        )];
1091
1092        let result = localizer.localize(&coverage, 100, 10);
1093
1094        assert!(!result.rankings[0].explanation.is_empty());
1095        assert!(result.rankings[0].explanation.contains("50%")); // 5/10 = 50%
1096    }
1097
1098    #[test]
1099    fn test_localizer_no_explanation() {
1100        let localizer = SbflLocalizer::new().with_explanations(false);
1101
1102        let coverage = vec![StatementCoverage::new(
1103            StatementId::new("file.rs", 10),
1104            10,
1105            5,
1106        )];
1107
1108        let result = localizer.localize(&coverage, 100, 10);
1109
1110        assert!(result.rankings[0].explanation.is_empty());
1111    }
1112
1113    #[test]
1114    fn test_localizer_confidence() {
1115        let localizer = SbflLocalizer::new();
1116
1117        // More failing tests = higher confidence
1118        // Need multiple statements for coverage_factor to be non-zero
1119        let coverage: Vec<StatementCoverage> = (1..=100)
1120            .map(|i| StatementCoverage::new(StatementId::new("file.rs", i), 90, 10))
1121            .collect();
1122
1123        let result_many_fail = localizer.localize(&coverage, 90, 10);
1124        let result_few_fail = localizer.localize(&coverage, 99, 1);
1125
1126        assert!(result_many_fail.confidence > result_few_fail.confidence);
1127    }
1128
1129    // ============== LCOV Parser Tests ==============
1130
1131    #[test]
1132    fn test_lcov_parse_basic() {
1133        let lcov = r#"
1134SF:src/main.rs
1135DA:10,5
1136DA:20,0
1137DA:30,12
1138end_of_record
1139"#;
1140
1141        let results = LcovParser::parse(lcov).unwrap();
1142
1143        assert_eq!(results.len(), 3);
1144        assert_eq!(results[0].0.line, 10);
1145        assert_eq!(results[0].1, 5);
1146    }
1147
1148    #[test]
1149    fn test_lcov_parse_multiple_files() {
1150        let lcov = r#"
1151SF:src/a.rs
1152DA:10,5
1153end_of_record
1154SF:src/b.rs
1155DA:20,10
1156end_of_record
1157"#;
1158
1159        let results = LcovParser::parse(lcov).unwrap();
1160
1161        assert_eq!(results.len(), 2);
1162        assert_eq!(results[0].0.file, PathBuf::from("src/a.rs"));
1163        assert_eq!(results[1].0.file, PathBuf::from("src/b.rs"));
1164    }
1165
1166    #[test]
1167    fn test_lcov_combine_coverage() {
1168        let passed = vec![
1169            (StatementId::new("file.rs", 10), 5),
1170            (StatementId::new("file.rs", 20), 10),
1171        ];
1172
1173        let failed = vec![
1174            (StatementId::new("file.rs", 10), 3),
1175            (StatementId::new("file.rs", 30), 1),
1176        ];
1177
1178        let combined = LcovParser::combine_coverage(&passed, &failed);
1179
1180        assert_eq!(combined.len(), 3);
1181
1182        // Find the statement at line 10 (covered by both)
1183        let stmt_10 = combined.iter().find(|c| c.id.line == 10).unwrap();
1184        assert_eq!(stmt_10.executed_by_passed, 1); // At least one passed test
1185        assert_eq!(stmt_10.executed_by_failed, 1); // At least one failed test
1186    }
1187
1188    // ============== Integration Tests ==============
1189
1190    #[test]
1191    fn test_end_to_end_localization() {
1192        // Simulate a scenario where line 50 is the fault
1193        let coverage = vec![
1194            // Line 50: Executed by all failing tests, few passing
1195            StatementCoverage::new(StatementId::new("buggy.rs", 50), 5, 10),
1196            // Line 60: Common code - executed by all
1197            StatementCoverage::new(StatementId::new("buggy.rs", 60), 95, 10),
1198            // Line 70: Only passing tests
1199            StatementCoverage::new(StatementId::new("buggy.rs", 70), 90, 0),
1200        ];
1201
1202        let result = SbflLocalizer::new()
1203            .with_formula(SbflFormula::Tarantula)
1204            .localize(&coverage, 100, 10);
1205
1206        // Line 50 should be ranked first (most suspicious)
1207        assert_eq!(result.rankings[0].statement.line, 50);
1208    }
1209
1210    #[test]
1211    fn test_statement_id_equality() {
1212        let id1 = StatementId::new("file.rs", 10);
1213        let id2 = StatementId::new("file.rs", 10);
1214        let id3 = StatementId::new("file.rs", 20);
1215
1216        assert_eq!(id1, id2);
1217        assert_ne!(id1, id3);
1218    }
1219
1220    #[test]
1221    fn test_statement_id_with_column() {
1222        let id = StatementId::new("file.rs", 10).with_column(5);
1223
1224        assert_eq!(id.column, Some(5));
1225    }
1226
1227    #[test]
1228    fn test_formula_default() {
1229        let formula = SbflFormula::default();
1230        assert_eq!(formula, SbflFormula::Tarantula);
1231    }
1232
1233    #[test]
1234    fn test_localizer_default() {
1235        let localizer = SbflLocalizer::default();
1236        let coverage = vec![StatementCoverage::new(
1237            StatementId::new("file.rs", 10),
1238            50,
1239            5,
1240        )];
1241
1242        let result = localizer.localize(&coverage, 100, 10);
1243        assert_eq!(result.formula_used, SbflFormula::Tarantula);
1244    }
1245
1246    #[test]
1247    fn test_confidence_edge_cases() {
1248        let localizer = SbflLocalizer::new();
1249
1250        // No tests
1251        let result = localizer.localize(&[], 0, 0);
1252        assert_eq!(result.confidence, 0.0);
1253
1254        // No failing tests
1255        let coverage = vec![StatementCoverage::new(
1256            StatementId::new("file.rs", 10),
1257            100,
1258            0,
1259        )];
1260        let result = localizer.localize(&coverage, 100, 0);
1261        assert_eq!(result.confidence, 0.0);
1262    }
1263
1264    #[test]
1265    fn test_min_confidence_threshold() {
1266        let localizer = SbflLocalizer::new().with_min_confidence(0.5);
1267
1268        let coverage = vec![
1269            StatementCoverage::new(StatementId::new("file.rs", 10), 0, 10), // High score
1270            StatementCoverage::new(StatementId::new("file.rs", 20), 100, 1), // Low score
1271        ];
1272
1273        let result = localizer.localize(&coverage, 100, 10);
1274
1275        // Only high-score statement should be included
1276        assert!(result.rankings.iter().all(|r| r.suspiciousness >= 0.5));
1277    }
1278
1279    #[test]
1280    fn test_serialization() {
1281        let result = FaultLocalizationResult {
1282            rankings: vec![SuspiciousnessRanking {
1283                rank: 1,
1284                statement: StatementId::new("file.rs", 10),
1285                suspiciousness: 0.95,
1286                scores: HashMap::new(),
1287                explanation: "Test".to_string(),
1288                failed_coverage: 10,
1289                passed_coverage: 5,
1290            }],
1291            formula_used: SbflFormula::Tarantula,
1292            confidence: 0.8,
1293            total_passed_tests: 100,
1294            total_failed_tests: 10,
1295        };
1296
1297        let json = serde_json::to_string(&result).unwrap();
1298        let deserialized: FaultLocalizationResult = serde_json::from_str(&json).unwrap();
1299
1300        assert_eq!(deserialized.rankings.len(), 1);
1301        assert_eq!(deserialized.confidence, 0.8);
1302    }
1303
1304    // ============== TarantulaIntegration Tests (TDD - Red Phase) ==============
1305
1306    #[test]
1307    fn test_integration_is_coverage_tool_available() {
1308        // Should not panic, returns bool
1309        let _available = TarantulaIntegration::is_coverage_tool_available();
1310    }
1311
1312    #[test]
1313    fn test_integration_parse_lcov_output() {
1314        let lcov = r#"SF:src/main.rs
1315DA:10,5
1316DA:20,0
1317DA:30,12
1318end_of_record
1319SF:src/lib.rs
1320DA:100,8
1321DA:200,0
1322end_of_record"#;
1323
1324        let result = TarantulaIntegration::parse_lcov_output(lcov).unwrap();
1325
1326        assert_eq!(result.len(), 5);
1327        assert!(result
1328            .iter()
1329            .any(|(s, _)| s.file.as_path() == std::path::Path::new("src/main.rs") && s.line == 10));
1330        assert!(result
1331            .iter()
1332            .any(|(s, _)| s.file.as_path() == std::path::Path::new("src/lib.rs") && s.line == 100));
1333    }
1334
1335    #[test]
1336    fn test_integration_run_localization() {
1337        // Create test coverage data
1338        let passed_coverage = vec![
1339            (StatementId::new("src/buggy.rs", 10), 5_usize),
1340            (StatementId::new("src/buggy.rs", 20), 10_usize),
1341            (StatementId::new("src/buggy.rs", 30), 8_usize),
1342        ];
1343
1344        let failed_coverage = vec![
1345            (StatementId::new("src/buggy.rs", 10), 3_usize),
1346            (StatementId::new("src/buggy.rs", 20), 0_usize),
1347            (StatementId::new("src/buggy.rs", 40), 5_usize), // Only in failing
1348        ];
1349
1350        let config = LocalizationConfig::default();
1351        let result = TarantulaIntegration::run_localization(
1352            &passed_coverage,
1353            &failed_coverage,
1354            1, // 1 passed test
1355            1, // 1 failed test
1356            &config,
1357        );
1358
1359        assert!(!result.rankings.is_empty());
1360        // Line 40 should be most suspicious (only executed by failing tests)
1361        assert_eq!(result.rankings[0].statement.line, 40);
1362    }
1363
1364    #[test]
1365    fn test_localization_config_default() {
1366        let config = LocalizationConfig::default();
1367
1368        assert_eq!(config.formula, SbflFormula::Tarantula);
1369        assert_eq!(config.top_n, 10);
1370        assert!(config.include_explanations);
1371    }
1372
1373    #[test]
1374    fn test_localization_config_builder() {
1375        let config = LocalizationConfig::new()
1376            .with_formula(SbflFormula::Ochiai)
1377            .with_top_n(5)
1378            .with_explanations(false);
1379
1380        assert_eq!(config.formula, SbflFormula::Ochiai);
1381        assert_eq!(config.top_n, 5);
1382        assert!(!config.include_explanations);
1383    }
1384
1385    #[test]
1386    fn test_integration_generate_report_yaml() {
1387        let result = FaultLocalizationResult {
1388            rankings: vec![SuspiciousnessRanking {
1389                rank: 1,
1390                statement: StatementId::new("src/bug.rs", 42),
1391                suspiciousness: 0.95,
1392                scores: {
1393                    let mut m = HashMap::new();
1394                    m.insert("tarantula".to_string(), 0.95);
1395                    m.insert("ochiai".to_string(), 0.92);
1396                    m
1397                },
1398                explanation: "High suspicion".to_string(),
1399                failed_coverage: 10,
1400                passed_coverage: 2,
1401            }],
1402            formula_used: SbflFormula::Tarantula,
1403            confidence: 0.85,
1404            total_passed_tests: 100,
1405            total_failed_tests: 10,
1406        };
1407
1408        let yaml = TarantulaIntegration::generate_report(&result, ReportFormat::Yaml).unwrap();
1409
1410        assert!(yaml.contains("src/bug.rs"));
1411        assert!(yaml.contains("42"));
1412        assert!(yaml.contains("0.95") || yaml.contains("0.9")); // Score present
1413    }
1414
1415    #[test]
1416    fn test_integration_generate_report_json() {
1417        let result = FaultLocalizationResult {
1418            rankings: vec![],
1419            formula_used: SbflFormula::Ochiai,
1420            confidence: 0.5,
1421            total_passed_tests: 50,
1422            total_failed_tests: 5,
1423        };
1424
1425        let json = TarantulaIntegration::generate_report(&result, ReportFormat::Json).unwrap();
1426
1427        assert!(json.contains("Ochiai"));
1428        assert!(json.contains("0.5"));
1429    }
1430
1431    #[test]
1432    fn test_integration_combine_with_tdg() {
1433        // Create localization result
1434        let mut result = FaultLocalizationResult {
1435            rankings: vec![SuspiciousnessRanking {
1436                rank: 1,
1437                statement: StatementId::new("src/complex.rs", 100),
1438                suspiciousness: 0.8,
1439                scores: HashMap::new(),
1440                explanation: String::new(),
1441                failed_coverage: 5,
1442                passed_coverage: 10,
1443            }],
1444            formula_used: SbflFormula::Tarantula,
1445            confidence: 0.7,
1446            total_passed_tests: 100,
1447            total_failed_tests: 10,
1448        };
1449
1450        // Create mock TDG scores
1451        let mut tdg_scores = HashMap::new();
1452        tdg_scores.insert("src/complex.rs".to_string(), 45.0_f32); // Low TDG = high debt
1453
1454        TarantulaIntegration::enrich_with_tdg(&mut result, &tdg_scores);
1455
1456        // Should have TDG score in the scores map
1457        assert!(result.rankings[0].scores.contains_key("tdg"));
1458        assert_eq!(result.rankings[0].scores.get("tdg"), Some(&45.0));
1459    }
1460
1461    #[test]
1462    fn test_report_format_enum() {
1463        assert_eq!(ReportFormat::default(), ReportFormat::Yaml);
1464    }
1465
1466    // ============== SZZ Algorithm Tests ==============
1467
1468    #[test]
1469    fn test_szz_identify_bug_fixes() {
1470        let commits = vec![
1471            (
1472                "abc123".to_string(),
1473                "fix: resolve null pointer exception".to_string(),
1474            ),
1475            ("def456".to_string(), "feat: add new feature".to_string()),
1476            (
1477                "ghi789".to_string(),
1478                "bugfix: memory leak in parser".to_string(),
1479            ),
1480            ("jkl012".to_string(), "docs: update readme".to_string()),
1481            (
1482                "mno345".to_string(),
1483                "closes #123: fix race condition".to_string(),
1484            ),
1485        ];
1486
1487        let fixes = SzzAnalyzer::identify_bug_fixes(&commits);
1488
1489        assert_eq!(fixes.len(), 3);
1490        assert!(fixes.iter().any(|(h, _)| h == "abc123"));
1491        assert!(fixes.iter().any(|(h, _)| h == "ghi789"));
1492        assert!(fixes.iter().any(|(h, _)| h == "mno345"));
1493    }
1494
1495    #[test]
1496    fn test_szz_identify_no_fixes() {
1497        let commits = vec![
1498            ("abc123".to_string(), "feat: new feature".to_string()),
1499            ("def456".to_string(), "docs: documentation".to_string()),
1500            ("ghi789".to_string(), "refactor: clean up code".to_string()),
1501        ];
1502
1503        let fixes = SzzAnalyzer::identify_bug_fixes(&commits);
1504
1505        assert!(fixes.is_empty());
1506    }
1507
1508    #[test]
1509    fn test_szz_trace_introducing_commits() {
1510        let changed_lines = vec![
1511            ("src/bug.rs".to_string(), 50, true),  // Deleted (likely bug)
1512            ("src/bug.rs".to_string(), 51, true),  // Deleted
1513            ("src/bug.rs".to_string(), 55, false), // Added (fix)
1514        ];
1515
1516        let mut blame_data = HashMap::new();
1517        blame_data.insert(
1518            ("src/bug.rs".to_string(), 50),
1519            ("bad_commit_1".to_string(), "author1".to_string()),
1520        );
1521        blame_data.insert(
1522            ("src/bug.rs".to_string(), 51),
1523            ("bad_commit_1".to_string(), "author1".to_string()),
1524        );
1525
1526        let result = SzzAnalyzer::trace_introducing_commits(
1527            "fix_commit",
1528            "fix: null pointer exception",
1529            &changed_lines,
1530            &blame_data,
1531        );
1532
1533        assert_eq!(result.bug_fixing_commit, "fix_commit");
1534        assert_eq!(result.bug_introducing_commits.len(), 1);
1535        assert!(result
1536            .bug_introducing_commits
1537            .contains(&"bad_commit_1".to_string()));
1538        assert_eq!(result.faulty_lines.len(), 2);
1539        assert_eq!(result.confidence, SzzConfidence::High);
1540    }
1541
1542    #[test]
1543    fn test_szz_trace_no_blame_data() {
1544        let changed_lines = vec![("src/new.rs".to_string(), 10, true)];
1545        let blame_data = HashMap::new();
1546
1547        let result = SzzAnalyzer::trace_introducing_commits(
1548            "fix_commit",
1549            "fix: issue",
1550            &changed_lines,
1551            &blame_data,
1552        );
1553
1554        assert!(result.bug_introducing_commits.is_empty());
1555        assert_eq!(result.confidence, SzzConfidence::Low);
1556    }
1557
1558    #[test]
1559    fn test_szz_filter_cosmetic_changes() {
1560        let changes = vec![
1561            ("src/code.rs".to_string(), 1, true), // Empty line
1562            ("src/code.rs".to_string(), 2, true), // Comment
1563            ("src/code.rs".to_string(), 3, true), // Import
1564            ("src/code.rs".to_string(), 4, true), // Real code
1565        ];
1566
1567        let mut file_contents = HashMap::new();
1568        file_contents.insert(
1569            "src/code.rs".to_string(),
1570            vec![
1571                "".to_string(),                               // Line 1: empty
1572                "// This is a comment".to_string(),           // Line 2: comment
1573                "use std::collections::HashMap;".to_string(), // Line 3: import
1574                "let x = compute_value();".to_string(),       // Line 4: real code
1575            ],
1576        );
1577
1578        let filtered = SzzAnalyzer::filter_cosmetic_changes(&changes, &file_contents);
1579
1580        // Only line 4 (real code) should remain
1581        assert_eq!(filtered.len(), 1);
1582        assert_eq!(filtered[0].1, 4);
1583    }
1584
1585    #[test]
1586    fn test_szz_calculate_file_suspiciousness() {
1587        let szz_results = vec![
1588            SzzResult {
1589                bug_fixing_commit: "fix1".to_string(),
1590                bug_introducing_commits: vec!["bad1".to_string()],
1591                faulty_lines: vec![
1592                    ("src/buggy.rs".to_string(), 10),
1593                    ("src/buggy.rs".to_string(), 20),
1594                ],
1595                confidence: SzzConfidence::High,
1596                fix_message: "fix: bug 1".to_string(),
1597            },
1598            SzzResult {
1599                bug_fixing_commit: "fix2".to_string(),
1600                bug_introducing_commits: vec!["bad2".to_string()],
1601                faulty_lines: vec![
1602                    ("src/buggy.rs".to_string(), 30),
1603                    ("src/other.rs".to_string(), 10),
1604                ],
1605                confidence: SzzConfidence::High,
1606                fix_message: "fix: bug 2".to_string(),
1607            },
1608        ];
1609
1610        let suspiciousness = SzzAnalyzer::calculate_file_suspiciousness(&szz_results);
1611
1612        // src/buggy.rs has 3 faulty lines across 2 bugs = 1.5 (but capped per file)
1613        // src/other.rs has 1 faulty line = 0.5
1614        assert!(suspiciousness.contains_key("src/buggy.rs"));
1615        assert!(suspiciousness.contains_key("src/other.rs"));
1616        assert!(
1617            suspiciousness.get("src/buggy.rs").unwrap()
1618                > suspiciousness.get("src/other.rs").unwrap()
1619        );
1620    }
1621
1622    // ============== Hybrid Fault Localizer Tests ==============
1623
1624    #[test]
1625    fn test_hybrid_combine_scores_basic() {
1626        let sbfl_result = FaultLocalizationResult {
1627            rankings: vec![
1628                SuspiciousnessRanking {
1629                    rank: 1,
1630                    statement: StatementId::new("src/a.rs", 10),
1631                    suspiciousness: 0.9,
1632                    scores: HashMap::new(),
1633                    explanation: "High SBFL".to_string(),
1634                    failed_coverage: 10,
1635                    passed_coverage: 2,
1636                },
1637                SuspiciousnessRanking {
1638                    rank: 2,
1639                    statement: StatementId::new("src/b.rs", 20),
1640                    suspiciousness: 0.5,
1641                    scores: HashMap::new(),
1642                    explanation: "Medium SBFL".to_string(),
1643                    failed_coverage: 5,
1644                    passed_coverage: 5,
1645                },
1646            ],
1647            formula_used: SbflFormula::Tarantula,
1648            confidence: 0.8,
1649            total_passed_tests: 100,
1650            total_failed_tests: 10,
1651        };
1652
1653        let mut historical = HashMap::new();
1654        historical.insert("src/a.rs".to_string(), 0.2_f32); // Low historical
1655        historical.insert("src/b.rs".to_string(), 0.9_f32); // High historical
1656
1657        // With alpha=0.7: a = 0.7*0.9 + 0.3*0.2 = 0.69, b = 0.7*0.5 + 0.3*0.9 = 0.62
1658        let combined = HybridFaultLocalizer::combine_scores(&sbfl_result, &historical, 0.7);
1659
1660        assert_eq!(combined.rankings.len(), 2);
1661        // Rankings should be preserved (a still higher than b with alpha=0.7)
1662        assert_eq!(
1663            combined.rankings[0].statement.file,
1664            PathBuf::from("src/a.rs")
1665        );
1666
1667        // Check scores include historical and combined
1668        assert!(combined.rankings[0].scores.contains_key("historical"));
1669        assert!(combined.rankings[0].scores.contains_key("combined"));
1670    }
1671
1672    #[test]
1673    fn test_hybrid_combine_scores_reranking() {
1674        let sbfl_result = FaultLocalizationResult {
1675            rankings: vec![
1676                SuspiciousnessRanking {
1677                    rank: 1,
1678                    statement: StatementId::new("src/low_hist.rs", 10),
1679                    suspiciousness: 0.6,
1680                    scores: HashMap::new(),
1681                    explanation: String::new(),
1682                    failed_coverage: 6,
1683                    passed_coverage: 4,
1684                },
1685                SuspiciousnessRanking {
1686                    rank: 2,
1687                    statement: StatementId::new("src/high_hist.rs", 20),
1688                    suspiciousness: 0.4,
1689                    scores: HashMap::new(),
1690                    explanation: String::new(),
1691                    failed_coverage: 4,
1692                    passed_coverage: 6,
1693                },
1694            ],
1695            formula_used: SbflFormula::Ochiai,
1696            confidence: 0.7,
1697            total_passed_tests: 100,
1698            total_failed_tests: 10,
1699        };
1700
1701        let mut historical = HashMap::new();
1702        historical.insert("src/low_hist.rs".to_string(), 0.0_f32);
1703        historical.insert("src/high_hist.rs".to_string(), 1.0_f32);
1704
1705        // With alpha=0.3 (historical weighted heavily):
1706        // low_hist = 0.3*0.6 + 0.7*0.0 = 0.18
1707        // high_hist = 0.3*0.4 + 0.7*1.0 = 0.82
1708        let combined = HybridFaultLocalizer::combine_scores(&sbfl_result, &historical, 0.3);
1709
1710        // high_hist should now be ranked #1
1711        assert_eq!(
1712            combined.rankings[0].statement.file,
1713            PathBuf::from("src/high_hist.rs")
1714        );
1715        assert_eq!(combined.rankings[0].rank, 1);
1716        assert_eq!(combined.rankings[1].rank, 2);
1717    }
1718
1719    #[test]
1720    fn test_hybrid_no_historical_data() {
1721        let sbfl_result = FaultLocalizationResult {
1722            rankings: vec![SuspiciousnessRanking {
1723                rank: 1,
1724                statement: StatementId::new("src/new.rs", 10),
1725                suspiciousness: 0.8,
1726                scores: HashMap::new(),
1727                explanation: String::new(),
1728                failed_coverage: 8,
1729                passed_coverage: 2,
1730            }],
1731            formula_used: SbflFormula::DStar { exponent: 2 },
1732            confidence: 0.6,
1733            total_passed_tests: 50,
1734            total_failed_tests: 5,
1735        };
1736
1737        let historical = HashMap::new(); // Empty
1738
1739        let combined = HybridFaultLocalizer::combine_scores(&sbfl_result, &historical, 0.7);
1740
1741        // With no historical data, score = 0.7 * 0.8 + 0.3 * 0.0 = 0.56
1742        assert!((combined.rankings[0].suspiciousness - 0.56).abs() < 0.01);
1743        assert_eq!(combined.rankings[0].scores.get("historical"), Some(&0.0));
1744    }
1745
1746    #[test]
1747    fn test_szz_confidence_enum() {
1748        assert_eq!(SzzConfidence::High, SzzConfidence::High);
1749        assert_ne!(SzzConfidence::High, SzzConfidence::Low);
1750    }
1751
1752    #[test]
1753    fn test_szz_result_serialization() {
1754        let result = SzzResult {
1755            bug_fixing_commit: "abc123".to_string(),
1756            bug_introducing_commits: vec!["def456".to_string()],
1757            faulty_lines: vec![("src/bug.rs".to_string(), 42)],
1758            confidence: SzzConfidence::High,
1759            fix_message: "fix: critical bug".to_string(),
1760        };
1761
1762        let json = serde_json::to_string(&result).unwrap();
1763        let deserialized: SzzResult = serde_json::from_str(&json).unwrap();
1764
1765        assert_eq!(deserialized.bug_fixing_commit, "abc123");
1766        assert_eq!(deserialized.confidence, SzzConfidence::High);
1767    }
1768}