use crate::classifier::{Classification, DefectCategory, HybridClassifier};
use crate::git::{CommitInfo, GitAnalyzer};
use crate::pmat::{PmatIntegration, TdgAnalysis};
use crate::report::{DefectInstance, DefectPattern, QualitySignals};
use anyhow::Result;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use tracing::{debug, info};
pub struct OrgAnalyzer {
git_analyzer: GitAnalyzer,
classifier: HybridClassifier,
cache_dir: PathBuf,
}
impl OrgAnalyzer {
pub fn new<P: AsRef<Path>>(cache_dir: P) -> Self {
let cache_dir = cache_dir.as_ref().to_path_buf();
Self {
git_analyzer: GitAnalyzer::new(&cache_dir),
classifier: HybridClassifier::new_rule_based(),
cache_dir,
}
}
pub fn with_ml_model<P: AsRef<Path>>(
cache_dir: P,
ml_model: crate::ml_trainer::TrainedModel,
confidence_threshold: f32,
) -> Self {
let cache_dir = cache_dir.as_ref().to_path_buf();
Self {
git_analyzer: GitAnalyzer::new(&cache_dir),
classifier: HybridClassifier::new_hybrid(ml_model, confidence_threshold),
cache_dir,
}
}
pub async fn analyze_repository(
&self,
repo_url: &str,
repo_name: &str,
max_commits: usize,
) -> Result<Vec<DefectPattern>> {
info!(
"Analyzing repository {} (up to {} commits)",
repo_name, max_commits
);
self.git_analyzer.clone_repository(repo_url, repo_name)?;
let commits = self.git_analyzer.analyze_commits(repo_name, max_commits)?;
debug!("Retrieved {} commits from {}", commits.len(), repo_name);
let mut patterns = self.aggregate_defect_patterns(&commits);
let repo_path = self.cache_dir.join(repo_name);
if let Ok(tdg_analysis) = PmatIntegration::analyze_tdg(&repo_path) {
debug!(
"TDG analysis: avg={:.1}, max={:.1}",
tdg_analysis.average_score, tdg_analysis.max_score
);
self.enrich_with_tdg(&mut patterns, &tdg_analysis);
} else {
debug!("TDG analysis unavailable (pmat not installed or failed)");
}
info!(
"Found {} defect categories in {}",
patterns.len(),
repo_name
);
Ok(patterns)
}
fn aggregate_defect_patterns(&self, commits: &[CommitInfo]) -> Vec<DefectPattern> {
let mut category_map: HashMap<DefectCategory, CategoryStats> = HashMap::new();
for commit in commits {
if let Some(classification) = self.classifier.classify_from_message(&commit.message) {
let stats = category_map
.entry(classification.category)
.or_insert_with(|| CategoryStats::new(classification.category));
stats.add_instance(commit, &classification);
}
}
category_map
.into_values()
.map(|stats| stats.into_defect_pattern())
.collect()
}
fn enrich_with_tdg(&self, patterns: &mut [DefectPattern], tdg_analysis: &TdgAnalysis) {
for pattern in patterns.iter_mut() {
pattern.quality_signals.avg_tdg_score = Some(tdg_analysis.average_score);
pattern.quality_signals.max_tdg_score = Some(tdg_analysis.max_score);
}
}
}
#[derive(Debug)]
struct CategoryStats {
category: DefectCategory,
count: usize,
total_confidence: f32,
instances: Vec<DefectInstance>,
total_files_changed: usize,
total_lines_added: usize,
total_lines_removed: usize,
}
impl CategoryStats {
fn new(category: DefectCategory) -> Self {
Self {
category,
count: 0,
total_confidence: 0.0,
instances: Vec::new(),
total_files_changed: 0,
total_lines_added: 0,
total_lines_removed: 0,
}
}
fn add_instance(&mut self, commit: &CommitInfo, classification: &Classification) {
self.count += 1;
self.total_confidence += classification.confidence;
self.total_files_changed += commit.files_changed;
self.total_lines_added += commit.lines_added;
self.total_lines_removed += commit.lines_removed;
if self.instances.len() < 3 {
self.instances.push(DefectInstance {
commit_hash: commit.hash[..8.min(commit.hash.len())].to_string(),
message: commit.message.clone(),
author: commit.author.clone(),
timestamp: commit.timestamp,
files_affected: commit.files_changed,
lines_added: commit.lines_added,
lines_removed: commit.lines_removed,
});
}
}
fn into_defect_pattern(self) -> DefectPattern {
let avg_confidence = if self.count > 0 {
self.total_confidence / self.count as f32
} else {
0.0
};
let quality_signals = if self.count > 0 {
QualitySignals {
avg_tdg_score: None, max_tdg_score: None,
avg_complexity: None,
avg_test_coverage: None,
satd_instances: 0, avg_lines_changed: (self.total_lines_added + self.total_lines_removed) as f32
/ self.count as f32,
avg_files_per_commit: self.total_files_changed as f32 / self.count as f32,
}
} else {
QualitySignals::default()
};
DefectPattern {
category: self.category,
frequency: self.count,
confidence: avg_confidence,
quality_signals,
examples: self.instances,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_org_analyzer_can_be_created() {
let temp_dir = TempDir::new().unwrap();
let _analyzer = OrgAnalyzer::new(temp_dir.path());
}
#[test]
fn test_aggregate_empty_commits() {
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let commits = vec![];
let patterns = analyzer.aggregate_defect_patterns(&commits);
assert!(patterns.is_empty());
}
#[test]
fn test_aggregate_non_defect_commits() {
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let commits = vec![
CommitInfo {
hash: "abc123".to_string(),
message: "docs: update README".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 1,
lines_added: 5,
lines_removed: 2,
},
CommitInfo {
hash: "def456".to_string(),
message: "chore: bump version".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567891,
files_changed: 1,
lines_added: 1,
lines_removed: 1,
},
];
let patterns = analyzer.aggregate_defect_patterns(&commits);
assert!(patterns.is_empty());
}
#[test]
fn test_aggregate_defect_commits() {
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let commits = vec![
CommitInfo {
hash: "abc123".to_string(),
message: "fix: use-after-free in buffer".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 2,
lines_added: 45,
lines_removed: 12,
},
CommitInfo {
hash: "def456".to_string(),
message: "fix: another memory leak".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567891,
files_changed: 1,
lines_added: 8,
lines_removed: 3,
},
CommitInfo {
hash: "ghi789".to_string(),
message: "security: prevent SQL injection".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567892,
files_changed: 3,
lines_added: 67,
lines_removed: 23,
},
];
let patterns = analyzer.aggregate_defect_patterns(&commits);
assert_eq!(patterns.len(), 2);
let memory_pattern = patterns
.iter()
.find(|p| p.category == DefectCategory::MemorySafety)
.expect("Should find memory safety pattern");
assert_eq!(memory_pattern.frequency, 2);
assert!(memory_pattern.confidence > 0.0);
assert_eq!(memory_pattern.examples.len(), 2);
}
#[test]
fn test_category_stats_aggregation() {
let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
let commit1 = CommitInfo {
hash: "abc123".to_string(),
message: "fix: memory leak".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 2,
lines_added: 15,
lines_removed: 5,
};
let classification1 = Classification {
category: DefectCategory::MemorySafety,
confidence: 0.8,
explanation: "test".to_string(),
matched_patterns: vec!["memory leak".to_string()],
};
stats.add_instance(&commit1, &classification1);
assert_eq!(stats.count, 1);
assert_eq!(stats.total_confidence, 0.8);
assert_eq!(stats.instances.len(), 1);
let pattern = stats.into_defect_pattern();
assert_eq!(pattern.frequency, 1);
assert_eq!(pattern.confidence, 0.8);
assert_eq!(pattern.quality_signals.avg_lines_changed, 20.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
}
#[test]
fn test_examples_limited_to_three() {
let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
for i in 0..5 {
let commit = CommitInfo {
hash: format!("hash{}", i),
message: "fix: memory leak".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890 + i as i64,
files_changed: 1,
lines_added: 10,
lines_removed: 5,
};
let classification = Classification {
category: DefectCategory::MemorySafety,
confidence: 0.8,
explanation: "test".to_string(),
matched_patterns: vec!["memory leak".to_string()],
};
stats.add_instance(&commit, &classification);
}
assert_eq!(stats.count, 5);
assert_eq!(stats.instances.len(), 3); }
#[test]
fn test_enrich_with_tdg() {
use crate::pmat::TdgAnalysis;
use std::collections::HashMap;
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let mut patterns = vec![DefectPattern {
category: DefectCategory::MemorySafety,
frequency: 5,
confidence: 0.85,
quality_signals: QualitySignals::default(),
examples: vec![],
}];
let tdg_analysis = TdgAnalysis {
file_scores: HashMap::new(),
average_score: 92.5,
max_score: 98.0,
};
analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
assert_eq!(patterns[0].quality_signals.avg_tdg_score, Some(92.5));
assert_eq!(patterns[0].quality_signals.max_tdg_score, Some(98.0));
}
#[tokio::test]
#[ignore]
async fn test_analyze_real_repository() {
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let patterns = analyzer
.analyze_repository("https://github.com/rust-lang/rustlings", "rustlings", 100)
.await
.unwrap();
assert!(!patterns.is_empty() || patterns.is_empty()); }
#[test]
fn test_category_stats_new() {
let stats = CategoryStats::new(DefectCategory::LogicErrors);
assert_eq!(stats.count, 0);
assert_eq!(stats.total_confidence, 0.0);
assert_eq!(stats.instances.len(), 0);
assert_eq!(stats.total_files_changed, 0);
assert_eq!(stats.total_lines_added, 0);
assert_eq!(stats.total_lines_removed, 0);
}
#[test]
fn test_category_stats_averaging() {
let mut stats = CategoryStats::new(DefectCategory::SecurityVulnerabilities);
for i in 0..3 {
let commit = CommitInfo {
hash: format!("hash{}", i),
message: "fix: SQL injection".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890 + i as i64,
files_changed: 2,
lines_added: 10,
lines_removed: 5,
};
let classification = Classification {
category: DefectCategory::SecurityVulnerabilities,
confidence: 0.9,
explanation: "test".to_string(),
matched_patterns: vec!["sql injection".to_string()],
};
stats.add_instance(&commit, &classification);
}
let pattern = stats.into_defect_pattern();
assert_eq!(pattern.frequency, 3);
assert!((pattern.confidence - 0.9).abs() < 0.01); assert_eq!(pattern.quality_signals.avg_lines_changed, 15.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
}
#[test]
fn test_commit_hash_truncation() {
let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
let commit = CommitInfo {
hash: "abcdefghijklmnop".to_string(), message: "fix: memory leak".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 1,
lines_added: 10,
lines_removed: 5,
};
let classification = Classification {
category: DefectCategory::MemorySafety,
confidence: 0.8,
explanation: "test".to_string(),
matched_patterns: vec!["memory leak".to_string()],
};
stats.add_instance(&commit, &classification);
assert_eq!(stats.instances[0].commit_hash, "abcdefgh"); assert_eq!(stats.instances[0].commit_hash.len(), 8);
}
#[test]
fn test_commit_hash_short() {
let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
let commit = CommitInfo {
hash: "abc".to_string(), message: "fix: memory leak".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 1,
lines_added: 10,
lines_removed: 5,
};
let classification = Classification {
category: DefectCategory::MemorySafety,
confidence: 0.8,
explanation: "test".to_string(),
matched_patterns: vec!["memory leak".to_string()],
};
stats.add_instance(&commit, &classification);
assert_eq!(stats.instances[0].commit_hash, "abc");
}
#[test]
fn test_category_stats_zero_count_pattern() {
let stats = CategoryStats::new(DefectCategory::TypeErrors);
let pattern = stats.into_defect_pattern();
assert_eq!(pattern.frequency, 0);
assert_eq!(pattern.confidence, 0.0);
assert_eq!(pattern.quality_signals.avg_lines_changed, 0.0);
assert_eq!(pattern.quality_signals.avg_files_per_commit, 0.0);
}
#[test]
fn test_aggregate_mixed_commits() {
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let commits = vec![
CommitInfo {
hash: "abc123".to_string(),
message: "fix: null pointer dereference".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 2,
lines_added: 20,
lines_removed: 5,
},
CommitInfo {
hash: "def456".to_string(),
message: "docs: update README".to_string(), author: "test@example.com".to_string(),
timestamp: 1234567891,
files_changed: 1,
lines_added: 5,
lines_removed: 2,
},
CommitInfo {
hash: "ghi789".to_string(),
message: "fix: another null pointer issue".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567892,
files_changed: 1,
lines_added: 10,
lines_removed: 3,
},
];
let patterns = analyzer.aggregate_defect_patterns(&commits);
assert_eq!(patterns.len(), 1);
let memory_pattern = &patterns[0];
assert_eq!(memory_pattern.category, DefectCategory::MemorySafety);
assert_eq!(memory_pattern.frequency, 2);
assert_eq!(memory_pattern.examples.len(), 2);
}
#[test]
fn test_quality_signals_calculation() {
let mut stats = CategoryStats::new(DefectCategory::ConcurrencyBugs);
let commit = CommitInfo {
hash: "abc123".to_string(),
message: "fix: race condition".to_string(),
author: "test@example.com".to_string(),
timestamp: 1234567890,
files_changed: 3,
lines_added: 50,
lines_removed: 20,
};
let classification = Classification {
category: DefectCategory::ConcurrencyBugs,
confidence: 0.82,
explanation: "test".to_string(),
matched_patterns: vec!["race condition".to_string()],
};
stats.add_instance(&commit, &classification);
let pattern = stats.into_defect_pattern();
assert_eq!(pattern.quality_signals.avg_lines_changed, 70.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 3.0);
assert!(pattern.quality_signals.avg_tdg_score.is_none()); assert!(pattern.quality_signals.avg_complexity.is_none());
assert!(pattern.quality_signals.avg_test_coverage.is_none());
assert_eq!(pattern.quality_signals.satd_instances, 0);
}
#[test]
fn test_enrich_with_tdg_multiple_patterns() {
use crate::pmat::TdgAnalysis;
use std::collections::HashMap;
let temp_dir = TempDir::new().unwrap();
let analyzer = OrgAnalyzer::new(temp_dir.path());
let mut patterns = vec![
DefectPattern {
category: DefectCategory::MemorySafety,
frequency: 5,
confidence: 0.85,
quality_signals: QualitySignals::default(),
examples: vec![],
},
DefectPattern {
category: DefectCategory::SecurityVulnerabilities,
frequency: 3,
confidence: 0.90,
quality_signals: QualitySignals::default(),
examples: vec![],
},
];
let tdg_analysis = TdgAnalysis {
file_scores: HashMap::new(),
average_score: 85.5,
max_score: 95.0,
};
analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
for pattern in &patterns {
assert_eq!(pattern.quality_signals.avg_tdg_score, Some(85.5));
assert_eq!(pattern.quality_signals.max_tdg_score, Some(95.0));
}
}
}