1use crate::classifier::{Classification, DefectCategory, HybridClassifier};
7use crate::git::{CommitInfo, GitAnalyzer};
8use crate::pmat::{PmatIntegration, TdgAnalysis};
9use crate::report::{DefectInstance, DefectPattern, QualitySignals};
10use anyhow::Result;
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use tracing::{debug, info};
14
15pub struct OrgAnalyzer {
20 git_analyzer: GitAnalyzer,
21 classifier: HybridClassifier,
22 cache_dir: PathBuf,
23}
24
25impl OrgAnalyzer {
26 pub fn new<P: AsRef<Path>>(cache_dir: P) -> Self {
39 let cache_dir = cache_dir.as_ref().to_path_buf();
40 Self {
41 git_analyzer: GitAnalyzer::new(&cache_dir),
42 classifier: HybridClassifier::new_rule_based(),
43 cache_dir,
44 }
45 }
46
47 pub fn with_ml_model<P: AsRef<Path>>(
69 cache_dir: P,
70 ml_model: crate::ml_trainer::TrainedModel,
71 confidence_threshold: f32,
72 ) -> Self {
73 let cache_dir = cache_dir.as_ref().to_path_buf();
74 Self {
75 git_analyzer: GitAnalyzer::new(&cache_dir),
76 classifier: HybridClassifier::new_hybrid(ml_model, confidence_threshold),
77 cache_dir,
78 }
79 }
80
81 pub async fn analyze_repository(
106 &self,
107 repo_url: &str,
108 repo_name: &str,
109 max_commits: usize,
110 ) -> Result<Vec<DefectPattern>> {
111 info!(
112 "Analyzing repository {} (up to {} commits)",
113 repo_name, max_commits
114 );
115
116 self.git_analyzer.clone_repository(repo_url, repo_name)?;
118
119 let commits = self.git_analyzer.analyze_commits(repo_name, max_commits)?;
121 debug!("Retrieved {} commits from {}", commits.len(), repo_name);
122
123 let mut patterns = self.aggregate_defect_patterns(&commits);
125
126 let repo_path = self.cache_dir.join(repo_name);
128 if let Ok(tdg_analysis) = PmatIntegration::analyze_tdg(&repo_path) {
129 debug!(
130 "TDG analysis: avg={:.1}, max={:.1}",
131 tdg_analysis.average_score, tdg_analysis.max_score
132 );
133 self.enrich_with_tdg(&mut patterns, &tdg_analysis);
134 } else {
135 debug!("TDG analysis unavailable (pmat not installed or failed)");
136 }
137
138 info!(
139 "Found {} defect categories in {}",
140 patterns.len(),
141 repo_name
142 );
143 Ok(patterns)
144 }
145
146 fn aggregate_defect_patterns(&self, commits: &[CommitInfo]) -> Vec<DefectPattern> {
154 let mut category_map: HashMap<DefectCategory, CategoryStats> = HashMap::new();
155
156 for commit in commits {
158 if let Some(classification) = self.classifier.classify_from_message(&commit.message) {
159 let stats = category_map
160 .entry(classification.category)
161 .or_insert_with(|| CategoryStats::new(classification.category));
162
163 stats.add_instance(commit, &classification);
164 }
165 }
166
167 category_map
169 .into_values()
170 .map(|stats| stats.into_defect_pattern())
171 .collect()
172 }
173
174 fn enrich_with_tdg(&self, patterns: &mut [DefectPattern], tdg_analysis: &TdgAnalysis) {
180 for pattern in patterns.iter_mut() {
181 pattern.quality_signals.avg_tdg_score = Some(tdg_analysis.average_score);
183 pattern.quality_signals.max_tdg_score = Some(tdg_analysis.max_score);
184 }
185 }
186}
187
188#[derive(Debug)]
190struct CategoryStats {
191 category: DefectCategory,
192 count: usize,
193 total_confidence: f32,
194 instances: Vec<DefectInstance>,
195 total_files_changed: usize,
197 total_lines_added: usize,
198 total_lines_removed: usize,
199}
200
201impl CategoryStats {
202 fn new(category: DefectCategory) -> Self {
203 Self {
204 category,
205 count: 0,
206 total_confidence: 0.0,
207 instances: Vec::new(),
208 total_files_changed: 0,
209 total_lines_added: 0,
210 total_lines_removed: 0,
211 }
212 }
213
214 fn add_instance(&mut self, commit: &CommitInfo, classification: &Classification) {
215 self.count += 1;
216 self.total_confidence += classification.confidence;
217
218 self.total_files_changed += commit.files_changed;
220 self.total_lines_added += commit.lines_added;
221 self.total_lines_removed += commit.lines_removed;
222
223 if self.instances.len() < 3 {
225 self.instances.push(DefectInstance {
226 commit_hash: commit.hash[..8.min(commit.hash.len())].to_string(),
227 message: commit.message.clone(),
228 author: commit.author.clone(),
229 timestamp: commit.timestamp,
230 files_affected: commit.files_changed,
231 lines_added: commit.lines_added,
232 lines_removed: commit.lines_removed,
233 });
234 }
235 }
236
237 fn into_defect_pattern(self) -> DefectPattern {
238 let avg_confidence = if self.count > 0 {
239 self.total_confidence / self.count as f32
240 } else {
241 0.0
242 };
243
244 let quality_signals = if self.count > 0 {
246 QualitySignals {
247 avg_tdg_score: None, max_tdg_score: None,
249 avg_complexity: None,
250 avg_test_coverage: None,
251 satd_instances: 0, avg_lines_changed: (self.total_lines_added + self.total_lines_removed) as f32
253 / self.count as f32,
254 avg_files_per_commit: self.total_files_changed as f32 / self.count as f32,
255 }
256 } else {
257 QualitySignals::default()
258 };
259
260 DefectPattern {
261 category: self.category,
262 frequency: self.count,
263 confidence: avg_confidence,
264 quality_signals,
265 examples: self.instances,
266 }
267 }
268}
269
270#[cfg(test)]
271mod tests {
272 use super::*;
273 use tempfile::TempDir;
274
275 #[test]
276 fn test_org_analyzer_can_be_created() {
277 let temp_dir = TempDir::new().unwrap();
278 let _analyzer = OrgAnalyzer::new(temp_dir.path());
279 }
280
281 #[test]
282 fn test_aggregate_empty_commits() {
283 let temp_dir = TempDir::new().unwrap();
284 let analyzer = OrgAnalyzer::new(temp_dir.path());
285
286 let commits = vec![];
287 let patterns = analyzer.aggregate_defect_patterns(&commits);
288
289 assert!(patterns.is_empty());
290 }
291
292 #[test]
293 fn test_aggregate_non_defect_commits() {
294 let temp_dir = TempDir::new().unwrap();
295 let analyzer = OrgAnalyzer::new(temp_dir.path());
296
297 let commits = vec![
298 CommitInfo {
299 hash: "abc123".to_string(),
300 message: "docs: update README".to_string(),
301 author: "test@example.com".to_string(),
302 timestamp: 1234567890,
303 files_changed: 1,
304 lines_added: 5,
305 lines_removed: 2,
306 },
307 CommitInfo {
308 hash: "def456".to_string(),
309 message: "chore: bump version".to_string(),
310 author: "test@example.com".to_string(),
311 timestamp: 1234567891,
312 files_changed: 1,
313 lines_added: 1,
314 lines_removed: 1,
315 },
316 ];
317
318 let patterns = analyzer.aggregate_defect_patterns(&commits);
319 assert!(patterns.is_empty());
320 }
321
322 #[test]
323 fn test_aggregate_defect_commits() {
324 let temp_dir = TempDir::new().unwrap();
325 let analyzer = OrgAnalyzer::new(temp_dir.path());
326
327 let commits = vec![
328 CommitInfo {
329 hash: "abc123".to_string(),
330 message: "fix: use-after-free in buffer".to_string(),
331 author: "test@example.com".to_string(),
332 timestamp: 1234567890,
333 files_changed: 2,
334 lines_added: 45,
335 lines_removed: 12,
336 },
337 CommitInfo {
338 hash: "def456".to_string(),
339 message: "fix: another memory leak".to_string(),
340 author: "test@example.com".to_string(),
341 timestamp: 1234567891,
342 files_changed: 1,
343 lines_added: 8,
344 lines_removed: 3,
345 },
346 CommitInfo {
347 hash: "ghi789".to_string(),
348 message: "security: prevent SQL injection".to_string(),
349 author: "test@example.com".to_string(),
350 timestamp: 1234567892,
351 files_changed: 3,
352 lines_added: 67,
353 lines_removed: 23,
354 },
355 ];
356
357 let patterns = analyzer.aggregate_defect_patterns(&commits);
358
359 assert_eq!(patterns.len(), 2);
361
362 let memory_pattern = patterns
364 .iter()
365 .find(|p| p.category == DefectCategory::MemorySafety)
366 .expect("Should find memory safety pattern");
367
368 assert_eq!(memory_pattern.frequency, 2);
369 assert!(memory_pattern.confidence > 0.0);
370 assert_eq!(memory_pattern.examples.len(), 2);
371 }
372
373 #[test]
374 fn test_category_stats_aggregation() {
375 let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
376
377 let commit1 = CommitInfo {
378 hash: "abc123".to_string(),
379 message: "fix: memory leak".to_string(),
380 author: "test@example.com".to_string(),
381 timestamp: 1234567890,
382 files_changed: 2,
383 lines_added: 15,
384 lines_removed: 5,
385 };
386
387 let classification1 = Classification {
388 category: DefectCategory::MemorySafety,
389 confidence: 0.8,
390 explanation: "test".to_string(),
391 matched_patterns: vec!["memory leak".to_string()],
392 };
393
394 stats.add_instance(&commit1, &classification1);
395
396 assert_eq!(stats.count, 1);
397 assert_eq!(stats.total_confidence, 0.8);
398 assert_eq!(stats.instances.len(), 1);
399
400 let pattern = stats.into_defect_pattern();
401 assert_eq!(pattern.frequency, 1);
402 assert_eq!(pattern.confidence, 0.8);
403 assert_eq!(pattern.quality_signals.avg_lines_changed, 20.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
406 }
407
408 #[test]
409 fn test_examples_limited_to_three() {
410 let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
411
412 for i in 0..5 {
413 let commit = CommitInfo {
414 hash: format!("hash{}", i),
415 message: "fix: memory leak".to_string(),
416 author: "test@example.com".to_string(),
417 timestamp: 1234567890 + i as i64,
418 files_changed: 1,
419 lines_added: 10,
420 lines_removed: 5,
421 };
422
423 let classification = Classification {
424 category: DefectCategory::MemorySafety,
425 confidence: 0.8,
426 explanation: "test".to_string(),
427 matched_patterns: vec!["memory leak".to_string()],
428 };
429
430 stats.add_instance(&commit, &classification);
431 }
432
433 assert_eq!(stats.count, 5);
434 assert_eq!(stats.instances.len(), 3); }
436
437 #[test]
438 fn test_enrich_with_tdg() {
439 use crate::pmat::TdgAnalysis;
440 use std::collections::HashMap;
441
442 let temp_dir = TempDir::new().unwrap();
443 let analyzer = OrgAnalyzer::new(temp_dir.path());
444
445 let mut patterns = vec![DefectPattern {
447 category: DefectCategory::MemorySafety,
448 frequency: 5,
449 confidence: 0.85,
450 quality_signals: QualitySignals::default(),
451 examples: vec![],
452 }];
453
454 let tdg_analysis = TdgAnalysis {
456 file_scores: HashMap::new(),
457 average_score: 92.5,
458 max_score: 98.0,
459 };
460
461 analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
463
464 assert_eq!(patterns[0].quality_signals.avg_tdg_score, Some(92.5));
466 assert_eq!(patterns[0].quality_signals.max_tdg_score, Some(98.0));
467 }
468
469 #[tokio::test]
471 #[ignore]
472 async fn test_analyze_real_repository() {
473 let temp_dir = TempDir::new().unwrap();
474 let analyzer = OrgAnalyzer::new(temp_dir.path());
475
476 let patterns = analyzer
477 .analyze_repository("https://github.com/rust-lang/rustlings", "rustlings", 100)
478 .await
479 .unwrap();
480
481 assert!(!patterns.is_empty() || patterns.is_empty()); }
485
486 #[test]
487 fn test_category_stats_new() {
488 let stats = CategoryStats::new(DefectCategory::LogicErrors);
489 assert_eq!(stats.count, 0);
490 assert_eq!(stats.total_confidence, 0.0);
491 assert_eq!(stats.instances.len(), 0);
492 assert_eq!(stats.total_files_changed, 0);
493 assert_eq!(stats.total_lines_added, 0);
494 assert_eq!(stats.total_lines_removed, 0);
495 }
496
497 #[test]
498 fn test_category_stats_averaging() {
499 let mut stats = CategoryStats::new(DefectCategory::SecurityVulnerabilities);
500
501 for i in 0..3 {
503 let commit = CommitInfo {
504 hash: format!("hash{}", i),
505 message: "fix: SQL injection".to_string(),
506 author: "test@example.com".to_string(),
507 timestamp: 1234567890 + i as i64,
508 files_changed: 2,
509 lines_added: 10,
510 lines_removed: 5,
511 };
512
513 let classification = Classification {
514 category: DefectCategory::SecurityVulnerabilities,
515 confidence: 0.9,
516 explanation: "test".to_string(),
517 matched_patterns: vec!["sql injection".to_string()],
518 };
519
520 stats.add_instance(&commit, &classification);
521 }
522
523 let pattern = stats.into_defect_pattern();
524 assert_eq!(pattern.frequency, 3);
525 assert!((pattern.confidence - 0.9).abs() < 0.01); assert_eq!(pattern.quality_signals.avg_lines_changed, 15.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
528 }
529
530 #[test]
531 fn test_commit_hash_truncation() {
532 let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
533
534 let commit = CommitInfo {
535 hash: "abcdefghijklmnop".to_string(), message: "fix: memory leak".to_string(),
537 author: "test@example.com".to_string(),
538 timestamp: 1234567890,
539 files_changed: 1,
540 lines_added: 10,
541 lines_removed: 5,
542 };
543
544 let classification = Classification {
545 category: DefectCategory::MemorySafety,
546 confidence: 0.8,
547 explanation: "test".to_string(),
548 matched_patterns: vec!["memory leak".to_string()],
549 };
550
551 stats.add_instance(&commit, &classification);
552
553 assert_eq!(stats.instances[0].commit_hash, "abcdefgh"); assert_eq!(stats.instances[0].commit_hash.len(), 8);
555 }
556
557 #[test]
558 fn test_commit_hash_short() {
559 let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
560
561 let commit = CommitInfo {
562 hash: "abc".to_string(), message: "fix: memory leak".to_string(),
564 author: "test@example.com".to_string(),
565 timestamp: 1234567890,
566 files_changed: 1,
567 lines_added: 10,
568 lines_removed: 5,
569 };
570
571 let classification = Classification {
572 category: DefectCategory::MemorySafety,
573 confidence: 0.8,
574 explanation: "test".to_string(),
575 matched_patterns: vec!["memory leak".to_string()],
576 };
577
578 stats.add_instance(&commit, &classification);
579
580 assert_eq!(stats.instances[0].commit_hash, "abc");
582 }
583
584 #[test]
585 fn test_category_stats_zero_count_pattern() {
586 let stats = CategoryStats::new(DefectCategory::TypeErrors);
587 let pattern = stats.into_defect_pattern();
588
589 assert_eq!(pattern.frequency, 0);
590 assert_eq!(pattern.confidence, 0.0);
591 assert_eq!(pattern.quality_signals.avg_lines_changed, 0.0);
592 assert_eq!(pattern.quality_signals.avg_files_per_commit, 0.0);
593 }
594
595 #[test]
596 fn test_aggregate_mixed_commits() {
597 let temp_dir = TempDir::new().unwrap();
598 let analyzer = OrgAnalyzer::new(temp_dir.path());
599
600 let commits = vec![
601 CommitInfo {
602 hash: "abc123".to_string(),
603 message: "fix: null pointer dereference".to_string(),
604 author: "test@example.com".to_string(),
605 timestamp: 1234567890,
606 files_changed: 2,
607 lines_added: 20,
608 lines_removed: 5,
609 },
610 CommitInfo {
611 hash: "def456".to_string(),
612 message: "docs: update README".to_string(), author: "test@example.com".to_string(),
614 timestamp: 1234567891,
615 files_changed: 1,
616 lines_added: 5,
617 lines_removed: 2,
618 },
619 CommitInfo {
620 hash: "ghi789".to_string(),
621 message: "fix: another null pointer issue".to_string(),
622 author: "test@example.com".to_string(),
623 timestamp: 1234567892,
624 files_changed: 1,
625 lines_added: 10,
626 lines_removed: 3,
627 },
628 ];
629
630 let patterns = analyzer.aggregate_defect_patterns(&commits);
631
632 assert_eq!(patterns.len(), 1);
634
635 let memory_pattern = &patterns[0];
636 assert_eq!(memory_pattern.category, DefectCategory::MemorySafety);
637 assert_eq!(memory_pattern.frequency, 2);
638 assert_eq!(memory_pattern.examples.len(), 2);
639 }
640
641 #[test]
642 fn test_quality_signals_calculation() {
643 let mut stats = CategoryStats::new(DefectCategory::ConcurrencyBugs);
644
645 let commit = CommitInfo {
646 hash: "abc123".to_string(),
647 message: "fix: race condition".to_string(),
648 author: "test@example.com".to_string(),
649 timestamp: 1234567890,
650 files_changed: 3,
651 lines_added: 50,
652 lines_removed: 20,
653 };
654
655 let classification = Classification {
656 category: DefectCategory::ConcurrencyBugs,
657 confidence: 0.82,
658 explanation: "test".to_string(),
659 matched_patterns: vec!["race condition".to_string()],
660 };
661
662 stats.add_instance(&commit, &classification);
663
664 let pattern = stats.into_defect_pattern();
665
666 assert_eq!(pattern.quality_signals.avg_lines_changed, 70.0); assert_eq!(pattern.quality_signals.avg_files_per_commit, 3.0);
669 assert!(pattern.quality_signals.avg_tdg_score.is_none()); assert!(pattern.quality_signals.avg_complexity.is_none());
671 assert!(pattern.quality_signals.avg_test_coverage.is_none());
672 assert_eq!(pattern.quality_signals.satd_instances, 0);
673 }
674
675 #[test]
676 fn test_enrich_with_tdg_multiple_patterns() {
677 use crate::pmat::TdgAnalysis;
678 use std::collections::HashMap;
679
680 let temp_dir = TempDir::new().unwrap();
681 let analyzer = OrgAnalyzer::new(temp_dir.path());
682
683 let mut patterns = vec![
684 DefectPattern {
685 category: DefectCategory::MemorySafety,
686 frequency: 5,
687 confidence: 0.85,
688 quality_signals: QualitySignals::default(),
689 examples: vec![],
690 },
691 DefectPattern {
692 category: DefectCategory::SecurityVulnerabilities,
693 frequency: 3,
694 confidence: 0.90,
695 quality_signals: QualitySignals::default(),
696 examples: vec![],
697 },
698 ];
699
700 let tdg_analysis = TdgAnalysis {
701 file_scores: HashMap::new(),
702 average_score: 85.5,
703 max_score: 95.0,
704 };
705
706 analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
707
708 for pattern in &patterns {
710 assert_eq!(pattern.quality_signals.avg_tdg_score, Some(85.5));
711 assert_eq!(pattern.quality_signals.max_tdg_score, Some(95.0));
712 }
713 }
714}