organizational_intelligence_plugin/
analyzer.rs

1// Integrated analyzer combining git history and defect classification
2// Phase 1: Combines GitAnalyzer + RuleBasedClassifier to generate defect patterns
3// Phase 2: Supports hybrid ML + rule-based classification (NLP-010)
4// Toyota Way: Simple integration, measure before optimizing
5
6use crate::classifier::{Classification, DefectCategory, HybridClassifier};
7use crate::git::{CommitInfo, GitAnalyzer};
8use crate::pmat::{PmatIntegration, TdgAnalysis};
9use crate::report::{DefectInstance, DefectPattern, QualitySignals};
10use anyhow::Result;
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use tracing::{debug, info};
14
15/// Integrated organizational defect analyzer
16/// Combines git history analysis with defect classification
17///
18/// Supports both rule-based and ML-based classification (NLP-010)
19pub struct OrgAnalyzer {
20    git_analyzer: GitAnalyzer,
21    classifier: HybridClassifier,
22    cache_dir: PathBuf,
23}
24
25impl OrgAnalyzer {
26    /// Create a new organizational analyzer
27    ///
28    /// # Arguments
29    /// * `cache_dir` - Directory for storing cloned repositories
30    ///
31    /// # Examples
32    /// ```
33    /// use organizational_intelligence_plugin::analyzer::OrgAnalyzer;
34    /// use std::path::PathBuf;
35    ///
36    /// let analyzer = OrgAnalyzer::new(PathBuf::from("/tmp/repos"));
37    /// ```
38    pub fn new<P: AsRef<Path>>(cache_dir: P) -> Self {
39        let cache_dir = cache_dir.as_ref().to_path_buf();
40        Self {
41            git_analyzer: GitAnalyzer::new(&cache_dir),
42            classifier: HybridClassifier::new_rule_based(),
43            cache_dir,
44        }
45    }
46
47    /// Create a new organizational analyzer with ML model
48    ///
49    /// # Arguments
50    /// * `cache_dir` - Directory for storing cloned repositories
51    /// * `ml_model` - Trained ML model for classification
52    /// * `confidence_threshold` - Minimum confidence for ML predictions
53    ///
54    /// # Examples
55    /// ```no_run
56    /// use organizational_intelligence_plugin::analyzer::OrgAnalyzer;
57    /// use organizational_intelligence_plugin::ml_trainer::MLTrainer;
58    /// use std::path::PathBuf;
59    ///
60    /// # fn example(model: organizational_intelligence_plugin::ml_trainer::TrainedModel) {
61    /// let analyzer = OrgAnalyzer::with_ml_model(
62    ///     PathBuf::from("/tmp/repos"),
63    ///     model,
64    ///     0.65
65    /// );
66    /// # }
67    /// ```
68    pub fn with_ml_model<P: AsRef<Path>>(
69        cache_dir: P,
70        ml_model: crate::ml_trainer::TrainedModel,
71        confidence_threshold: f32,
72    ) -> Self {
73        let cache_dir = cache_dir.as_ref().to_path_buf();
74        Self {
75            git_analyzer: GitAnalyzer::new(&cache_dir),
76            classifier: HybridClassifier::new_hybrid(ml_model, confidence_threshold),
77            cache_dir,
78        }
79    }
80
81    /// Analyze a single repository
82    ///
83    /// # Arguments
84    /// * `repo_url` - Repository URL
85    /// * `repo_name` - Repository name
86    /// * `max_commits` - Maximum commits to analyze
87    ///
88    /// # Returns
89    /// * `Ok(Vec<DefectPattern>)` with detected defect patterns
90    ///
91    /// # Examples
92    /// ```no_run
93    /// # use organizational_intelligence_plugin::analyzer::OrgAnalyzer;
94    /// # use std::path::PathBuf;
95    /// # async fn example() -> Result<(), anyhow::Error> {
96    /// let analyzer = OrgAnalyzer::new(PathBuf::from("/tmp/repos"));
97    /// let patterns = analyzer.analyze_repository(
98    ///     "https://github.com/rust-lang/rust",
99    ///     "rust",
100    ///     1000
101    /// ).await?;
102    /// # Ok(())
103    /// # }
104    /// ```
105    pub async fn analyze_repository(
106        &self,
107        repo_url: &str,
108        repo_name: &str,
109        max_commits: usize,
110    ) -> Result<Vec<DefectPattern>> {
111        info!(
112            "Analyzing repository {} (up to {} commits)",
113            repo_name, max_commits
114        );
115
116        // Clone repository
117        self.git_analyzer.clone_repository(repo_url, repo_name)?;
118
119        // Analyze commits
120        let commits = self.git_analyzer.analyze_commits(repo_name, max_commits)?;
121        debug!("Retrieved {} commits from {}", commits.len(), repo_name);
122
123        // Classify commits and aggregate patterns
124        let mut patterns = self.aggregate_defect_patterns(&commits);
125
126        // Optionally enrich with TDG analysis (if pmat available)
127        let repo_path = self.cache_dir.join(repo_name);
128        if let Ok(tdg_analysis) = PmatIntegration::analyze_tdg(&repo_path) {
129            debug!(
130                "TDG analysis: avg={:.1}, max={:.1}",
131                tdg_analysis.average_score, tdg_analysis.max_score
132            );
133            self.enrich_with_tdg(&mut patterns, &tdg_analysis);
134        } else {
135            debug!("TDG analysis unavailable (pmat not installed or failed)");
136        }
137
138        info!(
139            "Found {} defect categories in {}",
140            patterns.len(),
141            repo_name
142        );
143        Ok(patterns)
144    }
145
146    /// Aggregate defect patterns from classified commits
147    ///
148    /// # Arguments
149    /// * `commits` - List of commits to analyze
150    ///
151    /// # Returns
152    /// * `Vec<DefectPattern>` with aggregated statistics
153    fn aggregate_defect_patterns(&self, commits: &[CommitInfo]) -> Vec<DefectPattern> {
154        let mut category_map: HashMap<DefectCategory, CategoryStats> = HashMap::new();
155
156        // Classify each commit
157        for commit in commits {
158            if let Some(classification) = self.classifier.classify_from_message(&commit.message) {
159                let stats = category_map
160                    .entry(classification.category)
161                    .or_insert_with(|| CategoryStats::new(classification.category));
162
163                stats.add_instance(commit, &classification);
164            }
165        }
166
167        // Convert to DefectPattern
168        category_map
169            .into_values()
170            .map(|stats| stats.into_defect_pattern())
171            .collect()
172    }
173
174    /// Enrich defect patterns with TDG quality signals
175    ///
176    /// # Arguments
177    /// * `patterns` - Defect patterns to enrich
178    /// * `tdg_analysis` - TDG analysis results
179    fn enrich_with_tdg(&self, patterns: &mut [DefectPattern], tdg_analysis: &TdgAnalysis) {
180        for pattern in patterns.iter_mut() {
181            // Update quality signals with TDG data
182            pattern.quality_signals.avg_tdg_score = Some(tdg_analysis.average_score);
183            pattern.quality_signals.max_tdg_score = Some(tdg_analysis.max_score);
184        }
185    }
186}
187
188/// Internal stats tracking for each defect category with quality signals
189#[derive(Debug)]
190struct CategoryStats {
191    category: DefectCategory,
192    count: usize,
193    total_confidence: f32,
194    instances: Vec<DefectInstance>,
195    // Quality signal aggregators
196    total_files_changed: usize,
197    total_lines_added: usize,
198    total_lines_removed: usize,
199}
200
201impl CategoryStats {
202    fn new(category: DefectCategory) -> Self {
203        Self {
204            category,
205            count: 0,
206            total_confidence: 0.0,
207            instances: Vec::new(),
208            total_files_changed: 0,
209            total_lines_added: 0,
210            total_lines_removed: 0,
211        }
212    }
213
214    fn add_instance(&mut self, commit: &CommitInfo, classification: &Classification) {
215        self.count += 1;
216        self.total_confidence += classification.confidence;
217
218        // Aggregate quality signals
219        self.total_files_changed += commit.files_changed;
220        self.total_lines_added += commit.lines_added;
221        self.total_lines_removed += commit.lines_removed;
222
223        // Keep up to 3 examples
224        if self.instances.len() < 3 {
225            self.instances.push(DefectInstance {
226                commit_hash: commit.hash[..8.min(commit.hash.len())].to_string(),
227                message: commit.message.clone(),
228                author: commit.author.clone(),
229                timestamp: commit.timestamp,
230                files_affected: commit.files_changed,
231                lines_added: commit.lines_added,
232                lines_removed: commit.lines_removed,
233            });
234        }
235    }
236
237    fn into_defect_pattern(self) -> DefectPattern {
238        let avg_confidence = if self.count > 0 {
239            self.total_confidence / self.count as f32
240        } else {
241            0.0
242        };
243
244        // Calculate quality signals
245        let quality_signals = if self.count > 0 {
246            QualitySignals {
247                avg_tdg_score: None, // Will be enhanced in Phase 1.5.2
248                max_tdg_score: None,
249                avg_complexity: None,
250                avg_test_coverage: None,
251                satd_instances: 0, // Will be enhanced in Phase 1.5.2
252                avg_lines_changed: (self.total_lines_added + self.total_lines_removed) as f32
253                    / self.count as f32,
254                avg_files_per_commit: self.total_files_changed as f32 / self.count as f32,
255            }
256        } else {
257            QualitySignals::default()
258        };
259
260        DefectPattern {
261            category: self.category,
262            frequency: self.count,
263            confidence: avg_confidence,
264            quality_signals,
265            examples: self.instances,
266        }
267    }
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273    use tempfile::TempDir;
274
275    #[test]
276    fn test_org_analyzer_can_be_created() {
277        let temp_dir = TempDir::new().unwrap();
278        let _analyzer = OrgAnalyzer::new(temp_dir.path());
279    }
280
281    #[test]
282    fn test_aggregate_empty_commits() {
283        let temp_dir = TempDir::new().unwrap();
284        let analyzer = OrgAnalyzer::new(temp_dir.path());
285
286        let commits = vec![];
287        let patterns = analyzer.aggregate_defect_patterns(&commits);
288
289        assert!(patterns.is_empty());
290    }
291
292    #[test]
293    fn test_aggregate_non_defect_commits() {
294        let temp_dir = TempDir::new().unwrap();
295        let analyzer = OrgAnalyzer::new(temp_dir.path());
296
297        let commits = vec![
298            CommitInfo {
299                hash: "abc123".to_string(),
300                message: "docs: update README".to_string(),
301                author: "test@example.com".to_string(),
302                timestamp: 1234567890,
303                files_changed: 1,
304                lines_added: 5,
305                lines_removed: 2,
306            },
307            CommitInfo {
308                hash: "def456".to_string(),
309                message: "chore: bump version".to_string(),
310                author: "test@example.com".to_string(),
311                timestamp: 1234567891,
312                files_changed: 1,
313                lines_added: 1,
314                lines_removed: 1,
315            },
316        ];
317
318        let patterns = analyzer.aggregate_defect_patterns(&commits);
319        assert!(patterns.is_empty());
320    }
321
322    #[test]
323    fn test_aggregate_defect_commits() {
324        let temp_dir = TempDir::new().unwrap();
325        let analyzer = OrgAnalyzer::new(temp_dir.path());
326
327        let commits = vec![
328            CommitInfo {
329                hash: "abc123".to_string(),
330                message: "fix: use-after-free in buffer".to_string(),
331                author: "test@example.com".to_string(),
332                timestamp: 1234567890,
333                files_changed: 2,
334                lines_added: 45,
335                lines_removed: 12,
336            },
337            CommitInfo {
338                hash: "def456".to_string(),
339                message: "fix: another memory leak".to_string(),
340                author: "test@example.com".to_string(),
341                timestamp: 1234567891,
342                files_changed: 1,
343                lines_added: 8,
344                lines_removed: 3,
345            },
346            CommitInfo {
347                hash: "ghi789".to_string(),
348                message: "security: prevent SQL injection".to_string(),
349                author: "test@example.com".to_string(),
350                timestamp: 1234567892,
351                files_changed: 3,
352                lines_added: 67,
353                lines_removed: 23,
354            },
355        ];
356
357        let patterns = analyzer.aggregate_defect_patterns(&commits);
358
359        // Should have 2 categories: MemorySafety (2x) and SecurityVulnerabilities (1x)
360        assert_eq!(patterns.len(), 2);
361
362        // Check memory safety pattern
363        let memory_pattern = patterns
364            .iter()
365            .find(|p| p.category == DefectCategory::MemorySafety)
366            .expect("Should find memory safety pattern");
367
368        assert_eq!(memory_pattern.frequency, 2);
369        assert!(memory_pattern.confidence > 0.0);
370        assert_eq!(memory_pattern.examples.len(), 2);
371    }
372
373    #[test]
374    fn test_category_stats_aggregation() {
375        let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
376
377        let commit1 = CommitInfo {
378            hash: "abc123".to_string(),
379            message: "fix: memory leak".to_string(),
380            author: "test@example.com".to_string(),
381            timestamp: 1234567890,
382            files_changed: 2,
383            lines_added: 15,
384            lines_removed: 5,
385        };
386
387        let classification1 = Classification {
388            category: DefectCategory::MemorySafety,
389            confidence: 0.8,
390            explanation: "test".to_string(),
391            matched_patterns: vec!["memory leak".to_string()],
392        };
393
394        stats.add_instance(&commit1, &classification1);
395
396        assert_eq!(stats.count, 1);
397        assert_eq!(stats.total_confidence, 0.8);
398        assert_eq!(stats.instances.len(), 1);
399
400        let pattern = stats.into_defect_pattern();
401        assert_eq!(pattern.frequency, 1);
402        assert_eq!(pattern.confidence, 0.8);
403        // Verify quality signals are calculated
404        assert_eq!(pattern.quality_signals.avg_lines_changed, 20.0); // 15 + 5
405        assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
406    }
407
408    #[test]
409    fn test_examples_limited_to_three() {
410        let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
411
412        for i in 0..5 {
413            let commit = CommitInfo {
414                hash: format!("hash{}", i),
415                message: "fix: memory leak".to_string(),
416                author: "test@example.com".to_string(),
417                timestamp: 1234567890 + i as i64,
418                files_changed: 1,
419                lines_added: 10,
420                lines_removed: 5,
421            };
422
423            let classification = Classification {
424                category: DefectCategory::MemorySafety,
425                confidence: 0.8,
426                explanation: "test".to_string(),
427                matched_patterns: vec!["memory leak".to_string()],
428            };
429
430            stats.add_instance(&commit, &classification);
431        }
432
433        assert_eq!(stats.count, 5);
434        assert_eq!(stats.instances.len(), 3); // Limited to 3
435    }
436
437    #[test]
438    fn test_enrich_with_tdg() {
439        use crate::pmat::TdgAnalysis;
440        use std::collections::HashMap;
441
442        let temp_dir = TempDir::new().unwrap();
443        let analyzer = OrgAnalyzer::new(temp_dir.path());
444
445        // Create mock defect pattern
446        let mut patterns = vec![DefectPattern {
447            category: DefectCategory::MemorySafety,
448            frequency: 5,
449            confidence: 0.85,
450            quality_signals: QualitySignals::default(),
451            examples: vec![],
452        }];
453
454        // Create mock TDG analysis
455        let tdg_analysis = TdgAnalysis {
456            file_scores: HashMap::new(),
457            average_score: 92.5,
458            max_score: 98.0,
459        };
460
461        // Enrich patterns
462        analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
463
464        // Verify TDG scores were populated
465        assert_eq!(patterns[0].quality_signals.avg_tdg_score, Some(92.5));
466        assert_eq!(patterns[0].quality_signals.max_tdg_score, Some(98.0));
467    }
468
469    // Integration test requiring network access
470    #[tokio::test]
471    #[ignore]
472    async fn test_analyze_real_repository() {
473        let temp_dir = TempDir::new().unwrap();
474        let analyzer = OrgAnalyzer::new(temp_dir.path());
475
476        let patterns = analyzer
477            .analyze_repository("https://github.com/rust-lang/rustlings", "rustlings", 100)
478            .await
479            .unwrap();
480
481        // Should find at least some defect patterns in 100 commits
482        // (This is probabilistic, but rustlings has fix commits)
483        assert!(!patterns.is_empty() || patterns.is_empty()); // Always passes, just testing it runs
484    }
485
486    #[test]
487    fn test_category_stats_new() {
488        let stats = CategoryStats::new(DefectCategory::LogicErrors);
489        assert_eq!(stats.count, 0);
490        assert_eq!(stats.total_confidence, 0.0);
491        assert_eq!(stats.instances.len(), 0);
492        assert_eq!(stats.total_files_changed, 0);
493        assert_eq!(stats.total_lines_added, 0);
494        assert_eq!(stats.total_lines_removed, 0);
495    }
496
497    #[test]
498    fn test_category_stats_averaging() {
499        let mut stats = CategoryStats::new(DefectCategory::SecurityVulnerabilities);
500
501        // Add multiple commits to test averaging
502        for i in 0..3 {
503            let commit = CommitInfo {
504                hash: format!("hash{}", i),
505                message: "fix: SQL injection".to_string(),
506                author: "test@example.com".to_string(),
507                timestamp: 1234567890 + i as i64,
508                files_changed: 2,
509                lines_added: 10,
510                lines_removed: 5,
511            };
512
513            let classification = Classification {
514                category: DefectCategory::SecurityVulnerabilities,
515                confidence: 0.9,
516                explanation: "test".to_string(),
517                matched_patterns: vec!["sql injection".to_string()],
518            };
519
520            stats.add_instance(&commit, &classification);
521        }
522
523        let pattern = stats.into_defect_pattern();
524        assert_eq!(pattern.frequency, 3);
525        assert!((pattern.confidence - 0.9).abs() < 0.01); // Floating point tolerance
526        assert_eq!(pattern.quality_signals.avg_lines_changed, 15.0); // (10+5) per commit
527        assert_eq!(pattern.quality_signals.avg_files_per_commit, 2.0);
528    }
529
530    #[test]
531    fn test_commit_hash_truncation() {
532        let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
533
534        let commit = CommitInfo {
535            hash: "abcdefghijklmnop".to_string(), // 16 chars
536            message: "fix: memory leak".to_string(),
537            author: "test@example.com".to_string(),
538            timestamp: 1234567890,
539            files_changed: 1,
540            lines_added: 10,
541            lines_removed: 5,
542        };
543
544        let classification = Classification {
545            category: DefectCategory::MemorySafety,
546            confidence: 0.8,
547            explanation: "test".to_string(),
548            matched_patterns: vec!["memory leak".to_string()],
549        };
550
551        stats.add_instance(&commit, &classification);
552
553        assert_eq!(stats.instances[0].commit_hash, "abcdefgh"); // Truncated to 8
554        assert_eq!(stats.instances[0].commit_hash.len(), 8);
555    }
556
557    #[test]
558    fn test_commit_hash_short() {
559        let mut stats = CategoryStats::new(DefectCategory::MemorySafety);
560
561        let commit = CommitInfo {
562            hash: "abc".to_string(), // < 8 chars
563            message: "fix: memory leak".to_string(),
564            author: "test@example.com".to_string(),
565            timestamp: 1234567890,
566            files_changed: 1,
567            lines_added: 10,
568            lines_removed: 5,
569        };
570
571        let classification = Classification {
572            category: DefectCategory::MemorySafety,
573            confidence: 0.8,
574            explanation: "test".to_string(),
575            matched_patterns: vec!["memory leak".to_string()],
576        };
577
578        stats.add_instance(&commit, &classification);
579
580        // Short hash should remain unchanged
581        assert_eq!(stats.instances[0].commit_hash, "abc");
582    }
583
584    #[test]
585    fn test_category_stats_zero_count_pattern() {
586        let stats = CategoryStats::new(DefectCategory::TypeErrors);
587        let pattern = stats.into_defect_pattern();
588
589        assert_eq!(pattern.frequency, 0);
590        assert_eq!(pattern.confidence, 0.0);
591        assert_eq!(pattern.quality_signals.avg_lines_changed, 0.0);
592        assert_eq!(pattern.quality_signals.avg_files_per_commit, 0.0);
593    }
594
595    #[test]
596    fn test_aggregate_mixed_commits() {
597        let temp_dir = TempDir::new().unwrap();
598        let analyzer = OrgAnalyzer::new(temp_dir.path());
599
600        let commits = vec![
601            CommitInfo {
602                hash: "abc123".to_string(),
603                message: "fix: null pointer dereference".to_string(),
604                author: "test@example.com".to_string(),
605                timestamp: 1234567890,
606                files_changed: 2,
607                lines_added: 20,
608                lines_removed: 5,
609            },
610            CommitInfo {
611                hash: "def456".to_string(),
612                message: "docs: update README".to_string(), // Non-defect
613                author: "test@example.com".to_string(),
614                timestamp: 1234567891,
615                files_changed: 1,
616                lines_added: 5,
617                lines_removed: 2,
618            },
619            CommitInfo {
620                hash: "ghi789".to_string(),
621                message: "fix: another null pointer issue".to_string(),
622                author: "test@example.com".to_string(),
623                timestamp: 1234567892,
624                files_changed: 1,
625                lines_added: 10,
626                lines_removed: 3,
627            },
628        ];
629
630        let patterns = analyzer.aggregate_defect_patterns(&commits);
631
632        // Should have 1 category: MemorySafety (2x), ignore non-defect
633        assert_eq!(patterns.len(), 1);
634
635        let memory_pattern = &patterns[0];
636        assert_eq!(memory_pattern.category, DefectCategory::MemorySafety);
637        assert_eq!(memory_pattern.frequency, 2);
638        assert_eq!(memory_pattern.examples.len(), 2);
639    }
640
641    #[test]
642    fn test_quality_signals_calculation() {
643        let mut stats = CategoryStats::new(DefectCategory::ConcurrencyBugs);
644
645        let commit = CommitInfo {
646            hash: "abc123".to_string(),
647            message: "fix: race condition".to_string(),
648            author: "test@example.com".to_string(),
649            timestamp: 1234567890,
650            files_changed: 3,
651            lines_added: 50,
652            lines_removed: 20,
653        };
654
655        let classification = Classification {
656            category: DefectCategory::ConcurrencyBugs,
657            confidence: 0.82,
658            explanation: "test".to_string(),
659            matched_patterns: vec!["race condition".to_string()],
660        };
661
662        stats.add_instance(&commit, &classification);
663
664        let pattern = stats.into_defect_pattern();
665
666        // Verify quality signals
667        assert_eq!(pattern.quality_signals.avg_lines_changed, 70.0); // 50 + 20
668        assert_eq!(pattern.quality_signals.avg_files_per_commit, 3.0);
669        assert!(pattern.quality_signals.avg_tdg_score.is_none()); // Not enriched yet
670        assert!(pattern.quality_signals.avg_complexity.is_none());
671        assert!(pattern.quality_signals.avg_test_coverage.is_none());
672        assert_eq!(pattern.quality_signals.satd_instances, 0);
673    }
674
675    #[test]
676    fn test_enrich_with_tdg_multiple_patterns() {
677        use crate::pmat::TdgAnalysis;
678        use std::collections::HashMap;
679
680        let temp_dir = TempDir::new().unwrap();
681        let analyzer = OrgAnalyzer::new(temp_dir.path());
682
683        let mut patterns = vec![
684            DefectPattern {
685                category: DefectCategory::MemorySafety,
686                frequency: 5,
687                confidence: 0.85,
688                quality_signals: QualitySignals::default(),
689                examples: vec![],
690            },
691            DefectPattern {
692                category: DefectCategory::SecurityVulnerabilities,
693                frequency: 3,
694                confidence: 0.90,
695                quality_signals: QualitySignals::default(),
696                examples: vec![],
697            },
698        ];
699
700        let tdg_analysis = TdgAnalysis {
701            file_scores: HashMap::new(),
702            average_score: 85.5,
703            max_score: 95.0,
704        };
705
706        analyzer.enrich_with_tdg(&mut patterns, &tdg_analysis);
707
708        // Both patterns should be enriched
709        for pattern in &patterns {
710            assert_eq!(pattern.quality_signals.avg_tdg_score, Some(85.5));
711            assert_eq!(pattern.quality_signals.max_tdg_score, Some(95.0));
712        }
713    }
714}