Skip to main content

depyler_oracle/
github_corpus.rs

1//! GitHub history corpus integration via organizational-intelligence-plugin.
2//!
3//! Recipe for learning from Git history:
4//! 1. Use OIP to extract training data: `oip extract-training-data --repo <path>`
5//! 2. Import JSON into depyler-oracle training pipeline
6//! 3. Map OIP's DefectCategory to depyler's ErrorCategory
7//!
8//! OIP DefectCategory → depyler ErrorCategory mapping:
9//! - OwnershipBorrow, TraitBounds → BorrowChecker, TraitBound
10//! - TypeErrors, TypeAnnotationGaps → TypeMismatch
11//! - StdlibMapping, ASTTransform, ConfigurationErrors → MissingImport
12//! - MemorySafety → LifetimeError
13//! - IteratorChain, ComprehensionBugs → Other
14
15use crate::classifier::ErrorCategory;
16use crate::moe_oracle::ExpertDomain;
17use crate::training::{TrainingDataset, TrainingSample};
18use serde::{Deserialize, Serialize};
19use std::collections::HashMap;
20use std::fs;
21use std::path::Path;
22
23/// OIP DefectCategory (18 categories from organizational-intelligence-plugin)
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
25pub enum OipDefectCategory {
26    // General defect categories (10)
27    MemorySafety,
28    ConcurrencyBugs,
29    LogicErrors,
30    ApiMisuse,
31    ResourceLeaks,
32    TypeErrors,
33    ConfigurationErrors,
34    SecurityVulnerabilities,
35    PerformanceIssues,
36    IntegrationFailures,
37    // Transpiler-specific categories (8)
38    OperatorPrecedence,
39    TypeAnnotationGaps,
40    StdlibMapping,
41    ASTTransform,
42    ComprehensionBugs,
43    IteratorChain,
44    OwnershipBorrow,
45    TraitBounds,
46}
47
48impl OipDefectCategory {
49    /// Map OIP category to depyler ErrorCategory
50    #[must_use]
51    pub fn to_error_category(self) -> ErrorCategory {
52        match self {
53            // Ownership/borrowing issues
54            Self::OwnershipBorrow | Self::MemorySafety => ErrorCategory::BorrowChecker,
55
56            // Trait bound issues
57            Self::TraitBounds => ErrorCategory::TraitBound,
58
59            // Type errors
60            Self::TypeErrors | Self::TypeAnnotationGaps => ErrorCategory::TypeMismatch,
61
62            // Missing imports / stdlib mapping
63            Self::StdlibMapping | Self::ConfigurationErrors | Self::ASTTransform => {
64                ErrorCategory::MissingImport
65            }
66
67            // Lifetime errors (from memory safety patterns)
68            Self::ResourceLeaks => ErrorCategory::LifetimeError,
69
70            // Other categories
71            Self::ConcurrencyBugs
72            | Self::LogicErrors
73            | Self::ApiMisuse
74            | Self::SecurityVulnerabilities
75            | Self::PerformanceIssues
76            | Self::IntegrationFailures
77            | Self::OperatorPrecedence
78            | Self::ComprehensionBugs
79            | Self::IteratorChain => ErrorCategory::Other,
80        }
81    }
82
83    /// Map OIP category to MoE ExpertDomain
84    #[must_use]
85    pub fn to_expert_domain(self) -> ExpertDomain {
86        match self {
87            // Type system expert
88            Self::TypeErrors | Self::TypeAnnotationGaps | Self::TraitBounds => {
89                ExpertDomain::TypeSystem
90            }
91
92            // Scope resolution expert
93            Self::StdlibMapping
94            | Self::ConfigurationErrors
95            | Self::IntegrationFailures
96            | Self::ASTTransform => ExpertDomain::ScopeResolution,
97
98            // Method/field expert
99            Self::ApiMisuse | Self::IteratorChain | Self::ComprehensionBugs => {
100                ExpertDomain::MethodField
101            }
102
103            // Syntax/borrow expert (default for Rust-specific)
104            Self::OwnershipBorrow
105            | Self::MemorySafety
106            | Self::ResourceLeaks
107            | Self::ConcurrencyBugs
108            | Self::LogicErrors
109            | Self::SecurityVulnerabilities
110            | Self::PerformanceIssues
111            | Self::OperatorPrecedence => ExpertDomain::SyntaxBorrowing,
112        }
113    }
114}
115
116/// OIP TrainingExample format (from organizational-intelligence-plugin)
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct OipTrainingExample {
119    /// Commit message text
120    pub message: String,
121    /// Defect category label
122    pub label: OipDefectCategory,
123    /// Classifier confidence (0.0-1.0)
124    pub confidence: f32,
125    /// Original commit hash
126    pub commit_hash: String,
127    /// Author name/email
128    pub author: String,
129    /// Unix timestamp
130    pub timestamp: i64,
131    /// Lines added in commit
132    pub lines_added: usize,
133    /// Lines removed in commit
134    pub lines_removed: usize,
135    /// Number of files changed
136    pub files_changed: usize,
137}
138
139/// OIP TrainingDataset format
140#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct OipTrainingDataset {
142    /// Training examples
143    pub train: Vec<OipTrainingExample>,
144    /// Validation examples
145    pub validation: Vec<OipTrainingExample>,
146    /// Test examples
147    pub test: Vec<OipTrainingExample>,
148}
149
150/// Load OIP training data from JSON file
151///
152/// # Errors
153/// Returns error if file cannot be read or parsed
154pub fn load_oip_training_data(path: &Path) -> Result<OipTrainingDataset, std::io::Error> {
155    let content = fs::read_to_string(path)?;
156    serde_json::from_str(&content)
157        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
158}
159
160/// Convert OIP training data to depyler TrainingDataset
161#[must_use]
162pub fn convert_oip_to_depyler(oip_data: &OipTrainingDataset) -> TrainingDataset {
163    let mut dataset = TrainingDataset::new();
164
165    // Process all examples (train + validation + test)
166    let all_examples: Vec<_> = oip_data
167        .train
168        .iter()
169        .chain(oip_data.validation.iter())
170        .chain(oip_data.test.iter())
171        .collect();
172
173    for example in all_examples {
174        let category = example.label.to_error_category();
175
176        // Extract error pattern from commit message
177        // OIP stores commit messages, we need to extract the error pattern
178        let error_pattern = extract_error_pattern(&example.message);
179
180        // Create fix suggestion from commit message
181        let fix = extract_fix_from_commit(&example.message);
182
183        dataset.add(TrainingSample::with_fix(&error_pattern, category, &fix));
184    }
185
186    dataset
187}
188
189/// Extract error pattern from commit message
190fn extract_error_pattern(message: &str) -> String {
191    // Look for error code patterns like error[E0308]
192    if let Some(start) = message.find("error[E") {
193        if let Some(end) = message[start..].find(']') {
194            let error_code = &message[start..start + end + 1];
195            // Find the error description after the code
196            let rest = &message[start + end + 1..];
197            if let Some(desc_end) = rest.find('\n') {
198                return format!("{}: {}", error_code, rest[..desc_end].trim());
199            }
200            return error_code.to_string();
201        }
202    }
203
204    // Extract from conventional commit format: "fix: <description>"
205    if let Some(fix_start) = message.to_lowercase().find("fix:") {
206        let rest = &message[fix_start + 4..];
207        if let Some(end) = rest.find('\n') {
208            return rest[..end].trim().to_string();
209        }
210        return rest.trim().to_string();
211    }
212
213    // Fall back to first line
214    message.lines().next().unwrap_or(message).to_string()
215}
216
217/// Extract fix suggestion from commit message
218fn extract_fix_from_commit(message: &str) -> String {
219    // Look for "Fix:" or "Solution:" patterns
220    let lower = message.to_lowercase();
221
222    for pattern in &["solution:", "fixed by:", "fix:", "resolved:"] {
223        if let Some(idx) = lower.find(pattern) {
224            let rest = &message[idx + pattern.len()..];
225            if let Some(end) = rest.find('\n') {
226                return rest[..end].trim().to_string();
227            }
228            return rest.trim().to_string();
229        }
230    }
231
232    // Extract from commit title
233    message
234        .lines()
235        .next()
236        .map(|s| s.trim().to_string())
237        .unwrap_or_else(|| "See commit for fix details".to_string())
238}
239
240/// Build corpus from OIP training data file
241///
242/// # Errors
243/// Returns error if file cannot be loaded
244pub fn build_github_corpus(oip_json_path: &Path) -> Result<TrainingDataset, std::io::Error> {
245    let oip_data = load_oip_training_data(oip_json_path)?;
246    Ok(convert_oip_to_depyler(&oip_data))
247}
248
249/// Get MoE training samples from OIP data
250///
251/// Returns (error_code, context, domain) tuples for MoE training
252#[must_use]
253pub fn get_moe_samples_from_oip(
254    oip_data: &OipTrainingDataset,
255) -> Vec<(String, String, ExpertDomain)> {
256    let mut samples = Vec::new();
257
258    let all_examples: Vec<_> = oip_data
259        .train
260        .iter()
261        .chain(oip_data.validation.iter())
262        .chain(oip_data.test.iter())
263        .collect();
264
265    for example in all_examples {
266        let domain = example.label.to_expert_domain();
267        let error_code = infer_error_code_from_category(example.label);
268        let context = example.message.clone();
269
270        samples.push((error_code, context, domain));
271    }
272
273    samples
274}
275
276/// Infer Rust error code from OIP category
277fn infer_error_code_from_category(category: OipDefectCategory) -> String {
278    match category {
279        OipDefectCategory::TypeErrors | OipDefectCategory::TypeAnnotationGaps => {
280            "E0308".to_string()
281        }
282        OipDefectCategory::TraitBounds => "E0277".to_string(),
283        OipDefectCategory::OwnershipBorrow | OipDefectCategory::MemorySafety => "E0382".to_string(),
284        OipDefectCategory::StdlibMapping | OipDefectCategory::ASTTransform => "E0433".to_string(),
285        OipDefectCategory::ApiMisuse | OipDefectCategory::IteratorChain => "E0599".to_string(),
286        OipDefectCategory::ConfigurationErrors | OipDefectCategory::IntegrationFailures => {
287            "E0425".to_string()
288        }
289        OipDefectCategory::ResourceLeaks => "E0106".to_string(),
290        OipDefectCategory::ComprehensionBugs => "E0609".to_string(),
291        _ => "E0000".to_string(), // Generic error
292    }
293}
294
295/// Statistics about the GitHub corpus
296#[derive(Debug, Default)]
297pub struct CorpusStats {
298    pub total_examples: usize,
299    pub by_category: HashMap<String, usize>,
300    pub by_expert: HashMap<ExpertDomain, usize>,
301    pub avg_confidence: f32,
302}
303
304/// Analyze OIP corpus statistics
305#[must_use]
306pub fn analyze_corpus(oip_data: &OipTrainingDataset) -> CorpusStats {
307    let mut stats = CorpusStats::default();
308
309    let all_examples: Vec<_> = oip_data
310        .train
311        .iter()
312        .chain(oip_data.validation.iter())
313        .chain(oip_data.test.iter())
314        .collect();
315
316    stats.total_examples = all_examples.len();
317
318    let mut total_confidence = 0.0f32;
319
320    for example in &all_examples {
321        // Count by OIP category
322        let cat_name = format!("{:?}", example.label);
323        *stats.by_category.entry(cat_name).or_default() += 1;
324
325        // Count by expert domain
326        let domain = example.label.to_expert_domain();
327        *stats.by_expert.entry(domain).or_default() += 1;
328
329        total_confidence += example.confidence;
330    }
331
332    if !all_examples.is_empty() {
333        stats.avg_confidence = total_confidence / all_examples.len() as f32;
334    }
335
336    stats
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    #[test]
344    fn test_oip_to_error_category_mapping() {
345        assert_eq!(
346            OipDefectCategory::OwnershipBorrow.to_error_category(),
347            ErrorCategory::BorrowChecker
348        );
349        assert_eq!(
350            OipDefectCategory::TypeErrors.to_error_category(),
351            ErrorCategory::TypeMismatch
352        );
353        assert_eq!(
354            OipDefectCategory::TraitBounds.to_error_category(),
355            ErrorCategory::TraitBound
356        );
357        assert_eq!(
358            OipDefectCategory::StdlibMapping.to_error_category(),
359            ErrorCategory::MissingImport
360        );
361    }
362
363    #[test]
364    fn test_oip_to_expert_domain_mapping() {
365        assert_eq!(
366            OipDefectCategory::TypeErrors.to_expert_domain(),
367            ExpertDomain::TypeSystem
368        );
369        assert_eq!(
370            OipDefectCategory::StdlibMapping.to_expert_domain(),
371            ExpertDomain::ScopeResolution
372        );
373        assert_eq!(
374            OipDefectCategory::OwnershipBorrow.to_expert_domain(),
375            ExpertDomain::SyntaxBorrowing
376        );
377        assert_eq!(
378            OipDefectCategory::ApiMisuse.to_expert_domain(),
379            ExpertDomain::MethodField
380        );
381    }
382
383    #[test]
384    fn test_extract_error_pattern() {
385        let msg = "fix: error[E0308]: mismatched types\n\ndetails here";
386        let pattern = extract_error_pattern(msg);
387        assert!(pattern.contains("E0308"));
388    }
389
390    #[test]
391    fn test_extract_error_pattern_conventional() {
392        let msg = "fix: resolve borrow checker issue with lifetime";
393        let pattern = extract_error_pattern(msg);
394        assert_eq!(pattern, "resolve borrow checker issue with lifetime");
395    }
396
397    #[test]
398    fn test_extract_fix_from_commit() {
399        let msg = "fix: type mismatch\n\nSolution: Use .into() for conversion";
400        let fix = extract_fix_from_commit(msg);
401        assert_eq!(fix, "Use .into() for conversion");
402    }
403
404    #[test]
405    fn test_infer_error_code() {
406        assert_eq!(
407            infer_error_code_from_category(OipDefectCategory::TypeErrors),
408            "E0308"
409        );
410        assert_eq!(
411            infer_error_code_from_category(OipDefectCategory::TraitBounds),
412            "E0277"
413        );
414        assert_eq!(
415            infer_error_code_from_category(OipDefectCategory::OwnershipBorrow),
416            "E0382"
417        );
418    }
419
420    #[test]
421    fn test_convert_empty_dataset() {
422        let oip = OipTrainingDataset {
423            train: vec![],
424            validation: vec![],
425            test: vec![],
426        };
427        let dataset = convert_oip_to_depyler(&oip);
428        assert!(dataset.samples().is_empty());
429    }
430
431    #[test]
432    fn test_analyze_corpus_empty() {
433        let oip = OipTrainingDataset {
434            train: vec![],
435            validation: vec![],
436            test: vec![],
437        };
438        let stats = analyze_corpus(&oip);
439        assert_eq!(stats.total_examples, 0);
440    }
441
442    #[test]
443    fn test_load_real_oip_data_if_exists() {
444        // Try to load real OIP training data if available
445        let oip_path = std::path::Path::new(
446            "/home/noah/src/organizational-intelligence-plugin/training-data.json",
447        );
448
449        if oip_path.exists() {
450            let oip_data = load_oip_training_data(oip_path).expect("Should load OIP data");
451            let stats = analyze_corpus(&oip_data);
452
453            println!("OIP Corpus Statistics:");
454            println!("  Total examples: {}", stats.total_examples);
455            println!("  Avg confidence: {:.2}", stats.avg_confidence);
456            println!("  By category:");
457            for (cat, count) in &stats.by_category {
458                println!("    {}: {}", cat, count);
459            }
460            println!("  By expert domain:");
461            for (domain, count) in &stats.by_expert {
462                println!("    {:?}: {}", domain, count);
463            }
464
465            // Convert to depyler format
466            let depyler_dataset = convert_oip_to_depyler(&oip_data);
467            println!(
468                "  Converted to {} depyler samples",
469                depyler_dataset.samples().len()
470            );
471
472            assert!(stats.total_examples > 0, "Should have training examples");
473        } else {
474            println!("OIP training data not found at {:?}, skipping", oip_path);
475        }
476    }
477
478    #[test]
479    fn test_convert_with_sample_data() {
480        let oip = OipTrainingDataset {
481            train: vec![OipTrainingExample {
482                message: "fix: error[E0308]: mismatched types\n\nUse .into()".to_string(),
483                label: OipDefectCategory::TypeErrors,
484                confidence: 0.85,
485                commit_hash: "abc123".to_string(),
486                author: "test@example.com".to_string(),
487                timestamp: 1234567890,
488                lines_added: 10,
489                lines_removed: 5,
490                files_changed: 2,
491            }],
492            validation: vec![],
493            test: vec![],
494        };
495
496        let dataset = convert_oip_to_depyler(&oip);
497        assert_eq!(dataset.samples().len(), 1);
498
499        let moe_samples = get_moe_samples_from_oip(&oip);
500        assert_eq!(moe_samples.len(), 1);
501        assert_eq!(moe_samples[0].0, "E0308"); // Error code
502        assert_eq!(moe_samples[0].2, ExpertDomain::TypeSystem); // Expert domain
503    }
504}