1use crate::classifier::ErrorCategory;
16use crate::moe_oracle::ExpertDomain;
17use crate::training::{TrainingDataset, TrainingSample};
18use serde::{Deserialize, Serialize};
19use std::collections::HashMap;
20use std::fs;
21use std::path::Path;
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
25pub enum OipDefectCategory {
26 MemorySafety,
28 ConcurrencyBugs,
29 LogicErrors,
30 ApiMisuse,
31 ResourceLeaks,
32 TypeErrors,
33 ConfigurationErrors,
34 SecurityVulnerabilities,
35 PerformanceIssues,
36 IntegrationFailures,
37 OperatorPrecedence,
39 TypeAnnotationGaps,
40 StdlibMapping,
41 ASTTransform,
42 ComprehensionBugs,
43 IteratorChain,
44 OwnershipBorrow,
45 TraitBounds,
46}
47
48impl OipDefectCategory {
49 #[must_use]
51 pub fn to_error_category(self) -> ErrorCategory {
52 match self {
53 Self::OwnershipBorrow | Self::MemorySafety => ErrorCategory::BorrowChecker,
55
56 Self::TraitBounds => ErrorCategory::TraitBound,
58
59 Self::TypeErrors | Self::TypeAnnotationGaps => ErrorCategory::TypeMismatch,
61
62 Self::StdlibMapping | Self::ConfigurationErrors | Self::ASTTransform => {
64 ErrorCategory::MissingImport
65 }
66
67 Self::ResourceLeaks => ErrorCategory::LifetimeError,
69
70 Self::ConcurrencyBugs
72 | Self::LogicErrors
73 | Self::ApiMisuse
74 | Self::SecurityVulnerabilities
75 | Self::PerformanceIssues
76 | Self::IntegrationFailures
77 | Self::OperatorPrecedence
78 | Self::ComprehensionBugs
79 | Self::IteratorChain => ErrorCategory::Other,
80 }
81 }
82
83 #[must_use]
85 pub fn to_expert_domain(self) -> ExpertDomain {
86 match self {
87 Self::TypeErrors | Self::TypeAnnotationGaps | Self::TraitBounds => {
89 ExpertDomain::TypeSystem
90 }
91
92 Self::StdlibMapping
94 | Self::ConfigurationErrors
95 | Self::IntegrationFailures
96 | Self::ASTTransform => ExpertDomain::ScopeResolution,
97
98 Self::ApiMisuse | Self::IteratorChain | Self::ComprehensionBugs => {
100 ExpertDomain::MethodField
101 }
102
103 Self::OwnershipBorrow
105 | Self::MemorySafety
106 | Self::ResourceLeaks
107 | Self::ConcurrencyBugs
108 | Self::LogicErrors
109 | Self::SecurityVulnerabilities
110 | Self::PerformanceIssues
111 | Self::OperatorPrecedence => ExpertDomain::SyntaxBorrowing,
112 }
113 }
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct OipTrainingExample {
119 pub message: String,
121 pub label: OipDefectCategory,
123 pub confidence: f32,
125 pub commit_hash: String,
127 pub author: String,
129 pub timestamp: i64,
131 pub lines_added: usize,
133 pub lines_removed: usize,
135 pub files_changed: usize,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
141pub struct OipTrainingDataset {
142 pub train: Vec<OipTrainingExample>,
144 pub validation: Vec<OipTrainingExample>,
146 pub test: Vec<OipTrainingExample>,
148}
149
150pub fn load_oip_training_data(path: &Path) -> Result<OipTrainingDataset, std::io::Error> {
155 let content = fs::read_to_string(path)?;
156 serde_json::from_str(&content)
157 .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
158}
159
160#[must_use]
162pub fn convert_oip_to_depyler(oip_data: &OipTrainingDataset) -> TrainingDataset {
163 let mut dataset = TrainingDataset::new();
164
165 let all_examples: Vec<_> = oip_data
167 .train
168 .iter()
169 .chain(oip_data.validation.iter())
170 .chain(oip_data.test.iter())
171 .collect();
172
173 for example in all_examples {
174 let category = example.label.to_error_category();
175
176 let error_pattern = extract_error_pattern(&example.message);
179
180 let fix = extract_fix_from_commit(&example.message);
182
183 dataset.add(TrainingSample::with_fix(&error_pattern, category, &fix));
184 }
185
186 dataset
187}
188
189fn extract_error_pattern(message: &str) -> String {
191 if let Some(start) = message.find("error[E") {
193 if let Some(end) = message[start..].find(']') {
194 let error_code = &message[start..start + end + 1];
195 let rest = &message[start + end + 1..];
197 if let Some(desc_end) = rest.find('\n') {
198 return format!("{}: {}", error_code, rest[..desc_end].trim());
199 }
200 return error_code.to_string();
201 }
202 }
203
204 if let Some(fix_start) = message.to_lowercase().find("fix:") {
206 let rest = &message[fix_start + 4..];
207 if let Some(end) = rest.find('\n') {
208 return rest[..end].trim().to_string();
209 }
210 return rest.trim().to_string();
211 }
212
213 message.lines().next().unwrap_or(message).to_string()
215}
216
217fn extract_fix_from_commit(message: &str) -> String {
219 let lower = message.to_lowercase();
221
222 for pattern in &["solution:", "fixed by:", "fix:", "resolved:"] {
223 if let Some(idx) = lower.find(pattern) {
224 let rest = &message[idx + pattern.len()..];
225 if let Some(end) = rest.find('\n') {
226 return rest[..end].trim().to_string();
227 }
228 return rest.trim().to_string();
229 }
230 }
231
232 message
234 .lines()
235 .next()
236 .map(|s| s.trim().to_string())
237 .unwrap_or_else(|| "See commit for fix details".to_string())
238}
239
240pub fn build_github_corpus(oip_json_path: &Path) -> Result<TrainingDataset, std::io::Error> {
245 let oip_data = load_oip_training_data(oip_json_path)?;
246 Ok(convert_oip_to_depyler(&oip_data))
247}
248
249#[must_use]
253pub fn get_moe_samples_from_oip(
254 oip_data: &OipTrainingDataset,
255) -> Vec<(String, String, ExpertDomain)> {
256 let mut samples = Vec::new();
257
258 let all_examples: Vec<_> = oip_data
259 .train
260 .iter()
261 .chain(oip_data.validation.iter())
262 .chain(oip_data.test.iter())
263 .collect();
264
265 for example in all_examples {
266 let domain = example.label.to_expert_domain();
267 let error_code = infer_error_code_from_category(example.label);
268 let context = example.message.clone();
269
270 samples.push((error_code, context, domain));
271 }
272
273 samples
274}
275
276fn infer_error_code_from_category(category: OipDefectCategory) -> String {
278 match category {
279 OipDefectCategory::TypeErrors | OipDefectCategory::TypeAnnotationGaps => {
280 "E0308".to_string()
281 }
282 OipDefectCategory::TraitBounds => "E0277".to_string(),
283 OipDefectCategory::OwnershipBorrow | OipDefectCategory::MemorySafety => "E0382".to_string(),
284 OipDefectCategory::StdlibMapping | OipDefectCategory::ASTTransform => "E0433".to_string(),
285 OipDefectCategory::ApiMisuse | OipDefectCategory::IteratorChain => "E0599".to_string(),
286 OipDefectCategory::ConfigurationErrors | OipDefectCategory::IntegrationFailures => {
287 "E0425".to_string()
288 }
289 OipDefectCategory::ResourceLeaks => "E0106".to_string(),
290 OipDefectCategory::ComprehensionBugs => "E0609".to_string(),
291 _ => "E0000".to_string(), }
293}
294
295#[derive(Debug, Default)]
297pub struct CorpusStats {
298 pub total_examples: usize,
299 pub by_category: HashMap<String, usize>,
300 pub by_expert: HashMap<ExpertDomain, usize>,
301 pub avg_confidence: f32,
302}
303
304#[must_use]
306pub fn analyze_corpus(oip_data: &OipTrainingDataset) -> CorpusStats {
307 let mut stats = CorpusStats::default();
308
309 let all_examples: Vec<_> = oip_data
310 .train
311 .iter()
312 .chain(oip_data.validation.iter())
313 .chain(oip_data.test.iter())
314 .collect();
315
316 stats.total_examples = all_examples.len();
317
318 let mut total_confidence = 0.0f32;
319
320 for example in &all_examples {
321 let cat_name = format!("{:?}", example.label);
323 *stats.by_category.entry(cat_name).or_default() += 1;
324
325 let domain = example.label.to_expert_domain();
327 *stats.by_expert.entry(domain).or_default() += 1;
328
329 total_confidence += example.confidence;
330 }
331
332 if !all_examples.is_empty() {
333 stats.avg_confidence = total_confidence / all_examples.len() as f32;
334 }
335
336 stats
337}
338
339#[cfg(test)]
340mod tests {
341 use super::*;
342
343 #[test]
344 fn test_oip_to_error_category_mapping() {
345 assert_eq!(
346 OipDefectCategory::OwnershipBorrow.to_error_category(),
347 ErrorCategory::BorrowChecker
348 );
349 assert_eq!(
350 OipDefectCategory::TypeErrors.to_error_category(),
351 ErrorCategory::TypeMismatch
352 );
353 assert_eq!(
354 OipDefectCategory::TraitBounds.to_error_category(),
355 ErrorCategory::TraitBound
356 );
357 assert_eq!(
358 OipDefectCategory::StdlibMapping.to_error_category(),
359 ErrorCategory::MissingImport
360 );
361 }
362
363 #[test]
364 fn test_oip_to_expert_domain_mapping() {
365 assert_eq!(
366 OipDefectCategory::TypeErrors.to_expert_domain(),
367 ExpertDomain::TypeSystem
368 );
369 assert_eq!(
370 OipDefectCategory::StdlibMapping.to_expert_domain(),
371 ExpertDomain::ScopeResolution
372 );
373 assert_eq!(
374 OipDefectCategory::OwnershipBorrow.to_expert_domain(),
375 ExpertDomain::SyntaxBorrowing
376 );
377 assert_eq!(
378 OipDefectCategory::ApiMisuse.to_expert_domain(),
379 ExpertDomain::MethodField
380 );
381 }
382
383 #[test]
384 fn test_extract_error_pattern() {
385 let msg = "fix: error[E0308]: mismatched types\n\ndetails here";
386 let pattern = extract_error_pattern(msg);
387 assert!(pattern.contains("E0308"));
388 }
389
390 #[test]
391 fn test_extract_error_pattern_conventional() {
392 let msg = "fix: resolve borrow checker issue with lifetime";
393 let pattern = extract_error_pattern(msg);
394 assert_eq!(pattern, "resolve borrow checker issue with lifetime");
395 }
396
397 #[test]
398 fn test_extract_fix_from_commit() {
399 let msg = "fix: type mismatch\n\nSolution: Use .into() for conversion";
400 let fix = extract_fix_from_commit(msg);
401 assert_eq!(fix, "Use .into() for conversion");
402 }
403
404 #[test]
405 fn test_infer_error_code() {
406 assert_eq!(
407 infer_error_code_from_category(OipDefectCategory::TypeErrors),
408 "E0308"
409 );
410 assert_eq!(
411 infer_error_code_from_category(OipDefectCategory::TraitBounds),
412 "E0277"
413 );
414 assert_eq!(
415 infer_error_code_from_category(OipDefectCategory::OwnershipBorrow),
416 "E0382"
417 );
418 }
419
420 #[test]
421 fn test_convert_empty_dataset() {
422 let oip = OipTrainingDataset {
423 train: vec![],
424 validation: vec![],
425 test: vec![],
426 };
427 let dataset = convert_oip_to_depyler(&oip);
428 assert!(dataset.samples().is_empty());
429 }
430
431 #[test]
432 fn test_analyze_corpus_empty() {
433 let oip = OipTrainingDataset {
434 train: vec![],
435 validation: vec![],
436 test: vec![],
437 };
438 let stats = analyze_corpus(&oip);
439 assert_eq!(stats.total_examples, 0);
440 }
441
442 #[test]
443 fn test_load_real_oip_data_if_exists() {
444 let oip_path = std::path::Path::new(
446 "/home/noah/src/organizational-intelligence-plugin/training-data.json",
447 );
448
449 if oip_path.exists() {
450 let oip_data = load_oip_training_data(oip_path).expect("Should load OIP data");
451 let stats = analyze_corpus(&oip_data);
452
453 println!("OIP Corpus Statistics:");
454 println!(" Total examples: {}", stats.total_examples);
455 println!(" Avg confidence: {:.2}", stats.avg_confidence);
456 println!(" By category:");
457 for (cat, count) in &stats.by_category {
458 println!(" {}: {}", cat, count);
459 }
460 println!(" By expert domain:");
461 for (domain, count) in &stats.by_expert {
462 println!(" {:?}: {}", domain, count);
463 }
464
465 let depyler_dataset = convert_oip_to_depyler(&oip_data);
467 println!(
468 " Converted to {} depyler samples",
469 depyler_dataset.samples().len()
470 );
471
472 assert!(stats.total_examples > 0, "Should have training examples");
473 } else {
474 println!("OIP training data not found at {:?}, skipping", oip_path);
475 }
476 }
477
478 #[test]
479 fn test_convert_with_sample_data() {
480 let oip = OipTrainingDataset {
481 train: vec![OipTrainingExample {
482 message: "fix: error[E0308]: mismatched types\n\nUse .into()".to_string(),
483 label: OipDefectCategory::TypeErrors,
484 confidence: 0.85,
485 commit_hash: "abc123".to_string(),
486 author: "test@example.com".to_string(),
487 timestamp: 1234567890,
488 lines_added: 10,
489 lines_removed: 5,
490 files_changed: 2,
491 }],
492 validation: vec![],
493 test: vec![],
494 };
495
496 let dataset = convert_oip_to_depyler(&oip);
497 assert_eq!(dataset.samples().len(), 1);
498
499 let moe_samples = get_moe_samples_from_oip(&oip);
500 assert_eq!(moe_samples.len(), 1);
501 assert_eq!(moe_samples[0].0, "E0308"); assert_eq!(moe_samples[0].2, ExpertDomain::TypeSystem); }
504}