organizational_intelligence_plugin/
citl.rs

1//! Compiler-in-the-Loop (CITL) Integration Module
2//!
3//! NLP-014: Integrates Depyler's CITL diagnostic output as ground-truth training labels.
4//!
5//! Provides:
6//! - rustc error code → DefectCategory mapping
7//! - Clippy lint → DefectCategory mapping
8//! - Depyler export import functionality
9//! - Extended training example support
10
11use crate::classifier::DefectCategory;
12use anyhow::{anyhow, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16
17/// Error code class for feature extraction (Section 3.4)
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
19pub enum ErrorCodeClass {
20    Type = 0,
21    Borrow = 1,
22    Name = 2,
23    Trait = 3,
24    #[default]
25    Other = 4,
26}
27
28impl ErrorCodeClass {
29    pub fn as_u8(&self) -> u8 {
30        *self as u8
31    }
32}
33
34/// Suggestion applicability from rustc/clippy
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
36pub enum SuggestionApplicability {
37    #[default]
38    None = 0,
39    MachineApplicable = 1,
40    MaybeIncorrect = 2,
41    HasPlaceholders = 3,
42}
43
44impl SuggestionApplicability {
45    pub fn as_u8(&self) -> u8 {
46        *self as u8
47    }
48
49    pub fn parse(s: &str) -> Self {
50        match s {
51            "MachineApplicable" => Self::MachineApplicable,
52            "MaybeIncorrect" => Self::MaybeIncorrect,
53            "HasPlaceholders" => Self::HasPlaceholders,
54            _ => Self::None,
55        }
56    }
57}
58
59/// Source of a training example
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
61pub enum TrainingSource {
62    #[default]
63    CommitMessage,
64    DepylerCitl,
65    Manual,
66}
67
68/// Depyler CITL export record (Section 3.2)
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct DepylerExport {
71    pub source_file: String,
72    pub error_code: Option<String>,
73    pub clippy_lint: Option<String>,
74    pub level: String,
75    pub message: String,
76    pub oip_category: Option<String>,
77    pub confidence: f32,
78    pub span: Option<SpanInfo>,
79    pub suggestion: Option<SuggestionInfo>,
80    pub timestamp: i64,
81    pub depyler_version: String,
82}
83
84/// Span information for diagnostic location
85#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
86pub struct SpanInfo {
87    pub line_start: u32,
88    pub column_start: u32,
89}
90
91/// Suggestion information from compiler
92#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
93pub struct SuggestionInfo {
94    pub replacement: String,
95    pub applicability: String,
96}
97
98/// Import statistics
99#[derive(Debug, Clone, Default)]
100pub struct ImportStats {
101    pub total_records: usize,
102    pub imported: usize,
103    pub skipped_low_confidence: usize,
104    pub skipped_unknown_category: usize,
105    pub by_category: HashMap<DefectCategory, usize>,
106    pub by_source: HashMap<String, usize>,
107    pub avg_confidence: f32,
108}
109
110/// Confidence values for error code mappings (Appendix A)
111pub fn get_error_code_confidence(code: &str) -> f32 {
112    match code {
113        "E0308" | "E0277" => 0.95,
114        "E0502" | "E0503" | "E0505" => 0.95,
115        "E0382" | "E0507" => 0.90,
116        "E0425" | "E0433" | "E0412" => 0.85,
117        "E0599" | "E0614" | "E0615" => 0.80,
118        "E0658" => 0.75,
119        _ => 0.70,
120    }
121}
122
123/// Map rustc error code to OIP DefectCategory (Section 3.1)
124///
125/// # Arguments
126/// * `code` - Rustc error code (e.g., "E0308")
127///
128/// # Returns
129/// * `Some(DefectCategory)` if mapping exists
130/// * `None` for unknown codes
131///
132/// # Examples
133/// ```
134/// use organizational_intelligence_plugin::citl::rustc_to_defect_category;
135/// use organizational_intelligence_plugin::classifier::DefectCategory;
136///
137/// assert_eq!(rustc_to_defect_category("E0308"), Some(DefectCategory::TypeErrors));
138/// assert_eq!(rustc_to_defect_category("E0277"), Some(DefectCategory::TraitBounds));
139/// assert_eq!(rustc_to_defect_category("UNKNOWN"), None);
140/// ```
141pub fn rustc_to_defect_category(code: &str) -> Option<DefectCategory> {
142    match code {
143        // Type system
144        "E0308" => Some(DefectCategory::TypeErrors),
145        "E0412" => Some(DefectCategory::TypeAnnotationGaps),
146
147        // Ownership/borrowing
148        "E0502" | "E0503" | "E0505" => Some(DefectCategory::OwnershipBorrow),
149        "E0382" | "E0507" => Some(DefectCategory::MemorySafety),
150
151        // Traits
152        "E0277" => Some(DefectCategory::TraitBounds),
153
154        // Name resolution
155        "E0425" | "E0433" => Some(DefectCategory::StdlibMapping),
156
157        // AST/structure
158        "E0599" | "E0615" => Some(DefectCategory::ASTTransform),
159        "E0614" => Some(DefectCategory::OperatorPrecedence),
160
161        // Configuration
162        "E0658" => Some(DefectCategory::ConfigurationErrors),
163
164        _ => None,
165    }
166}
167
168/// Map Clippy lint to OIP DefectCategory (Section 3.1)
169///
170/// # Arguments
171/// * `lint` - Clippy lint name (e.g., "clippy::unwrap_used")
172///
173/// # Returns
174/// * `Some(DefectCategory)` if mapping exists
175/// * `None` for unknown lints
176///
177/// # Examples
178/// ```
179/// use organizational_intelligence_plugin::citl::clippy_to_defect_category;
180/// use organizational_intelligence_plugin::classifier::DefectCategory;
181///
182/// assert_eq!(clippy_to_defect_category("clippy::unwrap_used"), Some(DefectCategory::ApiMisuse));
183/// assert_eq!(clippy_to_defect_category("clippy::todo"), Some(DefectCategory::LogicErrors));
184/// assert_eq!(clippy_to_defect_category("clippy::unknown"), None);
185/// ```
186pub fn clippy_to_defect_category(lint: &str) -> Option<DefectCategory> {
187    match lint {
188        "clippy::unwrap_used" | "clippy::expect_used" | "clippy::panic" => {
189            Some(DefectCategory::ApiMisuse)
190        }
191        "clippy::todo" | "clippy::unreachable" => Some(DefectCategory::LogicErrors),
192        "clippy::cognitive_complexity" => Some(DefectCategory::PerformanceIssues),
193        "clippy::too_many_arguments" | "clippy::match_single_binding" => {
194            Some(DefectCategory::ASTTransform)
195        }
196        "clippy::needless_collect" => Some(DefectCategory::IteratorChain),
197        "clippy::manual_map" => Some(DefectCategory::ComprehensionBugs),
198        _ => None,
199    }
200}
201
202/// Get error code class for feature extraction (Section 3.4)
203pub fn get_error_code_class(code: &str) -> ErrorCodeClass {
204    match code {
205        // Type errors
206        "E0308" | "E0412" => ErrorCodeClass::Type,
207        // Borrow errors
208        "E0502" | "E0503" | "E0505" | "E0382" | "E0507" => ErrorCodeClass::Borrow,
209        // Name resolution
210        "E0425" | "E0433" => ErrorCodeClass::Name,
211        // Trait errors
212        "E0277" => ErrorCodeClass::Trait,
213        // Other
214        _ => ErrorCodeClass::Other,
215    }
216}
217
218/// Import Depyler CITL corpus from JSONL file
219///
220/// # Arguments
221/// * `path` - Path to JSONL export file
222/// * `min_confidence` - Minimum confidence threshold
223///
224/// # Returns
225/// * `Ok((Vec<DepylerExport>, ImportStats))` on success
226pub fn import_depyler_corpus<P: AsRef<Path>>(
227    path: P,
228    min_confidence: f32,
229) -> Result<(Vec<DepylerExport>, ImportStats)> {
230    let content = std::fs::read_to_string(path.as_ref())
231        .map_err(|e| anyhow!("Failed to read corpus file: {}", e))?;
232
233    let mut exports = Vec::new();
234    let mut stats = ImportStats::default();
235
236    for (line_num, line) in content.lines().enumerate() {
237        if line.trim().is_empty() {
238            continue;
239        }
240
241        stats.total_records += 1;
242
243        let export: DepylerExport = serde_json::from_str(line).map_err(|e| {
244            anyhow!(
245                "Failed to parse JSON at line {}: {} - content: {}",
246                line_num + 1,
247                e,
248                line
249            )
250        })?;
251
252        // Check confidence threshold
253        if export.confidence < min_confidence {
254            stats.skipped_low_confidence += 1;
255            continue;
256        }
257
258        // Resolve category
259        let category = resolve_category(&export);
260        if category.is_none() {
261            stats.skipped_unknown_category += 1;
262            continue;
263        }
264
265        let cat = category.unwrap();
266        *stats.by_category.entry(cat).or_insert(0) += 1;
267        *stats
268            .by_source
269            .entry(export.source_file.clone())
270            .or_insert(0) += 1;
271
272        stats.imported += 1;
273        exports.push(export);
274    }
275
276    // Calculate average confidence
277    if stats.imported > 0 {
278        stats.avg_confidence =
279            exports.iter().map(|e| e.confidence).sum::<f32>() / stats.imported as f32;
280    }
281
282    Ok((exports, stats))
283}
284
285/// Convert DepylerExport records to TrainingExamples (NLP-014)
286///
287/// # Arguments
288/// * `exports` - Vector of DepylerExport records
289///
290/// # Returns
291/// * Vector of TrainingExamples with CITL source
292pub fn convert_to_training_examples(
293    exports: &[DepylerExport],
294) -> Vec<crate::training::TrainingExample> {
295    exports
296        .iter()
297        .filter_map(|export| {
298            let category = resolve_category(export)?;
299            let suggestion_applicability = export
300                .suggestion
301                .as_ref()
302                .map(|s| SuggestionApplicability::parse(&s.applicability));
303
304            Some(crate::training::TrainingExample {
305                message: export.message.clone(),
306                label: category,
307                confidence: export.confidence,
308                commit_hash: String::new(), // CITL doesn't have commit hash
309                author: "depyler".to_string(),
310                timestamp: export.timestamp,
311                lines_added: 0,
312                lines_removed: 0,
313                files_changed: 1,
314                // NLP-014: CITL fields
315                error_code: export.error_code.clone(),
316                clippy_lint: export.clippy_lint.clone(),
317                has_suggestion: export.suggestion.is_some(),
318                suggestion_applicability,
319                source: TrainingSource::DepylerCitl,
320            })
321        })
322        .collect()
323}
324
325/// Resolve DefectCategory from DepylerExport
326fn resolve_category(export: &DepylerExport) -> Option<DefectCategory> {
327    // Try pre-mapped category first
328    if let Some(ref cat_str) = export.oip_category {
329        if let Some(cat) = parse_defect_category(cat_str) {
330            return Some(cat);
331        }
332    }
333
334    // Try error code mapping
335    if let Some(ref code) = export.error_code {
336        if let Some(cat) = rustc_to_defect_category(code) {
337            return Some(cat);
338        }
339    }
340
341    // Try clippy lint mapping
342    if let Some(ref lint) = export.clippy_lint {
343        if let Some(cat) = clippy_to_defect_category(lint) {
344            return Some(cat);
345        }
346    }
347
348    None
349}
350
351/// Parse DefectCategory from string
352fn parse_defect_category(s: &str) -> Option<DefectCategory> {
353    match s {
354        "MemorySafety" => Some(DefectCategory::MemorySafety),
355        "ConcurrencyBugs" => Some(DefectCategory::ConcurrencyBugs),
356        "LogicErrors" => Some(DefectCategory::LogicErrors),
357        "ApiMisuse" => Some(DefectCategory::ApiMisuse),
358        "ResourceLeaks" => Some(DefectCategory::ResourceLeaks),
359        "TypeErrors" => Some(DefectCategory::TypeErrors),
360        "ConfigurationErrors" => Some(DefectCategory::ConfigurationErrors),
361        "SecurityVulnerabilities" => Some(DefectCategory::SecurityVulnerabilities),
362        "PerformanceIssues" => Some(DefectCategory::PerformanceIssues),
363        "IntegrationFailures" => Some(DefectCategory::IntegrationFailures),
364        "OperatorPrecedence" => Some(DefectCategory::OperatorPrecedence),
365        "TypeAnnotationGaps" => Some(DefectCategory::TypeAnnotationGaps),
366        "StdlibMapping" => Some(DefectCategory::StdlibMapping),
367        "ASTTransform" => Some(DefectCategory::ASTTransform),
368        "ComprehensionBugs" => Some(DefectCategory::ComprehensionBugs),
369        "IteratorChain" => Some(DefectCategory::IteratorChain),
370        "OwnershipBorrow" => Some(DefectCategory::OwnershipBorrow),
371        "TraitBounds" => Some(DefectCategory::TraitBounds),
372        _ => None,
373    }
374}
375
376// ==================== Alimentar DataLoader Integration ====================
377
378/// Merge strategy for combining CITL data with existing training data
379#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
380pub enum MergeStrategy {
381    /// Append CITL examples to existing data
382    #[default]
383    Append,
384    /// Replace existing data with CITL examples
385    Replace,
386    /// Weight CITL examples higher (multiplier applied)
387    Weighted(u32),
388}
389
390/// Configuration for CITL DataLoader
391#[derive(Debug, Clone)]
392pub struct CitlLoaderConfig {
393    /// Batch size for data loading
394    pub batch_size: usize,
395    /// Whether to shuffle the data
396    pub shuffle: bool,
397    /// Minimum confidence threshold
398    pub min_confidence: f32,
399    /// Merge strategy for combining with existing data
400    pub merge_strategy: MergeStrategy,
401    /// Weight multiplier for CITL examples (used with Weighted strategy)
402    pub weight: f32,
403}
404
405impl Default for CitlLoaderConfig {
406    fn default() -> Self {
407        Self {
408            batch_size: 128,
409            shuffle: true,
410            min_confidence: 0.75,
411            merge_strategy: MergeStrategy::Append,
412            weight: 1.0,
413        }
414    }
415}
416
417/// CITL DataLoader using alimentar for efficient data loading
418pub struct CitlDataLoader {
419    config: CitlLoaderConfig,
420}
421
422impl CitlDataLoader {
423    /// Create a new CITL DataLoader with default configuration
424    pub fn new() -> Self {
425        Self {
426            config: CitlLoaderConfig::default(),
427        }
428    }
429
430    /// Create a new CITL DataLoader with custom configuration
431    pub fn with_config(config: CitlLoaderConfig) -> Self {
432        Self { config }
433    }
434
435    /// Set batch size
436    pub fn batch_size(mut self, size: usize) -> Self {
437        self.config.batch_size = size;
438        self
439    }
440
441    /// Enable/disable shuffling
442    pub fn shuffle(mut self, shuffle: bool) -> Self {
443        self.config.shuffle = shuffle;
444        self
445    }
446
447    /// Set minimum confidence threshold
448    pub fn min_confidence(mut self, confidence: f32) -> Self {
449        self.config.min_confidence = confidence;
450        self
451    }
452
453    /// Set merge strategy
454    pub fn merge_strategy(mut self, strategy: MergeStrategy) -> Self {
455        self.config.merge_strategy = strategy;
456        self
457    }
458
459    /// Load CITL corpus from Parquet file using alimentar
460    ///
461    /// Returns an iterator over batches of TrainingExamples
462    pub fn load_parquet<P: AsRef<Path>>(&self, path: P) -> Result<CitlBatchIterator> {
463        use alimentar::{ArrowDataset, DataLoader};
464
465        let dataset = ArrowDataset::from_parquet(path.as_ref())
466            .map_err(|e| anyhow!("Failed to load Parquet: {}", e))?;
467
468        let mut loader = DataLoader::new(dataset).batch_size(self.config.batch_size);
469
470        if self.config.shuffle {
471            loader = loader.shuffle(true);
472        }
473
474        Ok(CitlBatchIterator {
475            inner: Box::new(loader.into_iter()),
476            min_confidence: self.config.min_confidence,
477        })
478    }
479
480    /// Load CITL corpus from JSONL file (streaming)
481    pub fn load_jsonl<P: AsRef<Path>>(
482        &self,
483        path: P,
484    ) -> Result<(Vec<crate::training::TrainingExample>, ImportStats)> {
485        let (exports, stats) = import_depyler_corpus(path, self.config.min_confidence)?;
486        let examples = convert_to_training_examples(&exports);
487        Ok((examples, stats))
488    }
489
490    /// Get the configuration
491    pub fn config(&self) -> &CitlLoaderConfig {
492        &self.config
493    }
494}
495
496impl Default for CitlDataLoader {
497    fn default() -> Self {
498        Self::new()
499    }
500}
501
502/// Iterator over batches of training examples from alimentar
503pub struct CitlBatchIterator {
504    inner: Box<dyn Iterator<Item = arrow::array::RecordBatch> + Send>,
505    min_confidence: f32,
506}
507
508impl Iterator for CitlBatchIterator {
509    type Item = Vec<crate::training::TrainingExample>;
510
511    fn next(&mut self) -> Option<Self::Item> {
512        self.inner.next().map(|batch| {
513            // Convert Arrow RecordBatch to TrainingExamples
514            convert_batch_to_examples(&batch, self.min_confidence)
515        })
516    }
517}
518
519/// Convert an Arrow RecordBatch to TrainingExamples
520fn convert_batch_to_examples(
521    batch: &arrow::array::RecordBatch,
522    min_confidence: f32,
523) -> Vec<crate::training::TrainingExample> {
524    use arrow::array::{Array, Float32Array, Int64Array, StringArray};
525
526    let num_rows = batch.num_rows();
527    let mut examples = Vec::with_capacity(num_rows);
528
529    // Downcast columns upfront for efficiency
530    let message_arr = batch
531        .column_by_name("message")
532        .and_then(|c| c.as_any().downcast_ref::<StringArray>());
533    let error_code_arr = batch
534        .column_by_name("error_code")
535        .and_then(|c| c.as_any().downcast_ref::<StringArray>());
536    let clippy_lint_arr = batch
537        .column_by_name("clippy_lint")
538        .and_then(|c| c.as_any().downcast_ref::<StringArray>());
539    let confidence_arr = batch
540        .column_by_name("confidence")
541        .and_then(|c| c.as_any().downcast_ref::<Float32Array>());
542    let timestamp_arr = batch
543        .column_by_name("timestamp")
544        .and_then(|c| c.as_any().downcast_ref::<Int64Array>());
545    let oip_category_arr = batch
546        .column_by_name("oip_category")
547        .and_then(|c| c.as_any().downcast_ref::<StringArray>());
548
549    for i in 0..num_rows {
550        // Extract confidence
551        let confidence = confidence_arr.map(|a| a.value(i)).unwrap_or(0.0);
552
553        if confidence < min_confidence {
554            continue;
555        }
556
557        // Extract message
558        let message = message_arr
559            .and_then(|a| {
560                if a.is_null(i) {
561                    None
562                } else {
563                    Some(a.value(i).to_string())
564                }
565            })
566            .unwrap_or_default();
567
568        // Extract error_code
569        let error_code = error_code_arr.and_then(|a| {
570            if a.is_null(i) {
571                None
572            } else {
573                Some(a.value(i).to_string())
574            }
575        });
576
577        // Extract clippy_lint
578        let clippy_lint = clippy_lint_arr.and_then(|a| {
579            if a.is_null(i) {
580                None
581            } else {
582                Some(a.value(i).to_string())
583            }
584        });
585
586        // Extract timestamp
587        let timestamp = timestamp_arr.map(|a| a.value(i)).unwrap_or(0);
588
589        // Resolve category
590        let oip_category =
591            oip_category_arr.and_then(|a| if a.is_null(i) { None } else { Some(a.value(i)) });
592
593        let category = oip_category
594            .and_then(parse_defect_category)
595            .or_else(|| error_code.as_deref().and_then(rustc_to_defect_category))
596            .or_else(|| clippy_lint.as_deref().and_then(clippy_to_defect_category));
597
598        if let Some(label) = category {
599            examples.push(crate::training::TrainingExample {
600                message,
601                label,
602                confidence,
603                commit_hash: String::new(),
604                author: "depyler".to_string(),
605                timestamp,
606                lines_added: 0,
607                lines_removed: 0,
608                files_changed: 1,
609                error_code,
610                clippy_lint,
611                has_suggestion: false,
612                suggestion_applicability: None,
613                source: TrainingSource::DepylerCitl,
614            });
615        }
616    }
617
618    examples
619}
620
621/// Validate CITL export schema (FR-8)
622pub fn validate_citl_schema<P: AsRef<Path>>(path: P) -> Result<SchemaValidation> {
623    use alimentar::{ArrowDataset, Dataset};
624
625    let ext = path.as_ref().extension().and_then(|e| e.to_str());
626
627    let schema = match ext {
628        Some("parquet") => {
629            let dataset = ArrowDataset::from_parquet(path.as_ref())
630                .map_err(|e| anyhow!("Failed to load Parquet: {}", e))?;
631            dataset.schema()
632        }
633        Some("jsonl") | Some("json") => {
634            // For JSONL, we validate the first line
635            let content = std::fs::read_to_string(path.as_ref())?;
636            let first_line = content
637                .lines()
638                .next()
639                .ok_or_else(|| anyhow!("Empty file"))?;
640            let _: DepylerExport = serde_json::from_str(first_line)
641                .map_err(|e| anyhow!("Invalid JSONL schema: {}", e))?;
642            return Ok(SchemaValidation {
643                is_valid: true,
644                missing_fields: vec![],
645                extra_fields: vec![],
646                format: "jsonl".to_string(),
647            });
648        }
649        _ => return Err(anyhow!("Unsupported file format: {:?}", ext)),
650    };
651
652    // Required fields for CITL schema
653    let required_fields = ["message", "confidence", "timestamp"];
654    let optional_fields = [
655        "error_code",
656        "clippy_lint",
657        "oip_category",
658        "suggestion",
659        "span",
660    ];
661
662    let schema_fields: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
663
664    let missing: Vec<String> = required_fields
665        .iter()
666        .filter(|f| !schema_fields.contains(*f))
667        .map(|s: &&str| (*s).to_string())
668        .collect();
669
670    let known_fields: Vec<&str> = required_fields
671        .iter()
672        .chain(optional_fields.iter())
673        .copied()
674        .collect();
675
676    let extra: Vec<String> = schema_fields
677        .iter()
678        .filter(|f| !known_fields.contains(*f))
679        .map(|s: &&str| (*s).to_string())
680        .collect();
681
682    Ok(SchemaValidation {
683        is_valid: missing.is_empty(),
684        missing_fields: missing,
685        extra_fields: extra,
686        format: "parquet".to_string(),
687    })
688}
689
690/// Schema validation result
691#[derive(Debug, Clone)]
692pub struct SchemaValidation {
693    /// Whether the schema is valid
694    pub is_valid: bool,
695    /// Missing required fields
696    pub missing_fields: Vec<String>,
697    /// Extra fields not in the expected schema
698    pub extra_fields: Vec<String>,
699    /// Detected format
700    pub format: String,
701}
702
703#[cfg(test)]
704mod tests {
705    use super::*;
706
707    // ==================== rustc_to_defect_category tests ====================
708
709    #[test]
710    fn test_rustc_type_error_e0308() {
711        assert_eq!(
712            rustc_to_defect_category("E0308"),
713            Some(DefectCategory::TypeErrors)
714        );
715    }
716
717    #[test]
718    fn test_rustc_type_annotation_e0412() {
719        assert_eq!(
720            rustc_to_defect_category("E0412"),
721            Some(DefectCategory::TypeAnnotationGaps)
722        );
723    }
724
725    #[test]
726    fn test_rustc_ownership_borrow_e0502() {
727        assert_eq!(
728            rustc_to_defect_category("E0502"),
729            Some(DefectCategory::OwnershipBorrow)
730        );
731    }
732
733    #[test]
734    fn test_rustc_ownership_borrow_e0503() {
735        assert_eq!(
736            rustc_to_defect_category("E0503"),
737            Some(DefectCategory::OwnershipBorrow)
738        );
739    }
740
741    #[test]
742    fn test_rustc_ownership_borrow_e0505() {
743        assert_eq!(
744            rustc_to_defect_category("E0505"),
745            Some(DefectCategory::OwnershipBorrow)
746        );
747    }
748
749    #[test]
750    fn test_rustc_memory_safety_e0382() {
751        assert_eq!(
752            rustc_to_defect_category("E0382"),
753            Some(DefectCategory::MemorySafety)
754        );
755    }
756
757    #[test]
758    fn test_rustc_memory_safety_e0507() {
759        assert_eq!(
760            rustc_to_defect_category("E0507"),
761            Some(DefectCategory::MemorySafety)
762        );
763    }
764
765    #[test]
766    fn test_rustc_trait_bounds_e0277() {
767        assert_eq!(
768            rustc_to_defect_category("E0277"),
769            Some(DefectCategory::TraitBounds)
770        );
771    }
772
773    #[test]
774    fn test_rustc_stdlib_mapping_e0425() {
775        assert_eq!(
776            rustc_to_defect_category("E0425"),
777            Some(DefectCategory::StdlibMapping)
778        );
779    }
780
781    #[test]
782    fn test_rustc_stdlib_mapping_e0433() {
783        assert_eq!(
784            rustc_to_defect_category("E0433"),
785            Some(DefectCategory::StdlibMapping)
786        );
787    }
788
789    #[test]
790    fn test_rustc_ast_transform_e0599() {
791        assert_eq!(
792            rustc_to_defect_category("E0599"),
793            Some(DefectCategory::ASTTransform)
794        );
795    }
796
797    #[test]
798    fn test_rustc_ast_transform_e0615() {
799        assert_eq!(
800            rustc_to_defect_category("E0615"),
801            Some(DefectCategory::ASTTransform)
802        );
803    }
804
805    #[test]
806    fn test_rustc_operator_precedence_e0614() {
807        assert_eq!(
808            rustc_to_defect_category("E0614"),
809            Some(DefectCategory::OperatorPrecedence)
810        );
811    }
812
813    #[test]
814    fn test_rustc_configuration_e0658() {
815        assert_eq!(
816            rustc_to_defect_category("E0658"),
817            Some(DefectCategory::ConfigurationErrors)
818        );
819    }
820
821    #[test]
822    fn test_rustc_unknown_code_returns_none() {
823        assert_eq!(rustc_to_defect_category("E9999"), None);
824        assert_eq!(rustc_to_defect_category("UNKNOWN"), None);
825        assert_eq!(rustc_to_defect_category(""), None);
826    }
827
828    // ==================== clippy_to_defect_category tests ====================
829
830    #[test]
831    fn test_clippy_api_misuse_unwrap() {
832        assert_eq!(
833            clippy_to_defect_category("clippy::unwrap_used"),
834            Some(DefectCategory::ApiMisuse)
835        );
836    }
837
838    #[test]
839    fn test_clippy_api_misuse_expect() {
840        assert_eq!(
841            clippy_to_defect_category("clippy::expect_used"),
842            Some(DefectCategory::ApiMisuse)
843        );
844    }
845
846    #[test]
847    fn test_clippy_api_misuse_panic() {
848        assert_eq!(
849            clippy_to_defect_category("clippy::panic"),
850            Some(DefectCategory::ApiMisuse)
851        );
852    }
853
854    #[test]
855    fn test_clippy_logic_errors_todo() {
856        assert_eq!(
857            clippy_to_defect_category("clippy::todo"),
858            Some(DefectCategory::LogicErrors)
859        );
860    }
861
862    #[test]
863    fn test_clippy_logic_errors_unreachable() {
864        assert_eq!(
865            clippy_to_defect_category("clippy::unreachable"),
866            Some(DefectCategory::LogicErrors)
867        );
868    }
869
870    #[test]
871    fn test_clippy_performance_cognitive_complexity() {
872        assert_eq!(
873            clippy_to_defect_category("clippy::cognitive_complexity"),
874            Some(DefectCategory::PerformanceIssues)
875        );
876    }
877
878    #[test]
879    fn test_clippy_ast_transform_too_many_arguments() {
880        assert_eq!(
881            clippy_to_defect_category("clippy::too_many_arguments"),
882            Some(DefectCategory::ASTTransform)
883        );
884    }
885
886    #[test]
887    fn test_clippy_ast_transform_match_single_binding() {
888        assert_eq!(
889            clippy_to_defect_category("clippy::match_single_binding"),
890            Some(DefectCategory::ASTTransform)
891        );
892    }
893
894    #[test]
895    fn test_clippy_iterator_chain_needless_collect() {
896        assert_eq!(
897            clippy_to_defect_category("clippy::needless_collect"),
898            Some(DefectCategory::IteratorChain)
899        );
900    }
901
902    #[test]
903    fn test_clippy_comprehension_bugs_manual_map() {
904        assert_eq!(
905            clippy_to_defect_category("clippy::manual_map"),
906            Some(DefectCategory::ComprehensionBugs)
907        );
908    }
909
910    #[test]
911    fn test_clippy_unknown_lint_returns_none() {
912        assert_eq!(clippy_to_defect_category("clippy::unknown_lint"), None);
913        assert_eq!(clippy_to_defect_category("not_clippy"), None);
914        assert_eq!(clippy_to_defect_category(""), None);
915    }
916
917    // ==================== error_code_class tests ====================
918
919    #[test]
920    fn test_error_code_class_type() {
921        assert_eq!(get_error_code_class("E0308"), ErrorCodeClass::Type);
922        assert_eq!(get_error_code_class("E0412"), ErrorCodeClass::Type);
923    }
924
925    #[test]
926    fn test_error_code_class_borrow() {
927        assert_eq!(get_error_code_class("E0502"), ErrorCodeClass::Borrow);
928        assert_eq!(get_error_code_class("E0503"), ErrorCodeClass::Borrow);
929        assert_eq!(get_error_code_class("E0505"), ErrorCodeClass::Borrow);
930        assert_eq!(get_error_code_class("E0382"), ErrorCodeClass::Borrow);
931        assert_eq!(get_error_code_class("E0507"), ErrorCodeClass::Borrow);
932    }
933
934    #[test]
935    fn test_error_code_class_name() {
936        assert_eq!(get_error_code_class("E0425"), ErrorCodeClass::Name);
937        assert_eq!(get_error_code_class("E0433"), ErrorCodeClass::Name);
938    }
939
940    #[test]
941    fn test_error_code_class_trait() {
942        assert_eq!(get_error_code_class("E0277"), ErrorCodeClass::Trait);
943    }
944
945    #[test]
946    fn test_error_code_class_other() {
947        assert_eq!(get_error_code_class("E9999"), ErrorCodeClass::Other);
948        assert_eq!(get_error_code_class("UNKNOWN"), ErrorCodeClass::Other);
949    }
950
951    #[test]
952    fn test_error_code_class_as_u8() {
953        assert_eq!(ErrorCodeClass::Type.as_u8(), 0);
954        assert_eq!(ErrorCodeClass::Borrow.as_u8(), 1);
955        assert_eq!(ErrorCodeClass::Name.as_u8(), 2);
956        assert_eq!(ErrorCodeClass::Trait.as_u8(), 3);
957        assert_eq!(ErrorCodeClass::Other.as_u8(), 4);
958    }
959
960    // ==================== SuggestionApplicability tests ====================
961
962    #[test]
963    fn test_suggestion_applicability_parse() {
964        assert_eq!(
965            SuggestionApplicability::parse("MachineApplicable"),
966            SuggestionApplicability::MachineApplicable
967        );
968        assert_eq!(
969            SuggestionApplicability::parse("MaybeIncorrect"),
970            SuggestionApplicability::MaybeIncorrect
971        );
972        assert_eq!(
973            SuggestionApplicability::parse("HasPlaceholders"),
974            SuggestionApplicability::HasPlaceholders
975        );
976        assert_eq!(
977            SuggestionApplicability::parse("Unknown"),
978            SuggestionApplicability::None
979        );
980    }
981
982    #[test]
983    fn test_suggestion_applicability_as_u8() {
984        assert_eq!(SuggestionApplicability::None.as_u8(), 0);
985        assert_eq!(SuggestionApplicability::MachineApplicable.as_u8(), 1);
986        assert_eq!(SuggestionApplicability::MaybeIncorrect.as_u8(), 2);
987        assert_eq!(SuggestionApplicability::HasPlaceholders.as_u8(), 3);
988    }
989
990    // ==================== get_error_code_confidence tests ====================
991
992    #[test]
993    fn test_error_code_confidence_high() {
994        assert!((get_error_code_confidence("E0308") - 0.95).abs() < 0.001);
995        assert!((get_error_code_confidence("E0277") - 0.95).abs() < 0.001);
996        assert!((get_error_code_confidence("E0502") - 0.95).abs() < 0.001);
997    }
998
999    #[test]
1000    fn test_error_code_confidence_medium() {
1001        assert!((get_error_code_confidence("E0382") - 0.90).abs() < 0.001);
1002        assert!((get_error_code_confidence("E0425") - 0.85).abs() < 0.001);
1003        assert!((get_error_code_confidence("E0599") - 0.80).abs() < 0.001);
1004    }
1005
1006    #[test]
1007    fn test_error_code_confidence_low() {
1008        assert!((get_error_code_confidence("E0658") - 0.75).abs() < 0.001);
1009        assert!((get_error_code_confidence("UNKNOWN") - 0.70).abs() < 0.001);
1010    }
1011
1012    // ==================== DepylerExport parsing tests ====================
1013
1014    #[test]
1015    fn test_depyler_export_parse() {
1016        let json = r#"{
1017            "source_file": "example.py",
1018            "error_code": "E0308",
1019            "clippy_lint": null,
1020            "level": "error",
1021            "message": "mismatched types",
1022            "oip_category": "TypeErrors",
1023            "confidence": 0.95,
1024            "span": {"line_start": 42, "column_start": 12},
1025            "suggestion": {"replacement": ".parse::<i32>()", "applicability": "MaybeIncorrect"},
1026            "timestamp": 1732752000,
1027            "depyler_version": "3.21.0"
1028        }"#;
1029
1030        let export: DepylerExport = serde_json::from_str(json).unwrap();
1031
1032        assert_eq!(export.source_file, "example.py");
1033        assert_eq!(export.error_code, Some("E0308".to_string()));
1034        assert_eq!(export.clippy_lint, None);
1035        assert_eq!(export.level, "error");
1036        assert!((export.confidence - 0.95).abs() < 0.001);
1037        assert_eq!(export.span.as_ref().unwrap().line_start, 42);
1038        assert_eq!(
1039            export.suggestion.as_ref().unwrap().applicability,
1040            "MaybeIncorrect"
1041        );
1042    }
1043
1044    #[test]
1045    fn test_depyler_export_minimal() {
1046        let json = r#"{
1047            "source_file": "test.py",
1048            "error_code": null,
1049            "clippy_lint": "clippy::unwrap_used",
1050            "level": "warning",
1051            "message": "unwrap used",
1052            "oip_category": null,
1053            "confidence": 0.80,
1054            "span": null,
1055            "suggestion": null,
1056            "timestamp": 1732752000,
1057            "depyler_version": "3.21.0"
1058        }"#;
1059
1060        let export: DepylerExport = serde_json::from_str(json).unwrap();
1061
1062        assert_eq!(export.error_code, None);
1063        assert_eq!(export.clippy_lint, Some("clippy::unwrap_used".to_string()));
1064        assert_eq!(export.span, None);
1065        assert_eq!(export.suggestion, None);
1066    }
1067
1068    // ==================== resolve_category tests ====================
1069
1070    #[test]
1071    fn test_resolve_category_from_oip_category() {
1072        let export = DepylerExport {
1073            source_file: "test.py".to_string(),
1074            error_code: None,
1075            clippy_lint: None,
1076            level: "error".to_string(),
1077            message: "test".to_string(),
1078            oip_category: Some("MemorySafety".to_string()),
1079            confidence: 0.90,
1080            span: None,
1081            suggestion: None,
1082            timestamp: 0,
1083            depyler_version: "1.0".to_string(),
1084        };
1085
1086        assert_eq!(
1087            resolve_category(&export),
1088            Some(DefectCategory::MemorySafety)
1089        );
1090    }
1091
1092    #[test]
1093    fn test_resolve_category_from_error_code() {
1094        let export = DepylerExport {
1095            source_file: "test.py".to_string(),
1096            error_code: Some("E0308".to_string()),
1097            clippy_lint: None,
1098            level: "error".to_string(),
1099            message: "test".to_string(),
1100            oip_category: None,
1101            confidence: 0.90,
1102            span: None,
1103            suggestion: None,
1104            timestamp: 0,
1105            depyler_version: "1.0".to_string(),
1106        };
1107
1108        assert_eq!(resolve_category(&export), Some(DefectCategory::TypeErrors));
1109    }
1110
1111    #[test]
1112    fn test_resolve_category_from_clippy_lint() {
1113        let export = DepylerExport {
1114            source_file: "test.py".to_string(),
1115            error_code: None,
1116            clippy_lint: Some("clippy::unwrap_used".to_string()),
1117            level: "warning".to_string(),
1118            message: "test".to_string(),
1119            oip_category: None,
1120            confidence: 0.90,
1121            span: None,
1122            suggestion: None,
1123            timestamp: 0,
1124            depyler_version: "1.0".to_string(),
1125        };
1126
1127        assert_eq!(resolve_category(&export), Some(DefectCategory::ApiMisuse));
1128    }
1129
1130    #[test]
1131    fn test_resolve_category_unknown() {
1132        let export = DepylerExport {
1133            source_file: "test.py".to_string(),
1134            error_code: Some("E9999".to_string()),
1135            clippy_lint: None,
1136            level: "error".to_string(),
1137            message: "test".to_string(),
1138            oip_category: None,
1139            confidence: 0.90,
1140            span: None,
1141            suggestion: None,
1142            timestamp: 0,
1143            depyler_version: "1.0".to_string(),
1144        };
1145
1146        assert_eq!(resolve_category(&export), None);
1147    }
1148
1149    // ==================== parse_defect_category tests ====================
1150
1151    #[test]
1152    fn test_parse_all_defect_categories() {
1153        let categories = vec![
1154            ("MemorySafety", DefectCategory::MemorySafety),
1155            ("ConcurrencyBugs", DefectCategory::ConcurrencyBugs),
1156            ("LogicErrors", DefectCategory::LogicErrors),
1157            ("ApiMisuse", DefectCategory::ApiMisuse),
1158            ("ResourceLeaks", DefectCategory::ResourceLeaks),
1159            ("TypeErrors", DefectCategory::TypeErrors),
1160            ("ConfigurationErrors", DefectCategory::ConfigurationErrors),
1161            (
1162                "SecurityVulnerabilities",
1163                DefectCategory::SecurityVulnerabilities,
1164            ),
1165            ("PerformanceIssues", DefectCategory::PerformanceIssues),
1166            ("IntegrationFailures", DefectCategory::IntegrationFailures),
1167            ("OperatorPrecedence", DefectCategory::OperatorPrecedence),
1168            ("TypeAnnotationGaps", DefectCategory::TypeAnnotationGaps),
1169            ("StdlibMapping", DefectCategory::StdlibMapping),
1170            ("ASTTransform", DefectCategory::ASTTransform),
1171            ("ComprehensionBugs", DefectCategory::ComprehensionBugs),
1172            ("IteratorChain", DefectCategory::IteratorChain),
1173            ("OwnershipBorrow", DefectCategory::OwnershipBorrow),
1174            ("TraitBounds", DefectCategory::TraitBounds),
1175        ];
1176
1177        for (s, expected) in categories {
1178            assert_eq!(
1179                parse_defect_category(s),
1180                Some(expected),
1181                "Failed for: {}",
1182                s
1183            );
1184        }
1185    }
1186
1187    #[test]
1188    fn test_parse_unknown_category() {
1189        assert_eq!(parse_defect_category("Unknown"), None);
1190        assert_eq!(parse_defect_category(""), None);
1191    }
1192
1193    // ==================== TrainingSource tests ====================
1194
1195    #[test]
1196    fn test_training_source_default() {
1197        assert_eq!(TrainingSource::default(), TrainingSource::CommitMessage);
1198    }
1199
1200    #[test]
1201    fn test_training_source_serialization() {
1202        let source = TrainingSource::DepylerCitl;
1203        let json = serde_json::to_string(&source).unwrap();
1204        let parsed: TrainingSource = serde_json::from_str(&json).unwrap();
1205        assert_eq!(parsed, TrainingSource::DepylerCitl);
1206    }
1207
1208    // ==================== import_depyler_corpus tests ====================
1209
1210    #[test]
1211    fn test_import_depyler_corpus_file_not_found() {
1212        let result = import_depyler_corpus("/nonexistent/path.jsonl", 0.75);
1213        assert!(result.is_err());
1214    }
1215
1216    #[test]
1217    fn test_import_stats_default() {
1218        let stats = ImportStats::default();
1219        assert_eq!(stats.total_records, 0);
1220        assert_eq!(stats.imported, 0);
1221        assert_eq!(stats.skipped_low_confidence, 0);
1222        assert_eq!(stats.skipped_unknown_category, 0);
1223        assert!(stats.by_category.is_empty());
1224        assert!((stats.avg_confidence - 0.0).abs() < 0.001);
1225    }
1226
1227    // ==================== convert_to_training_examples tests ====================
1228
1229    #[test]
1230    fn test_convert_to_training_examples_basic() {
1231        let exports = vec![DepylerExport {
1232            source_file: "test.py".to_string(),
1233            error_code: Some("E0308".to_string()),
1234            clippy_lint: None,
1235            level: "error".to_string(),
1236            message: "mismatched types".to_string(),
1237            oip_category: None,
1238            confidence: 0.95,
1239            span: None,
1240            suggestion: None,
1241            timestamp: 1732752000,
1242            depyler_version: "3.21.0".to_string(),
1243        }];
1244
1245        let examples = convert_to_training_examples(&exports);
1246        assert_eq!(examples.len(), 1);
1247        assert_eq!(examples[0].label, DefectCategory::TypeErrors);
1248        assert_eq!(examples[0].message, "mismatched types");
1249        assert!((examples[0].confidence - 0.95).abs() < 0.001);
1250        assert_eq!(examples[0].error_code, Some("E0308".to_string()));
1251        assert_eq!(examples[0].source, TrainingSource::DepylerCitl);
1252    }
1253
1254    #[test]
1255    fn test_convert_to_training_examples_with_suggestion() {
1256        let exports = vec![DepylerExport {
1257            source_file: "test.py".to_string(),
1258            error_code: Some("E0308".to_string()),
1259            clippy_lint: None,
1260            level: "error".to_string(),
1261            message: "type error".to_string(),
1262            oip_category: None,
1263            confidence: 0.90,
1264            span: None,
1265            suggestion: Some(SuggestionInfo {
1266                replacement: ".parse::<i32>()".to_string(),
1267                applicability: "MachineApplicable".to_string(),
1268            }),
1269            timestamp: 1732752000,
1270            depyler_version: "3.21.0".to_string(),
1271        }];
1272
1273        let examples = convert_to_training_examples(&exports);
1274        assert_eq!(examples.len(), 1);
1275        assert!(examples[0].has_suggestion);
1276        assert_eq!(
1277            examples[0].suggestion_applicability,
1278            Some(SuggestionApplicability::MachineApplicable)
1279        );
1280    }
1281
1282    #[test]
1283    fn test_convert_to_training_examples_filters_unknown() {
1284        let exports = vec![
1285            DepylerExport {
1286                source_file: "test.py".to_string(),
1287                error_code: Some("E0308".to_string()),
1288                clippy_lint: None,
1289                level: "error".to_string(),
1290                message: "known error".to_string(),
1291                oip_category: None,
1292                confidence: 0.90,
1293                span: None,
1294                suggestion: None,
1295                timestamp: 0,
1296                depyler_version: "1.0".to_string(),
1297            },
1298            DepylerExport {
1299                source_file: "test.py".to_string(),
1300                error_code: Some("E9999".to_string()), // Unknown error code
1301                clippy_lint: None,
1302                level: "error".to_string(),
1303                message: "unknown error".to_string(),
1304                oip_category: None,
1305                confidence: 0.90,
1306                span: None,
1307                suggestion: None,
1308                timestamp: 0,
1309                depyler_version: "1.0".to_string(),
1310            },
1311        ];
1312
1313        let examples = convert_to_training_examples(&exports);
1314        assert_eq!(examples.len(), 1);
1315        assert_eq!(examples[0].message, "known error");
1316    }
1317
1318    // ==================== MergeStrategy tests ====================
1319
1320    #[test]
1321    fn test_merge_strategy_default() {
1322        let strategy = MergeStrategy::default();
1323        assert!(matches!(strategy, MergeStrategy::Append));
1324    }
1325
1326    #[test]
1327    fn test_merge_strategy_append() {
1328        let strategy = MergeStrategy::Append;
1329        assert!(matches!(strategy, MergeStrategy::Append));
1330    }
1331
1332    #[test]
1333    fn test_merge_strategy_replace() {
1334        let strategy = MergeStrategy::Replace;
1335        assert!(matches!(strategy, MergeStrategy::Replace));
1336    }
1337
1338    #[test]
1339    fn test_merge_strategy_weighted() {
1340        let strategy = MergeStrategy::Weighted(2);
1341        if let MergeStrategy::Weighted(multiplier) = strategy {
1342            assert_eq!(multiplier, 2);
1343        } else {
1344            panic!("Expected MergeStrategy::Weighted");
1345        }
1346    }
1347
1348    // ==================== CitlLoaderConfig tests ====================
1349
1350    #[test]
1351    fn test_citl_loader_config_default() {
1352        let config = CitlLoaderConfig::default();
1353        assert_eq!(config.batch_size, 128);
1354        assert!((config.min_confidence - 0.75).abs() < 0.001);
1355        assert!(matches!(config.merge_strategy, MergeStrategy::Append));
1356        assert!(config.shuffle);
1357        assert!((config.weight - 1.0).abs() < 0.001);
1358    }
1359
1360    #[test]
1361    fn test_citl_loader_config_custom() {
1362        let config = CitlLoaderConfig {
1363            batch_size: 512,
1364            min_confidence: 0.9,
1365            merge_strategy: MergeStrategy::Replace,
1366            shuffle: false,
1367            weight: 2.0,
1368        };
1369        assert_eq!(config.batch_size, 512);
1370        assert!((config.min_confidence - 0.9).abs() < 0.001);
1371        assert!(!config.shuffle);
1372        assert!((config.weight - 2.0).abs() < 0.001);
1373    }
1374
1375    // ==================== CitlDataLoader tests ====================
1376
1377    #[test]
1378    fn test_citl_data_loader_new() {
1379        let loader = CitlDataLoader::new();
1380        assert_eq!(loader.config().batch_size, 128);
1381    }
1382
1383    #[test]
1384    fn test_citl_data_loader_with_config() {
1385        let config = CitlLoaderConfig {
1386            batch_size: 256,
1387            min_confidence: 0.8,
1388            ..CitlLoaderConfig::default()
1389        };
1390        let loader = CitlDataLoader::with_config(config);
1391        assert_eq!(loader.config().batch_size, 256);
1392        assert!((loader.config().min_confidence - 0.8).abs() < 0.001);
1393    }
1394
1395    #[test]
1396    fn test_citl_data_loader_default() {
1397        let loader = CitlDataLoader::default();
1398        assert_eq!(loader.config().batch_size, 128);
1399    }
1400
1401    #[test]
1402    fn test_citl_data_loader_load_jsonl_not_found() {
1403        let loader = CitlDataLoader::new();
1404        let result = loader.load_jsonl("nonexistent.jsonl");
1405        assert!(result.is_err());
1406    }
1407
1408    #[test]
1409    fn test_citl_data_loader_load_parquet_not_found() {
1410        let loader = CitlDataLoader::new();
1411        let result = loader.load_parquet("nonexistent.parquet");
1412        assert!(result.is_err());
1413    }
1414
1415    #[test]
1416    fn test_citl_data_loader_load_jsonl_valid() {
1417        use std::io::Write;
1418        let temp_dir = tempfile::tempdir().unwrap();
1419        let file_path = temp_dir.path().join("valid.jsonl");
1420        let mut file = std::fs::File::create(&file_path).unwrap();
1421
1422        // Write valid CITL entries
1423        writeln!(file, r#"{{"source_file":"test.py","error_code":"E0308","clippy_lint":null,"level":"error","message":"type mismatch","oip_category":null,"confidence":0.95,"span":null,"suggestion":null,"timestamp":1732752000,"depyler_version":"1.0"}}"#).unwrap();
1424        writeln!(file, r#"{{"source_file":"test.py","error_code":null,"clippy_lint":"clippy::unwrap_used","level":"warning","message":"unwrap used","oip_category":null,"confidence":0.85,"span":null,"suggestion":null,"timestamp":1732752001,"depyler_version":"1.0"}}"#).unwrap();
1425
1426        let loader = CitlDataLoader::new();
1427        let result = loader.load_jsonl(&file_path);
1428        assert!(result.is_ok());
1429
1430        let (examples, stats) = result.unwrap();
1431        assert_eq!(examples.len(), 2);
1432        assert_eq!(stats.total_records, 2);
1433        assert_eq!(stats.imported, 2);
1434    }
1435
1436    #[test]
1437    fn test_citl_data_loader_load_parquet_valid() {
1438        use arrow::array::{Float32Array, Int64Array, StringArray};
1439        use arrow::datatypes::{DataType, Field, Schema};
1440        use parquet::arrow::ArrowWriter;
1441        use std::fs::File;
1442        use std::sync::Arc;
1443
1444        let temp_dir = tempfile::tempdir().unwrap();
1445        let file_path = temp_dir.path().join("valid.parquet");
1446
1447        // Create schema
1448        let schema = Arc::new(Schema::new(vec![
1449            Field::new("message", DataType::Utf8, false),
1450            Field::new("confidence", DataType::Float32, false),
1451            Field::new("error_code", DataType::Utf8, true),
1452            Field::new("timestamp", DataType::Int64, false),
1453        ]));
1454
1455        // Create data
1456        let message_arr = StringArray::from(vec!["type mismatch", "api misuse"]);
1457        let confidence_arr = Float32Array::from(vec![0.95, 0.88]);
1458        let error_code_arr = StringArray::from(vec![Some("E0308"), None]);
1459        let timestamp_arr = Int64Array::from(vec![1732752000, 1732752001]);
1460
1461        let batch = arrow::array::RecordBatch::try_new(
1462            schema.clone(),
1463            vec![
1464                Arc::new(message_arr),
1465                Arc::new(confidence_arr),
1466                Arc::new(error_code_arr),
1467                Arc::new(timestamp_arr),
1468            ],
1469        )
1470        .unwrap();
1471
1472        // Write parquet file
1473        let file = File::create(&file_path).unwrap();
1474        let mut writer = ArrowWriter::try_new(file, schema, None).unwrap();
1475        writer.write(&batch).unwrap();
1476        writer.close().unwrap();
1477
1478        // Load using CitlDataLoader (returns iterator)
1479        let loader = CitlDataLoader::new();
1480        let result = loader.load_parquet(&file_path);
1481        assert!(result.is_ok());
1482
1483        // Collect examples from iterator
1484        let iter = result.unwrap();
1485        let all_examples: Vec<_> = iter.flatten().collect();
1486        // Only 1 example should be valid (E0308 maps to TypeErrors, the other has no error_code mapping)
1487        assert_eq!(all_examples.len(), 1);
1488        assert_eq!(all_examples[0].label, DefectCategory::TypeErrors);
1489    }
1490
1491    // ==================== SchemaValidation tests ====================
1492
1493    #[test]
1494    fn test_schema_validation_valid() {
1495        let validation = SchemaValidation {
1496            is_valid: true,
1497            missing_fields: vec![],
1498            extra_fields: vec![],
1499            format: "parquet".to_string(),
1500        };
1501        assert!(validation.is_valid);
1502        assert!(validation.missing_fields.is_empty());
1503    }
1504
1505    #[test]
1506    fn test_schema_validation_invalid() {
1507        let validation = SchemaValidation {
1508            is_valid: false,
1509            missing_fields: vec!["message".to_string(), "confidence".to_string()],
1510            extra_fields: vec![],
1511            format: "parquet".to_string(),
1512        };
1513        assert!(!validation.is_valid);
1514        assert_eq!(validation.missing_fields.len(), 2);
1515    }
1516
1517    #[test]
1518    fn test_validate_citl_schema_unsupported_format() {
1519        let result = validate_citl_schema("test.csv");
1520        assert!(result.is_err());
1521    }
1522
1523    #[test]
1524    fn test_validate_citl_schema_jsonl_valid() {
1525        use std::io::Write;
1526        let temp_dir = tempfile::tempdir().unwrap();
1527        let file_path = temp_dir.path().join("test.jsonl");
1528        let mut file = std::fs::File::create(&file_path).unwrap();
1529        writeln!(file, r#"{{"source_file":"test.py","error_code":"E0308","clippy_lint":null,"level":"error","message":"test","oip_category":null,"confidence":0.9,"span":null,"suggestion":null,"timestamp":0,"depyler_version":"1.0"}}"#).unwrap();
1530
1531        let result = validate_citl_schema(&file_path).unwrap();
1532        assert!(result.is_valid);
1533        assert_eq!(result.format, "jsonl");
1534    }
1535
1536    #[test]
1537    fn test_validate_citl_schema_empty_file() {
1538        let temp_dir = tempfile::tempdir().unwrap();
1539        let file_path = temp_dir.path().join("empty.jsonl");
1540        let _file = std::fs::File::create(&file_path).unwrap();
1541
1542        let result = validate_citl_schema(&file_path);
1543        assert!(result.is_err());
1544    }
1545
1546    // ==================== convert_batch_to_examples tests ====================
1547
1548    #[test]
1549    fn test_convert_batch_empty() {
1550        use arrow::array::RecordBatch;
1551        use arrow::datatypes::{DataType, Field, Schema};
1552        use std::sync::Arc;
1553
1554        let schema = Arc::new(Schema::new(vec![
1555            Field::new("message", DataType::Utf8, false),
1556            Field::new("confidence", DataType::Float32, false),
1557        ]));
1558
1559        let batch = RecordBatch::new_empty(schema);
1560        let examples = convert_batch_to_examples(&batch, 0.0);
1561        assert!(examples.is_empty());
1562    }
1563
1564    #[test]
1565    fn test_convert_batch_with_data() {
1566        use arrow::array::{Float32Array, RecordBatch, StringArray};
1567        use arrow::datatypes::{DataType, Field, Schema};
1568        use std::sync::Arc;
1569
1570        let schema = Arc::new(Schema::new(vec![
1571            Field::new("message", DataType::Utf8, false),
1572            Field::new("confidence", DataType::Float32, false),
1573            Field::new("error_code", DataType::Utf8, true),
1574            Field::new("timestamp", DataType::Int64, false),
1575        ]));
1576
1577        let message_arr = StringArray::from(vec!["type mismatch"]);
1578        let confidence_arr = Float32Array::from(vec![0.95]);
1579        let error_code_arr = StringArray::from(vec![Some("E0308")]);
1580        let timestamp_arr = arrow::array::Int64Array::from(vec![1732752000]);
1581
1582        let batch = RecordBatch::try_new(
1583            schema,
1584            vec![
1585                Arc::new(message_arr),
1586                Arc::new(confidence_arr),
1587                Arc::new(error_code_arr),
1588                Arc::new(timestamp_arr),
1589            ],
1590        )
1591        .unwrap();
1592
1593        let examples = convert_batch_to_examples(&batch, 0.5);
1594        assert_eq!(examples.len(), 1);
1595        assert_eq!(examples[0].message, "type mismatch");
1596        assert_eq!(examples[0].label, DefectCategory::TypeErrors);
1597        assert!((examples[0].confidence - 0.95).abs() < 0.001);
1598    }
1599
1600    #[test]
1601    fn test_convert_batch_filters_low_confidence() {
1602        use arrow::array::{Float32Array, RecordBatch, StringArray};
1603        use arrow::datatypes::{DataType, Field, Schema};
1604        use std::sync::Arc;
1605
1606        let schema = Arc::new(Schema::new(vec![
1607            Field::new("message", DataType::Utf8, false),
1608            Field::new("confidence", DataType::Float32, false),
1609            Field::new("error_code", DataType::Utf8, true),
1610        ]));
1611
1612        let message_arr = StringArray::from(vec!["low conf", "high conf"]);
1613        let confidence_arr = Float32Array::from(vec![0.3, 0.9]);
1614        let error_code_arr = StringArray::from(vec![Some("E0308"), Some("E0308")]);
1615
1616        let batch = RecordBatch::try_new(
1617            schema,
1618            vec![
1619                Arc::new(message_arr),
1620                Arc::new(confidence_arr),
1621                Arc::new(error_code_arr),
1622            ],
1623        )
1624        .unwrap();
1625
1626        let examples = convert_batch_to_examples(&batch, 0.5);
1627        assert_eq!(examples.len(), 1);
1628        assert_eq!(examples[0].message, "high conf");
1629    }
1630}