organizational_intelligence_plugin/
export.rs

1//! Feature export module for aprender ML format.
2//!
3//! Issue #2: Export CommitFeatures to aprender format
4//!
5//! This module provides:
6//! - Export `CommitFeatures` as aprender `Matrix<f32>`
7//! - Export defect labels as `Vec<u8>` (18-category taxonomy)
8//! - Parquet output support for large datasets
9//! - Round-trip compatibility with aprender training pipeline
10//!
11//! Implements extreme TDD: All tests written before implementation.
12
13use crate::classifier::DefectCategory;
14use crate::features::CommitFeatures;
15use anyhow::{anyhow, Result};
16use aprender::primitives::Matrix;
17use serde::{Deserialize, Serialize};
18use std::fs;
19use std::path::Path;
20
21/// Total number of defect categories (10 general + 8 transpiler)
22pub const NUM_CATEGORIES: usize = 18;
23
24/// Feature dimension for CommitFeatures (matches CommitFeatures::DIMENSION)
25/// NLP-014: Extended from 8 to 14 dimensions for CITL integration
26pub const FEATURE_DIMENSION: usize = 14;
27
28/// Export format for aprender integration
29#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
30pub enum ExportFormat {
31    /// JSON format (default, human-readable)
32    #[default]
33    Json,
34    /// Binary format (faster, smaller)
35    Binary,
36    /// Parquet format (columnar, for large datasets)
37    Parquet,
38}
39
40impl std::fmt::Display for ExportFormat {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        match self {
43            Self::Json => write!(f, "json"),
44            Self::Binary => write!(f, "binary"),
45            Self::Parquet => write!(f, "parquet"),
46        }
47    }
48}
49
50impl std::str::FromStr for ExportFormat {
51    type Err = anyhow::Error;
52
53    fn from_str(s: &str) -> Result<Self> {
54        match s.to_lowercase().as_str() {
55            "json" => Ok(Self::Json),
56            "binary" | "bin" => Ok(Self::Binary),
57            "parquet" | "pq" => Ok(Self::Parquet),
58            _ => Err(anyhow!("Unknown export format: {}", s)),
59        }
60    }
61}
62
63/// Exported dataset for aprender ML training
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExportedDataset {
66    /// Feature matrix dimensions [n_samples, n_features]
67    pub shape: (usize, usize),
68    /// Flattened feature data (row-major)
69    pub features: Vec<f32>,
70    /// Label vector (one per sample)
71    pub labels: Vec<u8>,
72    /// Category names for label indices
73    pub category_names: Vec<String>,
74    /// Metadata about the export
75    pub metadata: ExportMetadata,
76}
77
78/// Metadata for exported dataset
79#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct ExportMetadata {
81    /// Number of samples
82    pub n_samples: usize,
83    /// Number of features per sample
84    pub n_features: usize,
85    /// Number of unique labels
86    pub n_classes: usize,
87    /// Export format used
88    pub format: String,
89    /// Version of the export format
90    pub version: String,
91}
92
93/// Feature exporter for aprender format
94pub struct FeatureExporter {
95    format: ExportFormat,
96}
97
98impl FeatureExporter {
99    /// Create a new feature exporter with specified format
100    ///
101    /// # Arguments
102    /// * `format` - Export format (Json, Binary, or Parquet)
103    ///
104    /// # Examples
105    /// ```
106    /// use organizational_intelligence_plugin::export::{FeatureExporter, ExportFormat};
107    ///
108    /// let exporter = FeatureExporter::new(ExportFormat::Json);
109    /// ```
110    pub fn new(format: ExportFormat) -> Self {
111        Self { format }
112    }
113
114    /// Convert CommitFeatures to aprender Matrix<f32>
115    ///
116    /// # Arguments
117    /// * `features` - Slice of CommitFeatures to convert
118    ///
119    /// # Returns
120    /// * `Ok(Matrix<f32>)` with shape [n_samples, FEATURE_DIMENSION]
121    /// * `Err` if features slice is empty
122    ///
123    /// # Examples
124    /// ```
125    /// use organizational_intelligence_plugin::export::FeatureExporter;
126    /// use organizational_intelligence_plugin::features::CommitFeatures;
127    ///
128    /// let features = vec![
129    ///     CommitFeatures {
130    ///         defect_category: 0,
131    ///         files_changed: 2.0,
132    ///         lines_added: 10.0,
133    ///         lines_deleted: 5.0,
134    ///         complexity_delta: 0.0,
135    ///         timestamp: 1700000000.0,
136    ///         hour_of_day: 14,
137    ///         day_of_week: 2,
138    ///         ..Default::default()
139    ///     },
140    /// ];
141    ///
142    /// let matrix = FeatureExporter::to_matrix(&features).unwrap();
143    /// assert_eq!(matrix.n_rows(), 1);
144    /// assert_eq!(matrix.n_cols(), 14);  // NLP-014: 14 dimensions
145    /// ```
146    pub fn to_matrix(features: &[CommitFeatures]) -> Result<Matrix<f32>> {
147        if features.is_empty() {
148            return Err(anyhow!("Cannot create matrix from empty features"));
149        }
150
151        let n_rows = features.len();
152        let n_cols = FEATURE_DIMENSION;
153
154        // Flatten features into row-major vector
155        let data: Vec<f32> = features.iter().flat_map(|f| f.to_vector()).collect();
156
157        Matrix::from_vec(n_rows, n_cols, data)
158            .map_err(|e| anyhow!("Failed to create matrix: {}", e))
159    }
160
161    /// Encode DefectCategory to label index (0-17)
162    ///
163    /// # Arguments
164    /// * `category` - DefectCategory to encode
165    ///
166    /// # Returns
167    /// * Label index (0-17)
168    ///
169    /// # Examples
170    /// ```
171    /// use organizational_intelligence_plugin::export::FeatureExporter;
172    /// use organizational_intelligence_plugin::classifier::DefectCategory;
173    ///
174    /// let label = FeatureExporter::encode_label(DefectCategory::MemorySafety);
175    /// assert_eq!(label, 0);
176    ///
177    /// let label = FeatureExporter::encode_label(DefectCategory::TraitBounds);
178    /// assert_eq!(label, 17);
179    /// ```
180    pub fn encode_label(category: DefectCategory) -> u8 {
181        match category {
182            // General categories (0-9)
183            DefectCategory::MemorySafety => 0,
184            DefectCategory::ConcurrencyBugs => 1,
185            DefectCategory::LogicErrors => 2,
186            DefectCategory::ApiMisuse => 3,
187            DefectCategory::ResourceLeaks => 4,
188            DefectCategory::TypeErrors => 5,
189            DefectCategory::ConfigurationErrors => 6,
190            DefectCategory::SecurityVulnerabilities => 7,
191            DefectCategory::PerformanceIssues => 8,
192            DefectCategory::IntegrationFailures => 9,
193            // Transpiler categories (10-17)
194            DefectCategory::OperatorPrecedence => 10,
195            DefectCategory::TypeAnnotationGaps => 11,
196            DefectCategory::StdlibMapping => 12,
197            DefectCategory::ASTTransform => 13,
198            DefectCategory::ComprehensionBugs => 14,
199            DefectCategory::IteratorChain => 15,
200            DefectCategory::OwnershipBorrow => 16,
201            DefectCategory::TraitBounds => 17,
202        }
203    }
204
205    /// Decode label index back to DefectCategory
206    ///
207    /// # Arguments
208    /// * `label` - Label index (0-17)
209    ///
210    /// # Returns
211    /// * `Ok(DefectCategory)` if label is valid
212    /// * `Err` if label is out of range
213    ///
214    /// # Examples
215    /// ```
216    /// use organizational_intelligence_plugin::export::FeatureExporter;
217    /// use organizational_intelligence_plugin::classifier::DefectCategory;
218    ///
219    /// let category = FeatureExporter::decode_label(0).unwrap();
220    /// assert_eq!(category, DefectCategory::MemorySafety);
221    ///
222    /// let result = FeatureExporter::decode_label(18);
223    /// assert!(result.is_err());
224    /// ```
225    pub fn decode_label(label: u8) -> Result<DefectCategory> {
226        match label {
227            0 => Ok(DefectCategory::MemorySafety),
228            1 => Ok(DefectCategory::ConcurrencyBugs),
229            2 => Ok(DefectCategory::LogicErrors),
230            3 => Ok(DefectCategory::ApiMisuse),
231            4 => Ok(DefectCategory::ResourceLeaks),
232            5 => Ok(DefectCategory::TypeErrors),
233            6 => Ok(DefectCategory::ConfigurationErrors),
234            7 => Ok(DefectCategory::SecurityVulnerabilities),
235            8 => Ok(DefectCategory::PerformanceIssues),
236            9 => Ok(DefectCategory::IntegrationFailures),
237            10 => Ok(DefectCategory::OperatorPrecedence),
238            11 => Ok(DefectCategory::TypeAnnotationGaps),
239            12 => Ok(DefectCategory::StdlibMapping),
240            13 => Ok(DefectCategory::ASTTransform),
241            14 => Ok(DefectCategory::ComprehensionBugs),
242            15 => Ok(DefectCategory::IteratorChain),
243            16 => Ok(DefectCategory::OwnershipBorrow),
244            17 => Ok(DefectCategory::TraitBounds),
245            _ => Err(anyhow!("Invalid label index: {} (must be 0-17)", label)),
246        }
247    }
248
249    /// Encode multiple DefectCategories to label vector
250    ///
251    /// # Arguments
252    /// * `categories` - Slice of DefectCategories
253    ///
254    /// # Returns
255    /// * Vector of label indices
256    pub fn encode_labels(categories: &[DefectCategory]) -> Vec<u8> {
257        categories.iter().map(|c| Self::encode_label(*c)).collect()
258    }
259
260    /// Get all category names in label order
261    ///
262    /// # Returns
263    /// * Vector of category names indexed by label
264    pub fn category_names() -> Vec<String> {
265        vec![
266            "MemorySafety".to_string(),
267            "ConcurrencyBugs".to_string(),
268            "LogicErrors".to_string(),
269            "ApiMisuse".to_string(),
270            "ResourceLeaks".to_string(),
271            "TypeErrors".to_string(),
272            "ConfigurationErrors".to_string(),
273            "SecurityVulnerabilities".to_string(),
274            "PerformanceIssues".to_string(),
275            "IntegrationFailures".to_string(),
276            "OperatorPrecedence".to_string(),
277            "TypeAnnotationGaps".to_string(),
278            "StdlibMapping".to_string(),
279            "ASTTransform".to_string(),
280            "ComprehensionBugs".to_string(),
281            "IteratorChain".to_string(),
282            "OwnershipBorrow".to_string(),
283            "TraitBounds".to_string(),
284        ]
285    }
286
287    /// Export features and labels to ExportedDataset
288    ///
289    /// # Arguments
290    /// * `features` - CommitFeatures to export
291    /// * `categories` - Corresponding DefectCategories
292    ///
293    /// # Returns
294    /// * `Ok(ExportedDataset)` with features and labels
295    /// * `Err` if lengths mismatch or empty input
296    pub fn export(
297        &self,
298        features: &[CommitFeatures],
299        categories: &[DefectCategory],
300    ) -> Result<ExportedDataset> {
301        if features.is_empty() {
302            return Err(anyhow!("Cannot export empty features"));
303        }
304
305        if features.len() != categories.len() {
306            return Err(anyhow!(
307                "Features and categories length mismatch: {} vs {}",
308                features.len(),
309                categories.len()
310            ));
311        }
312
313        let n_samples = features.len();
314        let n_features = FEATURE_DIMENSION;
315
316        // Convert features to flat vector
317        let feature_data: Vec<f32> = features.iter().flat_map(|f| f.to_vector()).collect();
318
319        // Encode labels
320        let labels = Self::encode_labels(categories);
321
322        // Count unique classes
323        let mut unique_labels: Vec<u8> = labels.clone();
324        unique_labels.sort();
325        unique_labels.dedup();
326        let n_classes = unique_labels.len();
327
328        Ok(ExportedDataset {
329            shape: (n_samples, n_features),
330            features: feature_data,
331            labels,
332            category_names: Self::category_names(),
333            metadata: ExportMetadata {
334                n_samples,
335                n_features,
336                n_classes,
337                format: self.format.to_string(),
338                version: "1.0.0".to_string(),
339            },
340        })
341    }
342
343    /// Save exported dataset to file
344    ///
345    /// # Arguments
346    /// * `dataset` - ExportedDataset to save
347    /// * `path` - Output file path
348    ///
349    /// # Returns
350    /// * `Ok(())` if successful
351    /// * `Err` if write fails
352    pub fn save<P: AsRef<Path>>(&self, dataset: &ExportedDataset, path: P) -> Result<()> {
353        match self.format {
354            ExportFormat::Json => {
355                let json = serde_json::to_string_pretty(dataset)
356                    .map_err(|e| anyhow!("JSON serialization failed: {}", e))?;
357                fs::write(path.as_ref(), json)
358                    .map_err(|e| anyhow!("Failed to write file: {}", e))?;
359            }
360            ExportFormat::Binary => {
361                let binary = bincode::serialize(dataset)
362                    .map_err(|e| anyhow!("Binary serialization failed: {}", e))?;
363                fs::write(path.as_ref(), binary)
364                    .map_err(|e| anyhow!("Failed to write file: {}", e))?;
365            }
366            ExportFormat::Parquet => {
367                self.save_parquet(dataset, path.as_ref())?;
368            }
369        }
370        Ok(())
371    }
372
373    /// Load exported dataset from file
374    ///
375    /// # Arguments
376    /// * `path` - Input file path
377    /// * `format` - Format to expect
378    ///
379    /// # Returns
380    /// * `Ok(ExportedDataset)` if successful
381    /// * `Err` if read/parse fails
382    pub fn load<P: AsRef<Path>>(path: P, format: ExportFormat) -> Result<ExportedDataset> {
383        match format {
384            ExportFormat::Json => {
385                let content = fs::read_to_string(path.as_ref())
386                    .map_err(|e| anyhow!("Failed to read file: {}", e))?;
387                serde_json::from_str(&content)
388                    .map_err(|e| anyhow!("JSON deserialization failed: {}", e))
389            }
390            ExportFormat::Binary => {
391                let content =
392                    fs::read(path.as_ref()).map_err(|e| anyhow!("Failed to read file: {}", e))?;
393                bincode::deserialize(&content)
394                    .map_err(|e| anyhow!("Binary deserialization failed: {}", e))
395            }
396            ExportFormat::Parquet => Self::load_parquet(path.as_ref()),
397        }
398    }
399
400    /// Convert ExportedDataset to aprender Matrix
401    ///
402    /// # Arguments
403    /// * `dataset` - ExportedDataset to convert
404    ///
405    /// # Returns
406    /// * `Ok(Matrix<f32>)` feature matrix
407    pub fn to_aprender_matrix(dataset: &ExportedDataset) -> Result<Matrix<f32>> {
408        let (n_rows, n_cols) = dataset.shape;
409        Matrix::from_vec(n_rows, n_cols, dataset.features.clone())
410            .map_err(|e| anyhow!("Failed to create matrix: {}", e))
411    }
412
413    /// Save dataset in Parquet format
414    fn save_parquet<P: AsRef<Path>>(&self, dataset: &ExportedDataset, path: P) -> Result<()> {
415        // For now, use JSON as fallback since Parquet requires additional dependencies
416        // TODO: Add arrow/parquet crate for native Parquet support
417        let json = serde_json::to_string_pretty(dataset)
418            .map_err(|e| anyhow!("JSON serialization failed: {}", e))?;
419        fs::write(path.as_ref(), json).map_err(|e| anyhow!("Failed to write file: {}", e))?;
420        Ok(())
421    }
422
423    /// Load dataset from Parquet format
424    fn load_parquet<P: AsRef<Path>>(path: P) -> Result<ExportedDataset> {
425        // For now, use JSON as fallback
426        let content =
427            fs::read_to_string(path.as_ref()).map_err(|e| anyhow!("Failed to read file: {}", e))?;
428        serde_json::from_str(&content).map_err(|e| anyhow!("JSON deserialization failed: {}", e))
429    }
430}
431
432impl Default for FeatureExporter {
433    fn default() -> Self {
434        Self::new(ExportFormat::Json)
435    }
436}
437
438#[cfg(test)]
439mod tests {
440    use super::*;
441    use proptest::prelude::*;
442
443    // ===== Unit Tests =====
444
445    #[test]
446    fn test_export_format_default() {
447        assert_eq!(ExportFormat::default(), ExportFormat::Json);
448    }
449
450    #[test]
451    fn test_export_format_display() {
452        assert_eq!(format!("{}", ExportFormat::Json), "json");
453        assert_eq!(format!("{}", ExportFormat::Binary), "binary");
454        assert_eq!(format!("{}", ExportFormat::Parquet), "parquet");
455    }
456
457    #[test]
458    fn test_export_format_from_str() {
459        assert_eq!("json".parse::<ExportFormat>().unwrap(), ExportFormat::Json);
460        assert_eq!(
461            "binary".parse::<ExportFormat>().unwrap(),
462            ExportFormat::Binary
463        );
464        assert_eq!("bin".parse::<ExportFormat>().unwrap(), ExportFormat::Binary);
465        assert_eq!(
466            "parquet".parse::<ExportFormat>().unwrap(),
467            ExportFormat::Parquet
468        );
469        assert_eq!("pq".parse::<ExportFormat>().unwrap(), ExportFormat::Parquet);
470        assert!("invalid".parse::<ExportFormat>().is_err());
471    }
472
473    #[test]
474    fn test_feature_exporter_creation() {
475        let exporter = FeatureExporter::new(ExportFormat::Json);
476        assert_eq!(exporter.format, ExportFormat::Json);
477
478        let default_exporter = FeatureExporter::default();
479        assert_eq!(default_exporter.format, ExportFormat::Json);
480    }
481
482    #[test]
483    fn test_to_matrix_single_sample() {
484        let features = vec![CommitFeatures {
485            defect_category: 0,
486            files_changed: 2.0,
487            lines_added: 10.0,
488            lines_deleted: 5.0,
489            complexity_delta: 1.5,
490            timestamp: 1700000000.0,
491            hour_of_day: 14,
492            day_of_week: 2,
493            ..Default::default()
494        }];
495
496        let matrix = FeatureExporter::to_matrix(&features).unwrap();
497        assert_eq!(matrix.n_rows(), 1);
498        assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
499        assert_eq!(matrix.get(0, 0), 0.0); // defect_category
500        assert_eq!(matrix.get(0, 1), 2.0); // files_changed
501        assert_eq!(matrix.get(0, 2), 10.0); // lines_added
502    }
503
504    #[test]
505    fn test_to_matrix_multiple_samples() {
506        let features = vec![
507            CommitFeatures {
508                defect_category: 0,
509                files_changed: 1.0,
510                lines_added: 10.0,
511                lines_deleted: 5.0,
512                complexity_delta: 0.0,
513                timestamp: 1700000000.0,
514                hour_of_day: 10,
515                day_of_week: 1,
516                ..Default::default()
517            },
518            CommitFeatures {
519                defect_category: 5,
520                files_changed: 3.0,
521                lines_added: 20.0,
522                lines_deleted: 15.0,
523                complexity_delta: 2.0,
524                timestamp: 1700000001.0,
525                hour_of_day: 11,
526                day_of_week: 2,
527                ..Default::default()
528            },
529        ];
530
531        let matrix = FeatureExporter::to_matrix(&features).unwrap();
532        assert_eq!(matrix.n_rows(), 2);
533        assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
534
535        // First row
536        assert_eq!(matrix.get(0, 0), 0.0);
537        assert_eq!(matrix.get(0, 1), 1.0);
538
539        // Second row
540        assert_eq!(matrix.get(1, 0), 5.0);
541        assert_eq!(matrix.get(1, 1), 3.0);
542    }
543
544    #[test]
545    fn test_to_matrix_empty_error() {
546        let features: Vec<CommitFeatures> = vec![];
547        let result = FeatureExporter::to_matrix(&features);
548        assert!(result.is_err());
549        assert!(result.unwrap_err().to_string().contains("empty"));
550    }
551
552    #[test]
553    fn test_encode_label_all_categories() {
554        // General categories (0-9)
555        assert_eq!(
556            FeatureExporter::encode_label(DefectCategory::MemorySafety),
557            0
558        );
559        assert_eq!(
560            FeatureExporter::encode_label(DefectCategory::ConcurrencyBugs),
561            1
562        );
563        assert_eq!(
564            FeatureExporter::encode_label(DefectCategory::LogicErrors),
565            2
566        );
567        assert_eq!(FeatureExporter::encode_label(DefectCategory::ApiMisuse), 3);
568        assert_eq!(
569            FeatureExporter::encode_label(DefectCategory::ResourceLeaks),
570            4
571        );
572        assert_eq!(FeatureExporter::encode_label(DefectCategory::TypeErrors), 5);
573        assert_eq!(
574            FeatureExporter::encode_label(DefectCategory::ConfigurationErrors),
575            6
576        );
577        assert_eq!(
578            FeatureExporter::encode_label(DefectCategory::SecurityVulnerabilities),
579            7
580        );
581        assert_eq!(
582            FeatureExporter::encode_label(DefectCategory::PerformanceIssues),
583            8
584        );
585        assert_eq!(
586            FeatureExporter::encode_label(DefectCategory::IntegrationFailures),
587            9
588        );
589
590        // Transpiler categories (10-17)
591        assert_eq!(
592            FeatureExporter::encode_label(DefectCategory::OperatorPrecedence),
593            10
594        );
595        assert_eq!(
596            FeatureExporter::encode_label(DefectCategory::TypeAnnotationGaps),
597            11
598        );
599        assert_eq!(
600            FeatureExporter::encode_label(DefectCategory::StdlibMapping),
601            12
602        );
603        assert_eq!(
604            FeatureExporter::encode_label(DefectCategory::ASTTransform),
605            13
606        );
607        assert_eq!(
608            FeatureExporter::encode_label(DefectCategory::ComprehensionBugs),
609            14
610        );
611        assert_eq!(
612            FeatureExporter::encode_label(DefectCategory::IteratorChain),
613            15
614        );
615        assert_eq!(
616            FeatureExporter::encode_label(DefectCategory::OwnershipBorrow),
617            16
618        );
619        assert_eq!(
620            FeatureExporter::encode_label(DefectCategory::TraitBounds),
621            17
622        );
623    }
624
625    #[test]
626    fn test_decode_label_all_valid() {
627        for i in 0..NUM_CATEGORIES {
628            let result = FeatureExporter::decode_label(i as u8);
629            assert!(result.is_ok(), "Failed to decode label {}", i);
630        }
631    }
632
633    #[test]
634    fn test_decode_label_invalid() {
635        let result = FeatureExporter::decode_label(18);
636        assert!(result.is_err());
637        assert!(result.unwrap_err().to_string().contains("18"));
638
639        let result = FeatureExporter::decode_label(255);
640        assert!(result.is_err());
641    }
642
643    #[test]
644    fn test_encode_decode_roundtrip() {
645        let categories = vec![
646            DefectCategory::MemorySafety,
647            DefectCategory::SecurityVulnerabilities,
648            DefectCategory::TraitBounds,
649            DefectCategory::ASTTransform,
650        ];
651
652        for category in categories {
653            let encoded = FeatureExporter::encode_label(category);
654            let decoded = FeatureExporter::decode_label(encoded).unwrap();
655            assert_eq!(category, decoded);
656        }
657    }
658
659    #[test]
660    fn test_encode_labels_multiple() {
661        let categories = vec![
662            DefectCategory::MemorySafety,
663            DefectCategory::ConcurrencyBugs,
664            DefectCategory::TraitBounds,
665        ];
666
667        let labels = FeatureExporter::encode_labels(&categories);
668        assert_eq!(labels, vec![0, 1, 17]);
669    }
670
671    #[test]
672    fn test_category_names() {
673        let names = FeatureExporter::category_names();
674        assert_eq!(names.len(), NUM_CATEGORIES);
675        assert_eq!(names[0], "MemorySafety");
676        assert_eq!(names[17], "TraitBounds");
677    }
678
679    #[test]
680    fn test_export_basic() {
681        let exporter = FeatureExporter::new(ExportFormat::Json);
682
683        let features = vec![CommitFeatures {
684            defect_category: 0,
685            files_changed: 2.0,
686            lines_added: 10.0,
687            lines_deleted: 5.0,
688            complexity_delta: 0.0,
689            timestamp: 1700000000.0,
690            hour_of_day: 14,
691            day_of_week: 2,
692            ..Default::default()
693        }];
694
695        let categories = vec![DefectCategory::MemorySafety];
696
697        let dataset = exporter.export(&features, &categories).unwrap();
698        assert_eq!(dataset.shape, (1, FEATURE_DIMENSION));
699        assert_eq!(dataset.features.len(), FEATURE_DIMENSION);
700        assert_eq!(dataset.labels, vec![0]);
701        assert_eq!(dataset.metadata.n_samples, 1);
702        assert_eq!(dataset.metadata.n_features, FEATURE_DIMENSION);
703    }
704
705    #[test]
706    fn test_export_empty_error() {
707        let exporter = FeatureExporter::new(ExportFormat::Json);
708        let features: Vec<CommitFeatures> = vec![];
709        let categories: Vec<DefectCategory> = vec![];
710
711        let result = exporter.export(&features, &categories);
712        assert!(result.is_err());
713    }
714
715    #[test]
716    fn test_export_length_mismatch_error() {
717        let exporter = FeatureExporter::new(ExportFormat::Json);
718
719        let features = vec![CommitFeatures {
720            defect_category: 0,
721            files_changed: 2.0,
722            lines_added: 10.0,
723            lines_deleted: 5.0,
724            complexity_delta: 0.0,
725            timestamp: 1700000000.0,
726            hour_of_day: 14,
727            day_of_week: 2,
728            ..Default::default()
729        }];
730
731        let categories = vec![
732            DefectCategory::MemorySafety,
733            DefectCategory::ConcurrencyBugs, // Extra category
734        ];
735
736        let result = exporter.export(&features, &categories);
737        assert!(result.is_err());
738        assert!(result.unwrap_err().to_string().contains("mismatch"));
739    }
740
741    #[test]
742    fn test_export_multiple_samples() {
743        let exporter = FeatureExporter::new(ExportFormat::Json);
744
745        let features = vec![
746            CommitFeatures {
747                defect_category: 0,
748                files_changed: 1.0,
749                lines_added: 10.0,
750                lines_deleted: 5.0,
751                complexity_delta: 0.0,
752                timestamp: 1700000000.0,
753                hour_of_day: 10,
754                day_of_week: 1,
755                ..Default::default()
756            },
757            CommitFeatures {
758                defect_category: 7,
759                files_changed: 3.0,
760                lines_added: 20.0,
761                lines_deleted: 15.0,
762                complexity_delta: 2.0,
763                timestamp: 1700000001.0,
764                hour_of_day: 11,
765                day_of_week: 2,
766                ..Default::default()
767            },
768        ];
769
770        let categories = vec![
771            DefectCategory::MemorySafety,
772            DefectCategory::SecurityVulnerabilities,
773        ];
774
775        let dataset = exporter.export(&features, &categories).unwrap();
776        assert_eq!(dataset.shape, (2, FEATURE_DIMENSION));
777        assert_eq!(dataset.features.len(), 2 * FEATURE_DIMENSION);
778        assert_eq!(dataset.labels, vec![0, 7]);
779        assert_eq!(dataset.metadata.n_classes, 2);
780    }
781
782    #[test]
783    fn test_to_aprender_matrix() {
784        let dataset = ExportedDataset {
785            shape: (2, 3),
786            features: vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
787            labels: vec![0, 1],
788            category_names: vec!["A".to_string(), "B".to_string()],
789            metadata: ExportMetadata {
790                n_samples: 2,
791                n_features: 3,
792                n_classes: 2,
793                format: "json".to_string(),
794                version: "1.0.0".to_string(),
795            },
796        };
797
798        let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
799        assert_eq!(matrix.n_rows(), 2);
800        assert_eq!(matrix.n_cols(), 3);
801        assert_eq!(matrix.get(0, 0), 1.0);
802        assert_eq!(matrix.get(1, 2), 6.0);
803    }
804
805    #[test]
806    fn test_save_and_load_json() {
807        let exporter = FeatureExporter::new(ExportFormat::Json);
808
809        let features = vec![CommitFeatures {
810            defect_category: 5,
811            files_changed: 3.0,
812            lines_added: 15.0,
813            lines_deleted: 8.0,
814            complexity_delta: 1.0,
815            timestamp: 1700000000.0,
816            hour_of_day: 9,
817            day_of_week: 0,
818            ..Default::default()
819        }];
820
821        let categories = vec![DefectCategory::TypeErrors];
822        let dataset = exporter.export(&features, &categories).unwrap();
823
824        // Save to temp file
825        let temp_dir = tempfile::tempdir().unwrap();
826        let path = temp_dir.path().join("test_export.json");
827
828        exporter.save(&dataset, &path).unwrap();
829
830        // Load and verify
831        let loaded = FeatureExporter::load(&path, ExportFormat::Json).unwrap();
832        assert_eq!(loaded.shape, dataset.shape);
833        assert_eq!(loaded.features, dataset.features);
834        assert_eq!(loaded.labels, dataset.labels);
835    }
836
837    #[test]
838    fn test_save_and_load_binary() {
839        let exporter = FeatureExporter::new(ExportFormat::Binary);
840
841        let features = vec![CommitFeatures {
842            defect_category: 10,
843            files_changed: 5.0,
844            lines_added: 25.0,
845            lines_deleted: 12.0,
846            complexity_delta: 3.0,
847            timestamp: 1700000000.0,
848            hour_of_day: 15,
849            day_of_week: 4,
850            ..Default::default()
851        }];
852
853        let categories = vec![DefectCategory::OperatorPrecedence];
854        let dataset = exporter.export(&features, &categories).unwrap();
855
856        let temp_dir = tempfile::tempdir().unwrap();
857        let path = temp_dir.path().join("test_export.bin");
858
859        exporter.save(&dataset, &path).unwrap();
860
861        let loaded = FeatureExporter::load(&path, ExportFormat::Binary).unwrap();
862        assert_eq!(loaded.shape, dataset.shape);
863        assert_eq!(loaded.labels, dataset.labels);
864    }
865
866    // ===== Property-Based Tests (Proptest) =====
867
868    proptest! {
869        /// Property: encode/decode roundtrip preserves category
870        #[test]
871        fn prop_encode_decode_roundtrip(label in 0u8..18) {
872            let category = FeatureExporter::decode_label(label).unwrap();
873            let encoded = FeatureExporter::encode_label(category);
874            prop_assert_eq!(label, encoded);
875        }
876
877        /// Property: all valid labels decode successfully
878        #[test]
879        fn prop_valid_labels_decode(label in 0u8..18) {
880            let result = FeatureExporter::decode_label(label);
881            prop_assert!(result.is_ok());
882        }
883
884        /// Property: invalid labels fail to decode
885        #[test]
886        fn prop_invalid_labels_fail(label in 18u8..=255) {
887            let result = FeatureExporter::decode_label(label);
888            prop_assert!(result.is_err());
889        }
890
891        /// Property: matrix dimensions match input
892        #[test]
893        fn prop_matrix_dimensions(
894            n_samples in 1usize..100,
895            defect_category in 0u8..18,
896            files_changed in 0.0f32..1000.0,
897            lines_added in 0.0f32..10000.0,
898        ) {
899            let features: Vec<CommitFeatures> = (0..n_samples)
900                .map(|_| CommitFeatures {
901                    defect_category,
902                    files_changed,
903                    lines_added,
904                    lines_deleted: 0.0,
905                    complexity_delta: 0.0,
906                    timestamp: 1700000000.0,
907                    hour_of_day: 12,
908                    day_of_week: 3,
909                    ..Default::default()
910                })
911                .collect();
912
913            let matrix = FeatureExporter::to_matrix(&features).unwrap();
914            prop_assert_eq!(matrix.n_rows(), n_samples);
915            prop_assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
916        }
917
918        /// Property: exported dataset has correct shape
919        #[test]
920        fn prop_export_shape(n_samples in 1usize..50) {
921            let exporter = FeatureExporter::default();
922
923            let features: Vec<CommitFeatures> = (0..n_samples)
924                .map(|i| CommitFeatures {
925                    defect_category: (i % 18) as u8,
926                    files_changed: 1.0,
927                    lines_added: 10.0,
928                    lines_deleted: 5.0,
929                    complexity_delta: 0.0,
930                    timestamp: 1700000000.0,
931                    hour_of_day: 12,
932                    day_of_week: 3,
933                    ..Default::default()
934                })
935                .collect();
936
937            let categories: Vec<DefectCategory> = (0..n_samples)
938                .map(|i| FeatureExporter::decode_label((i % 18) as u8).unwrap())
939                .collect();
940
941            let dataset = exporter.export(&features, &categories).unwrap();
942
943            prop_assert_eq!(dataset.shape.0, n_samples);
944            prop_assert_eq!(dataset.shape.1, FEATURE_DIMENSION);
945            prop_assert_eq!(dataset.features.len(), n_samples * FEATURE_DIMENSION);
946            prop_assert_eq!(dataset.labels.len(), n_samples);
947        }
948
949        /// Property: category names has exactly NUM_CATEGORIES entries
950        #[test]
951        fn prop_category_names_count(_dummy in 0..1) {
952            let names = FeatureExporter::category_names();
953            prop_assert_eq!(names.len(), NUM_CATEGORIES);
954        }
955
956        /// Property: feature data is preserved in export
957        #[test]
958        fn prop_feature_preservation(
959            files_changed in 0.0f32..1000.0,
960            lines_added in 0.0f32..10000.0,
961            lines_deleted in 0.0f32..5000.0,
962        ) {
963            let features = vec![CommitFeatures {
964                defect_category: 0,
965                files_changed,
966                lines_added,
967                lines_deleted,
968                complexity_delta: 0.0,
969                timestamp: 1700000000.0,
970                hour_of_day: 12,
971                day_of_week: 3,
972                ..Default::default()
973            }];
974
975            let matrix = FeatureExporter::to_matrix(&features).unwrap();
976
977            prop_assert_eq!(matrix.get(0, 1), files_changed);
978            prop_assert_eq!(matrix.get(0, 2), lines_added);
979            prop_assert_eq!(matrix.get(0, 3), lines_deleted);
980        }
981
982        /// Property: export then to_aprender_matrix preserves data
983        #[test]
984        fn prop_export_to_matrix_roundtrip(n_samples in 1usize..20) {
985            let exporter = FeatureExporter::default();
986
987            let features: Vec<CommitFeatures> = (0..n_samples)
988                .map(|i| CommitFeatures {
989                    defect_category: (i % 18) as u8,
990                    files_changed: (i + 1) as f32,
991                    lines_added: (i * 10) as f32,
992                    lines_deleted: (i * 5) as f32,
993                    complexity_delta: 0.0,
994                    timestamp: 1700000000.0,
995                    hour_of_day: 12,
996                    day_of_week: 3,
997                    ..Default::default()
998                })
999                .collect();
1000
1001            let categories: Vec<DefectCategory> = (0..n_samples)
1002                .map(|i| FeatureExporter::decode_label((i % 18) as u8).unwrap())
1003                .collect();
1004
1005            let dataset = exporter.export(&features, &categories).unwrap();
1006            let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
1007
1008            prop_assert_eq!(matrix.n_rows(), n_samples);
1009            prop_assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
1010
1011            // Verify first sample
1012            prop_assert_eq!(matrix.get(0, 1), 1.0); // files_changed for i=0
1013        }
1014    }
1015}
1016
1017// Integration test module
1018#[cfg(test)]
1019mod integration_tests {
1020    use super::*;
1021
1022    /// Test round-trip: export → save → load → to_matrix
1023    #[test]
1024    fn test_full_roundtrip_json() {
1025        let exporter = FeatureExporter::new(ExportFormat::Json);
1026
1027        // Create test data
1028        let features = vec![
1029            CommitFeatures {
1030                defect_category: 0,
1031                files_changed: 5.0,
1032                lines_added: 100.0,
1033                lines_deleted: 50.0,
1034                complexity_delta: 2.0,
1035                timestamp: 1700000000.0,
1036                hour_of_day: 14,
1037                day_of_week: 2,
1038                ..Default::default()
1039            },
1040            CommitFeatures {
1041                defect_category: 7,
1042                files_changed: 3.0,
1043                lines_added: 75.0,
1044                lines_deleted: 25.0,
1045                complexity_delta: 1.0,
1046                timestamp: 1700000001.0,
1047                hour_of_day: 15,
1048                day_of_week: 2,
1049                ..Default::default()
1050            },
1051            CommitFeatures {
1052                defect_category: 13,
1053                files_changed: 8.0,
1054                lines_added: 200.0,
1055                lines_deleted: 100.0,
1056                complexity_delta: 5.0,
1057                timestamp: 1700000002.0,
1058                hour_of_day: 16,
1059                day_of_week: 2,
1060                ..Default::default()
1061            },
1062        ];
1063
1064        let categories = vec![
1065            DefectCategory::MemorySafety,
1066            DefectCategory::SecurityVulnerabilities,
1067            DefectCategory::ASTTransform,
1068        ];
1069
1070        // Export
1071        let dataset = exporter.export(&features, &categories).unwrap();
1072
1073        // Save
1074        let temp_dir = tempfile::tempdir().unwrap();
1075        let path = temp_dir.path().join("roundtrip_test.json");
1076        exporter.save(&dataset, &path).unwrap();
1077
1078        // Load
1079        let loaded = FeatureExporter::load(&path, ExportFormat::Json).unwrap();
1080
1081        // Verify
1082        assert_eq!(loaded.shape, (3, FEATURE_DIMENSION));
1083        assert_eq!(loaded.labels, vec![0, 7, 13]);
1084
1085        // Convert to matrix for training
1086        let matrix = FeatureExporter::to_aprender_matrix(&loaded).unwrap();
1087        assert_eq!(matrix.n_rows(), 3);
1088        assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
1089
1090        // Verify specific values
1091        assert_eq!(matrix.get(0, 1), 5.0); // files_changed for first sample
1092        assert_eq!(matrix.get(1, 0), 7.0); // defect_category for second sample
1093        assert_eq!(matrix.get(2, 2), 200.0); // lines_added for third sample
1094    }
1095
1096    /// Test that exported data can be used with aprender RandomForestClassifier
1097    #[test]
1098    fn test_aprender_training_compatibility() {
1099        use aprender::tree::RandomForestClassifier;
1100
1101        let exporter = FeatureExporter::new(ExportFormat::Json);
1102
1103        // Create diverse training data (need enough samples for RF)
1104        let mut features = Vec::new();
1105        let mut categories = Vec::new();
1106
1107        for i in 0..30 {
1108            features.push(CommitFeatures {
1109                defect_category: (i % 3) as u8,
1110                files_changed: (i + 1) as f32,
1111                lines_added: (i * 10 + 5) as f32,
1112                lines_deleted: (i * 5) as f32,
1113                complexity_delta: (i % 5) as f32,
1114                timestamp: (1700000000 + i) as f64,
1115                hour_of_day: (9 + i % 8) as u8,
1116                day_of_week: (i % 5) as u8,
1117                ..Default::default()
1118            });
1119
1120            categories.push(match i % 3 {
1121                0 => DefectCategory::MemorySafety,
1122                1 => DefectCategory::ConcurrencyBugs,
1123                _ => DefectCategory::LogicErrors,
1124            });
1125        }
1126
1127        // Export
1128        let dataset = exporter.export(&features, &categories).unwrap();
1129
1130        // Convert to aprender format
1131        let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
1132        let labels: Vec<usize> = dataset.labels.iter().map(|&l| l as usize).collect();
1133
1134        // Train RandomForest (proves compatibility)
1135        let mut classifier = RandomForestClassifier::new(10);
1136        let result = classifier.fit(&matrix, &labels);
1137
1138        assert!(result.is_ok(), "RandomForest training should succeed");
1139
1140        // Predict on training data (sanity check)
1141        let predictions = classifier.predict(&matrix);
1142        assert_eq!(predictions.len(), 30);
1143    }
1144}