1use crate::classifier::DefectCategory;
14use crate::features::CommitFeatures;
15use anyhow::{anyhow, Result};
16use aprender::primitives::Matrix;
17use serde::{Deserialize, Serialize};
18use std::fs;
19use std::path::Path;
20
21pub const NUM_CATEGORIES: usize = 18;
23
24pub const FEATURE_DIMENSION: usize = 14;
27
28#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
30pub enum ExportFormat {
31 #[default]
33 Json,
34 Binary,
36 Parquet,
38}
39
40impl std::fmt::Display for ExportFormat {
41 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42 match self {
43 Self::Json => write!(f, "json"),
44 Self::Binary => write!(f, "binary"),
45 Self::Parquet => write!(f, "parquet"),
46 }
47 }
48}
49
50impl std::str::FromStr for ExportFormat {
51 type Err = anyhow::Error;
52
53 fn from_str(s: &str) -> Result<Self> {
54 match s.to_lowercase().as_str() {
55 "json" => Ok(Self::Json),
56 "binary" | "bin" => Ok(Self::Binary),
57 "parquet" | "pq" => Ok(Self::Parquet),
58 _ => Err(anyhow!("Unknown export format: {}", s)),
59 }
60 }
61}
62
63#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ExportedDataset {
66 pub shape: (usize, usize),
68 pub features: Vec<f32>,
70 pub labels: Vec<u8>,
72 pub category_names: Vec<String>,
74 pub metadata: ExportMetadata,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
80pub struct ExportMetadata {
81 pub n_samples: usize,
83 pub n_features: usize,
85 pub n_classes: usize,
87 pub format: String,
89 pub version: String,
91}
92
93pub struct FeatureExporter {
95 format: ExportFormat,
96}
97
98impl FeatureExporter {
99 pub fn new(format: ExportFormat) -> Self {
111 Self { format }
112 }
113
114 pub fn to_matrix(features: &[CommitFeatures]) -> Result<Matrix<f32>> {
147 if features.is_empty() {
148 return Err(anyhow!("Cannot create matrix from empty features"));
149 }
150
151 let n_rows = features.len();
152 let n_cols = FEATURE_DIMENSION;
153
154 let data: Vec<f32> = features.iter().flat_map(|f| f.to_vector()).collect();
156
157 Matrix::from_vec(n_rows, n_cols, data)
158 .map_err(|e| anyhow!("Failed to create matrix: {}", e))
159 }
160
161 pub fn encode_label(category: DefectCategory) -> u8 {
181 match category {
182 DefectCategory::MemorySafety => 0,
184 DefectCategory::ConcurrencyBugs => 1,
185 DefectCategory::LogicErrors => 2,
186 DefectCategory::ApiMisuse => 3,
187 DefectCategory::ResourceLeaks => 4,
188 DefectCategory::TypeErrors => 5,
189 DefectCategory::ConfigurationErrors => 6,
190 DefectCategory::SecurityVulnerabilities => 7,
191 DefectCategory::PerformanceIssues => 8,
192 DefectCategory::IntegrationFailures => 9,
193 DefectCategory::OperatorPrecedence => 10,
195 DefectCategory::TypeAnnotationGaps => 11,
196 DefectCategory::StdlibMapping => 12,
197 DefectCategory::ASTTransform => 13,
198 DefectCategory::ComprehensionBugs => 14,
199 DefectCategory::IteratorChain => 15,
200 DefectCategory::OwnershipBorrow => 16,
201 DefectCategory::TraitBounds => 17,
202 }
203 }
204
205 pub fn decode_label(label: u8) -> Result<DefectCategory> {
226 match label {
227 0 => Ok(DefectCategory::MemorySafety),
228 1 => Ok(DefectCategory::ConcurrencyBugs),
229 2 => Ok(DefectCategory::LogicErrors),
230 3 => Ok(DefectCategory::ApiMisuse),
231 4 => Ok(DefectCategory::ResourceLeaks),
232 5 => Ok(DefectCategory::TypeErrors),
233 6 => Ok(DefectCategory::ConfigurationErrors),
234 7 => Ok(DefectCategory::SecurityVulnerabilities),
235 8 => Ok(DefectCategory::PerformanceIssues),
236 9 => Ok(DefectCategory::IntegrationFailures),
237 10 => Ok(DefectCategory::OperatorPrecedence),
238 11 => Ok(DefectCategory::TypeAnnotationGaps),
239 12 => Ok(DefectCategory::StdlibMapping),
240 13 => Ok(DefectCategory::ASTTransform),
241 14 => Ok(DefectCategory::ComprehensionBugs),
242 15 => Ok(DefectCategory::IteratorChain),
243 16 => Ok(DefectCategory::OwnershipBorrow),
244 17 => Ok(DefectCategory::TraitBounds),
245 _ => Err(anyhow!("Invalid label index: {} (must be 0-17)", label)),
246 }
247 }
248
249 pub fn encode_labels(categories: &[DefectCategory]) -> Vec<u8> {
257 categories.iter().map(|c| Self::encode_label(*c)).collect()
258 }
259
260 pub fn category_names() -> Vec<String> {
265 vec![
266 "MemorySafety".to_string(),
267 "ConcurrencyBugs".to_string(),
268 "LogicErrors".to_string(),
269 "ApiMisuse".to_string(),
270 "ResourceLeaks".to_string(),
271 "TypeErrors".to_string(),
272 "ConfigurationErrors".to_string(),
273 "SecurityVulnerabilities".to_string(),
274 "PerformanceIssues".to_string(),
275 "IntegrationFailures".to_string(),
276 "OperatorPrecedence".to_string(),
277 "TypeAnnotationGaps".to_string(),
278 "StdlibMapping".to_string(),
279 "ASTTransform".to_string(),
280 "ComprehensionBugs".to_string(),
281 "IteratorChain".to_string(),
282 "OwnershipBorrow".to_string(),
283 "TraitBounds".to_string(),
284 ]
285 }
286
287 pub fn export(
297 &self,
298 features: &[CommitFeatures],
299 categories: &[DefectCategory],
300 ) -> Result<ExportedDataset> {
301 if features.is_empty() {
302 return Err(anyhow!("Cannot export empty features"));
303 }
304
305 if features.len() != categories.len() {
306 return Err(anyhow!(
307 "Features and categories length mismatch: {} vs {}",
308 features.len(),
309 categories.len()
310 ));
311 }
312
313 let n_samples = features.len();
314 let n_features = FEATURE_DIMENSION;
315
316 let feature_data: Vec<f32> = features.iter().flat_map(|f| f.to_vector()).collect();
318
319 let labels = Self::encode_labels(categories);
321
322 let mut unique_labels: Vec<u8> = labels.clone();
324 unique_labels.sort();
325 unique_labels.dedup();
326 let n_classes = unique_labels.len();
327
328 Ok(ExportedDataset {
329 shape: (n_samples, n_features),
330 features: feature_data,
331 labels,
332 category_names: Self::category_names(),
333 metadata: ExportMetadata {
334 n_samples,
335 n_features,
336 n_classes,
337 format: self.format.to_string(),
338 version: "1.0.0".to_string(),
339 },
340 })
341 }
342
343 pub fn save<P: AsRef<Path>>(&self, dataset: &ExportedDataset, path: P) -> Result<()> {
353 match self.format {
354 ExportFormat::Json => {
355 let json = serde_json::to_string_pretty(dataset)
356 .map_err(|e| anyhow!("JSON serialization failed: {}", e))?;
357 fs::write(path.as_ref(), json)
358 .map_err(|e| anyhow!("Failed to write file: {}", e))?;
359 }
360 ExportFormat::Binary => {
361 let binary = bincode::serialize(dataset)
362 .map_err(|e| anyhow!("Binary serialization failed: {}", e))?;
363 fs::write(path.as_ref(), binary)
364 .map_err(|e| anyhow!("Failed to write file: {}", e))?;
365 }
366 ExportFormat::Parquet => {
367 self.save_parquet(dataset, path.as_ref())?;
368 }
369 }
370 Ok(())
371 }
372
373 pub fn load<P: AsRef<Path>>(path: P, format: ExportFormat) -> Result<ExportedDataset> {
383 match format {
384 ExportFormat::Json => {
385 let content = fs::read_to_string(path.as_ref())
386 .map_err(|e| anyhow!("Failed to read file: {}", e))?;
387 serde_json::from_str(&content)
388 .map_err(|e| anyhow!("JSON deserialization failed: {}", e))
389 }
390 ExportFormat::Binary => {
391 let content =
392 fs::read(path.as_ref()).map_err(|e| anyhow!("Failed to read file: {}", e))?;
393 bincode::deserialize(&content)
394 .map_err(|e| anyhow!("Binary deserialization failed: {}", e))
395 }
396 ExportFormat::Parquet => Self::load_parquet(path.as_ref()),
397 }
398 }
399
400 pub fn to_aprender_matrix(dataset: &ExportedDataset) -> Result<Matrix<f32>> {
408 let (n_rows, n_cols) = dataset.shape;
409 Matrix::from_vec(n_rows, n_cols, dataset.features.clone())
410 .map_err(|e| anyhow!("Failed to create matrix: {}", e))
411 }
412
413 fn save_parquet<P: AsRef<Path>>(&self, dataset: &ExportedDataset, path: P) -> Result<()> {
415 let json = serde_json::to_string_pretty(dataset)
418 .map_err(|e| anyhow!("JSON serialization failed: {}", e))?;
419 fs::write(path.as_ref(), json).map_err(|e| anyhow!("Failed to write file: {}", e))?;
420 Ok(())
421 }
422
423 fn load_parquet<P: AsRef<Path>>(path: P) -> Result<ExportedDataset> {
425 let content =
427 fs::read_to_string(path.as_ref()).map_err(|e| anyhow!("Failed to read file: {}", e))?;
428 serde_json::from_str(&content).map_err(|e| anyhow!("JSON deserialization failed: {}", e))
429 }
430}
431
432impl Default for FeatureExporter {
433 fn default() -> Self {
434 Self::new(ExportFormat::Json)
435 }
436}
437
438#[cfg(test)]
439mod tests {
440 use super::*;
441 use proptest::prelude::*;
442
443 #[test]
446 fn test_export_format_default() {
447 assert_eq!(ExportFormat::default(), ExportFormat::Json);
448 }
449
450 #[test]
451 fn test_export_format_display() {
452 assert_eq!(format!("{}", ExportFormat::Json), "json");
453 assert_eq!(format!("{}", ExportFormat::Binary), "binary");
454 assert_eq!(format!("{}", ExportFormat::Parquet), "parquet");
455 }
456
457 #[test]
458 fn test_export_format_from_str() {
459 assert_eq!("json".parse::<ExportFormat>().unwrap(), ExportFormat::Json);
460 assert_eq!(
461 "binary".parse::<ExportFormat>().unwrap(),
462 ExportFormat::Binary
463 );
464 assert_eq!("bin".parse::<ExportFormat>().unwrap(), ExportFormat::Binary);
465 assert_eq!(
466 "parquet".parse::<ExportFormat>().unwrap(),
467 ExportFormat::Parquet
468 );
469 assert_eq!("pq".parse::<ExportFormat>().unwrap(), ExportFormat::Parquet);
470 assert!("invalid".parse::<ExportFormat>().is_err());
471 }
472
473 #[test]
474 fn test_feature_exporter_creation() {
475 let exporter = FeatureExporter::new(ExportFormat::Json);
476 assert_eq!(exporter.format, ExportFormat::Json);
477
478 let default_exporter = FeatureExporter::default();
479 assert_eq!(default_exporter.format, ExportFormat::Json);
480 }
481
482 #[test]
483 fn test_to_matrix_single_sample() {
484 let features = vec![CommitFeatures {
485 defect_category: 0,
486 files_changed: 2.0,
487 lines_added: 10.0,
488 lines_deleted: 5.0,
489 complexity_delta: 1.5,
490 timestamp: 1700000000.0,
491 hour_of_day: 14,
492 day_of_week: 2,
493 ..Default::default()
494 }];
495
496 let matrix = FeatureExporter::to_matrix(&features).unwrap();
497 assert_eq!(matrix.n_rows(), 1);
498 assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
499 assert_eq!(matrix.get(0, 0), 0.0); assert_eq!(matrix.get(0, 1), 2.0); assert_eq!(matrix.get(0, 2), 10.0); }
503
504 #[test]
505 fn test_to_matrix_multiple_samples() {
506 let features = vec![
507 CommitFeatures {
508 defect_category: 0,
509 files_changed: 1.0,
510 lines_added: 10.0,
511 lines_deleted: 5.0,
512 complexity_delta: 0.0,
513 timestamp: 1700000000.0,
514 hour_of_day: 10,
515 day_of_week: 1,
516 ..Default::default()
517 },
518 CommitFeatures {
519 defect_category: 5,
520 files_changed: 3.0,
521 lines_added: 20.0,
522 lines_deleted: 15.0,
523 complexity_delta: 2.0,
524 timestamp: 1700000001.0,
525 hour_of_day: 11,
526 day_of_week: 2,
527 ..Default::default()
528 },
529 ];
530
531 let matrix = FeatureExporter::to_matrix(&features).unwrap();
532 assert_eq!(matrix.n_rows(), 2);
533 assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
534
535 assert_eq!(matrix.get(0, 0), 0.0);
537 assert_eq!(matrix.get(0, 1), 1.0);
538
539 assert_eq!(matrix.get(1, 0), 5.0);
541 assert_eq!(matrix.get(1, 1), 3.0);
542 }
543
544 #[test]
545 fn test_to_matrix_empty_error() {
546 let features: Vec<CommitFeatures> = vec![];
547 let result = FeatureExporter::to_matrix(&features);
548 assert!(result.is_err());
549 assert!(result.unwrap_err().to_string().contains("empty"));
550 }
551
552 #[test]
553 fn test_encode_label_all_categories() {
554 assert_eq!(
556 FeatureExporter::encode_label(DefectCategory::MemorySafety),
557 0
558 );
559 assert_eq!(
560 FeatureExporter::encode_label(DefectCategory::ConcurrencyBugs),
561 1
562 );
563 assert_eq!(
564 FeatureExporter::encode_label(DefectCategory::LogicErrors),
565 2
566 );
567 assert_eq!(FeatureExporter::encode_label(DefectCategory::ApiMisuse), 3);
568 assert_eq!(
569 FeatureExporter::encode_label(DefectCategory::ResourceLeaks),
570 4
571 );
572 assert_eq!(FeatureExporter::encode_label(DefectCategory::TypeErrors), 5);
573 assert_eq!(
574 FeatureExporter::encode_label(DefectCategory::ConfigurationErrors),
575 6
576 );
577 assert_eq!(
578 FeatureExporter::encode_label(DefectCategory::SecurityVulnerabilities),
579 7
580 );
581 assert_eq!(
582 FeatureExporter::encode_label(DefectCategory::PerformanceIssues),
583 8
584 );
585 assert_eq!(
586 FeatureExporter::encode_label(DefectCategory::IntegrationFailures),
587 9
588 );
589
590 assert_eq!(
592 FeatureExporter::encode_label(DefectCategory::OperatorPrecedence),
593 10
594 );
595 assert_eq!(
596 FeatureExporter::encode_label(DefectCategory::TypeAnnotationGaps),
597 11
598 );
599 assert_eq!(
600 FeatureExporter::encode_label(DefectCategory::StdlibMapping),
601 12
602 );
603 assert_eq!(
604 FeatureExporter::encode_label(DefectCategory::ASTTransform),
605 13
606 );
607 assert_eq!(
608 FeatureExporter::encode_label(DefectCategory::ComprehensionBugs),
609 14
610 );
611 assert_eq!(
612 FeatureExporter::encode_label(DefectCategory::IteratorChain),
613 15
614 );
615 assert_eq!(
616 FeatureExporter::encode_label(DefectCategory::OwnershipBorrow),
617 16
618 );
619 assert_eq!(
620 FeatureExporter::encode_label(DefectCategory::TraitBounds),
621 17
622 );
623 }
624
625 #[test]
626 fn test_decode_label_all_valid() {
627 for i in 0..NUM_CATEGORIES {
628 let result = FeatureExporter::decode_label(i as u8);
629 assert!(result.is_ok(), "Failed to decode label {}", i);
630 }
631 }
632
633 #[test]
634 fn test_decode_label_invalid() {
635 let result = FeatureExporter::decode_label(18);
636 assert!(result.is_err());
637 assert!(result.unwrap_err().to_string().contains("18"));
638
639 let result = FeatureExporter::decode_label(255);
640 assert!(result.is_err());
641 }
642
643 #[test]
644 fn test_encode_decode_roundtrip() {
645 let categories = vec![
646 DefectCategory::MemorySafety,
647 DefectCategory::SecurityVulnerabilities,
648 DefectCategory::TraitBounds,
649 DefectCategory::ASTTransform,
650 ];
651
652 for category in categories {
653 let encoded = FeatureExporter::encode_label(category);
654 let decoded = FeatureExporter::decode_label(encoded).unwrap();
655 assert_eq!(category, decoded);
656 }
657 }
658
659 #[test]
660 fn test_encode_labels_multiple() {
661 let categories = vec![
662 DefectCategory::MemorySafety,
663 DefectCategory::ConcurrencyBugs,
664 DefectCategory::TraitBounds,
665 ];
666
667 let labels = FeatureExporter::encode_labels(&categories);
668 assert_eq!(labels, vec![0, 1, 17]);
669 }
670
671 #[test]
672 fn test_category_names() {
673 let names = FeatureExporter::category_names();
674 assert_eq!(names.len(), NUM_CATEGORIES);
675 assert_eq!(names[0], "MemorySafety");
676 assert_eq!(names[17], "TraitBounds");
677 }
678
679 #[test]
680 fn test_export_basic() {
681 let exporter = FeatureExporter::new(ExportFormat::Json);
682
683 let features = vec![CommitFeatures {
684 defect_category: 0,
685 files_changed: 2.0,
686 lines_added: 10.0,
687 lines_deleted: 5.0,
688 complexity_delta: 0.0,
689 timestamp: 1700000000.0,
690 hour_of_day: 14,
691 day_of_week: 2,
692 ..Default::default()
693 }];
694
695 let categories = vec![DefectCategory::MemorySafety];
696
697 let dataset = exporter.export(&features, &categories).unwrap();
698 assert_eq!(dataset.shape, (1, FEATURE_DIMENSION));
699 assert_eq!(dataset.features.len(), FEATURE_DIMENSION);
700 assert_eq!(dataset.labels, vec![0]);
701 assert_eq!(dataset.metadata.n_samples, 1);
702 assert_eq!(dataset.metadata.n_features, FEATURE_DIMENSION);
703 }
704
705 #[test]
706 fn test_export_empty_error() {
707 let exporter = FeatureExporter::new(ExportFormat::Json);
708 let features: Vec<CommitFeatures> = vec![];
709 let categories: Vec<DefectCategory> = vec![];
710
711 let result = exporter.export(&features, &categories);
712 assert!(result.is_err());
713 }
714
715 #[test]
716 fn test_export_length_mismatch_error() {
717 let exporter = FeatureExporter::new(ExportFormat::Json);
718
719 let features = vec![CommitFeatures {
720 defect_category: 0,
721 files_changed: 2.0,
722 lines_added: 10.0,
723 lines_deleted: 5.0,
724 complexity_delta: 0.0,
725 timestamp: 1700000000.0,
726 hour_of_day: 14,
727 day_of_week: 2,
728 ..Default::default()
729 }];
730
731 let categories = vec![
732 DefectCategory::MemorySafety,
733 DefectCategory::ConcurrencyBugs, ];
735
736 let result = exporter.export(&features, &categories);
737 assert!(result.is_err());
738 assert!(result.unwrap_err().to_string().contains("mismatch"));
739 }
740
741 #[test]
742 fn test_export_multiple_samples() {
743 let exporter = FeatureExporter::new(ExportFormat::Json);
744
745 let features = vec![
746 CommitFeatures {
747 defect_category: 0,
748 files_changed: 1.0,
749 lines_added: 10.0,
750 lines_deleted: 5.0,
751 complexity_delta: 0.0,
752 timestamp: 1700000000.0,
753 hour_of_day: 10,
754 day_of_week: 1,
755 ..Default::default()
756 },
757 CommitFeatures {
758 defect_category: 7,
759 files_changed: 3.0,
760 lines_added: 20.0,
761 lines_deleted: 15.0,
762 complexity_delta: 2.0,
763 timestamp: 1700000001.0,
764 hour_of_day: 11,
765 day_of_week: 2,
766 ..Default::default()
767 },
768 ];
769
770 let categories = vec![
771 DefectCategory::MemorySafety,
772 DefectCategory::SecurityVulnerabilities,
773 ];
774
775 let dataset = exporter.export(&features, &categories).unwrap();
776 assert_eq!(dataset.shape, (2, FEATURE_DIMENSION));
777 assert_eq!(dataset.features.len(), 2 * FEATURE_DIMENSION);
778 assert_eq!(dataset.labels, vec![0, 7]);
779 assert_eq!(dataset.metadata.n_classes, 2);
780 }
781
782 #[test]
783 fn test_to_aprender_matrix() {
784 let dataset = ExportedDataset {
785 shape: (2, 3),
786 features: vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
787 labels: vec![0, 1],
788 category_names: vec!["A".to_string(), "B".to_string()],
789 metadata: ExportMetadata {
790 n_samples: 2,
791 n_features: 3,
792 n_classes: 2,
793 format: "json".to_string(),
794 version: "1.0.0".to_string(),
795 },
796 };
797
798 let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
799 assert_eq!(matrix.n_rows(), 2);
800 assert_eq!(matrix.n_cols(), 3);
801 assert_eq!(matrix.get(0, 0), 1.0);
802 assert_eq!(matrix.get(1, 2), 6.0);
803 }
804
805 #[test]
806 fn test_save_and_load_json() {
807 let exporter = FeatureExporter::new(ExportFormat::Json);
808
809 let features = vec![CommitFeatures {
810 defect_category: 5,
811 files_changed: 3.0,
812 lines_added: 15.0,
813 lines_deleted: 8.0,
814 complexity_delta: 1.0,
815 timestamp: 1700000000.0,
816 hour_of_day: 9,
817 day_of_week: 0,
818 ..Default::default()
819 }];
820
821 let categories = vec![DefectCategory::TypeErrors];
822 let dataset = exporter.export(&features, &categories).unwrap();
823
824 let temp_dir = tempfile::tempdir().unwrap();
826 let path = temp_dir.path().join("test_export.json");
827
828 exporter.save(&dataset, &path).unwrap();
829
830 let loaded = FeatureExporter::load(&path, ExportFormat::Json).unwrap();
832 assert_eq!(loaded.shape, dataset.shape);
833 assert_eq!(loaded.features, dataset.features);
834 assert_eq!(loaded.labels, dataset.labels);
835 }
836
837 #[test]
838 fn test_save_and_load_binary() {
839 let exporter = FeatureExporter::new(ExportFormat::Binary);
840
841 let features = vec![CommitFeatures {
842 defect_category: 10,
843 files_changed: 5.0,
844 lines_added: 25.0,
845 lines_deleted: 12.0,
846 complexity_delta: 3.0,
847 timestamp: 1700000000.0,
848 hour_of_day: 15,
849 day_of_week: 4,
850 ..Default::default()
851 }];
852
853 let categories = vec![DefectCategory::OperatorPrecedence];
854 let dataset = exporter.export(&features, &categories).unwrap();
855
856 let temp_dir = tempfile::tempdir().unwrap();
857 let path = temp_dir.path().join("test_export.bin");
858
859 exporter.save(&dataset, &path).unwrap();
860
861 let loaded = FeatureExporter::load(&path, ExportFormat::Binary).unwrap();
862 assert_eq!(loaded.shape, dataset.shape);
863 assert_eq!(loaded.labels, dataset.labels);
864 }
865
866 proptest! {
869 #[test]
871 fn prop_encode_decode_roundtrip(label in 0u8..18) {
872 let category = FeatureExporter::decode_label(label).unwrap();
873 let encoded = FeatureExporter::encode_label(category);
874 prop_assert_eq!(label, encoded);
875 }
876
877 #[test]
879 fn prop_valid_labels_decode(label in 0u8..18) {
880 let result = FeatureExporter::decode_label(label);
881 prop_assert!(result.is_ok());
882 }
883
884 #[test]
886 fn prop_invalid_labels_fail(label in 18u8..=255) {
887 let result = FeatureExporter::decode_label(label);
888 prop_assert!(result.is_err());
889 }
890
891 #[test]
893 fn prop_matrix_dimensions(
894 n_samples in 1usize..100,
895 defect_category in 0u8..18,
896 files_changed in 0.0f32..1000.0,
897 lines_added in 0.0f32..10000.0,
898 ) {
899 let features: Vec<CommitFeatures> = (0..n_samples)
900 .map(|_| CommitFeatures {
901 defect_category,
902 files_changed,
903 lines_added,
904 lines_deleted: 0.0,
905 complexity_delta: 0.0,
906 timestamp: 1700000000.0,
907 hour_of_day: 12,
908 day_of_week: 3,
909 ..Default::default()
910 })
911 .collect();
912
913 let matrix = FeatureExporter::to_matrix(&features).unwrap();
914 prop_assert_eq!(matrix.n_rows(), n_samples);
915 prop_assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
916 }
917
918 #[test]
920 fn prop_export_shape(n_samples in 1usize..50) {
921 let exporter = FeatureExporter::default();
922
923 let features: Vec<CommitFeatures> = (0..n_samples)
924 .map(|i| CommitFeatures {
925 defect_category: (i % 18) as u8,
926 files_changed: 1.0,
927 lines_added: 10.0,
928 lines_deleted: 5.0,
929 complexity_delta: 0.0,
930 timestamp: 1700000000.0,
931 hour_of_day: 12,
932 day_of_week: 3,
933 ..Default::default()
934 })
935 .collect();
936
937 let categories: Vec<DefectCategory> = (0..n_samples)
938 .map(|i| FeatureExporter::decode_label((i % 18) as u8).unwrap())
939 .collect();
940
941 let dataset = exporter.export(&features, &categories).unwrap();
942
943 prop_assert_eq!(dataset.shape.0, n_samples);
944 prop_assert_eq!(dataset.shape.1, FEATURE_DIMENSION);
945 prop_assert_eq!(dataset.features.len(), n_samples * FEATURE_DIMENSION);
946 prop_assert_eq!(dataset.labels.len(), n_samples);
947 }
948
949 #[test]
951 fn prop_category_names_count(_dummy in 0..1) {
952 let names = FeatureExporter::category_names();
953 prop_assert_eq!(names.len(), NUM_CATEGORIES);
954 }
955
956 #[test]
958 fn prop_feature_preservation(
959 files_changed in 0.0f32..1000.0,
960 lines_added in 0.0f32..10000.0,
961 lines_deleted in 0.0f32..5000.0,
962 ) {
963 let features = vec![CommitFeatures {
964 defect_category: 0,
965 files_changed,
966 lines_added,
967 lines_deleted,
968 complexity_delta: 0.0,
969 timestamp: 1700000000.0,
970 hour_of_day: 12,
971 day_of_week: 3,
972 ..Default::default()
973 }];
974
975 let matrix = FeatureExporter::to_matrix(&features).unwrap();
976
977 prop_assert_eq!(matrix.get(0, 1), files_changed);
978 prop_assert_eq!(matrix.get(0, 2), lines_added);
979 prop_assert_eq!(matrix.get(0, 3), lines_deleted);
980 }
981
982 #[test]
984 fn prop_export_to_matrix_roundtrip(n_samples in 1usize..20) {
985 let exporter = FeatureExporter::default();
986
987 let features: Vec<CommitFeatures> = (0..n_samples)
988 .map(|i| CommitFeatures {
989 defect_category: (i % 18) as u8,
990 files_changed: (i + 1) as f32,
991 lines_added: (i * 10) as f32,
992 lines_deleted: (i * 5) as f32,
993 complexity_delta: 0.0,
994 timestamp: 1700000000.0,
995 hour_of_day: 12,
996 day_of_week: 3,
997 ..Default::default()
998 })
999 .collect();
1000
1001 let categories: Vec<DefectCategory> = (0..n_samples)
1002 .map(|i| FeatureExporter::decode_label((i % 18) as u8).unwrap())
1003 .collect();
1004
1005 let dataset = exporter.export(&features, &categories).unwrap();
1006 let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
1007
1008 prop_assert_eq!(matrix.n_rows(), n_samples);
1009 prop_assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
1010
1011 prop_assert_eq!(matrix.get(0, 1), 1.0); }
1014 }
1015}
1016
1017#[cfg(test)]
1019mod integration_tests {
1020 use super::*;
1021
1022 #[test]
1024 fn test_full_roundtrip_json() {
1025 let exporter = FeatureExporter::new(ExportFormat::Json);
1026
1027 let features = vec![
1029 CommitFeatures {
1030 defect_category: 0,
1031 files_changed: 5.0,
1032 lines_added: 100.0,
1033 lines_deleted: 50.0,
1034 complexity_delta: 2.0,
1035 timestamp: 1700000000.0,
1036 hour_of_day: 14,
1037 day_of_week: 2,
1038 ..Default::default()
1039 },
1040 CommitFeatures {
1041 defect_category: 7,
1042 files_changed: 3.0,
1043 lines_added: 75.0,
1044 lines_deleted: 25.0,
1045 complexity_delta: 1.0,
1046 timestamp: 1700000001.0,
1047 hour_of_day: 15,
1048 day_of_week: 2,
1049 ..Default::default()
1050 },
1051 CommitFeatures {
1052 defect_category: 13,
1053 files_changed: 8.0,
1054 lines_added: 200.0,
1055 lines_deleted: 100.0,
1056 complexity_delta: 5.0,
1057 timestamp: 1700000002.0,
1058 hour_of_day: 16,
1059 day_of_week: 2,
1060 ..Default::default()
1061 },
1062 ];
1063
1064 let categories = vec![
1065 DefectCategory::MemorySafety,
1066 DefectCategory::SecurityVulnerabilities,
1067 DefectCategory::ASTTransform,
1068 ];
1069
1070 let dataset = exporter.export(&features, &categories).unwrap();
1072
1073 let temp_dir = tempfile::tempdir().unwrap();
1075 let path = temp_dir.path().join("roundtrip_test.json");
1076 exporter.save(&dataset, &path).unwrap();
1077
1078 let loaded = FeatureExporter::load(&path, ExportFormat::Json).unwrap();
1080
1081 assert_eq!(loaded.shape, (3, FEATURE_DIMENSION));
1083 assert_eq!(loaded.labels, vec![0, 7, 13]);
1084
1085 let matrix = FeatureExporter::to_aprender_matrix(&loaded).unwrap();
1087 assert_eq!(matrix.n_rows(), 3);
1088 assert_eq!(matrix.n_cols(), FEATURE_DIMENSION);
1089
1090 assert_eq!(matrix.get(0, 1), 5.0); assert_eq!(matrix.get(1, 0), 7.0); assert_eq!(matrix.get(2, 2), 200.0); }
1095
1096 #[test]
1098 fn test_aprender_training_compatibility() {
1099 use aprender::tree::RandomForestClassifier;
1100
1101 let exporter = FeatureExporter::new(ExportFormat::Json);
1102
1103 let mut features = Vec::new();
1105 let mut categories = Vec::new();
1106
1107 for i in 0..30 {
1108 features.push(CommitFeatures {
1109 defect_category: (i % 3) as u8,
1110 files_changed: (i + 1) as f32,
1111 lines_added: (i * 10 + 5) as f32,
1112 lines_deleted: (i * 5) as f32,
1113 complexity_delta: (i % 5) as f32,
1114 timestamp: (1700000000 + i) as f64,
1115 hour_of_day: (9 + i % 8) as u8,
1116 day_of_week: (i % 5) as u8,
1117 ..Default::default()
1118 });
1119
1120 categories.push(match i % 3 {
1121 0 => DefectCategory::MemorySafety,
1122 1 => DefectCategory::ConcurrencyBugs,
1123 _ => DefectCategory::LogicErrors,
1124 });
1125 }
1126
1127 let dataset = exporter.export(&features, &categories).unwrap();
1129
1130 let matrix = FeatureExporter::to_aprender_matrix(&dataset).unwrap();
1132 let labels: Vec<usize> = dataset.labels.iter().map(|&l| l as usize).collect();
1133
1134 let mut classifier = RandomForestClassifier::new(10);
1136 let result = classifier.fit(&matrix, &labels);
1137
1138 assert!(result.is_ok(), "RandomForest training should succeed");
1139
1140 let predictions = classifier.predict(&matrix);
1142 assert_eq!(predictions.len(), 30);
1143 }
1144}