1use crate::classifier::DefectCategory;
12use anyhow::{anyhow, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::HashMap;
15use std::path::Path;
16
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
19pub enum ErrorCodeClass {
20 Type = 0,
21 Borrow = 1,
22 Name = 2,
23 Trait = 3,
24 #[default]
25 Other = 4,
26}
27
28impl ErrorCodeClass {
29 pub fn as_u8(&self) -> u8 {
30 *self as u8
31 }
32}
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
36pub enum SuggestionApplicability {
37 #[default]
38 None = 0,
39 MachineApplicable = 1,
40 MaybeIncorrect = 2,
41 HasPlaceholders = 3,
42}
43
44impl SuggestionApplicability {
45 pub fn as_u8(&self) -> u8 {
46 *self as u8
47 }
48
49 pub fn parse(s: &str) -> Self {
50 match s {
51 "MachineApplicable" => Self::MachineApplicable,
52 "MaybeIncorrect" => Self::MaybeIncorrect,
53 "HasPlaceholders" => Self::HasPlaceholders,
54 _ => Self::None,
55 }
56 }
57}
58
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
61pub enum TrainingSource {
62 #[default]
63 CommitMessage,
64 DepylerCitl,
65 Manual,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct DepylerExport {
71 pub source_file: String,
72 pub error_code: Option<String>,
73 pub clippy_lint: Option<String>,
74 pub level: String,
75 pub message: String,
76 pub oip_category: Option<String>,
77 pub confidence: f32,
78 pub span: Option<SpanInfo>,
79 pub suggestion: Option<SuggestionInfo>,
80 pub timestamp: i64,
81 pub depyler_version: String,
82}
83
84#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
86pub struct SpanInfo {
87 pub line_start: u32,
88 pub column_start: u32,
89}
90
91#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
93pub struct SuggestionInfo {
94 pub replacement: String,
95 pub applicability: String,
96}
97
98#[derive(Debug, Clone, Default)]
100pub struct ImportStats {
101 pub total_records: usize,
102 pub imported: usize,
103 pub skipped_low_confidence: usize,
104 pub skipped_unknown_category: usize,
105 pub by_category: HashMap<DefectCategory, usize>,
106 pub by_source: HashMap<String, usize>,
107 pub avg_confidence: f32,
108}
109
110pub fn get_error_code_confidence(code: &str) -> f32 {
112 match code {
113 "E0308" | "E0277" => 0.95,
114 "E0502" | "E0503" | "E0505" => 0.95,
115 "E0382" | "E0507" => 0.90,
116 "E0425" | "E0433" | "E0412" => 0.85,
117 "E0599" | "E0614" | "E0615" => 0.80,
118 "E0658" => 0.75,
119 _ => 0.70,
120 }
121}
122
123pub fn rustc_to_defect_category(code: &str) -> Option<DefectCategory> {
142 match code {
143 "E0308" => Some(DefectCategory::TypeErrors),
145 "E0412" => Some(DefectCategory::TypeAnnotationGaps),
146
147 "E0502" | "E0503" | "E0505" => Some(DefectCategory::OwnershipBorrow),
149 "E0382" | "E0507" => Some(DefectCategory::MemorySafety),
150
151 "E0277" => Some(DefectCategory::TraitBounds),
153
154 "E0425" | "E0433" => Some(DefectCategory::StdlibMapping),
156
157 "E0599" | "E0615" => Some(DefectCategory::ASTTransform),
159 "E0614" => Some(DefectCategory::OperatorPrecedence),
160
161 "E0658" => Some(DefectCategory::ConfigurationErrors),
163
164 _ => None,
165 }
166}
167
168pub fn clippy_to_defect_category(lint: &str) -> Option<DefectCategory> {
187 match lint {
188 "clippy::unwrap_used" | "clippy::expect_used" | "clippy::panic" => {
189 Some(DefectCategory::ApiMisuse)
190 }
191 "clippy::todo" | "clippy::unreachable" => Some(DefectCategory::LogicErrors),
192 "clippy::cognitive_complexity" => Some(DefectCategory::PerformanceIssues),
193 "clippy::too_many_arguments" | "clippy::match_single_binding" => {
194 Some(DefectCategory::ASTTransform)
195 }
196 "clippy::needless_collect" => Some(DefectCategory::IteratorChain),
197 "clippy::manual_map" => Some(DefectCategory::ComprehensionBugs),
198 _ => None,
199 }
200}
201
202pub fn get_error_code_class(code: &str) -> ErrorCodeClass {
204 match code {
205 "E0308" | "E0412" => ErrorCodeClass::Type,
207 "E0502" | "E0503" | "E0505" | "E0382" | "E0507" => ErrorCodeClass::Borrow,
209 "E0425" | "E0433" => ErrorCodeClass::Name,
211 "E0277" => ErrorCodeClass::Trait,
213 _ => ErrorCodeClass::Other,
215 }
216}
217
218pub fn import_depyler_corpus<P: AsRef<Path>>(
227 path: P,
228 min_confidence: f32,
229) -> Result<(Vec<DepylerExport>, ImportStats)> {
230 let content = std::fs::read_to_string(path.as_ref())
231 .map_err(|e| anyhow!("Failed to read corpus file: {}", e))?;
232
233 let mut exports = Vec::new();
234 let mut stats = ImportStats::default();
235
236 for (line_num, line) in content.lines().enumerate() {
237 if line.trim().is_empty() {
238 continue;
239 }
240
241 stats.total_records += 1;
242
243 let export: DepylerExport = serde_json::from_str(line).map_err(|e| {
244 anyhow!(
245 "Failed to parse JSON at line {}: {} - content: {}",
246 line_num + 1,
247 e,
248 line
249 )
250 })?;
251
252 if export.confidence < min_confidence {
254 stats.skipped_low_confidence += 1;
255 continue;
256 }
257
258 let category = resolve_category(&export);
260 if category.is_none() {
261 stats.skipped_unknown_category += 1;
262 continue;
263 }
264
265 let cat = category.unwrap();
266 *stats.by_category.entry(cat).or_insert(0) += 1;
267 *stats
268 .by_source
269 .entry(export.source_file.clone())
270 .or_insert(0) += 1;
271
272 stats.imported += 1;
273 exports.push(export);
274 }
275
276 if stats.imported > 0 {
278 stats.avg_confidence =
279 exports.iter().map(|e| e.confidence).sum::<f32>() / stats.imported as f32;
280 }
281
282 Ok((exports, stats))
283}
284
285pub fn convert_to_training_examples(
293 exports: &[DepylerExport],
294) -> Vec<crate::training::TrainingExample> {
295 exports
296 .iter()
297 .filter_map(|export| {
298 let category = resolve_category(export)?;
299 let suggestion_applicability = export
300 .suggestion
301 .as_ref()
302 .map(|s| SuggestionApplicability::parse(&s.applicability));
303
304 Some(crate::training::TrainingExample {
305 message: export.message.clone(),
306 label: category,
307 confidence: export.confidence,
308 commit_hash: String::new(), author: "depyler".to_string(),
310 timestamp: export.timestamp,
311 lines_added: 0,
312 lines_removed: 0,
313 files_changed: 1,
314 error_code: export.error_code.clone(),
316 clippy_lint: export.clippy_lint.clone(),
317 has_suggestion: export.suggestion.is_some(),
318 suggestion_applicability,
319 source: TrainingSource::DepylerCitl,
320 })
321 })
322 .collect()
323}
324
325fn resolve_category(export: &DepylerExport) -> Option<DefectCategory> {
327 if let Some(ref cat_str) = export.oip_category {
329 if let Some(cat) = parse_defect_category(cat_str) {
330 return Some(cat);
331 }
332 }
333
334 if let Some(ref code) = export.error_code {
336 if let Some(cat) = rustc_to_defect_category(code) {
337 return Some(cat);
338 }
339 }
340
341 if let Some(ref lint) = export.clippy_lint {
343 if let Some(cat) = clippy_to_defect_category(lint) {
344 return Some(cat);
345 }
346 }
347
348 None
349}
350
351fn parse_defect_category(s: &str) -> Option<DefectCategory> {
353 match s {
354 "MemorySafety" => Some(DefectCategory::MemorySafety),
355 "ConcurrencyBugs" => Some(DefectCategory::ConcurrencyBugs),
356 "LogicErrors" => Some(DefectCategory::LogicErrors),
357 "ApiMisuse" => Some(DefectCategory::ApiMisuse),
358 "ResourceLeaks" => Some(DefectCategory::ResourceLeaks),
359 "TypeErrors" => Some(DefectCategory::TypeErrors),
360 "ConfigurationErrors" => Some(DefectCategory::ConfigurationErrors),
361 "SecurityVulnerabilities" => Some(DefectCategory::SecurityVulnerabilities),
362 "PerformanceIssues" => Some(DefectCategory::PerformanceIssues),
363 "IntegrationFailures" => Some(DefectCategory::IntegrationFailures),
364 "OperatorPrecedence" => Some(DefectCategory::OperatorPrecedence),
365 "TypeAnnotationGaps" => Some(DefectCategory::TypeAnnotationGaps),
366 "StdlibMapping" => Some(DefectCategory::StdlibMapping),
367 "ASTTransform" => Some(DefectCategory::ASTTransform),
368 "ComprehensionBugs" => Some(DefectCategory::ComprehensionBugs),
369 "IteratorChain" => Some(DefectCategory::IteratorChain),
370 "OwnershipBorrow" => Some(DefectCategory::OwnershipBorrow),
371 "TraitBounds" => Some(DefectCategory::TraitBounds),
372 _ => None,
373 }
374}
375
376#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
380pub enum MergeStrategy {
381 #[default]
383 Append,
384 Replace,
386 Weighted(u32),
388}
389
390#[derive(Debug, Clone)]
392pub struct CitlLoaderConfig {
393 pub batch_size: usize,
395 pub shuffle: bool,
397 pub min_confidence: f32,
399 pub merge_strategy: MergeStrategy,
401 pub weight: f32,
403}
404
405impl Default for CitlLoaderConfig {
406 fn default() -> Self {
407 Self {
408 batch_size: 128,
409 shuffle: true,
410 min_confidence: 0.75,
411 merge_strategy: MergeStrategy::Append,
412 weight: 1.0,
413 }
414 }
415}
416
417pub struct CitlDataLoader {
419 config: CitlLoaderConfig,
420}
421
422impl CitlDataLoader {
423 pub fn new() -> Self {
425 Self {
426 config: CitlLoaderConfig::default(),
427 }
428 }
429
430 pub fn with_config(config: CitlLoaderConfig) -> Self {
432 Self { config }
433 }
434
435 pub fn batch_size(mut self, size: usize) -> Self {
437 self.config.batch_size = size;
438 self
439 }
440
441 pub fn shuffle(mut self, shuffle: bool) -> Self {
443 self.config.shuffle = shuffle;
444 self
445 }
446
447 pub fn min_confidence(mut self, confidence: f32) -> Self {
449 self.config.min_confidence = confidence;
450 self
451 }
452
453 pub fn merge_strategy(mut self, strategy: MergeStrategy) -> Self {
455 self.config.merge_strategy = strategy;
456 self
457 }
458
459 pub fn load_parquet<P: AsRef<Path>>(&self, path: P) -> Result<CitlBatchIterator> {
463 use alimentar::{ArrowDataset, DataLoader};
464
465 let dataset = ArrowDataset::from_parquet(path.as_ref())
466 .map_err(|e| anyhow!("Failed to load Parquet: {}", e))?;
467
468 let mut loader = DataLoader::new(dataset).batch_size(self.config.batch_size);
469
470 if self.config.shuffle {
471 loader = loader.shuffle(true);
472 }
473
474 Ok(CitlBatchIterator {
475 inner: Box::new(loader.into_iter()),
476 min_confidence: self.config.min_confidence,
477 })
478 }
479
480 pub fn load_jsonl<P: AsRef<Path>>(
482 &self,
483 path: P,
484 ) -> Result<(Vec<crate::training::TrainingExample>, ImportStats)> {
485 let (exports, stats) = import_depyler_corpus(path, self.config.min_confidence)?;
486 let examples = convert_to_training_examples(&exports);
487 Ok((examples, stats))
488 }
489
490 pub fn config(&self) -> &CitlLoaderConfig {
492 &self.config
493 }
494}
495
496impl Default for CitlDataLoader {
497 fn default() -> Self {
498 Self::new()
499 }
500}
501
502pub struct CitlBatchIterator {
504 inner: Box<dyn Iterator<Item = arrow::array::RecordBatch> + Send>,
505 min_confidence: f32,
506}
507
508impl Iterator for CitlBatchIterator {
509 type Item = Vec<crate::training::TrainingExample>;
510
511 fn next(&mut self) -> Option<Self::Item> {
512 self.inner.next().map(|batch| {
513 convert_batch_to_examples(&batch, self.min_confidence)
515 })
516 }
517}
518
519fn convert_batch_to_examples(
521 batch: &arrow::array::RecordBatch,
522 min_confidence: f32,
523) -> Vec<crate::training::TrainingExample> {
524 use arrow::array::{Array, Float32Array, Int64Array, StringArray};
525
526 let num_rows = batch.num_rows();
527 let mut examples = Vec::with_capacity(num_rows);
528
529 let message_arr = batch
531 .column_by_name("message")
532 .and_then(|c| c.as_any().downcast_ref::<StringArray>());
533 let error_code_arr = batch
534 .column_by_name("error_code")
535 .and_then(|c| c.as_any().downcast_ref::<StringArray>());
536 let clippy_lint_arr = batch
537 .column_by_name("clippy_lint")
538 .and_then(|c| c.as_any().downcast_ref::<StringArray>());
539 let confidence_arr = batch
540 .column_by_name("confidence")
541 .and_then(|c| c.as_any().downcast_ref::<Float32Array>());
542 let timestamp_arr = batch
543 .column_by_name("timestamp")
544 .and_then(|c| c.as_any().downcast_ref::<Int64Array>());
545 let oip_category_arr = batch
546 .column_by_name("oip_category")
547 .and_then(|c| c.as_any().downcast_ref::<StringArray>());
548
549 for i in 0..num_rows {
550 let confidence = confidence_arr.map(|a| a.value(i)).unwrap_or(0.0);
552
553 if confidence < min_confidence {
554 continue;
555 }
556
557 let message = message_arr
559 .and_then(|a| {
560 if a.is_null(i) {
561 None
562 } else {
563 Some(a.value(i).to_string())
564 }
565 })
566 .unwrap_or_default();
567
568 let error_code = error_code_arr.and_then(|a| {
570 if a.is_null(i) {
571 None
572 } else {
573 Some(a.value(i).to_string())
574 }
575 });
576
577 let clippy_lint = clippy_lint_arr.and_then(|a| {
579 if a.is_null(i) {
580 None
581 } else {
582 Some(a.value(i).to_string())
583 }
584 });
585
586 let timestamp = timestamp_arr.map(|a| a.value(i)).unwrap_or(0);
588
589 let oip_category =
591 oip_category_arr.and_then(|a| if a.is_null(i) { None } else { Some(a.value(i)) });
592
593 let category = oip_category
594 .and_then(parse_defect_category)
595 .or_else(|| error_code.as_deref().and_then(rustc_to_defect_category))
596 .or_else(|| clippy_lint.as_deref().and_then(clippy_to_defect_category));
597
598 if let Some(label) = category {
599 examples.push(crate::training::TrainingExample {
600 message,
601 label,
602 confidence,
603 commit_hash: String::new(),
604 author: "depyler".to_string(),
605 timestamp,
606 lines_added: 0,
607 lines_removed: 0,
608 files_changed: 1,
609 error_code,
610 clippy_lint,
611 has_suggestion: false,
612 suggestion_applicability: None,
613 source: TrainingSource::DepylerCitl,
614 });
615 }
616 }
617
618 examples
619}
620
621pub fn validate_citl_schema<P: AsRef<Path>>(path: P) -> Result<SchemaValidation> {
623 use alimentar::{ArrowDataset, Dataset};
624
625 let ext = path.as_ref().extension().and_then(|e| e.to_str());
626
627 let schema = match ext {
628 Some("parquet") => {
629 let dataset = ArrowDataset::from_parquet(path.as_ref())
630 .map_err(|e| anyhow!("Failed to load Parquet: {}", e))?;
631 dataset.schema()
632 }
633 Some("jsonl") | Some("json") => {
634 let content = std::fs::read_to_string(path.as_ref())?;
636 let first_line = content
637 .lines()
638 .next()
639 .ok_or_else(|| anyhow!("Empty file"))?;
640 let _: DepylerExport = serde_json::from_str(first_line)
641 .map_err(|e| anyhow!("Invalid JSONL schema: {}", e))?;
642 return Ok(SchemaValidation {
643 is_valid: true,
644 missing_fields: vec![],
645 extra_fields: vec![],
646 format: "jsonl".to_string(),
647 });
648 }
649 _ => return Err(anyhow!("Unsupported file format: {:?}", ext)),
650 };
651
652 let required_fields = ["message", "confidence", "timestamp"];
654 let optional_fields = [
655 "error_code",
656 "clippy_lint",
657 "oip_category",
658 "suggestion",
659 "span",
660 ];
661
662 let schema_fields: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
663
664 let missing: Vec<String> = required_fields
665 .iter()
666 .filter(|f| !schema_fields.contains(*f))
667 .map(|s: &&str| (*s).to_string())
668 .collect();
669
670 let known_fields: Vec<&str> = required_fields
671 .iter()
672 .chain(optional_fields.iter())
673 .copied()
674 .collect();
675
676 let extra: Vec<String> = schema_fields
677 .iter()
678 .filter(|f| !known_fields.contains(*f))
679 .map(|s: &&str| (*s).to_string())
680 .collect();
681
682 Ok(SchemaValidation {
683 is_valid: missing.is_empty(),
684 missing_fields: missing,
685 extra_fields: extra,
686 format: "parquet".to_string(),
687 })
688}
689
690#[derive(Debug, Clone)]
692pub struct SchemaValidation {
693 pub is_valid: bool,
695 pub missing_fields: Vec<String>,
697 pub extra_fields: Vec<String>,
699 pub format: String,
701}
702
703#[cfg(test)]
704mod tests {
705 use super::*;
706
707 #[test]
710 fn test_rustc_type_error_e0308() {
711 assert_eq!(
712 rustc_to_defect_category("E0308"),
713 Some(DefectCategory::TypeErrors)
714 );
715 }
716
717 #[test]
718 fn test_rustc_type_annotation_e0412() {
719 assert_eq!(
720 rustc_to_defect_category("E0412"),
721 Some(DefectCategory::TypeAnnotationGaps)
722 );
723 }
724
725 #[test]
726 fn test_rustc_ownership_borrow_e0502() {
727 assert_eq!(
728 rustc_to_defect_category("E0502"),
729 Some(DefectCategory::OwnershipBorrow)
730 );
731 }
732
733 #[test]
734 fn test_rustc_ownership_borrow_e0503() {
735 assert_eq!(
736 rustc_to_defect_category("E0503"),
737 Some(DefectCategory::OwnershipBorrow)
738 );
739 }
740
741 #[test]
742 fn test_rustc_ownership_borrow_e0505() {
743 assert_eq!(
744 rustc_to_defect_category("E0505"),
745 Some(DefectCategory::OwnershipBorrow)
746 );
747 }
748
749 #[test]
750 fn test_rustc_memory_safety_e0382() {
751 assert_eq!(
752 rustc_to_defect_category("E0382"),
753 Some(DefectCategory::MemorySafety)
754 );
755 }
756
757 #[test]
758 fn test_rustc_memory_safety_e0507() {
759 assert_eq!(
760 rustc_to_defect_category("E0507"),
761 Some(DefectCategory::MemorySafety)
762 );
763 }
764
765 #[test]
766 fn test_rustc_trait_bounds_e0277() {
767 assert_eq!(
768 rustc_to_defect_category("E0277"),
769 Some(DefectCategory::TraitBounds)
770 );
771 }
772
773 #[test]
774 fn test_rustc_stdlib_mapping_e0425() {
775 assert_eq!(
776 rustc_to_defect_category("E0425"),
777 Some(DefectCategory::StdlibMapping)
778 );
779 }
780
781 #[test]
782 fn test_rustc_stdlib_mapping_e0433() {
783 assert_eq!(
784 rustc_to_defect_category("E0433"),
785 Some(DefectCategory::StdlibMapping)
786 );
787 }
788
789 #[test]
790 fn test_rustc_ast_transform_e0599() {
791 assert_eq!(
792 rustc_to_defect_category("E0599"),
793 Some(DefectCategory::ASTTransform)
794 );
795 }
796
797 #[test]
798 fn test_rustc_ast_transform_e0615() {
799 assert_eq!(
800 rustc_to_defect_category("E0615"),
801 Some(DefectCategory::ASTTransform)
802 );
803 }
804
805 #[test]
806 fn test_rustc_operator_precedence_e0614() {
807 assert_eq!(
808 rustc_to_defect_category("E0614"),
809 Some(DefectCategory::OperatorPrecedence)
810 );
811 }
812
813 #[test]
814 fn test_rustc_configuration_e0658() {
815 assert_eq!(
816 rustc_to_defect_category("E0658"),
817 Some(DefectCategory::ConfigurationErrors)
818 );
819 }
820
821 #[test]
822 fn test_rustc_unknown_code_returns_none() {
823 assert_eq!(rustc_to_defect_category("E9999"), None);
824 assert_eq!(rustc_to_defect_category("UNKNOWN"), None);
825 assert_eq!(rustc_to_defect_category(""), None);
826 }
827
828 #[test]
831 fn test_clippy_api_misuse_unwrap() {
832 assert_eq!(
833 clippy_to_defect_category("clippy::unwrap_used"),
834 Some(DefectCategory::ApiMisuse)
835 );
836 }
837
838 #[test]
839 fn test_clippy_api_misuse_expect() {
840 assert_eq!(
841 clippy_to_defect_category("clippy::expect_used"),
842 Some(DefectCategory::ApiMisuse)
843 );
844 }
845
846 #[test]
847 fn test_clippy_api_misuse_panic() {
848 assert_eq!(
849 clippy_to_defect_category("clippy::panic"),
850 Some(DefectCategory::ApiMisuse)
851 );
852 }
853
854 #[test]
855 fn test_clippy_logic_errors_todo() {
856 assert_eq!(
857 clippy_to_defect_category("clippy::todo"),
858 Some(DefectCategory::LogicErrors)
859 );
860 }
861
862 #[test]
863 fn test_clippy_logic_errors_unreachable() {
864 assert_eq!(
865 clippy_to_defect_category("clippy::unreachable"),
866 Some(DefectCategory::LogicErrors)
867 );
868 }
869
870 #[test]
871 fn test_clippy_performance_cognitive_complexity() {
872 assert_eq!(
873 clippy_to_defect_category("clippy::cognitive_complexity"),
874 Some(DefectCategory::PerformanceIssues)
875 );
876 }
877
878 #[test]
879 fn test_clippy_ast_transform_too_many_arguments() {
880 assert_eq!(
881 clippy_to_defect_category("clippy::too_many_arguments"),
882 Some(DefectCategory::ASTTransform)
883 );
884 }
885
886 #[test]
887 fn test_clippy_ast_transform_match_single_binding() {
888 assert_eq!(
889 clippy_to_defect_category("clippy::match_single_binding"),
890 Some(DefectCategory::ASTTransform)
891 );
892 }
893
894 #[test]
895 fn test_clippy_iterator_chain_needless_collect() {
896 assert_eq!(
897 clippy_to_defect_category("clippy::needless_collect"),
898 Some(DefectCategory::IteratorChain)
899 );
900 }
901
902 #[test]
903 fn test_clippy_comprehension_bugs_manual_map() {
904 assert_eq!(
905 clippy_to_defect_category("clippy::manual_map"),
906 Some(DefectCategory::ComprehensionBugs)
907 );
908 }
909
910 #[test]
911 fn test_clippy_unknown_lint_returns_none() {
912 assert_eq!(clippy_to_defect_category("clippy::unknown_lint"), None);
913 assert_eq!(clippy_to_defect_category("not_clippy"), None);
914 assert_eq!(clippy_to_defect_category(""), None);
915 }
916
917 #[test]
920 fn test_error_code_class_type() {
921 assert_eq!(get_error_code_class("E0308"), ErrorCodeClass::Type);
922 assert_eq!(get_error_code_class("E0412"), ErrorCodeClass::Type);
923 }
924
925 #[test]
926 fn test_error_code_class_borrow() {
927 assert_eq!(get_error_code_class("E0502"), ErrorCodeClass::Borrow);
928 assert_eq!(get_error_code_class("E0503"), ErrorCodeClass::Borrow);
929 assert_eq!(get_error_code_class("E0505"), ErrorCodeClass::Borrow);
930 assert_eq!(get_error_code_class("E0382"), ErrorCodeClass::Borrow);
931 assert_eq!(get_error_code_class("E0507"), ErrorCodeClass::Borrow);
932 }
933
934 #[test]
935 fn test_error_code_class_name() {
936 assert_eq!(get_error_code_class("E0425"), ErrorCodeClass::Name);
937 assert_eq!(get_error_code_class("E0433"), ErrorCodeClass::Name);
938 }
939
940 #[test]
941 fn test_error_code_class_trait() {
942 assert_eq!(get_error_code_class("E0277"), ErrorCodeClass::Trait);
943 }
944
945 #[test]
946 fn test_error_code_class_other() {
947 assert_eq!(get_error_code_class("E9999"), ErrorCodeClass::Other);
948 assert_eq!(get_error_code_class("UNKNOWN"), ErrorCodeClass::Other);
949 }
950
951 #[test]
952 fn test_error_code_class_as_u8() {
953 assert_eq!(ErrorCodeClass::Type.as_u8(), 0);
954 assert_eq!(ErrorCodeClass::Borrow.as_u8(), 1);
955 assert_eq!(ErrorCodeClass::Name.as_u8(), 2);
956 assert_eq!(ErrorCodeClass::Trait.as_u8(), 3);
957 assert_eq!(ErrorCodeClass::Other.as_u8(), 4);
958 }
959
960 #[test]
963 fn test_suggestion_applicability_parse() {
964 assert_eq!(
965 SuggestionApplicability::parse("MachineApplicable"),
966 SuggestionApplicability::MachineApplicable
967 );
968 assert_eq!(
969 SuggestionApplicability::parse("MaybeIncorrect"),
970 SuggestionApplicability::MaybeIncorrect
971 );
972 assert_eq!(
973 SuggestionApplicability::parse("HasPlaceholders"),
974 SuggestionApplicability::HasPlaceholders
975 );
976 assert_eq!(
977 SuggestionApplicability::parse("Unknown"),
978 SuggestionApplicability::None
979 );
980 }
981
982 #[test]
983 fn test_suggestion_applicability_as_u8() {
984 assert_eq!(SuggestionApplicability::None.as_u8(), 0);
985 assert_eq!(SuggestionApplicability::MachineApplicable.as_u8(), 1);
986 assert_eq!(SuggestionApplicability::MaybeIncorrect.as_u8(), 2);
987 assert_eq!(SuggestionApplicability::HasPlaceholders.as_u8(), 3);
988 }
989
990 #[test]
993 fn test_error_code_confidence_high() {
994 assert!((get_error_code_confidence("E0308") - 0.95).abs() < 0.001);
995 assert!((get_error_code_confidence("E0277") - 0.95).abs() < 0.001);
996 assert!((get_error_code_confidence("E0502") - 0.95).abs() < 0.001);
997 }
998
999 #[test]
1000 fn test_error_code_confidence_medium() {
1001 assert!((get_error_code_confidence("E0382") - 0.90).abs() < 0.001);
1002 assert!((get_error_code_confidence("E0425") - 0.85).abs() < 0.001);
1003 assert!((get_error_code_confidence("E0599") - 0.80).abs() < 0.001);
1004 }
1005
1006 #[test]
1007 fn test_error_code_confidence_low() {
1008 assert!((get_error_code_confidence("E0658") - 0.75).abs() < 0.001);
1009 assert!((get_error_code_confidence("UNKNOWN") - 0.70).abs() < 0.001);
1010 }
1011
1012 #[test]
1015 fn test_depyler_export_parse() {
1016 let json = r#"{
1017 "source_file": "example.py",
1018 "error_code": "E0308",
1019 "clippy_lint": null,
1020 "level": "error",
1021 "message": "mismatched types",
1022 "oip_category": "TypeErrors",
1023 "confidence": 0.95,
1024 "span": {"line_start": 42, "column_start": 12},
1025 "suggestion": {"replacement": ".parse::<i32>()", "applicability": "MaybeIncorrect"},
1026 "timestamp": 1732752000,
1027 "depyler_version": "3.21.0"
1028 }"#;
1029
1030 let export: DepylerExport = serde_json::from_str(json).unwrap();
1031
1032 assert_eq!(export.source_file, "example.py");
1033 assert_eq!(export.error_code, Some("E0308".to_string()));
1034 assert_eq!(export.clippy_lint, None);
1035 assert_eq!(export.level, "error");
1036 assert!((export.confidence - 0.95).abs() < 0.001);
1037 assert_eq!(export.span.as_ref().unwrap().line_start, 42);
1038 assert_eq!(
1039 export.suggestion.as_ref().unwrap().applicability,
1040 "MaybeIncorrect"
1041 );
1042 }
1043
1044 #[test]
1045 fn test_depyler_export_minimal() {
1046 let json = r#"{
1047 "source_file": "test.py",
1048 "error_code": null,
1049 "clippy_lint": "clippy::unwrap_used",
1050 "level": "warning",
1051 "message": "unwrap used",
1052 "oip_category": null,
1053 "confidence": 0.80,
1054 "span": null,
1055 "suggestion": null,
1056 "timestamp": 1732752000,
1057 "depyler_version": "3.21.0"
1058 }"#;
1059
1060 let export: DepylerExport = serde_json::from_str(json).unwrap();
1061
1062 assert_eq!(export.error_code, None);
1063 assert_eq!(export.clippy_lint, Some("clippy::unwrap_used".to_string()));
1064 assert_eq!(export.span, None);
1065 assert_eq!(export.suggestion, None);
1066 }
1067
1068 #[test]
1071 fn test_resolve_category_from_oip_category() {
1072 let export = DepylerExport {
1073 source_file: "test.py".to_string(),
1074 error_code: None,
1075 clippy_lint: None,
1076 level: "error".to_string(),
1077 message: "test".to_string(),
1078 oip_category: Some("MemorySafety".to_string()),
1079 confidence: 0.90,
1080 span: None,
1081 suggestion: None,
1082 timestamp: 0,
1083 depyler_version: "1.0".to_string(),
1084 };
1085
1086 assert_eq!(
1087 resolve_category(&export),
1088 Some(DefectCategory::MemorySafety)
1089 );
1090 }
1091
1092 #[test]
1093 fn test_resolve_category_from_error_code() {
1094 let export = DepylerExport {
1095 source_file: "test.py".to_string(),
1096 error_code: Some("E0308".to_string()),
1097 clippy_lint: None,
1098 level: "error".to_string(),
1099 message: "test".to_string(),
1100 oip_category: None,
1101 confidence: 0.90,
1102 span: None,
1103 suggestion: None,
1104 timestamp: 0,
1105 depyler_version: "1.0".to_string(),
1106 };
1107
1108 assert_eq!(resolve_category(&export), Some(DefectCategory::TypeErrors));
1109 }
1110
1111 #[test]
1112 fn test_resolve_category_from_clippy_lint() {
1113 let export = DepylerExport {
1114 source_file: "test.py".to_string(),
1115 error_code: None,
1116 clippy_lint: Some("clippy::unwrap_used".to_string()),
1117 level: "warning".to_string(),
1118 message: "test".to_string(),
1119 oip_category: None,
1120 confidence: 0.90,
1121 span: None,
1122 suggestion: None,
1123 timestamp: 0,
1124 depyler_version: "1.0".to_string(),
1125 };
1126
1127 assert_eq!(resolve_category(&export), Some(DefectCategory::ApiMisuse));
1128 }
1129
1130 #[test]
1131 fn test_resolve_category_unknown() {
1132 let export = DepylerExport {
1133 source_file: "test.py".to_string(),
1134 error_code: Some("E9999".to_string()),
1135 clippy_lint: None,
1136 level: "error".to_string(),
1137 message: "test".to_string(),
1138 oip_category: None,
1139 confidence: 0.90,
1140 span: None,
1141 suggestion: None,
1142 timestamp: 0,
1143 depyler_version: "1.0".to_string(),
1144 };
1145
1146 assert_eq!(resolve_category(&export), None);
1147 }
1148
1149 #[test]
1152 fn test_parse_all_defect_categories() {
1153 let categories = vec![
1154 ("MemorySafety", DefectCategory::MemorySafety),
1155 ("ConcurrencyBugs", DefectCategory::ConcurrencyBugs),
1156 ("LogicErrors", DefectCategory::LogicErrors),
1157 ("ApiMisuse", DefectCategory::ApiMisuse),
1158 ("ResourceLeaks", DefectCategory::ResourceLeaks),
1159 ("TypeErrors", DefectCategory::TypeErrors),
1160 ("ConfigurationErrors", DefectCategory::ConfigurationErrors),
1161 (
1162 "SecurityVulnerabilities",
1163 DefectCategory::SecurityVulnerabilities,
1164 ),
1165 ("PerformanceIssues", DefectCategory::PerformanceIssues),
1166 ("IntegrationFailures", DefectCategory::IntegrationFailures),
1167 ("OperatorPrecedence", DefectCategory::OperatorPrecedence),
1168 ("TypeAnnotationGaps", DefectCategory::TypeAnnotationGaps),
1169 ("StdlibMapping", DefectCategory::StdlibMapping),
1170 ("ASTTransform", DefectCategory::ASTTransform),
1171 ("ComprehensionBugs", DefectCategory::ComprehensionBugs),
1172 ("IteratorChain", DefectCategory::IteratorChain),
1173 ("OwnershipBorrow", DefectCategory::OwnershipBorrow),
1174 ("TraitBounds", DefectCategory::TraitBounds),
1175 ];
1176
1177 for (s, expected) in categories {
1178 assert_eq!(
1179 parse_defect_category(s),
1180 Some(expected),
1181 "Failed for: {}",
1182 s
1183 );
1184 }
1185 }
1186
1187 #[test]
1188 fn test_parse_unknown_category() {
1189 assert_eq!(parse_defect_category("Unknown"), None);
1190 assert_eq!(parse_defect_category(""), None);
1191 }
1192
1193 #[test]
1196 fn test_training_source_default() {
1197 assert_eq!(TrainingSource::default(), TrainingSource::CommitMessage);
1198 }
1199
1200 #[test]
1201 fn test_training_source_serialization() {
1202 let source = TrainingSource::DepylerCitl;
1203 let json = serde_json::to_string(&source).unwrap();
1204 let parsed: TrainingSource = serde_json::from_str(&json).unwrap();
1205 assert_eq!(parsed, TrainingSource::DepylerCitl);
1206 }
1207
1208 #[test]
1211 fn test_import_depyler_corpus_file_not_found() {
1212 let result = import_depyler_corpus("/nonexistent/path.jsonl", 0.75);
1213 assert!(result.is_err());
1214 }
1215
1216 #[test]
1217 fn test_import_stats_default() {
1218 let stats = ImportStats::default();
1219 assert_eq!(stats.total_records, 0);
1220 assert_eq!(stats.imported, 0);
1221 assert_eq!(stats.skipped_low_confidence, 0);
1222 assert_eq!(stats.skipped_unknown_category, 0);
1223 assert!(stats.by_category.is_empty());
1224 assert!((stats.avg_confidence - 0.0).abs() < 0.001);
1225 }
1226
1227 #[test]
1230 fn test_convert_to_training_examples_basic() {
1231 let exports = vec![DepylerExport {
1232 source_file: "test.py".to_string(),
1233 error_code: Some("E0308".to_string()),
1234 clippy_lint: None,
1235 level: "error".to_string(),
1236 message: "mismatched types".to_string(),
1237 oip_category: None,
1238 confidence: 0.95,
1239 span: None,
1240 suggestion: None,
1241 timestamp: 1732752000,
1242 depyler_version: "3.21.0".to_string(),
1243 }];
1244
1245 let examples = convert_to_training_examples(&exports);
1246 assert_eq!(examples.len(), 1);
1247 assert_eq!(examples[0].label, DefectCategory::TypeErrors);
1248 assert_eq!(examples[0].message, "mismatched types");
1249 assert!((examples[0].confidence - 0.95).abs() < 0.001);
1250 assert_eq!(examples[0].error_code, Some("E0308".to_string()));
1251 assert_eq!(examples[0].source, TrainingSource::DepylerCitl);
1252 }
1253
1254 #[test]
1255 fn test_convert_to_training_examples_with_suggestion() {
1256 let exports = vec![DepylerExport {
1257 source_file: "test.py".to_string(),
1258 error_code: Some("E0308".to_string()),
1259 clippy_lint: None,
1260 level: "error".to_string(),
1261 message: "type error".to_string(),
1262 oip_category: None,
1263 confidence: 0.90,
1264 span: None,
1265 suggestion: Some(SuggestionInfo {
1266 replacement: ".parse::<i32>()".to_string(),
1267 applicability: "MachineApplicable".to_string(),
1268 }),
1269 timestamp: 1732752000,
1270 depyler_version: "3.21.0".to_string(),
1271 }];
1272
1273 let examples = convert_to_training_examples(&exports);
1274 assert_eq!(examples.len(), 1);
1275 assert!(examples[0].has_suggestion);
1276 assert_eq!(
1277 examples[0].suggestion_applicability,
1278 Some(SuggestionApplicability::MachineApplicable)
1279 );
1280 }
1281
1282 #[test]
1283 fn test_convert_to_training_examples_filters_unknown() {
1284 let exports = vec![
1285 DepylerExport {
1286 source_file: "test.py".to_string(),
1287 error_code: Some("E0308".to_string()),
1288 clippy_lint: None,
1289 level: "error".to_string(),
1290 message: "known error".to_string(),
1291 oip_category: None,
1292 confidence: 0.90,
1293 span: None,
1294 suggestion: None,
1295 timestamp: 0,
1296 depyler_version: "1.0".to_string(),
1297 },
1298 DepylerExport {
1299 source_file: "test.py".to_string(),
1300 error_code: Some("E9999".to_string()), clippy_lint: None,
1302 level: "error".to_string(),
1303 message: "unknown error".to_string(),
1304 oip_category: None,
1305 confidence: 0.90,
1306 span: None,
1307 suggestion: None,
1308 timestamp: 0,
1309 depyler_version: "1.0".to_string(),
1310 },
1311 ];
1312
1313 let examples = convert_to_training_examples(&exports);
1314 assert_eq!(examples.len(), 1);
1315 assert_eq!(examples[0].message, "known error");
1316 }
1317
1318 #[test]
1321 fn test_merge_strategy_default() {
1322 let strategy = MergeStrategy::default();
1323 assert!(matches!(strategy, MergeStrategy::Append));
1324 }
1325
1326 #[test]
1327 fn test_merge_strategy_append() {
1328 let strategy = MergeStrategy::Append;
1329 assert!(matches!(strategy, MergeStrategy::Append));
1330 }
1331
1332 #[test]
1333 fn test_merge_strategy_replace() {
1334 let strategy = MergeStrategy::Replace;
1335 assert!(matches!(strategy, MergeStrategy::Replace));
1336 }
1337
1338 #[test]
1339 fn test_merge_strategy_weighted() {
1340 let strategy = MergeStrategy::Weighted(2);
1341 if let MergeStrategy::Weighted(multiplier) = strategy {
1342 assert_eq!(multiplier, 2);
1343 } else {
1344 panic!("Expected MergeStrategy::Weighted");
1345 }
1346 }
1347
1348 #[test]
1351 fn test_citl_loader_config_default() {
1352 let config = CitlLoaderConfig::default();
1353 assert_eq!(config.batch_size, 128);
1354 assert!((config.min_confidence - 0.75).abs() < 0.001);
1355 assert!(matches!(config.merge_strategy, MergeStrategy::Append));
1356 assert!(config.shuffle);
1357 assert!((config.weight - 1.0).abs() < 0.001);
1358 }
1359
1360 #[test]
1361 fn test_citl_loader_config_custom() {
1362 let config = CitlLoaderConfig {
1363 batch_size: 512,
1364 min_confidence: 0.9,
1365 merge_strategy: MergeStrategy::Replace,
1366 shuffle: false,
1367 weight: 2.0,
1368 };
1369 assert_eq!(config.batch_size, 512);
1370 assert!((config.min_confidence - 0.9).abs() < 0.001);
1371 assert!(!config.shuffle);
1372 assert!((config.weight - 2.0).abs() < 0.001);
1373 }
1374
1375 #[test]
1378 fn test_citl_data_loader_new() {
1379 let loader = CitlDataLoader::new();
1380 assert_eq!(loader.config().batch_size, 128);
1381 }
1382
1383 #[test]
1384 fn test_citl_data_loader_with_config() {
1385 let config = CitlLoaderConfig {
1386 batch_size: 256,
1387 min_confidence: 0.8,
1388 ..CitlLoaderConfig::default()
1389 };
1390 let loader = CitlDataLoader::with_config(config);
1391 assert_eq!(loader.config().batch_size, 256);
1392 assert!((loader.config().min_confidence - 0.8).abs() < 0.001);
1393 }
1394
1395 #[test]
1396 fn test_citl_data_loader_default() {
1397 let loader = CitlDataLoader::default();
1398 assert_eq!(loader.config().batch_size, 128);
1399 }
1400
1401 #[test]
1402 fn test_citl_data_loader_load_jsonl_not_found() {
1403 let loader = CitlDataLoader::new();
1404 let result = loader.load_jsonl("nonexistent.jsonl");
1405 assert!(result.is_err());
1406 }
1407
1408 #[test]
1409 fn test_citl_data_loader_load_parquet_not_found() {
1410 let loader = CitlDataLoader::new();
1411 let result = loader.load_parquet("nonexistent.parquet");
1412 assert!(result.is_err());
1413 }
1414
1415 #[test]
1416 fn test_citl_data_loader_load_jsonl_valid() {
1417 use std::io::Write;
1418 let temp_dir = tempfile::tempdir().unwrap();
1419 let file_path = temp_dir.path().join("valid.jsonl");
1420 let mut file = std::fs::File::create(&file_path).unwrap();
1421
1422 writeln!(file, r#"{{"source_file":"test.py","error_code":"E0308","clippy_lint":null,"level":"error","message":"type mismatch","oip_category":null,"confidence":0.95,"span":null,"suggestion":null,"timestamp":1732752000,"depyler_version":"1.0"}}"#).unwrap();
1424 writeln!(file, r#"{{"source_file":"test.py","error_code":null,"clippy_lint":"clippy::unwrap_used","level":"warning","message":"unwrap used","oip_category":null,"confidence":0.85,"span":null,"suggestion":null,"timestamp":1732752001,"depyler_version":"1.0"}}"#).unwrap();
1425
1426 let loader = CitlDataLoader::new();
1427 let result = loader.load_jsonl(&file_path);
1428 assert!(result.is_ok());
1429
1430 let (examples, stats) = result.unwrap();
1431 assert_eq!(examples.len(), 2);
1432 assert_eq!(stats.total_records, 2);
1433 assert_eq!(stats.imported, 2);
1434 }
1435
1436 #[test]
1437 fn test_citl_data_loader_load_parquet_valid() {
1438 use arrow::array::{Float32Array, Int64Array, StringArray};
1439 use arrow::datatypes::{DataType, Field, Schema};
1440 use parquet::arrow::ArrowWriter;
1441 use std::fs::File;
1442 use std::sync::Arc;
1443
1444 let temp_dir = tempfile::tempdir().unwrap();
1445 let file_path = temp_dir.path().join("valid.parquet");
1446
1447 let schema = Arc::new(Schema::new(vec![
1449 Field::new("message", DataType::Utf8, false),
1450 Field::new("confidence", DataType::Float32, false),
1451 Field::new("error_code", DataType::Utf8, true),
1452 Field::new("timestamp", DataType::Int64, false),
1453 ]));
1454
1455 let message_arr = StringArray::from(vec!["type mismatch", "api misuse"]);
1457 let confidence_arr = Float32Array::from(vec![0.95, 0.88]);
1458 let error_code_arr = StringArray::from(vec![Some("E0308"), None]);
1459 let timestamp_arr = Int64Array::from(vec![1732752000, 1732752001]);
1460
1461 let batch = arrow::array::RecordBatch::try_new(
1462 schema.clone(),
1463 vec![
1464 Arc::new(message_arr),
1465 Arc::new(confidence_arr),
1466 Arc::new(error_code_arr),
1467 Arc::new(timestamp_arr),
1468 ],
1469 )
1470 .unwrap();
1471
1472 let file = File::create(&file_path).unwrap();
1474 let mut writer = ArrowWriter::try_new(file, schema, None).unwrap();
1475 writer.write(&batch).unwrap();
1476 writer.close().unwrap();
1477
1478 let loader = CitlDataLoader::new();
1480 let result = loader.load_parquet(&file_path);
1481 assert!(result.is_ok());
1482
1483 let iter = result.unwrap();
1485 let all_examples: Vec<_> = iter.flatten().collect();
1486 assert_eq!(all_examples.len(), 1);
1488 assert_eq!(all_examples[0].label, DefectCategory::TypeErrors);
1489 }
1490
1491 #[test]
1494 fn test_schema_validation_valid() {
1495 let validation = SchemaValidation {
1496 is_valid: true,
1497 missing_fields: vec![],
1498 extra_fields: vec![],
1499 format: "parquet".to_string(),
1500 };
1501 assert!(validation.is_valid);
1502 assert!(validation.missing_fields.is_empty());
1503 }
1504
1505 #[test]
1506 fn test_schema_validation_invalid() {
1507 let validation = SchemaValidation {
1508 is_valid: false,
1509 missing_fields: vec!["message".to_string(), "confidence".to_string()],
1510 extra_fields: vec![],
1511 format: "parquet".to_string(),
1512 };
1513 assert!(!validation.is_valid);
1514 assert_eq!(validation.missing_fields.len(), 2);
1515 }
1516
1517 #[test]
1518 fn test_validate_citl_schema_unsupported_format() {
1519 let result = validate_citl_schema("test.csv");
1520 assert!(result.is_err());
1521 }
1522
1523 #[test]
1524 fn test_validate_citl_schema_jsonl_valid() {
1525 use std::io::Write;
1526 let temp_dir = tempfile::tempdir().unwrap();
1527 let file_path = temp_dir.path().join("test.jsonl");
1528 let mut file = std::fs::File::create(&file_path).unwrap();
1529 writeln!(file, r#"{{"source_file":"test.py","error_code":"E0308","clippy_lint":null,"level":"error","message":"test","oip_category":null,"confidence":0.9,"span":null,"suggestion":null,"timestamp":0,"depyler_version":"1.0"}}"#).unwrap();
1530
1531 let result = validate_citl_schema(&file_path).unwrap();
1532 assert!(result.is_valid);
1533 assert_eq!(result.format, "jsonl");
1534 }
1535
1536 #[test]
1537 fn test_validate_citl_schema_empty_file() {
1538 let temp_dir = tempfile::tempdir().unwrap();
1539 let file_path = temp_dir.path().join("empty.jsonl");
1540 let _file = std::fs::File::create(&file_path).unwrap();
1541
1542 let result = validate_citl_schema(&file_path);
1543 assert!(result.is_err());
1544 }
1545
1546 #[test]
1549 fn test_convert_batch_empty() {
1550 use arrow::array::RecordBatch;
1551 use arrow::datatypes::{DataType, Field, Schema};
1552 use std::sync::Arc;
1553
1554 let schema = Arc::new(Schema::new(vec![
1555 Field::new("message", DataType::Utf8, false),
1556 Field::new("confidence", DataType::Float32, false),
1557 ]));
1558
1559 let batch = RecordBatch::new_empty(schema);
1560 let examples = convert_batch_to_examples(&batch, 0.0);
1561 assert!(examples.is_empty());
1562 }
1563
1564 #[test]
1565 fn test_convert_batch_with_data() {
1566 use arrow::array::{Float32Array, RecordBatch, StringArray};
1567 use arrow::datatypes::{DataType, Field, Schema};
1568 use std::sync::Arc;
1569
1570 let schema = Arc::new(Schema::new(vec![
1571 Field::new("message", DataType::Utf8, false),
1572 Field::new("confidence", DataType::Float32, false),
1573 Field::new("error_code", DataType::Utf8, true),
1574 Field::new("timestamp", DataType::Int64, false),
1575 ]));
1576
1577 let message_arr = StringArray::from(vec!["type mismatch"]);
1578 let confidence_arr = Float32Array::from(vec![0.95]);
1579 let error_code_arr = StringArray::from(vec![Some("E0308")]);
1580 let timestamp_arr = arrow::array::Int64Array::from(vec![1732752000]);
1581
1582 let batch = RecordBatch::try_new(
1583 schema,
1584 vec![
1585 Arc::new(message_arr),
1586 Arc::new(confidence_arr),
1587 Arc::new(error_code_arr),
1588 Arc::new(timestamp_arr),
1589 ],
1590 )
1591 .unwrap();
1592
1593 let examples = convert_batch_to_examples(&batch, 0.5);
1594 assert_eq!(examples.len(), 1);
1595 assert_eq!(examples[0].message, "type mismatch");
1596 assert_eq!(examples[0].label, DefectCategory::TypeErrors);
1597 assert!((examples[0].confidence - 0.95).abs() < 0.001);
1598 }
1599
1600 #[test]
1601 fn test_convert_batch_filters_low_confidence() {
1602 use arrow::array::{Float32Array, RecordBatch, StringArray};
1603 use arrow::datatypes::{DataType, Field, Schema};
1604 use std::sync::Arc;
1605
1606 let schema = Arc::new(Schema::new(vec![
1607 Field::new("message", DataType::Utf8, false),
1608 Field::new("confidence", DataType::Float32, false),
1609 Field::new("error_code", DataType::Utf8, true),
1610 ]));
1611
1612 let message_arr = StringArray::from(vec!["low conf", "high conf"]);
1613 let confidence_arr = Float32Array::from(vec![0.3, 0.9]);
1614 let error_code_arr = StringArray::from(vec![Some("E0308"), Some("E0308")]);
1615
1616 let batch = RecordBatch::try_new(
1617 schema,
1618 vec![
1619 Arc::new(message_arr),
1620 Arc::new(confidence_arr),
1621 Arc::new(error_code_arr),
1622 ],
1623 )
1624 .unwrap();
1625
1626 let examples = convert_batch_to_examples(&batch, 0.5);
1627 assert_eq!(examples.len(), 1);
1628 assert_eq!(examples[0].message, "high conf");
1629 }
1630}