1use std::collections::HashMap;
35
36use datafusion::prelude::*;
37use regex::Regex;
38use serde::{Deserialize, Serialize};
39use tracing::{info, instrument};
40
41use crate::analyzers::errors::AnalyzerError;
42
43pub type InferenceResult<T> = Result<T, AnalyzerError>;
45
46#[derive(Debug, Clone)]
48pub struct InferenceConfig {
49 pub sample_size: u64,
51 pub confidence_threshold: f64,
53 pub detect_decimal_precision: bool,
55 pub categorical_threshold: usize,
57 pub international_formats: bool,
59}
60
61impl Default for InferenceConfig {
62 fn default() -> Self {
63 Self {
64 sample_size: 1000,
65 confidence_threshold: 0.7,
66 detect_decimal_precision: true,
67 categorical_threshold: 100,
68 international_formats: true,
69 }
70 }
71}
72
73#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
75pub enum InferredDataType {
76 Integer { nullable: bool },
78 Float { nullable: bool },
80 Decimal { precision: u8, scale: u8 },
82 Boolean {
84 true_values: Vec<String>,
85 false_values: Vec<String>,
86 },
87 Date { format: String },
89 DateTime { format: String },
91 Time { format: String },
93 Categorical { cardinality: usize },
95 Text,
97 Mixed { types: HashMap<String, f64> },
99}
100
101impl InferredDataType {
102 pub fn is_nullable(&self) -> bool {
104 match self {
105 InferredDataType::Integer { nullable } => *nullable,
106 InferredDataType::Float { nullable } => *nullable,
107 InferredDataType::Decimal { .. } => true, _ => true, }
110 }
111
112 pub fn type_name(&self) -> &'static str {
114 match self {
115 InferredDataType::Integer { .. } => "Integer",
116 InferredDataType::Float { .. } => "Float",
117 InferredDataType::Decimal { .. } => "Decimal",
118 InferredDataType::Boolean { .. } => "Boolean",
119 InferredDataType::Date { .. } => "Date",
120 InferredDataType::DateTime { .. } => "DateTime",
121 InferredDataType::Time { .. } => "Time",
122 InferredDataType::Categorical { .. } => "Categorical",
123 InferredDataType::Text => "Text",
124 InferredDataType::Mixed { .. } => "Mixed",
125 }
126 }
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TypeInferenceResult {
132 pub inferred_type: InferredDataType,
134 pub confidence: f64,
136 pub samples_analyzed: usize,
138 pub null_count: usize,
140 pub alternatives: HashMap<String, f64>,
142}
143
144#[derive(Debug)]
146pub struct TypeStats {
147 pub total_samples: usize,
148 pub null_count: usize,
149 pub integer_matches: usize,
150 pub float_matches: usize,
151 pub boolean_matches: usize,
152 pub date_matches: usize,
153 pub datetime_matches: usize,
154 pub time_matches: usize,
155 pub unique_values: HashMap<String, usize>,
156 pub decimal_info: Option<(u8, u8)>, pub boolean_representations: (Vec<String>, Vec<String>), pub detected_formats: Vec<String>,
159}
160
161impl Default for TypeStats {
162 fn default() -> Self {
163 Self::new()
164 }
165}
166
167impl TypeStats {
168 pub fn new() -> Self {
169 Self {
170 total_samples: 0,
171 null_count: 0,
172 integer_matches: 0,
173 float_matches: 0,
174 boolean_matches: 0,
175 date_matches: 0,
176 datetime_matches: 0,
177 time_matches: 0,
178 unique_values: HashMap::new(),
179 decimal_info: None,
180 boolean_representations: (Vec::new(), Vec::new()),
181 detected_formats: Vec::new(),
182 }
183 }
184}
185
186pub struct TypeInferenceEngineBuilder {
188 config: InferenceConfig,
189}
190
191impl TypeInferenceEngineBuilder {
192 pub fn sample_size(mut self, size: u64) -> Self {
194 self.config.sample_size = size;
195 self
196 }
197
198 pub fn confidence_threshold(mut self, threshold: f64) -> Self {
200 self.config.confidence_threshold = threshold;
201 self
202 }
203
204 pub fn detect_decimal_precision(mut self, enable: bool) -> Self {
206 self.config.detect_decimal_precision = enable;
207 self
208 }
209
210 pub fn categorical_threshold(mut self, threshold: usize) -> Self {
212 self.config.categorical_threshold = threshold;
213 self
214 }
215
216 pub fn international_formats(mut self, enable: bool) -> Self {
218 self.config.international_formats = enable;
219 self
220 }
221
222 pub fn build(self) -> TypeInferenceEngine {
224 TypeInferenceEngine {
225 config: self.config,
226 patterns: TypePatterns::new(),
227 }
228 }
229}
230
231struct TypePatterns {
233 integer: Regex,
234 float: Regex,
235 decimal: Regex,
236 date_iso: Regex,
237 date_us: Regex,
238 date_eu: Regex,
239 datetime_iso: Regex,
240 time: Regex,
241 boolean_true: Vec<Regex>,
242 boolean_false: Vec<Regex>,
243}
244
245impl TypePatterns {
246 fn new() -> Self {
247 Self {
248 integer: Regex::new(r"^[+-]?\d+$").unwrap(),
249 float: Regex::new(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$").unwrap(),
250 decimal: Regex::new(r"^[+-]?\d+\.\d+$").unwrap(),
251 date_iso: Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(),
252 date_us: Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(),
253 date_eu: Regex::new(r"^\d{1,2}\.\d{1,2}\.\d{4}$").unwrap(),
254 datetime_iso: Regex::new(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}").unwrap(),
255 time: Regex::new(r"^\d{1,2}:\d{2}(:\d{2})?(\s?(AM|PM))?$").unwrap(),
256 boolean_true: vec![Regex::new(r"(?i)^(true|t|yes|y|1|on|enabled?)$").unwrap()],
257 boolean_false: vec![Regex::new(r"(?i)^(false|f|no|n|0|off|disabled?)$").unwrap()],
258 }
259 }
260}
261
262pub struct TypeInferenceEngine {
264 config: InferenceConfig,
265 patterns: TypePatterns,
266}
267
268impl TypeInferenceEngine {
269 pub fn builder() -> TypeInferenceEngineBuilder {
271 TypeInferenceEngineBuilder {
272 config: InferenceConfig::default(),
273 }
274 }
275
276 pub fn new() -> Self {
278 Self::builder().build()
279 }
280
281 #[instrument(skip(self, ctx))]
283 pub async fn infer_column_type(
284 &self,
285 ctx: &SessionContext,
286 table_name: &str,
287 column_name: &str,
288 ) -> InferenceResult<TypeInferenceResult> {
289 info!(
290 table = table_name,
291 column = column_name,
292 sample_size = self.config.sample_size,
293 "Starting type inference"
294 );
295
296 let samples = self.collect_samples(ctx, table_name, column_name).await?;
298
299 let stats = self.analyze_samples(&samples);
301
302 let result = self.determine_type(&stats);
304
305 info!(
306 table = table_name,
307 column = column_name,
308 inferred_type = result.inferred_type.type_name(),
309 confidence = result.confidence,
310 samples = result.samples_analyzed,
311 "Completed type inference"
312 );
313
314 Ok(result)
315 }
316
317 #[instrument(skip(self, ctx))]
319 pub async fn infer_multiple_columns(
320 &self,
321 ctx: &SessionContext,
322 table_name: &str,
323 column_names: &[String],
324 ) -> InferenceResult<Vec<(String, TypeInferenceResult)>> {
325 let mut handles = Vec::new();
326
327 for column_name in column_names {
328 let ctx = ctx.clone();
329 let table_name = table_name.to_string();
330 let column_name = column_name.clone();
331 let engine = Self {
332 config: self.config.clone(),
333 patterns: TypePatterns::new(), };
335
336 let handle = tokio::spawn(async move {
337 let result = engine
338 .infer_column_type(&ctx, &table_name, &column_name)
339 .await?;
340 Ok::<_, AnalyzerError>((column_name, result))
341 });
342
343 handles.push(handle);
344 }
345
346 let mut results = Vec::new();
347 for handle in handles {
348 match handle.await {
349 Ok(Ok(result)) => results.push(result),
350 Ok(Err(e)) => return Err(e),
351 Err(e) => return Err(AnalyzerError::execution(format!("Task join error: {e}"))),
352 }
353 }
354
355 Ok(results)
356 }
357
358 async fn collect_samples(
360 &self,
361 ctx: &SessionContext,
362 table_name: &str,
363 column_name: &str,
364 ) -> InferenceResult<Vec<Option<String>>> {
365 let sql = format!(
366 "SELECT {column_name} FROM {table_name} LIMIT {}",
367 self.config.sample_size
368 );
369
370 let df = ctx
371 .sql(&sql)
372 .await
373 .map_err(|e| AnalyzerError::execution(e.to_string()))?;
374
375 let batches = df
376 .collect()
377 .await
378 .map_err(|e| AnalyzerError::execution(e.to_string()))?;
379
380 let mut samples = Vec::new();
381 for batch in &batches {
382 if batch.num_rows() > 0 {
383 let column_data = batch.column(0);
384 for i in 0..batch.num_rows() {
385 if column_data.is_null(i) {
386 samples.push(None);
387 } else {
388 let value = self.extract_string_value(column_data, i)?;
389 samples.push(Some(value));
390 }
391 }
392 }
393 }
394
395 Ok(samples)
396 }
397
398 fn extract_string_value(
400 &self,
401 column: &dyn arrow::array::Array,
402 row_idx: usize,
403 ) -> InferenceResult<String> {
404 if column.is_null(row_idx) {
405 return Ok("".to_string());
406 }
407
408 if let Some(arr) = column.as_any().downcast_ref::<arrow::array::StringArray>() {
409 Ok(arr.value(row_idx).to_string())
410 } else if let Some(arr) = column
411 .as_any()
412 .downcast_ref::<arrow::array::StringViewArray>()
413 {
414 Ok(arr.value(row_idx).to_string())
415 } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::Int64Array>() {
416 Ok(arr.value(row_idx).to_string())
417 } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::Float64Array>() {
418 Ok(arr.value(row_idx).to_string())
419 } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::BooleanArray>() {
420 Ok(arr.value(row_idx).to_string())
421 } else {
422 Ok("UNKNOWN".to_string())
424 }
425 }
426
427 fn analyze_samples(&self, samples: &[Option<String>]) -> TypeStats {
429 let mut stats = TypeStats::new();
430 stats.total_samples = samples.len();
431
432 for sample in samples {
433 match sample {
434 None => stats.null_count += 1,
435 Some(value) => {
436 let trimmed = value.trim();
437 if trimmed.is_empty() {
438 stats.null_count += 1;
439 continue;
440 }
441
442 *stats.unique_values.entry(trimmed.to_string()).or_insert(0) += 1;
444
445 self.test_patterns(trimmed, &mut stats);
447 }
448 }
449 }
450
451 stats
452 }
453
454 pub fn test_patterns(&self, value: &str, stats: &mut TypeStats) {
456 if self.patterns.integer.is_match(value) {
458 stats.integer_matches += 1;
459 }
460
461 if self.patterns.float.is_match(value) {
463 if !self.patterns.integer.is_match(value)
465 || value.contains('.')
466 || value.contains('e')
467 || value.contains('E')
468 {
469 stats.float_matches += 1;
470
471 if self.patterns.decimal.is_match(value) && self.config.detect_decimal_precision {
473 if let Some(dot_pos) = value.rfind('.') {
474 let fractional_part = &value[dot_pos + 1..];
475 let scale = fractional_part.len() as u8;
476 let precision = (value.len() - 1) as u8; stats.decimal_info = Some((precision.min(38), scale.min(38)));
479 }
480 }
481 }
482 }
483
484 if self.patterns.date_iso.is_match(value) {
486 stats.date_matches += 1;
487 stats.detected_formats.push("YYYY-MM-DD".to_string());
488 } else if self.patterns.date_us.is_match(value) {
489 stats.date_matches += 1;
490 stats.detected_formats.push("MM/DD/YYYY".to_string());
491 } else if self.patterns.date_eu.is_match(value) {
492 stats.date_matches += 1;
493 stats.detected_formats.push("DD.MM.YYYY".to_string());
494 }
495
496 if self.patterns.datetime_iso.is_match(value) {
498 stats.datetime_matches += 1;
499 stats
500 .detected_formats
501 .push("YYYY-MM-DD HH:MM:SS".to_string());
502 }
503
504 if self.patterns.time.is_match(value) {
506 stats.time_matches += 1;
507 stats.detected_formats.push("HH:MM:SS".to_string());
508 }
509
510 for pattern in &self.patterns.boolean_true {
512 if pattern.is_match(value) {
513 stats.boolean_matches += 1;
514 stats.boolean_representations.0.push(value.to_string());
515 break;
516 }
517 }
518 for pattern in &self.patterns.boolean_false {
519 if pattern.is_match(value) {
520 stats.boolean_matches += 1;
521 stats.boolean_representations.1.push(value.to_string());
522 break;
523 }
524 }
525 }
526
527 pub fn determine_type(&self, stats: &TypeStats) -> TypeInferenceResult {
529 let non_null_samples = stats.total_samples - stats.null_count;
530
531 if non_null_samples == 0 {
532 return TypeInferenceResult {
533 inferred_type: InferredDataType::Text,
534 confidence: 0.0,
535 samples_analyzed: stats.total_samples,
536 null_count: stats.null_count,
537 alternatives: HashMap::new(),
538 };
539 }
540
541 let mut alternatives = HashMap::new();
542
543 let integer_confidence = stats.integer_matches as f64 / non_null_samples as f64;
545 let float_confidence = stats.float_matches as f64 / non_null_samples as f64;
546 let boolean_confidence = stats.boolean_matches as f64 / non_null_samples as f64;
547 let date_confidence = stats.date_matches as f64 / non_null_samples as f64;
548 let datetime_confidence = stats.datetime_matches as f64 / non_null_samples as f64;
549 let time_confidence = stats.time_matches as f64 / non_null_samples as f64;
550
551 let is_categorical = stats.unique_values.len() <= self.config.categorical_threshold;
553 let categorical_confidence = if is_categorical { 1.0 } else { 0.0 };
554
555 if integer_confidence > 0.0 {
557 alternatives.insert("Integer".to_string(), integer_confidence);
558 }
559 if float_confidence > 0.0 {
560 alternatives.insert("Float".to_string(), float_confidence);
561 }
562 if boolean_confidence > 0.0 {
563 alternatives.insert("Boolean".to_string(), boolean_confidence);
564 }
565 if date_confidence > 0.0 {
566 alternatives.insert("Date".to_string(), date_confidence);
567 }
568 if datetime_confidence > 0.0 {
569 alternatives.insert("DateTime".to_string(), datetime_confidence);
570 }
571 if time_confidence > 0.0 {
572 alternatives.insert("Time".to_string(), time_confidence);
573 }
574 if categorical_confidence > 0.0 {
575 alternatives.insert("Categorical".to_string(), categorical_confidence);
576 }
577
578 let nullable = stats.null_count > 0;
580
581 let (inferred_type, confidence) = if datetime_confidence >= self.config.confidence_threshold
583 {
584 let format = stats
585 .detected_formats
586 .first()
587 .unwrap_or(&"YYYY-MM-DD HH:MM:SS".to_string())
588 .clone();
589 (InferredDataType::DateTime { format }, datetime_confidence)
590 } else if date_confidence >= self.config.confidence_threshold {
591 let format = stats
592 .detected_formats
593 .first()
594 .unwrap_or(&"YYYY-MM-DD".to_string())
595 .clone();
596 (InferredDataType::Date { format }, date_confidence)
597 } else if time_confidence >= self.config.confidence_threshold {
598 let format = stats
599 .detected_formats
600 .first()
601 .unwrap_or(&"HH:MM:SS".to_string())
602 .clone();
603 (InferredDataType::Time { format }, time_confidence)
604 } else if boolean_confidence >= self.config.confidence_threshold {
605 let (true_values, false_values) = &stats.boolean_representations;
606 (
607 InferredDataType::Boolean {
608 true_values: true_values.clone(),
609 false_values: false_values.clone(),
610 },
611 boolean_confidence,
612 )
613 } else if float_confidence >= self.config.confidence_threshold && stats.float_matches > 0 {
614 if let Some((precision, scale)) = stats.decimal_info {
616 (
617 InferredDataType::Decimal { precision, scale },
618 float_confidence,
619 )
620 } else {
621 (InferredDataType::Float { nullable }, float_confidence)
622 }
623 } else if integer_confidence >= self.config.confidence_threshold {
624 (InferredDataType::Integer { nullable }, integer_confidence)
625 } else if is_categorical && stats.unique_values.len() > 1 {
626 (
627 InferredDataType::Categorical {
628 cardinality: stats.unique_values.len(),
629 },
630 categorical_confidence,
631 )
632 } else {
633 let mixed_types = alternatives
635 .iter()
636 .filter(|(_, &conf)| conf > 0.1) .map(|(name, &conf)| (name.clone(), conf))
638 .collect::<HashMap<_, _>>();
639
640 if mixed_types.len() > 1 {
641 let max_confidence = mixed_types.values().fold(0.0f64, |a, &b| a.max(b));
642 (
643 InferredDataType::Mixed { types: mixed_types },
644 max_confidence,
645 )
646 } else {
647 (InferredDataType::Text, 1.0)
648 }
649 };
650
651 TypeInferenceResult {
652 inferred_type,
653 confidence,
654 samples_analyzed: stats.total_samples,
655 null_count: stats.null_count,
656 alternatives,
657 }
658 }
659}
660
661impl Default for TypeInferenceEngine {
662 fn default() -> Self {
663 Self::new()
664 }
665}
666
667#[cfg(test)]
668mod tests {
669 use super::*;
670
671 #[tokio::test]
672 async fn test_inference_engine_builder() {
673 let engine = TypeInferenceEngine::builder()
674 .sample_size(500)
675 .confidence_threshold(0.8)
676 .categorical_threshold(50)
677 .detect_decimal_precision(false)
678 .international_formats(false)
679 .build();
680
681 assert_eq!(engine.config.sample_size, 500);
682 assert_eq!(engine.config.confidence_threshold, 0.8);
683 assert_eq!(engine.config.categorical_threshold, 50);
684 assert!(!engine.config.detect_decimal_precision);
685 assert!(!engine.config.international_formats);
686 }
687
688 #[tokio::test]
689 async fn test_type_pattern_matching() {
690 let patterns = TypePatterns::new();
691
692 assert!(patterns.integer.is_match("123"));
694 assert!(patterns.integer.is_match("-456"));
695 assert!(patterns.integer.is_match("+789"));
696 assert!(!patterns.integer.is_match("12.34"));
697
698 assert!(patterns.float.is_match("12.34"));
700 assert!(patterns.float.is_match("1.23e10"));
701 assert!(patterns.float.is_match(".5"));
702 assert!(patterns.float.is_match("123."));
703
704 assert!(patterns.date_iso.is_match("2023-12-25"));
706 assert!(patterns.date_us.is_match("12/25/2023"));
707 assert!(patterns.date_eu.is_match("25.12.2023"));
708
709 assert!(patterns.boolean_true[0].is_match("true"));
711 assert!(patterns.boolean_true[0].is_match("YES"));
712 assert!(patterns.boolean_true[0].is_match("1"));
713 assert!(patterns.boolean_false[0].is_match("false"));
714 assert!(patterns.boolean_false[0].is_match("NO"));
715 assert!(patterns.boolean_false[0].is_match("0"));
716 }
717
718 #[test]
719 fn test_inferred_data_type_methods() {
720 let int_type = InferredDataType::Integer { nullable: true };
721 assert!(int_type.is_nullable());
722 assert_eq!(int_type.type_name(), "Integer");
723
724 let float_type = InferredDataType::Float { nullable: false };
725 assert!(!float_type.is_nullable());
726 assert_eq!(float_type.type_name(), "Float");
727
728 let bool_type = InferredDataType::Boolean {
729 true_values: vec!["yes".to_string()],
730 false_values: vec!["no".to_string()],
731 };
732 assert!(bool_type.is_nullable());
733 assert_eq!(bool_type.type_name(), "Boolean");
734 }
735
736 #[test]
737 fn test_type_stats_creation() {
738 let stats = TypeStats::new();
739 assert_eq!(stats.total_samples, 0);
740 assert_eq!(stats.null_count, 0);
741 assert_eq!(stats.integer_matches, 0);
742 assert!(stats.unique_values.is_empty());
743 }
744
745 #[test]
746 fn test_analyze_samples_with_nulls() {
747 let engine = TypeInferenceEngine::new();
748 let samples = vec![
749 Some("123".to_string()),
750 None,
751 Some("456".to_string()),
752 None,
753 Some("789".to_string()),
754 ];
755
756 let stats = engine.analyze_samples(&samples);
757 assert_eq!(stats.total_samples, 5);
758 assert_eq!(stats.null_count, 2);
759 assert_eq!(stats.integer_matches, 3);
760 }
761
762 #[test]
763 fn test_analyze_samples_all_nulls() {
764 let engine = TypeInferenceEngine::new();
765 let samples = vec![None, None, None];
766
767 let stats = engine.analyze_samples(&samples);
768 assert_eq!(stats.total_samples, 3);
769 assert_eq!(stats.null_count, 3);
770 assert_eq!(stats.integer_matches, 0);
771 }
772
773 #[test]
774 fn test_analyze_samples_mixed_types() {
775 let engine = TypeInferenceEngine::new();
776 let samples = vec![
777 Some("123".to_string()), Some("45.67".to_string()), Some("true".to_string()), Some("2023-12-25".to_string()), Some("hello".to_string()), ];
783
784 let stats = engine.analyze_samples(&samples);
785 assert_eq!(stats.total_samples, 5);
786 assert_eq!(stats.null_count, 0);
787 assert_eq!(stats.integer_matches, 1);
788 assert_eq!(stats.float_matches, 1); assert_eq!(stats.boolean_matches, 1);
790 assert_eq!(stats.date_matches, 1);
791 }
792
793 #[test]
794 fn test_determine_type_all_nulls() {
795 let engine = TypeInferenceEngine::new();
796 let mut stats = TypeStats::new();
797 stats.total_samples = 3;
798 stats.null_count = 3;
799
800 let result = engine.determine_type(&stats);
801 assert!(matches!(result.inferred_type, InferredDataType::Text));
802 assert_eq!(result.confidence, 0.0);
803 assert_eq!(result.null_count, 3);
804 }
805
806 #[test]
807 fn test_determine_type_single_value() {
808 let engine = TypeInferenceEngine::new();
809 let mut stats = TypeStats::new();
810 stats.total_samples = 1;
811 stats.null_count = 0;
812 stats.integer_matches = 1;
813 stats.unique_values.insert("42".to_string(), 1);
814
815 let result = engine.determine_type(&stats);
816 assert!(matches!(
817 result.inferred_type,
818 InferredDataType::Integer { .. }
819 ));
820 assert_eq!(result.confidence, 1.0);
821 }
822
823 #[test]
824 fn test_determine_type_boolean_detection() {
825 let engine = TypeInferenceEngine::new();
826 let mut stats = TypeStats::new();
827 stats.total_samples = 4;
828 stats.null_count = 0;
829 stats.boolean_matches = 4;
830 stats.boolean_representations = (
831 vec!["true".to_string(), "yes".to_string()],
832 vec!["false".to_string(), "no".to_string()],
833 );
834
835 let result = engine.determine_type(&stats);
836 assert!(matches!(
837 result.inferred_type,
838 InferredDataType::Boolean { .. }
839 ));
840 assert_eq!(result.confidence, 1.0);
841
842 if let InferredDataType::Boolean {
843 true_values,
844 false_values,
845 } = result.inferred_type
846 {
847 assert!(!true_values.is_empty());
848 assert!(!false_values.is_empty());
849 }
850 }
851
852 #[test]
853 fn test_determine_type_categorical_vs_text() {
854 let engine = TypeInferenceEngine::builder()
855 .categorical_threshold(3)
856 .build();
857
858 let mut stats_categorical = TypeStats::new();
860 stats_categorical.total_samples = 10;
861 stats_categorical.null_count = 0;
862 stats_categorical.unique_values.insert("A".to_string(), 5);
863 stats_categorical.unique_values.insert("B".to_string(), 3);
864 stats_categorical.unique_values.insert("C".to_string(), 2);
865
866 let result_categorical = engine.determine_type(&stats_categorical);
867 assert!(matches!(
868 result_categorical.inferred_type,
869 InferredDataType::Categorical { .. }
870 ));
871
872 let mut stats_text = TypeStats::new();
874 stats_text.total_samples = 10;
875 stats_text.null_count = 0;
876 for i in 0..10 {
877 stats_text.unique_values.insert(format!("text_{i}"), 1);
878 }
879
880 let result_text = engine.determine_type(&stats_text);
881 assert!(matches!(result_text.inferred_type, InferredDataType::Text));
882 }
883
884 #[test]
885 fn test_determine_type_decimal_precision() {
886 let engine = TypeInferenceEngine::builder()
887 .detect_decimal_precision(true)
888 .build();
889
890 let mut stats = TypeStats::new();
891 stats.total_samples = 3;
892 stats.null_count = 0;
893 stats.float_matches = 3;
894 stats.decimal_info = Some((5, 2)); let result = engine.determine_type(&stats);
897 assert!(matches!(
898 result.inferred_type,
899 InferredDataType::Decimal { .. }
900 ));
901
902 if let InferredDataType::Decimal { precision, scale } = result.inferred_type {
903 assert_eq!(precision, 5);
904 assert_eq!(scale, 2);
905 }
906 }
907
908 #[test]
909 fn test_determine_type_mixed_types() {
910 let engine = TypeInferenceEngine::builder()
911 .confidence_threshold(0.9) .build();
913
914 let mut stats = TypeStats::new();
915 stats.total_samples = 10;
916 stats.null_count = 0;
917 stats.integer_matches = 3; stats.float_matches = 4; stats.boolean_matches = 3; let result = engine.determine_type(&stats);
922
923 match result.inferred_type {
925 InferredDataType::Mixed { types } => {
926 assert!(!types.is_empty());
927 assert!(types.len() > 1);
928 }
929 _ => {
930 assert!(result.confidence > 0.0);
932 }
933 }
934 }
935
936 #[test]
937 fn test_date_format_detection() {
938 let patterns = TypePatterns::new();
939
940 assert!(patterns.date_iso.is_match("2023-12-25"));
942 assert!(!patterns.date_iso.is_match("12/25/2023"));
943
944 assert!(patterns.date_us.is_match("12/25/2023"));
946 assert!(patterns.date_us.is_match("1/1/2023"));
947 assert!(!patterns.date_us.is_match("2023-12-25"));
948
949 assert!(patterns.date_eu.is_match("25.12.2023"));
951 assert!(patterns.date_eu.is_match("1.1.2023"));
952 assert!(!patterns.date_eu.is_match("2023-12-25"));
953
954 assert!(patterns.datetime_iso.is_match("2023-12-25T10:30:00"));
956 assert!(patterns.datetime_iso.is_match("2023-12-25 10:30:00"));
957 assert!(!patterns.datetime_iso.is_match("2023-12-25"));
958 }
959
960 #[test]
961 fn test_boolean_representations() {
962 let patterns = TypePatterns::new();
963
964 let true_cases = vec![
966 "true", "TRUE", "True", "t", "T", "yes", "YES", "y", "Y", "1", "on", "enabled",
967 ];
968 for case in true_cases {
969 assert!(
970 patterns.boolean_true[0].is_match(case),
971 "Failed to match true case: {case}"
972 );
973 }
974
975 let false_cases = vec![
977 "false", "FALSE", "False", "f", "F", "no", "NO", "n", "N", "0", "off", "disabled",
978 ];
979 for case in false_cases {
980 assert!(
981 patterns.boolean_false[0].is_match(case),
982 "Failed to match false case: {case}"
983 );
984 }
985 }
986
987 #[test]
988 fn test_numeric_edge_cases() {
989 let patterns = TypePatterns::new();
990
991 assert!(patterns.integer.is_match("0"));
993 assert!(patterns.integer.is_match("-0"));
994 assert!(patterns.integer.is_match("+0"));
995 assert!(patterns.integer.is_match("9223372036854775807")); assert!(patterns.float.is_match("0.0"));
999 assert!(patterns.float.is_match(".0"));
1000 assert!(patterns.float.is_match("0."));
1001 assert!(patterns.float.is_match("1e10"));
1002 assert!(patterns.float.is_match("1E-10"));
1003 assert!(patterns.float.is_match("-1.23e+45"));
1004
1005 assert!(!patterns.integer.is_match(""));
1007 assert!(!patterns.integer.is_match("abc"));
1008 assert!(!patterns.float.is_match(""));
1009 assert!(!patterns.float.is_match("abc"));
1010 }
1011
1012 #[test]
1013 fn test_confidence_calculation() {
1014 let engine = TypeInferenceEngine::new();
1015
1016 let mut stats_perfect = TypeStats::new();
1018 stats_perfect.total_samples = 5;
1019 stats_perfect.null_count = 0;
1020 stats_perfect.integer_matches = 5;
1021
1022 let result_perfect = engine.determine_type(&stats_perfect);
1023 assert_eq!(result_perfect.confidence, 1.0);
1024
1025 let mut stats_partial = TypeStats::new();
1027 stats_partial.total_samples = 10;
1028 stats_partial.null_count = 0;
1029 stats_partial.integer_matches = 6;
1030
1031 let result_partial = engine.determine_type(&stats_partial);
1032 assert!(result_partial.confidence >= 0.6);
1033 }
1034
1035 #[test]
1036 fn test_empty_samples() {
1037 let engine = TypeInferenceEngine::new();
1038 let samples: Vec<Option<String>> = vec![];
1039
1040 let stats = engine.analyze_samples(&samples);
1041 assert_eq!(stats.total_samples, 0);
1042 assert_eq!(stats.null_count, 0);
1043
1044 let result = engine.determine_type(&stats);
1045 assert!(matches!(result.inferred_type, InferredDataType::Text));
1046 assert_eq!(result.confidence, 0.0);
1047 }
1048
1049 #[test]
1050 fn test_whitespace_handling() {
1051 let engine = TypeInferenceEngine::new();
1052 let samples = vec![
1053 Some(" 123 ".to_string()), Some("\t456\n".to_string()), Some(" ".to_string()), Some("".to_string()), ];
1058
1059 let stats = engine.analyze_samples(&samples);
1060 assert_eq!(stats.total_samples, 4);
1061 assert_eq!(stats.null_count, 2); assert_eq!(stats.integer_matches, 2); }
1064}