term_guard/analyzers/
inference.rs

1//! Data type inference engine for robust type detection from string data.
2//!
3//! This module provides a comprehensive data type inference system that can detect:
4//! - Numeric types (Integer, Float, Decimal)
5//! - Temporal types (Date, DateTime, Time)
6//! - Boolean values with various representations
7//! - Categorical vs. free text strings
8//! - Mixed type columns with confidence scores
9//!
10//! # Example
11//!
12//! ```rust,ignore
13//! use term_guard::analyzers::inference::{TypeInferenceEngine, InferredDataType};
14//! use term_guard::test_fixtures::create_minimal_tpc_h_context;
15//!
16//! # tokio::runtime::Runtime::new().unwrap().block_on(async {
17//! let engine = TypeInferenceEngine::builder()
18//!     .sample_size(1000)
19//!     .confidence_threshold(0.8)
20//!     .build();
21//!
22//! let ctx = create_minimal_tpc_h_context().await.unwrap();
23//! let inference = engine.infer_column_type(&ctx, "lineitem", "l_quantity").await.unwrap();
24//!
25//! match inference.inferred_type {
26//!     InferredDataType::Float { nullable } => println!("Detected float type, nullable: {nullable}"),
27//!     _ => println!("Detected other type"),
28//! }
29//!
30//! println!("Confidence: {:.2}", inference.confidence);
31//! # })
32//! ```
33
34use std::collections::HashMap;
35
36use datafusion::prelude::*;
37use regex::Regex;
38use serde::{Deserialize, Serialize};
39use tracing::{info, instrument};
40
41use crate::analyzers::errors::AnalyzerError;
42
43/// Result type for type inference operations
44pub type InferenceResult<T> = Result<T, AnalyzerError>;
45
46/// Configuration for the type inference engine
47#[derive(Debug, Clone)]
48pub struct InferenceConfig {
49    /// Number of rows to sample for type detection (default: 1000)
50    pub sample_size: u64,
51    /// Minimum confidence threshold for type detection (default: 0.7)
52    pub confidence_threshold: f64,
53    /// Whether to detect decimal precision/scale (default: true)
54    pub detect_decimal_precision: bool,
55    /// Maximum cardinality for categorical detection (default: 100)
56    pub categorical_threshold: usize,
57    /// Enable international number format detection (default: true)
58    pub international_formats: bool,
59}
60
61impl Default for InferenceConfig {
62    fn default() -> Self {
63        Self {
64            sample_size: 1000,
65            confidence_threshold: 0.7,
66            detect_decimal_precision: true,
67            categorical_threshold: 100,
68            international_formats: true,
69        }
70    }
71}
72
73/// Inferred data type with specific metadata
74#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
75pub enum InferredDataType {
76    /// Integer numbers
77    Integer { nullable: bool },
78    /// Floating point numbers
79    Float { nullable: bool },
80    /// Decimal numbers with precision and scale
81    Decimal { precision: u8, scale: u8 },
82    /// Boolean values with detected representations
83    Boolean {
84        true_values: Vec<String>,
85        false_values: Vec<String>,
86    },
87    /// Date values with detected format
88    Date { format: String },
89    /// DateTime values with detected format
90    DateTime { format: String },
91    /// Time values with detected format
92    Time { format: String },
93    /// Categorical data with known cardinality
94    Categorical { cardinality: usize },
95    /// Free text data
96    Text,
97    /// Mixed types with confidence scores for each type
98    Mixed { types: HashMap<String, f64> },
99}
100
101impl InferredDataType {
102    /// Check if the type is nullable
103    pub fn is_nullable(&self) -> bool {
104        match self {
105            InferredDataType::Integer { nullable } => *nullable,
106            InferredDataType::Float { nullable } => *nullable,
107            InferredDataType::Decimal { .. } => true, // Decimals can always be null
108            _ => true,                                // Most types can be nullable
109        }
110    }
111
112    /// Get the base type name as a string
113    pub fn type_name(&self) -> &'static str {
114        match self {
115            InferredDataType::Integer { .. } => "Integer",
116            InferredDataType::Float { .. } => "Float",
117            InferredDataType::Decimal { .. } => "Decimal",
118            InferredDataType::Boolean { .. } => "Boolean",
119            InferredDataType::Date { .. } => "Date",
120            InferredDataType::DateTime { .. } => "DateTime",
121            InferredDataType::Time { .. } => "Time",
122            InferredDataType::Categorical { .. } => "Categorical",
123            InferredDataType::Text => "Text",
124            InferredDataType::Mixed { .. } => "Mixed",
125        }
126    }
127}
128
129/// Type inference result with confidence score
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct TypeInferenceResult {
132    /// The inferred data type
133    pub inferred_type: InferredDataType,
134    /// Confidence score (0.0 to 1.0)
135    pub confidence: f64,
136    /// Number of samples analyzed
137    pub samples_analyzed: usize,
138    /// Number of null values encountered
139    pub null_count: usize,
140    /// Alternative types considered with their scores
141    pub alternatives: HashMap<String, f64>,
142}
143
144/// Type detection statistics for internal use
145#[derive(Debug)]
146pub struct TypeStats {
147    pub total_samples: usize,
148    pub null_count: usize,
149    pub integer_matches: usize,
150    pub float_matches: usize,
151    pub boolean_matches: usize,
152    pub date_matches: usize,
153    pub datetime_matches: usize,
154    pub time_matches: usize,
155    pub unique_values: HashMap<String, usize>,
156    pub decimal_info: Option<(u8, u8)>, // precision, scale
157    pub boolean_representations: (Vec<String>, Vec<String>), // true_values, false_values
158    pub detected_formats: Vec<String>,
159}
160
161impl Default for TypeStats {
162    fn default() -> Self {
163        Self::new()
164    }
165}
166
167impl TypeStats {
168    pub fn new() -> Self {
169        Self {
170            total_samples: 0,
171            null_count: 0,
172            integer_matches: 0,
173            float_matches: 0,
174            boolean_matches: 0,
175            date_matches: 0,
176            datetime_matches: 0,
177            time_matches: 0,
178            unique_values: HashMap::new(),
179            decimal_info: None,
180            boolean_representations: (Vec::new(), Vec::new()),
181            detected_formats: Vec::new(),
182        }
183    }
184}
185
186/// Builder for TypeInferenceEngine
187pub struct TypeInferenceEngineBuilder {
188    config: InferenceConfig,
189}
190
191impl TypeInferenceEngineBuilder {
192    /// Set the sample size for type detection
193    pub fn sample_size(mut self, size: u64) -> Self {
194        self.config.sample_size = size;
195        self
196    }
197
198    /// Set the confidence threshold
199    pub fn confidence_threshold(mut self, threshold: f64) -> Self {
200        self.config.confidence_threshold = threshold;
201        self
202    }
203
204    /// Enable or disable decimal precision detection
205    pub fn detect_decimal_precision(mut self, enable: bool) -> Self {
206        self.config.detect_decimal_precision = enable;
207        self
208    }
209
210    /// Set the categorical cardinality threshold
211    pub fn categorical_threshold(mut self, threshold: usize) -> Self {
212        self.config.categorical_threshold = threshold;
213        self
214    }
215
216    /// Enable or disable international format detection
217    pub fn international_formats(mut self, enable: bool) -> Self {
218        self.config.international_formats = enable;
219        self
220    }
221
222    /// Build the TypeInferenceEngine
223    pub fn build(self) -> TypeInferenceEngine {
224        TypeInferenceEngine {
225            config: self.config,
226            patterns: TypePatterns::new(),
227        }
228    }
229}
230
231/// Pattern matching utilities for type detection
232struct TypePatterns {
233    integer: Regex,
234    float: Regex,
235    decimal: Regex,
236    date_iso: Regex,
237    date_us: Regex,
238    date_eu: Regex,
239    datetime_iso: Regex,
240    time: Regex,
241    boolean_true: Vec<Regex>,
242    boolean_false: Vec<Regex>,
243}
244
245impl TypePatterns {
246    fn new() -> Self {
247        Self {
248            integer: Regex::new(r"^[+-]?\d+$").unwrap(),
249            float: Regex::new(r"^[+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)?$").unwrap(),
250            decimal: Regex::new(r"^[+-]?\d+\.\d+$").unwrap(),
251            date_iso: Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(),
252            date_us: Regex::new(r"^\d{1,2}/\d{1,2}/\d{4}$").unwrap(),
253            date_eu: Regex::new(r"^\d{1,2}\.\d{1,2}\.\d{4}$").unwrap(),
254            datetime_iso: Regex::new(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}").unwrap(),
255            time: Regex::new(r"^\d{1,2}:\d{2}(:\d{2})?(\s?(AM|PM))?$").unwrap(),
256            boolean_true: vec![Regex::new(r"(?i)^(true|t|yes|y|1|on|enabled?)$").unwrap()],
257            boolean_false: vec![Regex::new(r"(?i)^(false|f|no|n|0|off|disabled?)$").unwrap()],
258        }
259    }
260}
261
262/// Main type inference engine
263pub struct TypeInferenceEngine {
264    config: InferenceConfig,
265    patterns: TypePatterns,
266}
267
268impl TypeInferenceEngine {
269    /// Create a new builder for TypeInferenceEngine
270    pub fn builder() -> TypeInferenceEngineBuilder {
271        TypeInferenceEngineBuilder {
272            config: InferenceConfig::default(),
273        }
274    }
275
276    /// Create a TypeInferenceEngine with default configuration
277    pub fn new() -> Self {
278        Self::builder().build()
279    }
280
281    /// Infer the data type of a column in a table
282    #[instrument(skip(self, ctx))]
283    pub async fn infer_column_type(
284        &self,
285        ctx: &SessionContext,
286        table_name: &str,
287        column_name: &str,
288    ) -> InferenceResult<TypeInferenceResult> {
289        info!(
290            table = table_name,
291            column = column_name,
292            sample_size = self.config.sample_size,
293            "Starting type inference"
294        );
295
296        // Sample data for analysis
297        let samples = self.collect_samples(ctx, table_name, column_name).await?;
298
299        // Analyze the samples
300        let stats = self.analyze_samples(&samples);
301
302        // Determine the best type match
303        let result = self.determine_type(&stats);
304
305        info!(
306            table = table_name,
307            column = column_name,
308            inferred_type = result.inferred_type.type_name(),
309            confidence = result.confidence,
310            samples = result.samples_analyzed,
311            "Completed type inference"
312        );
313
314        Ok(result)
315    }
316
317    /// Infer types for multiple columns in parallel
318    #[instrument(skip(self, ctx))]
319    pub async fn infer_multiple_columns(
320        &self,
321        ctx: &SessionContext,
322        table_name: &str,
323        column_names: &[String],
324    ) -> InferenceResult<Vec<(String, TypeInferenceResult)>> {
325        let mut handles = Vec::new();
326
327        for column_name in column_names {
328            let ctx = ctx.clone();
329            let table_name = table_name.to_string();
330            let column_name = column_name.clone();
331            let engine = Self {
332                config: self.config.clone(),
333                patterns: TypePatterns::new(), // Create new patterns for each task
334            };
335
336            let handle = tokio::spawn(async move {
337                let result = engine
338                    .infer_column_type(&ctx, &table_name, &column_name)
339                    .await?;
340                Ok::<_, AnalyzerError>((column_name, result))
341            });
342
343            handles.push(handle);
344        }
345
346        let mut results = Vec::new();
347        for handle in handles {
348            match handle.await {
349                Ok(Ok(result)) => results.push(result),
350                Ok(Err(e)) => return Err(e),
351                Err(e) => return Err(AnalyzerError::execution(format!("Task join error: {e}"))),
352            }
353        }
354
355        Ok(results)
356    }
357
358    /// Collect sample data from the specified column
359    async fn collect_samples(
360        &self,
361        ctx: &SessionContext,
362        table_name: &str,
363        column_name: &str,
364    ) -> InferenceResult<Vec<Option<String>>> {
365        let sql = format!(
366            "SELECT {column_name} FROM {table_name} LIMIT {}",
367            self.config.sample_size
368        );
369
370        let df = ctx
371            .sql(&sql)
372            .await
373            .map_err(|e| AnalyzerError::execution(e.to_string()))?;
374
375        let batches = df
376            .collect()
377            .await
378            .map_err(|e| AnalyzerError::execution(e.to_string()))?;
379
380        let mut samples = Vec::new();
381        for batch in &batches {
382            if batch.num_rows() > 0 {
383                let column_data = batch.column(0);
384                for i in 0..batch.num_rows() {
385                    if column_data.is_null(i) {
386                        samples.push(None);
387                    } else {
388                        let value = self.extract_string_value(column_data, i)?;
389                        samples.push(Some(value));
390                    }
391                }
392            }
393        }
394
395        Ok(samples)
396    }
397
398    /// Extract string value from Arrow column
399    fn extract_string_value(
400        &self,
401        column: &dyn arrow::array::Array,
402        row_idx: usize,
403    ) -> InferenceResult<String> {
404        if column.is_null(row_idx) {
405            return Ok("".to_string());
406        }
407
408        if let Some(arr) = column.as_any().downcast_ref::<arrow::array::StringArray>() {
409            Ok(arr.value(row_idx).to_string())
410        } else if let Some(arr) = column
411            .as_any()
412            .downcast_ref::<arrow::array::StringViewArray>()
413        {
414            Ok(arr.value(row_idx).to_string())
415        } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::Int64Array>() {
416            Ok(arr.value(row_idx).to_string())
417        } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::Float64Array>() {
418            Ok(arr.value(row_idx).to_string())
419        } else if let Some(arr) = column.as_any().downcast_ref::<arrow::array::BooleanArray>() {
420            Ok(arr.value(row_idx).to_string())
421        } else {
422            // Generic fallback
423            Ok("UNKNOWN".to_string())
424        }
425    }
426
427    /// Analyze collected samples to gather type statistics
428    fn analyze_samples(&self, samples: &[Option<String>]) -> TypeStats {
429        let mut stats = TypeStats::new();
430        stats.total_samples = samples.len();
431
432        for sample in samples {
433            match sample {
434                None => stats.null_count += 1,
435                Some(value) => {
436                    let trimmed = value.trim();
437                    if trimmed.is_empty() {
438                        stats.null_count += 1;
439                        continue;
440                    }
441
442                    // Track unique values for categorical detection
443                    *stats.unique_values.entry(trimmed.to_string()).or_insert(0) += 1;
444
445                    // Test against various patterns
446                    self.test_patterns(trimmed, &mut stats);
447                }
448            }
449        }
450
451        stats
452    }
453
454    /// Test a value against all type patterns
455    pub fn test_patterns(&self, value: &str, stats: &mut TypeStats) {
456        // Integer test
457        if self.patterns.integer.is_match(value) {
458            stats.integer_matches += 1;
459        }
460
461        // Float test - but only count it if it's actually a float (has decimal point or scientific notation)
462        if self.patterns.float.is_match(value) {
463            // Only count as float if it's not a pure integer
464            if !self.patterns.integer.is_match(value)
465                || value.contains('.')
466                || value.contains('e')
467                || value.contains('E')
468            {
469                stats.float_matches += 1;
470
471                // Check for decimal precision if it's a decimal
472                if self.patterns.decimal.is_match(value) && self.config.detect_decimal_precision {
473                    if let Some(dot_pos) = value.rfind('.') {
474                        let fractional_part = &value[dot_pos + 1..];
475                        let scale = fractional_part.len() as u8;
476                        let precision = (value.len() - 1) as u8; // -1 for the dot
477
478                        stats.decimal_info = Some((precision.min(38), scale.min(38)));
479                    }
480                }
481            }
482        }
483
484        // Date tests
485        if self.patterns.date_iso.is_match(value) {
486            stats.date_matches += 1;
487            stats.detected_formats.push("YYYY-MM-DD".to_string());
488        } else if self.patterns.date_us.is_match(value) {
489            stats.date_matches += 1;
490            stats.detected_formats.push("MM/DD/YYYY".to_string());
491        } else if self.patterns.date_eu.is_match(value) {
492            stats.date_matches += 1;
493            stats.detected_formats.push("DD.MM.YYYY".to_string());
494        }
495
496        // DateTime test
497        if self.patterns.datetime_iso.is_match(value) {
498            stats.datetime_matches += 1;
499            stats
500                .detected_formats
501                .push("YYYY-MM-DD HH:MM:SS".to_string());
502        }
503
504        // Time test
505        if self.patterns.time.is_match(value) {
506            stats.time_matches += 1;
507            stats.detected_formats.push("HH:MM:SS".to_string());
508        }
509
510        // Boolean tests
511        for pattern in &self.patterns.boolean_true {
512            if pattern.is_match(value) {
513                stats.boolean_matches += 1;
514                stats.boolean_representations.0.push(value.to_string());
515                break;
516            }
517        }
518        for pattern in &self.patterns.boolean_false {
519            if pattern.is_match(value) {
520                stats.boolean_matches += 1;
521                stats.boolean_representations.1.push(value.to_string());
522                break;
523            }
524        }
525    }
526
527    /// Determine the best type match from statistics
528    pub fn determine_type(&self, stats: &TypeStats) -> TypeInferenceResult {
529        let non_null_samples = stats.total_samples - stats.null_count;
530
531        if non_null_samples == 0 {
532            return TypeInferenceResult {
533                inferred_type: InferredDataType::Text,
534                confidence: 0.0,
535                samples_analyzed: stats.total_samples,
536                null_count: stats.null_count,
537                alternatives: HashMap::new(),
538            };
539        }
540
541        let mut alternatives = HashMap::new();
542
543        // Calculate confidence scores for each type
544        let integer_confidence = stats.integer_matches as f64 / non_null_samples as f64;
545        let float_confidence = stats.float_matches as f64 / non_null_samples as f64;
546        let boolean_confidence = stats.boolean_matches as f64 / non_null_samples as f64;
547        let date_confidence = stats.date_matches as f64 / non_null_samples as f64;
548        let datetime_confidence = stats.datetime_matches as f64 / non_null_samples as f64;
549        let time_confidence = stats.time_matches as f64 / non_null_samples as f64;
550
551        // Categorical vs Text decision
552        let is_categorical = stats.unique_values.len() <= self.config.categorical_threshold;
553        let categorical_confidence = if is_categorical { 1.0 } else { 0.0 };
554
555        // Add alternatives
556        if integer_confidence > 0.0 {
557            alternatives.insert("Integer".to_string(), integer_confidence);
558        }
559        if float_confidence > 0.0 {
560            alternatives.insert("Float".to_string(), float_confidence);
561        }
562        if boolean_confidence > 0.0 {
563            alternatives.insert("Boolean".to_string(), boolean_confidence);
564        }
565        if date_confidence > 0.0 {
566            alternatives.insert("Date".to_string(), date_confidence);
567        }
568        if datetime_confidence > 0.0 {
569            alternatives.insert("DateTime".to_string(), datetime_confidence);
570        }
571        if time_confidence > 0.0 {
572            alternatives.insert("Time".to_string(), time_confidence);
573        }
574        if categorical_confidence > 0.0 {
575            alternatives.insert("Categorical".to_string(), categorical_confidence);
576        }
577
578        // Determine the best type based on highest confidence
579        let nullable = stats.null_count > 0;
580
581        // Priority order: DateTime > Date > Time > Boolean > Decimal > Float > Integer > Categorical > Text
582        let (inferred_type, confidence) = if datetime_confidence >= self.config.confidence_threshold
583        {
584            let format = stats
585                .detected_formats
586                .first()
587                .unwrap_or(&"YYYY-MM-DD HH:MM:SS".to_string())
588                .clone();
589            (InferredDataType::DateTime { format }, datetime_confidence)
590        } else if date_confidence >= self.config.confidence_threshold {
591            let format = stats
592                .detected_formats
593                .first()
594                .unwrap_or(&"YYYY-MM-DD".to_string())
595                .clone();
596            (InferredDataType::Date { format }, date_confidence)
597        } else if time_confidence >= self.config.confidence_threshold {
598            let format = stats
599                .detected_formats
600                .first()
601                .unwrap_or(&"HH:MM:SS".to_string())
602                .clone();
603            (InferredDataType::Time { format }, time_confidence)
604        } else if boolean_confidence >= self.config.confidence_threshold {
605            let (true_values, false_values) = &stats.boolean_representations;
606            (
607                InferredDataType::Boolean {
608                    true_values: true_values.clone(),
609                    false_values: false_values.clone(),
610                },
611                boolean_confidence,
612            )
613        } else if float_confidence >= self.config.confidence_threshold && stats.float_matches > 0 {
614            // Check if we should prefer decimal over float
615            if let Some((precision, scale)) = stats.decimal_info {
616                (
617                    InferredDataType::Decimal { precision, scale },
618                    float_confidence,
619                )
620            } else {
621                (InferredDataType::Float { nullable }, float_confidence)
622            }
623        } else if integer_confidence >= self.config.confidence_threshold {
624            (InferredDataType::Integer { nullable }, integer_confidence)
625        } else if is_categorical && stats.unique_values.len() > 1 {
626            (
627                InferredDataType::Categorical {
628                    cardinality: stats.unique_values.len(),
629                },
630                categorical_confidence,
631            )
632        } else {
633            // Check for mixed types
634            let mixed_types = alternatives
635                .iter()
636                .filter(|(_, &conf)| conf > 0.1) // At least 10% confidence
637                .map(|(name, &conf)| (name.clone(), conf))
638                .collect::<HashMap<_, _>>();
639
640            if mixed_types.len() > 1 {
641                let max_confidence = mixed_types.values().fold(0.0f64, |a, &b| a.max(b));
642                (
643                    InferredDataType::Mixed { types: mixed_types },
644                    max_confidence,
645                )
646            } else {
647                (InferredDataType::Text, 1.0)
648            }
649        };
650
651        TypeInferenceResult {
652            inferred_type,
653            confidence,
654            samples_analyzed: stats.total_samples,
655            null_count: stats.null_count,
656            alternatives,
657        }
658    }
659}
660
661impl Default for TypeInferenceEngine {
662    fn default() -> Self {
663        Self::new()
664    }
665}
666
667#[cfg(test)]
668mod tests {
669    use super::*;
670
671    #[tokio::test]
672    async fn test_inference_engine_builder() {
673        let engine = TypeInferenceEngine::builder()
674            .sample_size(500)
675            .confidence_threshold(0.8)
676            .categorical_threshold(50)
677            .detect_decimal_precision(false)
678            .international_formats(false)
679            .build();
680
681        assert_eq!(engine.config.sample_size, 500);
682        assert_eq!(engine.config.confidence_threshold, 0.8);
683        assert_eq!(engine.config.categorical_threshold, 50);
684        assert!(!engine.config.detect_decimal_precision);
685        assert!(!engine.config.international_formats);
686    }
687
688    #[tokio::test]
689    async fn test_type_pattern_matching() {
690        let patterns = TypePatterns::new();
691
692        // Integer tests
693        assert!(patterns.integer.is_match("123"));
694        assert!(patterns.integer.is_match("-456"));
695        assert!(patterns.integer.is_match("+789"));
696        assert!(!patterns.integer.is_match("12.34"));
697
698        // Float tests
699        assert!(patterns.float.is_match("12.34"));
700        assert!(patterns.float.is_match("1.23e10"));
701        assert!(patterns.float.is_match(".5"));
702        assert!(patterns.float.is_match("123."));
703
704        // Date tests
705        assert!(patterns.date_iso.is_match("2023-12-25"));
706        assert!(patterns.date_us.is_match("12/25/2023"));
707        assert!(patterns.date_eu.is_match("25.12.2023"));
708
709        // Boolean tests
710        assert!(patterns.boolean_true[0].is_match("true"));
711        assert!(patterns.boolean_true[0].is_match("YES"));
712        assert!(patterns.boolean_true[0].is_match("1"));
713        assert!(patterns.boolean_false[0].is_match("false"));
714        assert!(patterns.boolean_false[0].is_match("NO"));
715        assert!(patterns.boolean_false[0].is_match("0"));
716    }
717
718    #[test]
719    fn test_inferred_data_type_methods() {
720        let int_type = InferredDataType::Integer { nullable: true };
721        assert!(int_type.is_nullable());
722        assert_eq!(int_type.type_name(), "Integer");
723
724        let float_type = InferredDataType::Float { nullable: false };
725        assert!(!float_type.is_nullable());
726        assert_eq!(float_type.type_name(), "Float");
727
728        let bool_type = InferredDataType::Boolean {
729            true_values: vec!["yes".to_string()],
730            false_values: vec!["no".to_string()],
731        };
732        assert!(bool_type.is_nullable());
733        assert_eq!(bool_type.type_name(), "Boolean");
734    }
735
736    #[test]
737    fn test_type_stats_creation() {
738        let stats = TypeStats::new();
739        assert_eq!(stats.total_samples, 0);
740        assert_eq!(stats.null_count, 0);
741        assert_eq!(stats.integer_matches, 0);
742        assert!(stats.unique_values.is_empty());
743    }
744
745    #[test]
746    fn test_analyze_samples_with_nulls() {
747        let engine = TypeInferenceEngine::new();
748        let samples = vec![
749            Some("123".to_string()),
750            None,
751            Some("456".to_string()),
752            None,
753            Some("789".to_string()),
754        ];
755
756        let stats = engine.analyze_samples(&samples);
757        assert_eq!(stats.total_samples, 5);
758        assert_eq!(stats.null_count, 2);
759        assert_eq!(stats.integer_matches, 3);
760    }
761
762    #[test]
763    fn test_analyze_samples_all_nulls() {
764        let engine = TypeInferenceEngine::new();
765        let samples = vec![None, None, None];
766
767        let stats = engine.analyze_samples(&samples);
768        assert_eq!(stats.total_samples, 3);
769        assert_eq!(stats.null_count, 3);
770        assert_eq!(stats.integer_matches, 0);
771    }
772
773    #[test]
774    fn test_analyze_samples_mixed_types() {
775        let engine = TypeInferenceEngine::new();
776        let samples = vec![
777            Some("123".to_string()),        // Integer (also matches float)
778            Some("45.67".to_string()),      // Float
779            Some("true".to_string()),       // Boolean
780            Some("2023-12-25".to_string()), // Date
781            Some("hello".to_string()),      // Text
782        ];
783
784        let stats = engine.analyze_samples(&samples);
785        assert_eq!(stats.total_samples, 5);
786        assert_eq!(stats.null_count, 0);
787        assert_eq!(stats.integer_matches, 1);
788        assert_eq!(stats.float_matches, 1); // Only "45.67" matches float (not "123" anymore)
789        assert_eq!(stats.boolean_matches, 1);
790        assert_eq!(stats.date_matches, 1);
791    }
792
793    #[test]
794    fn test_determine_type_all_nulls() {
795        let engine = TypeInferenceEngine::new();
796        let mut stats = TypeStats::new();
797        stats.total_samples = 3;
798        stats.null_count = 3;
799
800        let result = engine.determine_type(&stats);
801        assert!(matches!(result.inferred_type, InferredDataType::Text));
802        assert_eq!(result.confidence, 0.0);
803        assert_eq!(result.null_count, 3);
804    }
805
806    #[test]
807    fn test_determine_type_single_value() {
808        let engine = TypeInferenceEngine::new();
809        let mut stats = TypeStats::new();
810        stats.total_samples = 1;
811        stats.null_count = 0;
812        stats.integer_matches = 1;
813        stats.unique_values.insert("42".to_string(), 1);
814
815        let result = engine.determine_type(&stats);
816        assert!(matches!(
817            result.inferred_type,
818            InferredDataType::Integer { .. }
819        ));
820        assert_eq!(result.confidence, 1.0);
821    }
822
823    #[test]
824    fn test_determine_type_boolean_detection() {
825        let engine = TypeInferenceEngine::new();
826        let mut stats = TypeStats::new();
827        stats.total_samples = 4;
828        stats.null_count = 0;
829        stats.boolean_matches = 4;
830        stats.boolean_representations = (
831            vec!["true".to_string(), "yes".to_string()],
832            vec!["false".to_string(), "no".to_string()],
833        );
834
835        let result = engine.determine_type(&stats);
836        assert!(matches!(
837            result.inferred_type,
838            InferredDataType::Boolean { .. }
839        ));
840        assert_eq!(result.confidence, 1.0);
841
842        if let InferredDataType::Boolean {
843            true_values,
844            false_values,
845        } = result.inferred_type
846        {
847            assert!(!true_values.is_empty());
848            assert!(!false_values.is_empty());
849        }
850    }
851
852    #[test]
853    fn test_determine_type_categorical_vs_text() {
854        let engine = TypeInferenceEngine::builder()
855            .categorical_threshold(3)
856            .build();
857
858        // Test categorical (low cardinality)
859        let mut stats_categorical = TypeStats::new();
860        stats_categorical.total_samples = 10;
861        stats_categorical.null_count = 0;
862        stats_categorical.unique_values.insert("A".to_string(), 5);
863        stats_categorical.unique_values.insert("B".to_string(), 3);
864        stats_categorical.unique_values.insert("C".to_string(), 2);
865
866        let result_categorical = engine.determine_type(&stats_categorical);
867        assert!(matches!(
868            result_categorical.inferred_type,
869            InferredDataType::Categorical { .. }
870        ));
871
872        // Test text (high cardinality)
873        let mut stats_text = TypeStats::new();
874        stats_text.total_samples = 10;
875        stats_text.null_count = 0;
876        for i in 0..10 {
877            stats_text.unique_values.insert(format!("text_{i}"), 1);
878        }
879
880        let result_text = engine.determine_type(&stats_text);
881        assert!(matches!(result_text.inferred_type, InferredDataType::Text));
882    }
883
884    #[test]
885    fn test_determine_type_decimal_precision() {
886        let engine = TypeInferenceEngine::builder()
887            .detect_decimal_precision(true)
888            .build();
889
890        let mut stats = TypeStats::new();
891        stats.total_samples = 3;
892        stats.null_count = 0;
893        stats.float_matches = 3;
894        stats.decimal_info = Some((5, 2)); // precision=5, scale=2
895
896        let result = engine.determine_type(&stats);
897        assert!(matches!(
898            result.inferred_type,
899            InferredDataType::Decimal { .. }
900        ));
901
902        if let InferredDataType::Decimal { precision, scale } = result.inferred_type {
903            assert_eq!(precision, 5);
904            assert_eq!(scale, 2);
905        }
906    }
907
908    #[test]
909    fn test_determine_type_mixed_types() {
910        let engine = TypeInferenceEngine::builder()
911            .confidence_threshold(0.9) // High threshold to force mixed detection
912            .build();
913
914        let mut stats = TypeStats::new();
915        stats.total_samples = 10;
916        stats.null_count = 0;
917        stats.integer_matches = 3; // 30% integers
918        stats.float_matches = 4; // 40% floats
919        stats.boolean_matches = 3; // 30% booleans
920
921        let result = engine.determine_type(&stats);
922
923        // Should detect as mixed type due to no single type having >90% confidence
924        match result.inferred_type {
925            InferredDataType::Mixed { types } => {
926                assert!(!types.is_empty());
927                assert!(types.len() > 1);
928            }
929            _ => {
930                // Or it might detect the highest confidence type
931                assert!(result.confidence > 0.0);
932            }
933        }
934    }
935
936    #[test]
937    fn test_date_format_detection() {
938        let patterns = TypePatterns::new();
939
940        // ISO format
941        assert!(patterns.date_iso.is_match("2023-12-25"));
942        assert!(!patterns.date_iso.is_match("12/25/2023"));
943
944        // US format
945        assert!(patterns.date_us.is_match("12/25/2023"));
946        assert!(patterns.date_us.is_match("1/1/2023"));
947        assert!(!patterns.date_us.is_match("2023-12-25"));
948
949        // EU format
950        assert!(patterns.date_eu.is_match("25.12.2023"));
951        assert!(patterns.date_eu.is_match("1.1.2023"));
952        assert!(!patterns.date_eu.is_match("2023-12-25"));
953
954        // DateTime format
955        assert!(patterns.datetime_iso.is_match("2023-12-25T10:30:00"));
956        assert!(patterns.datetime_iso.is_match("2023-12-25 10:30:00"));
957        assert!(!patterns.datetime_iso.is_match("2023-12-25"));
958    }
959
960    #[test]
961    fn test_boolean_representations() {
962        let patterns = TypePatterns::new();
963
964        // True values
965        let true_cases = vec![
966            "true", "TRUE", "True", "t", "T", "yes", "YES", "y", "Y", "1", "on", "enabled",
967        ];
968        for case in true_cases {
969            assert!(
970                patterns.boolean_true[0].is_match(case),
971                "Failed to match true case: {case}"
972            );
973        }
974
975        // False values
976        let false_cases = vec![
977            "false", "FALSE", "False", "f", "F", "no", "NO", "n", "N", "0", "off", "disabled",
978        ];
979        for case in false_cases {
980            assert!(
981                patterns.boolean_false[0].is_match(case),
982                "Failed to match false case: {case}"
983            );
984        }
985    }
986
987    #[test]
988    fn test_numeric_edge_cases() {
989        let patterns = TypePatterns::new();
990
991        // Integer edge cases
992        assert!(patterns.integer.is_match("0"));
993        assert!(patterns.integer.is_match("-0"));
994        assert!(patterns.integer.is_match("+0"));
995        assert!(patterns.integer.is_match("9223372036854775807")); // max i64
996
997        // Float edge cases
998        assert!(patterns.float.is_match("0.0"));
999        assert!(patterns.float.is_match(".0"));
1000        assert!(patterns.float.is_match("0."));
1001        assert!(patterns.float.is_match("1e10"));
1002        assert!(patterns.float.is_match("1E-10"));
1003        assert!(patterns.float.is_match("-1.23e+45"));
1004
1005        // Invalid cases
1006        assert!(!patterns.integer.is_match(""));
1007        assert!(!patterns.integer.is_match("abc"));
1008        assert!(!patterns.float.is_match(""));
1009        assert!(!patterns.float.is_match("abc"));
1010    }
1011
1012    #[test]
1013    fn test_confidence_calculation() {
1014        let engine = TypeInferenceEngine::new();
1015
1016        // Perfect match (100% integers)
1017        let mut stats_perfect = TypeStats::new();
1018        stats_perfect.total_samples = 5;
1019        stats_perfect.null_count = 0;
1020        stats_perfect.integer_matches = 5;
1021
1022        let result_perfect = engine.determine_type(&stats_perfect);
1023        assert_eq!(result_perfect.confidence, 1.0);
1024
1025        // Partial match (60% integers)
1026        let mut stats_partial = TypeStats::new();
1027        stats_partial.total_samples = 10;
1028        stats_partial.null_count = 0;
1029        stats_partial.integer_matches = 6;
1030
1031        let result_partial = engine.determine_type(&stats_partial);
1032        assert!(result_partial.confidence >= 0.6);
1033    }
1034
1035    #[test]
1036    fn test_empty_samples() {
1037        let engine = TypeInferenceEngine::new();
1038        let samples: Vec<Option<String>> = vec![];
1039
1040        let stats = engine.analyze_samples(&samples);
1041        assert_eq!(stats.total_samples, 0);
1042        assert_eq!(stats.null_count, 0);
1043
1044        let result = engine.determine_type(&stats);
1045        assert!(matches!(result.inferred_type, InferredDataType::Text));
1046        assert_eq!(result.confidence, 0.0);
1047    }
1048
1049    #[test]
1050    fn test_whitespace_handling() {
1051        let engine = TypeInferenceEngine::new();
1052        let samples = vec![
1053            Some("  123  ".to_string()), // Should be trimmed to "123"
1054            Some("\t456\n".to_string()), // Should be trimmed to "456"
1055            Some("   ".to_string()),     // Should be treated as null
1056            Some("".to_string()),        // Should be treated as null
1057        ];
1058
1059        let stats = engine.analyze_samples(&samples);
1060        assert_eq!(stats.total_samples, 4);
1061        assert_eq!(stats.null_count, 2); // Empty and whitespace-only
1062        assert_eq!(stats.integer_matches, 2); // "123" and "456"
1063    }
1064}