sql_cli/data/
type_inference.rs

1//! Shared type inference logic for data loaders
2//!
3//! This module provides centralized type detection logic to ensure
4//! consistent behavior across CSV, JSON, and other data sources.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9/// Static compiled regex patterns for date detection
10/// Using LazyLock for thread-safe initialization
11static DATE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
12    vec![
13        // YYYY-MM-DD (year must be 19xx or 20xx, month 01-12, day 01-31)
14        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$").unwrap(),
15        // MM/DD/YYYY
16        Regex::new(r"^(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])/(19|20)\d{2}$").unwrap(),
17        // DD/MM/YYYY
18        Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}$").unwrap(),
19        // DD-MM-YYYY
20        Regex::new(r"^(0[1-9]|[12]\d|3[01])-(0[1-9]|1[0-2])-(19|20)\d{2}$").unwrap(),
21        // YYYY/MM/DD
22        Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])$").unwrap(),
23        // ISO 8601 with time: YYYY-MM-DDTHH:MM:SS
24        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}")
25            .unwrap(),
26        // ISO 8601 with timezone: YYYY-MM-DDTHH:MM:SS+/-HH:MM or Z
27        Regex::new(
28            r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$",
29        )
30        .unwrap(),
31    ]
32});
33
34/// Detected data type for a value or column
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum InferredType {
37    Boolean,
38    Integer,
39    Float,
40    DateTime,
41    String,
42    Null,
43}
44
45/// Type inference utilities
46pub struct TypeInference;
47
48impl TypeInference {
49    /// Infer the type of a single string value
50    ///
51    /// This is the main entry point for type detection.
52    /// Order of checks is important for performance and accuracy.
53    pub fn infer_from_string(value: &str) -> InferredType {
54        // Empty values are null
55        if value.is_empty() {
56            return InferredType::Null;
57        }
58
59        // Check boolean first (fast string comparison)
60        if value.eq_ignore_ascii_case("true") || value.eq_ignore_ascii_case("false") {
61            return InferredType::Boolean;
62        }
63
64        // Try integer (common case, relatively fast)
65        if value.parse::<i64>().is_ok() {
66            return InferredType::Integer;
67        }
68
69        // Try float (includes scientific notation)
70        if value.parse::<f64>().is_ok() {
71            return InferredType::Float;
72        }
73
74        // Check if it looks like a datetime
75        // This is the most expensive check, so we do it last
76        if Self::looks_like_datetime(value) {
77            return InferredType::DateTime;
78        }
79
80        // Default to string
81        InferredType::String
82    }
83
84    /// Check if a string looks like a datetime value
85    ///
86    /// Uses strict regex patterns to avoid false positives with ID strings
87    /// like "BQ-123456" or "ORDER-2024-001"
88    pub fn looks_like_datetime(value: &str) -> bool {
89        // Quick length check - dates are typically 8-30 chars
90        if value.len() < 8 || value.len() > 35 {
91            return false;
92        }
93
94        // Check against our compiled patterns
95        DATE_PATTERNS.iter().any(|pattern| pattern.is_match(value))
96    }
97
98    /// Merge two types when a column has mixed types
99    ///
100    /// Rules:
101    /// - Same type -> keep it
102    /// - Null with anything -> the other type
103    /// - Integer + Float -> Float
104    /// - Any numeric + String -> String
105    /// - DateTime + String -> String
106    /// - Everything else -> String
107    pub fn merge_types(type1: InferredType, type2: InferredType) -> InferredType {
108        use InferredType::*;
109
110        match (type1, type2) {
111            // Same type
112            (t1, t2) if t1 == t2 => t1,
113
114            // Null merges to the other type
115            (Null, t) | (t, Null) => t,
116
117            // Integer and Float -> Float
118            (Integer, Float) | (Float, Integer) => Float,
119
120            // Boolean stays boolean only with itself or null
121            (Boolean, _) | (_, Boolean) => String,
122
123            // DateTime only compatible with itself or null
124            (DateTime, _) | (_, DateTime) => String,
125
126            // Default to String for mixed types
127            _ => String,
128        }
129    }
130
131    /// Infer type from multiple sample values
132    ///
133    /// Useful for determining column type from a sample of rows.
134    /// Returns the most specific type that fits all non-null values.
135    pub fn infer_from_samples<'a, I>(values: I) -> InferredType
136    where
137        I: Iterator<Item = &'a str>,
138    {
139        let mut result_type = InferredType::Null;
140
141        for value in values {
142            let value_type = Self::infer_from_string(value);
143            result_type = Self::merge_types(result_type, value_type);
144
145            // Early exit if we've degraded to String
146            if result_type == InferredType::String {
147                break;
148            }
149        }
150
151        result_type
152    }
153
154    /// Check if a value can be coerced to a specific type
155    pub fn can_coerce_to(value: &str, target_type: InferredType) -> bool {
156        match target_type {
157            InferredType::Boolean => {
158                value.eq_ignore_ascii_case("true")
159                    || value.eq_ignore_ascii_case("false")
160                    || value == "0"
161                    || value == "1"
162            }
163            InferredType::Integer => value.parse::<i64>().is_ok(),
164            InferredType::Float => value.parse::<f64>().is_ok(),
165            InferredType::DateTime => Self::looks_like_datetime(value),
166            InferredType::String => true, // Everything can be a string
167            InferredType::Null => value.is_empty(),
168        }
169    }
170}
171
172#[cfg(test)]
173mod tests {
174    use super::*;
175
176    #[test]
177    fn test_basic_type_inference() {
178        assert_eq!(
179            TypeInference::infer_from_string("123"),
180            InferredType::Integer
181        );
182        assert_eq!(
183            TypeInference::infer_from_string("123.45"),
184            InferredType::Float
185        );
186        assert_eq!(
187            TypeInference::infer_from_string("true"),
188            InferredType::Boolean
189        );
190        assert_eq!(
191            TypeInference::infer_from_string("FALSE"),
192            InferredType::Boolean
193        );
194        assert_eq!(
195            TypeInference::infer_from_string("hello"),
196            InferredType::String
197        );
198        assert_eq!(TypeInference::infer_from_string(""), InferredType::Null);
199    }
200
201    #[test]
202    fn test_datetime_detection() {
203        // Valid dates should be detected
204        assert_eq!(
205            TypeInference::infer_from_string("2024-01-15"),
206            InferredType::DateTime
207        );
208        assert_eq!(
209            TypeInference::infer_from_string("01/15/2024"),
210            InferredType::DateTime
211        );
212        assert_eq!(
213            TypeInference::infer_from_string("15-01-2024"),
214            InferredType::DateTime
215        );
216        assert_eq!(
217            TypeInference::infer_from_string("2024-01-15T10:30:00"),
218            InferredType::DateTime
219        );
220        assert_eq!(
221            TypeInference::infer_from_string("2024-01-15T10:30:00Z"),
222            InferredType::DateTime
223        );
224    }
225
226    #[test]
227    fn test_id_strings_not_detected_as_datetime() {
228        // These should be detected as String, not DateTime
229        assert_eq!(
230            TypeInference::infer_from_string("BQ-81198596"),
231            InferredType::String
232        );
233        assert_eq!(
234            TypeInference::infer_from_string("ORDER-2024-001"),
235            InferredType::String
236        );
237        assert_eq!(
238            TypeInference::infer_from_string("ID-123-456"),
239            InferredType::String
240        );
241        assert_eq!(
242            TypeInference::infer_from_string("ABC-DEF-GHI"),
243            InferredType::String
244        );
245        assert_eq!(
246            TypeInference::infer_from_string("2024-ABC-123"),
247            InferredType::String
248        );
249    }
250
251    #[test]
252    fn test_invalid_dates_not_detected() {
253        // Invalid month/day combinations
254        assert_eq!(
255            TypeInference::infer_from_string("2024-13-01"), // Month 13
256            InferredType::String
257        );
258        assert_eq!(
259            TypeInference::infer_from_string("2024-00-15"), // Month 00
260            InferredType::String
261        );
262        assert_eq!(
263            TypeInference::infer_from_string("2024-01-32"), // Day 32
264            InferredType::String
265        );
266        assert_eq!(
267            TypeInference::infer_from_string("2024-01-00"), // Day 00
268            InferredType::String
269        );
270    }
271
272    #[test]
273    fn test_type_merging() {
274        use InferredType::*;
275
276        // Same type
277        assert_eq!(TypeInference::merge_types(Integer, Integer), Integer);
278        assert_eq!(TypeInference::merge_types(String, String), String);
279
280        // Null with anything
281        assert_eq!(TypeInference::merge_types(Null, Integer), Integer);
282        assert_eq!(TypeInference::merge_types(Float, Null), Float);
283
284        // Integer and Float
285        assert_eq!(TypeInference::merge_types(Integer, Float), Float);
286        assert_eq!(TypeInference::merge_types(Float, Integer), Float);
287
288        // Mixed types degrade to String
289        assert_eq!(TypeInference::merge_types(Integer, String), String);
290        assert_eq!(TypeInference::merge_types(DateTime, Integer), String);
291        assert_eq!(TypeInference::merge_types(Boolean, Float), String);
292    }
293
294    #[test]
295    fn test_infer_from_samples() {
296        // All integers
297        let samples = vec!["1", "2", "3", "4", "5"];
298        assert_eq!(
299            TypeInference::infer_from_samples(samples.into_iter()),
300            InferredType::Integer
301        );
302
303        // Mixed integer and float
304        let samples = vec!["1", "2.5", "3", "4.0"];
305        assert_eq!(
306            TypeInference::infer_from_samples(samples.into_iter()),
307            InferredType::Float
308        );
309
310        // Mixed types degrade to string
311        let samples = vec!["1", "hello", "3"];
312        assert_eq!(
313            TypeInference::infer_from_samples(samples.into_iter()),
314            InferredType::String
315        );
316
317        // With nulls (empty strings)
318        let samples = vec!["", "1", "", "2", "3"];
319        assert_eq!(
320            TypeInference::infer_from_samples(samples.into_iter()),
321            InferredType::Integer
322        );
323    }
324
325    #[test]
326    fn test_can_coerce() {
327        // Boolean coercion
328        assert!(TypeInference::can_coerce_to("true", InferredType::Boolean));
329        assert!(TypeInference::can_coerce_to("1", InferredType::Boolean));
330        assert!(TypeInference::can_coerce_to("0", InferredType::Boolean));
331        assert!(!TypeInference::can_coerce_to(
332            "hello",
333            InferredType::Boolean
334        ));
335
336        // Integer coercion
337        assert!(TypeInference::can_coerce_to("123", InferredType::Integer));
338        assert!(!TypeInference::can_coerce_to(
339            "123.45",
340            InferredType::Integer
341        ));
342        assert!(!TypeInference::can_coerce_to(
343            "hello",
344            InferredType::Integer
345        ));
346
347        // Everything can be a string
348        assert!(TypeInference::can_coerce_to("123", InferredType::String));
349        assert!(TypeInference::can_coerce_to("hello", InferredType::String));
350        assert!(TypeInference::can_coerce_to("", InferredType::String));
351    }
352}