sql_cli/data/
type_inference.rs

1//! Shared type inference logic for data loaders
2//!
3//! This module provides centralized type detection logic to ensure
4//! consistent behavior across CSV, JSON, and other data sources.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9/// Static compiled regex patterns for date detection
10/// Using `LazyLock` for thread-safe initialization
11static DATE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
12    vec![
13        // YYYY-MM-DD (year must be 19xx or 20xx, month 01-12, day 01-31)
14        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$").unwrap(),
15        // MM/DD/YYYY
16        Regex::new(r"^(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])/(19|20)\d{2}$").unwrap(),
17        // DD/MM/YYYY
18        Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}$").unwrap(),
19        // DD-MM-YYYY
20        Regex::new(r"^(0[1-9]|[12]\d|3[01])-(0[1-9]|1[0-2])-(19|20)\d{2}$").unwrap(),
21        // YYYY/MM/DD
22        Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])$").unwrap(),
23        // ISO 8601 with time: YYYY-MM-DDTHH:MM:SS
24        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}")
25            .unwrap(),
26        // ISO 8601 with timezone: YYYY-MM-DDTHH:MM:SS+/-HH:MM or Z
27        Regex::new(
28            r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$",
29        )
30        .unwrap(),
31    ]
32});
33
34/// Detected data type for a value or column
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub enum InferredType {
37    Boolean,
38    Integer,
39    Float,
40    DateTime,
41    String,
42    Null,
43}
44
45/// Type inference utilities
46pub struct TypeInference;
47
48impl TypeInference {
49    /// Infer the type of a single string value
50    ///
51    /// This is the main entry point for type detection.
52    /// Order of checks is important for performance and accuracy.
53    #[must_use]
54    pub fn infer_from_string(value: &str) -> InferredType {
55        // Empty values are null
56        if value.is_empty() {
57            return InferredType::Null;
58        }
59
60        // Check boolean first (fast string comparison)
61        if value.eq_ignore_ascii_case("true") || value.eq_ignore_ascii_case("false") {
62            return InferredType::Boolean;
63        }
64
65        // Try integer (common case, relatively fast)
66        if value.parse::<i64>().is_ok() {
67            return InferredType::Integer;
68        }
69
70        // Try float (includes scientific notation)
71        if value.parse::<f64>().is_ok() {
72            return InferredType::Float;
73        }
74
75        // Check if it looks like a datetime
76        // This is the most expensive check, so we do it last
77        if Self::looks_like_datetime(value) {
78            return InferredType::DateTime;
79        }
80
81        // Default to string
82        InferredType::String
83    }
84
85    /// Check if a string looks like a datetime value
86    ///
87    /// Uses strict regex patterns to avoid false positives with ID strings
88    /// like "BQ-123456" or "ORDER-2024-001"
89    pub fn looks_like_datetime(value: &str) -> bool {
90        // Quick length check - dates are typically 8-30 chars
91        if value.len() < 8 || value.len() > 35 {
92            return false;
93        }
94
95        // Check against our compiled patterns
96        DATE_PATTERNS.iter().any(|pattern| pattern.is_match(value))
97    }
98
99    /// Merge two types when a column has mixed types
100    ///
101    /// Rules:
102    /// - Same type -> keep it
103    /// - Null with anything -> the other type
104    /// - Integer + Float -> Float
105    /// - Any numeric + String -> String
106    /// - `DateTime` + String -> String
107    /// - Everything else -> String
108    #[must_use]
109    pub fn merge_types(type1: InferredType, type2: InferredType) -> InferredType {
110        use InferredType::{Boolean, DateTime, Float, Integer, Null, String};
111
112        match (type1, type2) {
113            // Same type
114            (t1, t2) if t1 == t2 => t1,
115
116            // Null merges to the other type
117            (Null, t) | (t, Null) => t,
118
119            // Integer and Float -> Float
120            (Integer, Float) | (Float, Integer) => Float,
121
122            // Boolean stays boolean only with itself or null
123            (Boolean, _) | (_, Boolean) => String,
124
125            // DateTime only compatible with itself or null
126            (DateTime, _) | (_, DateTime) => String,
127
128            // Default to String for mixed types
129            _ => String,
130        }
131    }
132
133    /// Infer type from multiple sample values
134    ///
135    /// Useful for determining column type from a sample of rows.
136    /// Returns the most specific type that fits all non-null values.
137    pub fn infer_from_samples<'a, I>(values: I) -> InferredType
138    where
139        I: Iterator<Item = &'a str>,
140    {
141        let mut result_type = InferredType::Null;
142
143        for value in values {
144            let value_type = Self::infer_from_string(value);
145            result_type = Self::merge_types(result_type, value_type);
146
147            // Early exit if we've degraded to String
148            if result_type == InferredType::String {
149                break;
150            }
151        }
152
153        result_type
154    }
155
156    /// Check if a value can be coerced to a specific type
157    #[must_use]
158    pub fn can_coerce_to(value: &str, target_type: InferredType) -> bool {
159        match target_type {
160            InferredType::Boolean => {
161                value.eq_ignore_ascii_case("true")
162                    || value.eq_ignore_ascii_case("false")
163                    || value == "0"
164                    || value == "1"
165            }
166            InferredType::Integer => value.parse::<i64>().is_ok(),
167            InferredType::Float => value.parse::<f64>().is_ok(),
168            InferredType::DateTime => Self::looks_like_datetime(value),
169            InferredType::String => true, // Everything can be a string
170            InferredType::Null => value.is_empty(),
171        }
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn test_basic_type_inference() {
181        assert_eq!(
182            TypeInference::infer_from_string("123"),
183            InferredType::Integer
184        );
185        assert_eq!(
186            TypeInference::infer_from_string("123.45"),
187            InferredType::Float
188        );
189        assert_eq!(
190            TypeInference::infer_from_string("true"),
191            InferredType::Boolean
192        );
193        assert_eq!(
194            TypeInference::infer_from_string("FALSE"),
195            InferredType::Boolean
196        );
197        assert_eq!(
198            TypeInference::infer_from_string("hello"),
199            InferredType::String
200        );
201        assert_eq!(TypeInference::infer_from_string(""), InferredType::Null);
202    }
203
204    #[test]
205    fn test_datetime_detection() {
206        // Valid dates should be detected
207        assert_eq!(
208            TypeInference::infer_from_string("2024-01-15"),
209            InferredType::DateTime
210        );
211        assert_eq!(
212            TypeInference::infer_from_string("01/15/2024"),
213            InferredType::DateTime
214        );
215        assert_eq!(
216            TypeInference::infer_from_string("15-01-2024"),
217            InferredType::DateTime
218        );
219        assert_eq!(
220            TypeInference::infer_from_string("2024-01-15T10:30:00"),
221            InferredType::DateTime
222        );
223        assert_eq!(
224            TypeInference::infer_from_string("2024-01-15T10:30:00Z"),
225            InferredType::DateTime
226        );
227    }
228
229    #[test]
230    fn test_id_strings_not_detected_as_datetime() {
231        // These should be detected as String, not DateTime
232        assert_eq!(
233            TypeInference::infer_from_string("BQ-81198596"),
234            InferredType::String
235        );
236        assert_eq!(
237            TypeInference::infer_from_string("ORDER-2024-001"),
238            InferredType::String
239        );
240        assert_eq!(
241            TypeInference::infer_from_string("ID-123-456"),
242            InferredType::String
243        );
244        assert_eq!(
245            TypeInference::infer_from_string("ABC-DEF-GHI"),
246            InferredType::String
247        );
248        assert_eq!(
249            TypeInference::infer_from_string("2024-ABC-123"),
250            InferredType::String
251        );
252    }
253
254    #[test]
255    fn test_invalid_dates_not_detected() {
256        // Invalid month/day combinations
257        assert_eq!(
258            TypeInference::infer_from_string("2024-13-01"), // Month 13
259            InferredType::String
260        );
261        assert_eq!(
262            TypeInference::infer_from_string("2024-00-15"), // Month 00
263            InferredType::String
264        );
265        assert_eq!(
266            TypeInference::infer_from_string("2024-01-32"), // Day 32
267            InferredType::String
268        );
269        assert_eq!(
270            TypeInference::infer_from_string("2024-01-00"), // Day 00
271            InferredType::String
272        );
273    }
274
275    #[test]
276    fn test_type_merging() {
277        use InferredType::*;
278
279        // Same type
280        assert_eq!(TypeInference::merge_types(Integer, Integer), Integer);
281        assert_eq!(TypeInference::merge_types(String, String), String);
282
283        // Null with anything
284        assert_eq!(TypeInference::merge_types(Null, Integer), Integer);
285        assert_eq!(TypeInference::merge_types(Float, Null), Float);
286
287        // Integer and Float
288        assert_eq!(TypeInference::merge_types(Integer, Float), Float);
289        assert_eq!(TypeInference::merge_types(Float, Integer), Float);
290
291        // Mixed types degrade to String
292        assert_eq!(TypeInference::merge_types(Integer, String), String);
293        assert_eq!(TypeInference::merge_types(DateTime, Integer), String);
294        assert_eq!(TypeInference::merge_types(Boolean, Float), String);
295    }
296
297    #[test]
298    fn test_infer_from_samples() {
299        // All integers
300        let samples = vec!["1", "2", "3", "4", "5"];
301        assert_eq!(
302            TypeInference::infer_from_samples(samples.into_iter()),
303            InferredType::Integer
304        );
305
306        // Mixed integer and float
307        let samples = vec!["1", "2.5", "3", "4.0"];
308        assert_eq!(
309            TypeInference::infer_from_samples(samples.into_iter()),
310            InferredType::Float
311        );
312
313        // Mixed types degrade to string
314        let samples = vec!["1", "hello", "3"];
315        assert_eq!(
316            TypeInference::infer_from_samples(samples.into_iter()),
317            InferredType::String
318        );
319
320        // With nulls (empty strings)
321        let samples = vec!["", "1", "", "2", "3"];
322        assert_eq!(
323            TypeInference::infer_from_samples(samples.into_iter()),
324            InferredType::Integer
325        );
326    }
327
328    #[test]
329    fn test_can_coerce() {
330        // Boolean coercion
331        assert!(TypeInference::can_coerce_to("true", InferredType::Boolean));
332        assert!(TypeInference::can_coerce_to("1", InferredType::Boolean));
333        assert!(TypeInference::can_coerce_to("0", InferredType::Boolean));
334        assert!(!TypeInference::can_coerce_to(
335            "hello",
336            InferredType::Boolean
337        ));
338
339        // Integer coercion
340        assert!(TypeInference::can_coerce_to("123", InferredType::Integer));
341        assert!(!TypeInference::can_coerce_to(
342            "123.45",
343            InferredType::Integer
344        ));
345        assert!(!TypeInference::can_coerce_to(
346            "hello",
347            InferredType::Integer
348        ));
349
350        // Everything can be a string
351        assert!(TypeInference::can_coerce_to("123", InferredType::String));
352        assert!(TypeInference::can_coerce_to("hello", InferredType::String));
353        assert!(TypeInference::can_coerce_to("", InferredType::String));
354    }
355}