sql_cli/data/
type_inference.rs

1//! Shared type inference logic for data loaders
2//!
3//! This module provides centralized type detection logic to ensure
4//! consistent behavior across CSV, JSON, and other data sources.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9/// Static compiled regex patterns for date detection
10/// Using `LazyLock` for thread-safe initialization
11static DATE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
12    vec![
13        // YYYY-MM-DD (year must be 19xx or 20xx, month 01-12, day 01-31)
14        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$").unwrap(),
15        // YYYY-MM-DD HH:MM:SS (with space separator)
16        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}$").unwrap(),
17        // YYYY-MM-DD HH:MM:SS.mmm (with milliseconds)
18        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}\.\d{1,3}$").unwrap(),
19        // MM/DD/YYYY
20        Regex::new(r"^(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])/(19|20)\d{2}$").unwrap(),
21        // DD/MM/YYYY
22        Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}$").unwrap(),
23        // DD/MM/YYYY HH:MM:SS (UK format with time)
24        Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}\s+\d{2}:\d{2}:\d{2}$").unwrap(),
25        // DD/MM/YYYY HH:MM:SS.mmm (UK format with milliseconds)
26        Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{1,3}$").unwrap(),
27        // DD-MM-YYYY
28        Regex::new(r"^(0[1-9]|[12]\d|3[01])-(0[1-9]|1[0-2])-(19|20)\d{2}$").unwrap(),
29        // YYYY/MM/DD
30        Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])$").unwrap(),
31        // YYYY/MM/DD HH:MM:SS
32        Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}$").unwrap(),
33        // ISO 8601 with time: YYYY-MM-DDTHH:MM:SS
34        Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}")
35            .unwrap(),
36        // ISO 8601 with timezone: YYYY-MM-DDTHH:MM:SS+/-HH:MM or Z
37        Regex::new(
38            r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$",
39        )
40        .unwrap(),
41    ]
42});
43
44/// Detected data type for a value or column
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
46pub enum InferredType {
47    Boolean,
48    Integer,
49    Float,
50    DateTime,
51    String,
52    Null,
53}
54
55/// Type inference utilities
56pub struct TypeInference;
57
58impl TypeInference {
59    /// Infer the type of a single string value
60    ///
61    /// This is the main entry point for type detection.
62    /// Order of checks is important for performance and accuracy.
63    #[must_use]
64    pub fn infer_from_string(value: &str) -> InferredType {
65        // Empty values are null
66        if value.is_empty() {
67            return InferredType::Null;
68        }
69
70        // Check boolean first (fast string comparison)
71        if value.eq_ignore_ascii_case("true") || value.eq_ignore_ascii_case("false") {
72            return InferredType::Boolean;
73        }
74
75        // Try integer (common case, relatively fast)
76        if value.parse::<i64>().is_ok() {
77            return InferredType::Integer;
78        }
79
80        // Try float (includes scientific notation)
81        if value.parse::<f64>().is_ok() {
82            return InferredType::Float;
83        }
84
85        // Check if it looks like a datetime
86        // This is the most expensive check, so we do it last
87        if Self::looks_like_datetime(value) {
88            return InferredType::DateTime;
89        }
90
91        // Default to string
92        InferredType::String
93    }
94
95    /// Check if a string looks like a datetime value
96    ///
97    /// Uses strict regex patterns to avoid false positives with ID strings
98    /// like "BQ-123456" or "ORDER-2024-001"
99    pub fn looks_like_datetime(value: &str) -> bool {
100        // Quick length check - dates are typically 8-30 chars
101        if value.len() < 8 || value.len() > 35 {
102            return false;
103        }
104
105        // Check against our compiled patterns
106        DATE_PATTERNS.iter().any(|pattern| pattern.is_match(value))
107    }
108
109    /// Merge two types when a column has mixed types
110    ///
111    /// Rules:
112    /// - Same type -> keep it
113    /// - Null with anything -> the other type
114    /// - Integer + Float -> Float
115    /// - Any numeric + String -> String
116    /// - `DateTime` + String -> String
117    /// - Everything else -> String
118    #[must_use]
119    pub fn merge_types(type1: InferredType, type2: InferredType) -> InferredType {
120        use InferredType::{Boolean, DateTime, Float, Integer, Null, String};
121
122        match (type1, type2) {
123            // Same type
124            (t1, t2) if t1 == t2 => t1,
125
126            // Null merges to the other type
127            (Null, t) | (t, Null) => t,
128
129            // Integer and Float -> Float
130            (Integer, Float) | (Float, Integer) => Float,
131
132            // Boolean stays boolean only with itself or null
133            (Boolean, _) | (_, Boolean) => String,
134
135            // DateTime only compatible with itself or null
136            (DateTime, _) | (_, DateTime) => String,
137
138            // Default to String for mixed types
139            _ => String,
140        }
141    }
142
143    /// Infer type from multiple sample values
144    ///
145    /// Useful for determining column type from a sample of rows.
146    /// Returns the most specific type that fits all non-null values.
147    pub fn infer_from_samples<'a, I>(values: I) -> InferredType
148    where
149        I: Iterator<Item = &'a str>,
150    {
151        let mut result_type = InferredType::Null;
152
153        for value in values {
154            let value_type = Self::infer_from_string(value);
155            result_type = Self::merge_types(result_type, value_type);
156
157            // Early exit if we've degraded to String
158            if result_type == InferredType::String {
159                break;
160            }
161        }
162
163        result_type
164    }
165
166    /// Check if a value can be coerced to a specific type
167    #[must_use]
168    pub fn can_coerce_to(value: &str, target_type: InferredType) -> bool {
169        match target_type {
170            InferredType::Boolean => {
171                value.eq_ignore_ascii_case("true")
172                    || value.eq_ignore_ascii_case("false")
173                    || value == "0"
174                    || value == "1"
175            }
176            InferredType::Integer => value.parse::<i64>().is_ok(),
177            InferredType::Float => value.parse::<f64>().is_ok(),
178            InferredType::DateTime => Self::looks_like_datetime(value),
179            InferredType::String => true, // Everything can be a string
180            InferredType::Null => value.is_empty(),
181        }
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn test_basic_type_inference() {
191        assert_eq!(
192            TypeInference::infer_from_string("123"),
193            InferredType::Integer
194        );
195        assert_eq!(
196            TypeInference::infer_from_string("123.45"),
197            InferredType::Float
198        );
199        assert_eq!(
200            TypeInference::infer_from_string("true"),
201            InferredType::Boolean
202        );
203        assert_eq!(
204            TypeInference::infer_from_string("FALSE"),
205            InferredType::Boolean
206        );
207        assert_eq!(
208            TypeInference::infer_from_string("hello"),
209            InferredType::String
210        );
211        assert_eq!(TypeInference::infer_from_string(""), InferredType::Null);
212    }
213
214    #[test]
215    fn test_datetime_detection() {
216        // Valid dates should be detected
217        assert_eq!(
218            TypeInference::infer_from_string("2024-01-15"),
219            InferredType::DateTime
220        );
221        assert_eq!(
222            TypeInference::infer_from_string("01/15/2024"),
223            InferredType::DateTime
224        );
225        assert_eq!(
226            TypeInference::infer_from_string("15-01-2024"),
227            InferredType::DateTime
228        );
229        assert_eq!(
230            TypeInference::infer_from_string("2024-01-15T10:30:00"),
231            InferredType::DateTime
232        );
233        assert_eq!(
234            TypeInference::infer_from_string("2024-01-15T10:30:00Z"),
235            InferredType::DateTime
236        );
237    }
238
239    #[test]
240    fn test_id_strings_not_detected_as_datetime() {
241        // These should be detected as String, not DateTime
242        assert_eq!(
243            TypeInference::infer_from_string("BQ-81198596"),
244            InferredType::String
245        );
246        assert_eq!(
247            TypeInference::infer_from_string("ORDER-2024-001"),
248            InferredType::String
249        );
250        assert_eq!(
251            TypeInference::infer_from_string("ID-123-456"),
252            InferredType::String
253        );
254        assert_eq!(
255            TypeInference::infer_from_string("ABC-DEF-GHI"),
256            InferredType::String
257        );
258        assert_eq!(
259            TypeInference::infer_from_string("2024-ABC-123"),
260            InferredType::String
261        );
262    }
263
264    #[test]
265    fn test_invalid_dates_not_detected() {
266        // Invalid month/day combinations
267        assert_eq!(
268            TypeInference::infer_from_string("2024-13-01"), // Month 13
269            InferredType::String
270        );
271        assert_eq!(
272            TypeInference::infer_from_string("2024-00-15"), // Month 00
273            InferredType::String
274        );
275        assert_eq!(
276            TypeInference::infer_from_string("2024-01-32"), // Day 32
277            InferredType::String
278        );
279        assert_eq!(
280            TypeInference::infer_from_string("2024-01-00"), // Day 00
281            InferredType::String
282        );
283    }
284
285    #[test]
286    fn test_type_merging() {
287        use InferredType::*;
288
289        // Same type
290        assert_eq!(TypeInference::merge_types(Integer, Integer), Integer);
291        assert_eq!(TypeInference::merge_types(String, String), String);
292
293        // Null with anything
294        assert_eq!(TypeInference::merge_types(Null, Integer), Integer);
295        assert_eq!(TypeInference::merge_types(Float, Null), Float);
296
297        // Integer and Float
298        assert_eq!(TypeInference::merge_types(Integer, Float), Float);
299        assert_eq!(TypeInference::merge_types(Float, Integer), Float);
300
301        // Mixed types degrade to String
302        assert_eq!(TypeInference::merge_types(Integer, String), String);
303        assert_eq!(TypeInference::merge_types(DateTime, Integer), String);
304        assert_eq!(TypeInference::merge_types(Boolean, Float), String);
305    }
306
307    #[test]
308    fn test_infer_from_samples() {
309        // All integers
310        let samples = vec!["1", "2", "3", "4", "5"];
311        assert_eq!(
312            TypeInference::infer_from_samples(samples.into_iter()),
313            InferredType::Integer
314        );
315
316        // Mixed integer and float
317        let samples = vec!["1", "2.5", "3", "4.0"];
318        assert_eq!(
319            TypeInference::infer_from_samples(samples.into_iter()),
320            InferredType::Float
321        );
322
323        // Mixed types degrade to string
324        let samples = vec!["1", "hello", "3"];
325        assert_eq!(
326            TypeInference::infer_from_samples(samples.into_iter()),
327            InferredType::String
328        );
329
330        // With nulls (empty strings)
331        let samples = vec!["", "1", "", "2", "3"];
332        assert_eq!(
333            TypeInference::infer_from_samples(samples.into_iter()),
334            InferredType::Integer
335        );
336    }
337
338    #[test]
339    fn test_can_coerce() {
340        // Boolean coercion
341        assert!(TypeInference::can_coerce_to("true", InferredType::Boolean));
342        assert!(TypeInference::can_coerce_to("1", InferredType::Boolean));
343        assert!(TypeInference::can_coerce_to("0", InferredType::Boolean));
344        assert!(!TypeInference::can_coerce_to(
345            "hello",
346            InferredType::Boolean
347        ));
348
349        // Integer coercion
350        assert!(TypeInference::can_coerce_to("123", InferredType::Integer));
351        assert!(!TypeInference::can_coerce_to(
352            "123.45",
353            InferredType::Integer
354        ));
355        assert!(!TypeInference::can_coerce_to(
356            "hello",
357            InferredType::Integer
358        ));
359
360        // Everything can be a string
361        assert!(TypeInference::can_coerce_to("123", InferredType::String));
362        assert!(TypeInference::can_coerce_to("hello", InferredType::String));
363        assert!(TypeInference::can_coerce_to("", InferredType::String));
364    }
365}