type_detector/
lib.rs

1use chrono::prelude::*;
2use regex::Regex;
3use serde::Serialize;
4use std::cmp::PartialEq;
5use std::fmt;
6use std::mem;
7
8#[derive(Serialize)]
9#[serde(untagged, rename_all = "camelCase")]
10pub enum DataType {
11    String(String),
12    Int(i64),
13    Float(f64),
14    Bool(bool),
15    DateTime(chrono::DateTime<Utc>),
16}
17
18impl fmt::Debug for DataType {
19    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
20        match self {
21            DataType::String(s) => write!(f, "String({})", s),
22            DataType::Int(i) => write!(f, "Int({})", i),
23            DataType::Float(fl) => write!(f, "Float({})", fl),
24            DataType::Bool(b) => write!(f, "Bool({})", b),
25            DataType::DateTime(d) => write!(f, "DateTime({})", d),
26        }
27    }
28}
29
30impl PartialEq for DataType {
31    fn eq(&self, other: &Self) -> bool {
32        if mem::discriminant(self) != mem::discriminant(other) {
33            return false;
34        }
35        return match (self, other) {
36            (DataType::String(v), DataType::String(v1)) => {
37                return v == v1;
38            }
39            (DataType::Int(v), DataType::Int(v1)) => {
40                return v == v1;
41            }
42            (DataType::Float(v), DataType::Float(v1)) => {
43                return v == v1;
44            }
45            (DataType::Bool(v), DataType::Bool(v1)) => {
46                return v == v1;
47            }
48            (DataType::DateTime(v), DataType::DateTime(v1)) => {
49                return v == v1;
50            }
51            _ => false,
52        };
53    }
54}
55
56pub fn detect_type(s: &str) -> DataType {
57    if let Some(b) = try_get_bool(s) {
58        DataType::Bool(b)
59    } else if let Some(f) = try_get_f64(s) {
60        if f.fract() == 0.0 {
61            if s.len() == 8 {
62                if let Some(d) = try_get_datetime(s) {
63                    DataType::DateTime(d)
64                } else {
65                    DataType::Int(f as i64)
66                }
67            } else {
68                DataType::Int(f as i64)
69            }
70        } else {
71            DataType::Float(f)
72        }
73    } else if let Some(d) = try_get_datetime(s.clone()) {
74        DataType::DateTime(d)
75    } else {
76        DataType::String(s.to_string())
77    }
78}
79
80fn try_get_bool(s: &str) -> Option<bool> {
81    match s.to_lowercase().as_str() {
82        "true" => Some(true),
83        "false" => Some(false),
84        _ => None,
85    }
86}
87
88fn try_get_f64(s: &str) -> Option<f64> {
89    if let Ok(f) = s.parse::<f64>() {
90        Some(f)
91    } else {
92        None
93    }
94}
95
96fn try_get_i64(s: &str) -> Option<i64> {
97    if let Ok(f) = s.parse::<i64>() {
98        Some(f)
99    } else {
100        None
101    }
102}
103
104fn try_get_datetime(s: &str) -> Option<DateTime<Utc>> {
105    // pure numbers
106    if let Some(_) = try_get_i64(s) {
107        // yyyyMMdd
108        match s.len() {
109            // yyyyMMdd
110            8 => {
111                let y = &s[0..4].parse::<u32>().unwrap();
112                let m = &s[4..6].parse::<u32>().unwrap();
113                let d = &s[6..8].parse::<u32>().unwrap();
114                if is_date(*y, *m, *d) {
115                    if let Ok(dt) = Utc.datetime_from_str(
116                        format!("{}-{}-{} 00:00:00", &s[0..4], &s[4..6], &s[6..8]).as_str(),
117                        "%Y-%m-%d %H:%M:%S",
118                    ) {
119                        Some(dt)
120                    } else {
121                        None
122                    }
123                } else {
124                    None
125                }
126            }
127            _ => None,
128        }
129    } else {
130        // 2006-03-04, 2006-3-4, 2006/03/04, 2006/3/4, 2006.3.4, 2006.03.04 optional with time
131        if let Ok(re) = Regex::new(
132            r"(?x)
133^(?P<year>\d{4})
134(-|/|\.|年)
135(?P<month>\d{1,2})
136(-|/|\.|月)
137(?P<day>\d{1,2})
138(
139    [^0-9]+(?P<hour>\d{1,2})
140    [:|时](?P<minute>\d{1,2})
141    (
142        [:|分](?P<second>\d{1,2})
143        ([\.](?P<milli>\d{3}))?
144    )?
145)?
146",
147        ) {
148            if let Some(caps) = re.captures(&s) {
149                if let Ok(year) = caps["year"].parse::<u32>() {
150                    if let Ok(month) = caps["month"].parse::<u32>() {
151                        if let Ok(day) = caps["day"].parse::<u32>() {
152                            if is_date(year, month, day) {
153                                let mut f = format!("{}-{}-{} 00:00:00.000", year, month, day);
154                                if let Some(_) = caps.name("hour") {
155                                    if let Some(_) = caps.name("minute") {
156                                        if let Ok(hour) = caps["hour"].parse::<u32>() {
157                                            if let Ok(minute) = caps["minute"].parse::<u32>() {
158                                                if hour < 24 && minute < 60 {
159                                                    f = format!(
160                                                        "{}-{}-{} {}:{}:00.000",
161                                                        year, month, day, hour, minute
162                                                    );
163                                                    if let Some(_) = caps.name("second") {
164                                                        if let Ok(second) =
165                                                            caps["second"].parse::<u32>()
166                                                        {
167                                                            if second < 60 {
168                                                                f = format!(
169                                                                    "{}-{}-{} {}:{}:{}.000",
170                                                                    year,
171                                                                    month,
172                                                                    day,
173                                                                    hour,
174                                                                    minute,
175                                                                    second
176                                                                );
177                                                                if let Some(_) = caps.name("milli")
178                                                                {
179                                                                    if let Ok(milli) =
180                                                                        caps["milli"].parse::<u32>()
181                                                                    {
182                                                                        f = format!(
183                                                                            "{}-{}-{} {}:{}:{}.{}",
184                                                                            year,
185                                                                            month,
186                                                                            day,
187                                                                            hour,
188                                                                            minute,
189                                                                            second,
190                                                                            milli
191                                                                        )
192                                                                    }
193                                                                }
194                                                            }
195                                                        }
196                                                    }
197                                                }
198                                            }
199                                        }
200                                    }
201                                }
202                                if let Ok(dt) =
203                                    Utc.datetime_from_str(f.as_str(), "%Y-%m-%d %H:%M:%S%.3f")
204                                {
205                                    return Some(dt);
206                                }
207                            }
208                        }
209                    }
210                }
211            }
212        }
213        None
214    }
215}
216
217fn is_date(year: u32, month: u32, day: u32) -> bool {
218    if month < 1 || month > 12 {
219        false
220    } else {
221        match month {
222            1 | 3 | 5 | 7 | 8 | 10 | 12 if day > 0 && day < 32 => true,
223            4 | 6 | 9 | 11 if day > 0 && day < 31 => true,
224            2 if is_leap_year(year) && day > 0 && day < 30 => true,
225            2 if !is_leap_year(year) && day > 0 && day < 29 => true,
226            _ => false,
227        }
228    }
229}
230
231fn is_leap_year(year: u32) -> bool {
232    if year % 4 == 0 {
233        if year % 100 == 0 {
234            if year % 400 == 0 {
235                true
236            } else {
237                false
238            }
239        } else {
240            true
241        }
242    } else {
243        false
244    }
245}
246
247#[cfg(test)]
248mod tests {
249
250    use super::*;
251
252    #[test]
253    fn bool_works() {
254        let v: Vec<&str> = vec!["true", "True", "TRUE", "false", "False", "FALSE"];
255        let exp: Vec<DataType> = vec![
256            DataType::Bool(true),
257            DataType::Bool(true),
258            DataType::Bool(true),
259            DataType::Bool(false),
260            DataType::Bool(false),
261            DataType::Bool(false),
262        ];
263        for (i, el) in v.iter().enumerate() {
264            let result = detect_type(el);
265            assert_eq!(result, exp[i]);
266        }
267    }
268
269    #[test]
270    fn int_works() {
271        let v: Vec<&str> = vec!["123", "0123", "465.0", "-34.0", "-27", "000", "0", "0.0"];
272        let exp: Vec<DataType> = vec![
273            DataType::Int(123),
274            DataType::Int(123),
275            DataType::Int(465),
276            DataType::Int(-34),
277            DataType::Int(-27),
278            DataType::Int(0),
279            DataType::Int(0),
280            DataType::Int(0),
281        ];
282        for (i, el) in v.iter().enumerate() {
283            let result = detect_type(el);
284            assert_eq!(result, exp[i]);
285        }
286    }
287
288    #[test]
289    fn float_works() {
290        let v: Vec<&str> = vec![
291            "123.1", "0123.2", "465.389", "-34.2", "-27.99", "000.1", "0.00001", "-.2", ".324",
292        ];
293        let exp: Vec<DataType> = vec![
294            DataType::Float(123.1),
295            DataType::Float(123.2),
296            DataType::Float(465.389),
297            DataType::Float(-34.2),
298            DataType::Float(-27.99),
299            DataType::Float(0.1),
300            DataType::Float(0.00001),
301            DataType::Float(-0.2),
302            DataType::Float(0.324),
303        ];
304        for (i, el) in v.iter().enumerate() {
305            let result = detect_type(el);
306            assert_eq!(result, exp[i]);
307        }
308    }
309
310    #[test]
311    fn datetime_works() {
312        let v: Vec<&str> = vec![
313            "20220405",
314            "20221213",
315            "2022-03-04",
316            "2022-12-24",
317            "2022-1-13",
318            "2022-3-6",
319            "2022/03/06",
320            "2022/3/6",
321            "2022.03.06",
322            "2022.3.6",
323            "2022年03月06日",
324            "2022年3月6",
325            "2014-11-28T12:00:09Z",
326            "2022-03-04 13:04:05",
327            "2022-03-04 1:2:3",
328            "2022年03月04日 13:4:5",
329            "2022-03-04 13时04分05秒",
330            "2022年03月04日13:14",
331            "2022-03-04 13:25",
332            "2022-3-4 13:25",
333            "2022-3-4 1:5",
334            "2022-3-4 1:5:3",
335            "2022年03月04日13:14:15.123Z",
336        ];
337        let exp: Vec<DataType> = vec![
338            DataType::DateTime(Utc.ymd(2022, 4, 5).and_hms(0, 0, 0)),
339            DataType::DateTime(Utc.ymd(2022, 12, 13).and_hms(0, 0, 0)),
340            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(0, 0, 0)),
341            DataType::DateTime(Utc.ymd(2022, 12, 24).and_hms(0, 0, 0)),
342            DataType::DateTime(Utc.ymd(2022, 1, 13).and_hms(0, 0, 0)),
343            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
344            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
345            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
346            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
347            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
348            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
349            DataType::DateTime(Utc.ymd(2022, 3, 6).and_hms(0, 0, 0)),
350            DataType::DateTime(Utc.ymd(2014, 11, 28).and_hms(12, 0, 9)), // time starts
351            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 4, 5)),
352            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(1, 2, 3)),
353            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 4, 5)),
354            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 4, 5)),
355            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 14, 0)),
356            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 25, 0)),
357            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(13, 25, 0)),
358            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(1, 5, 0)),
359            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms(1, 5, 3)),
360            DataType::DateTime(Utc.ymd(2022, 3, 4).and_hms_milli(13, 14, 15, 123)),
361        ];
362        for (i, el) in v.iter().enumerate() {
363            let result = detect_type(el);
364            assert_eq!(result, exp[i]);
365        }
366    }
367
368    #[test]
369    fn string_works() {
370        let v: Vec<&str> = vec![
371            "fdsaf",
372            "0.3213-",
373            "2014-1111",
374            "2014-13-11",
375            "2014-12-32",
376            "2014-12-32 24:00:00",
377            "2014-12-32 24:00",
378            "2014-12-32 23:60:00",
379            "2014-12-32 23:10:60",
380            "2014-12-32 23:60",
381        ];
382        let exp: Vec<DataType> = vec![
383            DataType::String("fdsaf".to_string()),
384            DataType::String("0.3213-".to_string()),
385            DataType::String("2014-1111".to_string()),
386            DataType::String("2014-13-11".to_string()),
387            DataType::String("2014-12-32".to_string()),
388            DataType::String("2014-12-32 24:00:00".to_string()),
389            DataType::String("2014-12-32 24:00".to_string()),
390            DataType::String("2014-12-32 23:60:00".to_string()),
391            DataType::String("2014-12-32 23:10:60".to_string()),
392            DataType::String("2014-12-32 23:60".to_string()),
393        ];
394        for (i, el) in v.iter().enumerate() {
395            let result = detect_type(el);
396            assert_eq!(result, exp[i]);
397        }
398    }
399}