use regex::Regex;
use std::sync::LazyLock;
static DATE_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
vec![
Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$").unwrap(),
Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}$").unwrap(),
Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}\.\d{1,3}$").unwrap(),
Regex::new(r"^(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])/(19|20)\d{2}$").unwrap(),
Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}$").unwrap(),
Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}\s+\d{2}:\d{2}:\d{2}$").unwrap(),
Regex::new(r"^(0[1-9]|[12]\d|3[01])/(0[1-9]|1[0-2])/(19|20)\d{2}\s+\d{2}:\d{2}:\d{2}\.\d{1,3}$").unwrap(),
Regex::new(r"^(0[1-9]|[12]\d|3[01])-(0[1-9]|1[0-2])-(19|20)\d{2}$").unwrap(),
Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])$").unwrap(),
Regex::new(r"^(19|20)\d{2}/(0[1-9]|1[0-2])/(0[1-9]|[12]\d|3[01])\s+\d{2}:\d{2}:\d{2}$").unwrap(),
Regex::new(r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}")
.unwrap(),
Regex::new(
r"^(19|20)\d{2}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$",
)
.unwrap(),
]
});
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum InferredType {
Boolean,
Integer,
Float,
DateTime,
String,
Null,
}
pub struct TypeInference;
impl TypeInference {
#[must_use]
pub fn infer_from_string(value: &str) -> InferredType {
if value.is_empty() {
return InferredType::Null;
}
if value.eq_ignore_ascii_case("true") || value.eq_ignore_ascii_case("false") {
return InferredType::Boolean;
}
if value.parse::<i64>().is_ok() {
return InferredType::Integer;
}
if value.parse::<f64>().is_ok() {
return InferredType::Float;
}
if Self::looks_like_datetime(value) {
return InferredType::DateTime;
}
InferredType::String
}
pub fn looks_like_datetime(value: &str) -> bool {
if value.len() < 8 || value.len() > 35 {
return false;
}
DATE_PATTERNS.iter().any(|pattern| pattern.is_match(value))
}
#[must_use]
pub fn merge_types(type1: InferredType, type2: InferredType) -> InferredType {
use InferredType::{Boolean, DateTime, Float, Integer, Null, String};
match (type1, type2) {
(t1, t2) if t1 == t2 => t1,
(Null, t) | (t, Null) => t,
(Integer, Float) | (Float, Integer) => Float,
(Boolean, _) | (_, Boolean) => String,
(DateTime, _) | (_, DateTime) => String,
_ => String,
}
}
pub fn infer_from_samples<'a, I>(values: I) -> InferredType
where
I: Iterator<Item = &'a str>,
{
let mut result_type = InferredType::Null;
for value in values {
let value_type = Self::infer_from_string(value);
result_type = Self::merge_types(result_type, value_type);
if result_type == InferredType::String {
break;
}
}
result_type
}
#[must_use]
pub fn can_coerce_to(value: &str, target_type: InferredType) -> bool {
match target_type {
InferredType::Boolean => {
value.eq_ignore_ascii_case("true")
|| value.eq_ignore_ascii_case("false")
|| value == "0"
|| value == "1"
}
InferredType::Integer => value.parse::<i64>().is_ok(),
InferredType::Float => value.parse::<f64>().is_ok(),
InferredType::DateTime => Self::looks_like_datetime(value),
InferredType::String => true, InferredType::Null => value.is_empty(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_type_inference() {
assert_eq!(
TypeInference::infer_from_string("123"),
InferredType::Integer
);
assert_eq!(
TypeInference::infer_from_string("123.45"),
InferredType::Float
);
assert_eq!(
TypeInference::infer_from_string("true"),
InferredType::Boolean
);
assert_eq!(
TypeInference::infer_from_string("FALSE"),
InferredType::Boolean
);
assert_eq!(
TypeInference::infer_from_string("hello"),
InferredType::String
);
assert_eq!(TypeInference::infer_from_string(""), InferredType::Null);
}
#[test]
fn test_datetime_detection() {
assert_eq!(
TypeInference::infer_from_string("2024-01-15"),
InferredType::DateTime
);
assert_eq!(
TypeInference::infer_from_string("01/15/2024"),
InferredType::DateTime
);
assert_eq!(
TypeInference::infer_from_string("15-01-2024"),
InferredType::DateTime
);
assert_eq!(
TypeInference::infer_from_string("2024-01-15T10:30:00"),
InferredType::DateTime
);
assert_eq!(
TypeInference::infer_from_string("2024-01-15T10:30:00Z"),
InferredType::DateTime
);
}
#[test]
fn test_id_strings_not_detected_as_datetime() {
assert_eq!(
TypeInference::infer_from_string("BQ-81198596"),
InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("ORDER-2024-001"),
InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("ID-123-456"),
InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("ABC-DEF-GHI"),
InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("2024-ABC-123"),
InferredType::String
);
}
#[test]
fn test_invalid_dates_not_detected() {
assert_eq!(
TypeInference::infer_from_string("2024-13-01"), InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("2024-00-15"), InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("2024-01-32"), InferredType::String
);
assert_eq!(
TypeInference::infer_from_string("2024-01-00"), InferredType::String
);
}
#[test]
fn test_type_merging() {
use InferredType::*;
assert_eq!(TypeInference::merge_types(Integer, Integer), Integer);
assert_eq!(TypeInference::merge_types(String, String), String);
assert_eq!(TypeInference::merge_types(Null, Integer), Integer);
assert_eq!(TypeInference::merge_types(Float, Null), Float);
assert_eq!(TypeInference::merge_types(Integer, Float), Float);
assert_eq!(TypeInference::merge_types(Float, Integer), Float);
assert_eq!(TypeInference::merge_types(Integer, String), String);
assert_eq!(TypeInference::merge_types(DateTime, Integer), String);
assert_eq!(TypeInference::merge_types(Boolean, Float), String);
}
#[test]
fn test_infer_from_samples() {
let samples = vec!["1", "2", "3", "4", "5"];
assert_eq!(
TypeInference::infer_from_samples(samples.into_iter()),
InferredType::Integer
);
let samples = vec!["1", "2.5", "3", "4.0"];
assert_eq!(
TypeInference::infer_from_samples(samples.into_iter()),
InferredType::Float
);
let samples = vec!["1", "hello", "3"];
assert_eq!(
TypeInference::infer_from_samples(samples.into_iter()),
InferredType::String
);
let samples = vec!["", "1", "", "2", "3"];
assert_eq!(
TypeInference::infer_from_samples(samples.into_iter()),
InferredType::Integer
);
}
#[test]
fn test_can_coerce() {
assert!(TypeInference::can_coerce_to("true", InferredType::Boolean));
assert!(TypeInference::can_coerce_to("1", InferredType::Boolean));
assert!(TypeInference::can_coerce_to("0", InferredType::Boolean));
assert!(!TypeInference::can_coerce_to(
"hello",
InferredType::Boolean
));
assert!(TypeInference::can_coerce_to("123", InferredType::Integer));
assert!(!TypeInference::can_coerce_to(
"123.45",
InferredType::Integer
));
assert!(!TypeInference::can_coerce_to(
"hello",
InferredType::Integer
));
assert!(TypeInference::can_coerce_to("123", InferredType::String));
assert!(TypeInference::can_coerce_to("hello", InferredType::String));
assert!(TypeInference::can_coerce_to("", InferredType::String));
}
}