use regex::Regex;
pub static NULL_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"(?i)^(null|nil|none|na|n/a|\?|nan|-|--|\.|\.\.|#n/a|#value!|#ref!|#div/0!)$")
.expect("Invalid null pattern")
});
pub static UNSIGNED_PATTERN: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^[+]?\d{1,20}$").expect("Invalid unsigned pattern"));
pub static SIGNED_PATTERN: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^[-+]?\d{1,20}$").expect("Invalid signed pattern"));
pub static FLOAT_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[-+]?(?:\d+\.?\d*|\d*\.?\d+)(?:[eE][-+]?\d+)?$").expect("Invalid float pattern")
});
pub static FLOAT_EURO_PATTERN: std::sync::LazyLock<Regex> =
std::sync::LazyLock::new(|| Regex::new(r"^[-+]?\d+,\d+$").expect("Invalid euro float pattern"));
pub static FLOAT_THOUSANDS_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[-+]?(?:\d{1,3}(?:,\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?)$")
.expect("Invalid thousands pattern")
});
pub static BOOLEAN_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"(?i)^(true|false|yes|no|y|n|t|f|1|0|on|off)$").expect("Invalid boolean pattern")
});
pub static DATE_ISO_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}$").expect("Invalid ISO date pattern")
});
pub static DATE_US_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{1,2}[-/]\d{1,2}[-/]\d{2,4}$").expect("Invalid US date pattern")
});
pub static DATE_EURO_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{1,2}\.\d{1,2}\.\d{2,4}$").expect("Invalid Euro date pattern")
});
pub static DATETIME_ISO_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(
r"^\d{4}[-/]\d{1,2}[-/]\d{1,2}[T ]\d{1,2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+-]\d{2}:?\d{2})?$",
)
.expect("Invalid ISO datetime pattern")
});
pub static DATETIME_GENERAL_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{1,4}[-/\.]\d{1,2}[-/\.]\d{1,4}[T ]?\d{1,2}:\d{2}(:\d{2})?(\s*(AM|PM|am|pm))?$")
.expect("Invalid general datetime pattern")
});
pub static TIME_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{1,2}:\d{2}(:\d{2})?(\.\d+)?(\s*(AM|PM|am|pm))?$")
.expect("Invalid time pattern")
});
pub static EMAIL_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").expect("Invalid email pattern")
});
pub static URL_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^(https?|ftp)://[^\s/$.?#].[^\s]*$").expect("Invalid URL pattern")
});
pub static IPV4_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$").expect("Invalid IPv4 pattern")
});
pub static CURRENCY_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[$€£¥₹]?\s*[-+]?[\d,]+\.?\d*$|^[-+]?[\d,]+\.?\d*\s*[$€£¥₹]$")
.expect("Invalid currency pattern")
});
pub static PERCENTAGE_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[-+]?\d+\.?\d*\s*%$").expect("Invalid percentage pattern")
});
pub static UUID_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
.expect("Invalid UUID pattern")
});
pub static ALPHANUM_PATTERN: std::sync::LazyLock<Regex> = std::sync::LazyLock::new(|| {
Regex::new(r"^[A-Za-z0-9_-]+$").expect("Invalid alphanumeric pattern")
});
pub struct PatternCategory {
pub pattern: &'static std::sync::LazyLock<Regex>,
#[allow(dead_code)]
pub category: &'static str,
pub weight: f64,
}
static PATTERN_CATEGORIES: std::sync::LazyLock<Vec<PatternCategory>> =
std::sync::LazyLock::new(|| {
vec![
PatternCategory {
pattern: &NULL_PATTERN,
category: "null",
weight: 0.5,
},
PatternCategory {
pattern: &BOOLEAN_PATTERN,
category: "boolean",
weight: 1.0,
},
PatternCategory {
pattern: &UNSIGNED_PATTERN,
category: "unsigned",
weight: 1.0,
},
PatternCategory {
pattern: &SIGNED_PATTERN,
category: "signed",
weight: 1.0,
},
PatternCategory {
pattern: &FLOAT_PATTERN,
category: "float",
weight: 1.0,
},
PatternCategory {
pattern: &FLOAT_EURO_PATTERN,
category: "float_euro",
weight: 0.9,
},
PatternCategory {
pattern: &FLOAT_THOUSANDS_PATTERN,
category: "float_thousands",
weight: 0.9,
},
PatternCategory {
pattern: &DATE_ISO_PATTERN,
category: "date",
weight: 1.0,
},
PatternCategory {
pattern: &DATE_US_PATTERN,
category: "date",
weight: 0.9,
},
PatternCategory {
pattern: &DATE_EURO_PATTERN,
category: "date",
weight: 0.9,
},
PatternCategory {
pattern: &DATETIME_ISO_PATTERN,
category: "datetime",
weight: 1.0,
},
PatternCategory {
pattern: &DATETIME_GENERAL_PATTERN,
category: "datetime",
weight: 0.9,
},
PatternCategory {
pattern: &TIME_PATTERN,
category: "time",
weight: 0.8,
},
PatternCategory {
pattern: &EMAIL_PATTERN,
category: "email",
weight: 0.8,
},
PatternCategory {
pattern: &URL_PATTERN,
category: "url",
weight: 0.8,
},
PatternCategory {
pattern: &IPV4_PATTERN,
category: "ipv4",
weight: 0.8,
},
PatternCategory {
pattern: &CURRENCY_PATTERN,
category: "currency",
weight: 0.9,
},
PatternCategory {
pattern: &PERCENTAGE_PATTERN,
category: "percentage",
weight: 0.9,
},
PatternCategory {
pattern: &UUID_PATTERN,
category: "uuid",
weight: 0.8,
},
PatternCategory {
pattern: &ALPHANUM_PATTERN,
category: "alphanum",
weight: 0.3,
},
]
});
pub fn get_pattern_categories() -> &'static [PatternCategory] {
&PATTERN_CATEGORIES
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_unsigned_pattern() {
assert!(UNSIGNED_PATTERN.is_match("123"));
assert!(UNSIGNED_PATTERN.is_match("0"));
assert!(UNSIGNED_PATTERN.is_match("+42"));
assert!(!UNSIGNED_PATTERN.is_match("-42"));
assert!(!UNSIGNED_PATTERN.is_match("12.34"));
}
#[test]
fn test_boolean_pattern() {
assert!(BOOLEAN_PATTERN.is_match("true"));
assert!(BOOLEAN_PATTERN.is_match("FALSE"));
assert!(BOOLEAN_PATTERN.is_match("yes"));
assert!(BOOLEAN_PATTERN.is_match("1"));
assert!(!BOOLEAN_PATTERN.is_match("maybe"));
}
#[test]
fn test_date_patterns() {
assert!(DATE_ISO_PATTERN.is_match("2023-12-31"));
assert!(DATE_ISO_PATTERN.is_match("2023/12/31"));
assert!(DATE_US_PATTERN.is_match("12/31/2023"));
assert!(DATE_EURO_PATTERN.is_match("31.12.2023"));
}
#[test]
fn test_datetime_patterns() {
assert!(DATETIME_ISO_PATTERN.is_match("2023-12-31T12:30:45"));
assert!(DATETIME_ISO_PATTERN.is_match("2023-12-31 12:30:45"));
assert!(DATETIME_ISO_PATTERN.is_match("2023-12-31T12:30:45Z"));
assert!(DATETIME_ISO_PATTERN.is_match("2023-12-31T12:30:45+05:30"));
}
#[test]
fn test_null_pattern() {
assert!(NULL_PATTERN.is_match("NULL"));
assert!(NULL_PATTERN.is_match("null"));
assert!(NULL_PATTERN.is_match("NA"));
assert!(NULL_PATTERN.is_match("N/A"));
assert!(NULL_PATTERN.is_match("-"));
assert!(NULL_PATTERN.is_match("NaN"));
}
}