#![allow(dead_code)]
#![allow(clippy::manual_is_multiple_of)]
use once_cell::sync::Lazy;
use regex::Regex;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub enum Format {
Date,
DateTime,
Time,
Email,
Uri,
Uuid,
Ipv4,
Ipv6,
Hostname,
JsonPointer,
Regex,
Base64,
Phone,
CreditCard,
CountryCode,
CurrencyCode,
Semver,
None,
}
impl Format {
pub fn as_json_schema_format(&self) -> Option<&'static str> {
match self {
Format::Date => Some("date"),
Format::DateTime => Some("date-time"),
Format::Time => Some("time"),
Format::Email => Some("email"),
Format::Uri => Some("uri"),
Format::Uuid => Some("uuid"),
Format::Ipv4 => Some("ipv4"),
Format::Ipv6 => Some("ipv6"),
Format::Hostname => Some("hostname"),
Format::JsonPointer => Some("json-pointer"),
Format::Regex => Some("regex"),
Format::None => None,
Format::Base64 => Some("byte"),
Format::Phone => Some("phone"),
Format::CreditCard => None,
Format::CountryCode => None,
Format::CurrencyCode => None,
Format::Semver => None,
}
}
}
impl std::fmt::Display for Format {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Format::Date => write!(f, "date"),
Format::DateTime => write!(f, "date-time"),
Format::Time => write!(f, "time"),
Format::Email => write!(f, "email"),
Format::Uri => write!(f, "uri"),
Format::Uuid => write!(f, "uuid"),
Format::Ipv4 => write!(f, "ipv4"),
Format::Ipv6 => write!(f, "ipv6"),
Format::Hostname => write!(f, "hostname"),
Format::JsonPointer => write!(f, "json-pointer"),
Format::Regex => write!(f, "regex"),
Format::Base64 => write!(f, "base64"),
Format::Phone => write!(f, "phone"),
Format::CreditCard => write!(f, "credit-card"),
Format::CountryCode => write!(f, "country-code"),
Format::CurrencyCode => write!(f, "currency-code"),
Format::Semver => write!(f, "semver"),
Format::None => write!(f, "none"),
}
}
}
static DATE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap());
static DATETIME_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:?\d{2})?$").unwrap()
});
static TIME_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{2}:\d{2}:\d{2}(\.\d+)?$").unwrap());
static EMAIL_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$").unwrap());
static UUID_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
.unwrap()
});
static URI_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(https?|ftp|file)://[^\s/$.?#].[^\s]*$").unwrap());
static IPV4_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$",
)
.unwrap()
});
static IPV6_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^([0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}$|^::$|^([0-9a-fA-F]{1,4}:){1,7}:$")
.unwrap()
});
static HOSTNAME_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^([a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])(\.[a-zA-Z0-9]|[a-zA-Z0-9][a-zA-Z0-9\-]*[a-zA-Z0-9])*$").unwrap()
});
static PHONE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\+?[1-9]\d{1,14}$").unwrap());
static BASE64_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Za-z0-9+/]+=*$").unwrap());
static COUNTRY_CODE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Z]{2}$").unwrap());
static CURRENCY_CODE_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Z]{3}$").unwrap());
static SEMVER_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(-[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)*)?(\+[a-zA-Z0-9]+(\.[a-zA-Z0-9]+)*)?$").unwrap()
});
pub fn detect_format(value: &str) -> Format {
let value = value.trim();
if value.is_empty() {
return Format::None;
}
if UUID_REGEX.is_match(value) {
return Format::Uuid;
}
if DATETIME_REGEX.is_match(value) {
return Format::DateTime;
}
if DATE_REGEX.is_match(value) {
return Format::Date;
}
if TIME_REGEX.is_match(value) {
return Format::Time;
}
if EMAIL_REGEX.is_match(value) {
return Format::Email;
}
if URI_REGEX.is_match(value) {
return Format::Uri;
}
if IPV4_REGEX.is_match(value) {
return Format::Ipv4;
}
if IPV6_REGEX.is_match(value) {
return Format::Ipv6;
}
if SEMVER_REGEX.is_match(value) {
return Format::Semver;
}
if PHONE_REGEX.is_match(value) && value.len() >= 8 && value.len() <= 15 {
return Format::Phone;
}
if COUNTRY_CODE_REGEX.is_match(value) {
return Format::CountryCode;
}
if CURRENCY_CODE_REGEX.is_match(value) {
return Format::CurrencyCode;
}
if value.len() >= 4 && value.len() % 4 == 0 && BASE64_REGEX.is_match(value) {
return Format::Base64;
}
if HOSTNAME_REGEX.is_match(value) && value.contains('.') {
return Format::Hostname;
}
Format::None
}
pub fn format_confidence(values: &[&str], format: Format) -> f64 {
if values.is_empty() {
return 0.0;
}
let matches = values.iter().filter(|v| detect_format(v) == format).count();
matches as f64 / values.len() as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_date() {
assert_eq!(detect_format("2024-01-15"), Format::Date);
assert_eq!(detect_format("2024-12-31"), Format::Date);
assert_ne!(detect_format("2024-1-15"), Format::Date); }
#[test]
fn test_detect_datetime() {
assert_eq!(detect_format("2024-01-15T10:30:00"), Format::DateTime);
assert_eq!(detect_format("2024-01-15T10:30:00Z"), Format::DateTime);
assert_eq!(detect_format("2024-01-15T10:30:00+05:00"), Format::DateTime);
assert_eq!(detect_format("2024-01-15 10:30:00"), Format::DateTime);
}
#[test]
fn test_detect_time() {
assert_eq!(detect_format("10:30:00"), Format::Time);
assert_eq!(detect_format("23:59:59.999"), Format::Time);
}
#[test]
fn test_detect_email() {
assert_eq!(detect_format("user@example.com"), Format::Email);
assert_eq!(detect_format("user.name+tag@domain.co.uk"), Format::Email);
}
#[test]
fn test_detect_uuid() {
assert_eq!(
detect_format("550e8400-e29b-41d4-a716-446655440000"),
Format::Uuid
);
assert_eq!(
detect_format("550E8400-E29B-41D4-A716-446655440000"),
Format::Uuid
);
}
#[test]
fn test_detect_uri() {
assert_eq!(detect_format("https://example.com"), Format::Uri);
assert_eq!(detect_format("http://localhost:8080/path"), Format::Uri);
assert_eq!(
detect_format("ftp://files.example.com/file.txt"),
Format::Uri
);
}
#[test]
fn test_detect_ipv4() {
assert_eq!(detect_format("192.168.1.1"), Format::Ipv4);
assert_eq!(detect_format("255.255.255.255"), Format::Ipv4);
assert_eq!(detect_format("0.0.0.0"), Format::Ipv4);
}
#[test]
fn test_detect_semver() {
assert_eq!(detect_format("1.0.0"), Format::Semver);
assert_eq!(detect_format("2.1.3-alpha"), Format::Semver);
assert_eq!(detect_format("0.0.1+build.123"), Format::Semver);
}
#[test]
fn test_detect_country_code() {
assert_eq!(detect_format("US"), Format::CountryCode);
assert_eq!(detect_format("GB"), Format::CountryCode);
assert_eq!(detect_format("DE"), Format::CountryCode);
}
#[test]
fn test_detect_currency_code() {
assert_eq!(detect_format("USD"), Format::CurrencyCode);
assert_eq!(detect_format("EUR"), Format::CurrencyCode);
assert_eq!(detect_format("GBP"), Format::CurrencyCode);
}
#[test]
fn test_format_confidence() {
let dates = vec!["2024-01-01", "2024-02-15", "2024-03-20"];
assert_eq!(format_confidence(&dates, Format::Date), 1.0);
let mixed = vec!["2024-01-01", "not-a-date", "2024-03-20"];
assert!((format_confidence(&mixed, Format::Date) - 0.666).abs() < 0.01);
}
#[test]
fn test_empty_and_whitespace() {
assert_eq!(detect_format(""), Format::None);
assert_eq!(detect_format(" "), Format::None);
}
}