use crate::registrable_domain::IPV4_RE;
use once_cell::sync::Lazy;
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::sync::Mutex;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum SegmentType {
Literal,
Integer,
Float,
Number,
Uuid,
Date,
Timestamp,
Hash,
Slug,
Ipv4,
Ipv6,
Url,
Email,
Boolean,
Version,
Locale,
Currency,
Phone,
Jwt,
Mime,
File,
Color,
Coordinate,
Country,
Base64,
Year,
HttpStatus,
Enum,
OpaqueId,
}
impl SegmentType {
pub fn as_str(&self) -> &'static str {
use SegmentType::*;
match self {
Literal => "literal",
Integer => "integer",
Float => "float",
Number => "number",
Uuid => "uuid",
Date => "date",
Timestamp => "timestamp",
Hash => "hash",
Slug => "slug",
Ipv4 => "ipv4",
Ipv6 => "ipv6",
Url => "url",
Email => "email",
Boolean => "boolean",
Version => "version",
Locale => "locale",
Currency => "currency",
Phone => "phone",
Jwt => "jwt",
Mime => "mime",
File => "file",
Color => "color",
Coordinate => "coordinate",
Country => "country",
Base64 => "base64",
Year => "year",
HttpStatus => "http_status",
Enum => "enum",
OpaqueId => "opaque_id",
}
}
}
impl std::fmt::Display for SegmentType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
static FLOAT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^-?\d+\.\d+$").unwrap());
static ISO_TIME_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(:\d{2})?(\.\d+)?(Z|[+\-]\d{2}:?\d{2})?$")
.unwrap()
});
static HASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[0-9a-fA-F]{32,}$").unwrap());
static SLUG_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-z0-9]+(?:[-_][a-z0-9]+)+$").unwrap());
static LITERAL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\p{L}[\p{L}\p{M}_]*$").unwrap());
static OPAQUE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Za-z0-9_\-.~]{4,}$").unwrap());
static IPV6_FULL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[0-9a-fA-F]{1,4}(?::[0-9a-fA-F]{1,4}){7}$").unwrap());
static IPV6_COMPRESSED_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[0-9a-fA-F:]{2,}$").unwrap());
static URL_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://\S+$").unwrap());
static SCHEMELESS_URL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[a-zA-Z0-9\-]+(?:\.[a-zA-Z0-9\-]+)*\.[a-zA-Z]{2,}/\S*$").unwrap());
static EMAIL_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?(?:\.[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?)+$",
)
.unwrap()
});
static BOOLEAN_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^(?i:true|false)$").unwrap());
static VERSION_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^v\d+(?:\.\d+)*(?:[-+][A-Za-z0-9.\-]+)?$").unwrap());
static LOCALE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^([a-z]{2,3})[-_]([A-Za-z0-9]{2,4})$").unwrap());
static LOCALE_BARE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[a-z]{2}$").unwrap());
static CURRENCY_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Za-z]{3}$").unwrap());
static PHONE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\+[ \-.()\d]{7,20}$").unwrap());
static PHONE_NANP_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\(?([2-9]\d{2})\)?[ \-.]?([2-9]\d{2})[ \-.]?(\d{4})$").unwrap());
static FILE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[A-Za-z0-9][A-Za-z0-9_\-.~]*\.([A-Za-z0-9]{1,8})$").unwrap());
static COLOR_HEX_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^#([0-9a-fA-F]{3}|[0-9a-fA-F]{4}|[0-9a-fA-F]{6}|[0-9a-fA-F]{8})$").unwrap()
});
static COORDINATE_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(-?\d+(?:\.\d+)?),(-?\d+(?:\.\d+)?)$").unwrap());
static COUNTRY_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Z]{2}$").unwrap());
static BASE64_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^[A-Za-z0-9+/]{16,}={0,2}$").unwrap());
static JWT_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^ey[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+\.[A-Za-z0-9_\-]+$").unwrap());
static MIME_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(
r"^(?:text|image|video|audio|application|multipart|message|font|model)/[A-Za-z0-9!#$&^_+\-.]+$",
)
.unwrap()
});
static UUID_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
.unwrap()
});
static INTEGER_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d+$").unwrap());
static COMPACT_DATE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{8}$").unwrap());
static DATE_ISO_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap());
static DATE_SLASH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^\d{4}/\d{2}/\d{2}$").unwrap());
static DATE_US_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(\d{1,2})/(\d{1,2})/(\d{4})$").unwrap());
static FILE_EXT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\.([A-Za-z0-9]{1,8})$").unwrap());
static COUNTRY_CODES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"AD", "AE", "AF", "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", "BE",
"BG", "BH", "BJ", "BM", "BN", "BO", "BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG",
"CH", "CI", "CL", "CM", "CN", "CO", "CR", "CU", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO",
"DZ", "EC", "EE", "EG", "ER", "ES", "ET", "FI", "FJ", "FK", "FM", "FO", "FR", "GA", "GB",
"GE", "GH", "GI", "GL", "GM", "GN", "GR", "GT", "GU", "GW", "GY", "HK", "HN", "HR", "HT",
"HU", "ID", "IE", "IL", "IM", "IN", "IQ", "IR", "IS", "IT", "JM", "JO", "JP", "KE", "KG",
"KH", "KM", "KN", "KP", "KR", "KW", "KY", "KZ", "LA", "LB", "LC", "LI", "LK", "LR", "LS",
"LT", "LU", "LV", "LY", "MA", "MC", "MD", "ME", "MG", "MK", "ML", "MM", "MN", "MO", "MR",
"MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO", "NP", "NR",
"NU", "NZ", "OM", "PA", "PE", "PF", "PG", "PH", "PK", "PL", "PR", "PT", "PW", "PY", "QA",
"RE", "RO", "RS", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI", "SK", "SL", "SM",
"SN", "SO", "SR", "SS", "ST", "SV", "SY", "SZ", "TD", "TG", "TH", "TJ", "TM", "TN", "TO",
"TR", "TT", "TV", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VA", "VC", "VE", "VG", "VI",
"VN", "VU", "WS", "YE", "ZA", "ZM", "ZW",
]
.iter()
.copied()
.collect()
});
static LOCALE_LANGUAGE_CODES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu",
"he", "hi", "hr", "hu", "id", "it", "ja", "ka", "kk", "km", "kn", "ko", "lt", "lv", "mk",
"ml", "mr", "ms", "my", "nb", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "sr",
"sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh",
]
.iter()
.copied()
.collect()
});
static CURRENCY_CODES: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
"USD", "EUR", "GBP", "JPY", "CNY", "CHF", "CAD", "AUD", "NZD", "HKD", "SGD", "INR", "KRW",
"MXN", "BRL", "ZAR", "SEK", "NOK", "DKK", "PLN", "CZK", "HUF", "RUB", "TRY", "ILS", "AED",
"SAR", "THB", "IDR", "PHP", "VND", "TWD", "MYR", "NGN", "EGP",
]
.iter()
.copied()
.collect()
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum FileKind {
Image,
Document,
Data,
Text,
Web,
Audio,
Video,
Archive,
Code,
}
impl FileKind {
pub fn as_str(&self) -> &'static str {
match self {
FileKind::Image => "image",
FileKind::Document => "document",
FileKind::Data => "data",
FileKind::Text => "text",
FileKind::Web => "web",
FileKind::Audio => "audio",
FileKind::Video => "video",
FileKind::Archive => "archive",
FileKind::Code => "code",
}
}
}
static FILE_EXTENSION_KIND: Lazy<HashMap<&'static str, FileKind>> = Lazy::new(|| {
let mut m = HashMap::new();
use FileKind::*;
for e in [
"png", "jpg", "jpeg", "gif", "webp", "svg", "bmp", "tiff", "tif", "ico", "avif", "heic",
"heif",
] {
m.insert(e, Image);
}
for e in [
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "odt", "ods", "odp", "rtf", "epub",
] {
m.insert(e, Document);
}
for e in [
"csv", "tsv", "json", "xml", "yaml", "yml", "parquet", "sqlite", "db", "ndjson", "jsonl",
] {
m.insert(e, Data);
}
for e in ["txt", "md", "log", "markdown", "rst"] {
m.insert(e, Text);
}
for e in ["html", "htm", "css", "js", "mjs", "cjs", "ts", "jsx", "tsx"] {
m.insert(e, Web);
}
for e in ["mp3", "wav", "ogg", "flac", "aac", "m4a", "opus"] {
m.insert(e, Audio);
}
for e in ["mp4", "mov", "avi", "mkv", "webm", "flv", "wmv", "m4v"] {
m.insert(e, Video);
}
for e in ["zip", "tar", "gz", "bz2", "7z", "rar", "xz", "tgz"] {
m.insert(e, Archive);
}
for e in [
"rb", "py", "go", "java", "c", "cc", "cpp", "h", "hpp", "sh", "swift", "kt", "rs",
] {
m.insert(e, Code);
}
m
});
static PARAM_NAME_HINTS: Lazy<HashMap<&'static str, SegmentType>> = Lazy::new(|| {
use SegmentType::*;
let mut m = HashMap::new();
for k in ["phone", "tel", "telephone", "mobile", "cell"] {
m.insert(k, Phone);
}
for k in ["email", "e_mail", "mail"] {
m.insert(k, Email);
}
for k in ["locale", "lang", "language"] {
m.insert(k, Locale);
}
for k in ["currency", "cur", "curr"] {
m.insert(k, Currency);
}
for k in [
"url",
"uri",
"redirect",
"redirect_url",
"return_to",
"return_url",
"callback",
"callback_url",
"next_url",
] {
m.insert(k, Url);
}
for k in ["jwt", "bearer", "auth_token"] {
m.insert(k, Jwt);
}
for k in ["mime", "content_type", "media_type"] {
m.insert(k, Mime);
}
for k in ["color", "colour", "bg", "background", "fg", "foreground"] {
m.insert(k, Color);
}
for k in [
"coords",
"coordinates",
"geo",
"location",
"position",
"latlng",
"latlon",
] {
m.insert(k, Coordinate);
}
for k in ["country", "country_code", "nation"] {
m.insert(k, Country);
}
m
});
fn is_overridable(t: SegmentType) -> bool {
matches!(
t,
SegmentType::Literal | SegmentType::OpaqueId | SegmentType::Slug
)
}
pub fn param_name_hint(name: &str, current: SegmentType) -> Option<SegmentType> {
if name.is_empty() || !is_overridable(current) {
return None;
}
PARAM_NAME_HINTS.get(name.to_lowercase().as_str()).copied()
}
pub fn display_type(t: SegmentType) -> &'static str {
match t {
SegmentType::Ipv4 | SegmentType::Ipv6 => "ip",
_ => t.as_str(),
}
}
pub fn color_kind(value: &str) -> Option<&'static str> {
if COLOR_HEX_RE.is_match(value) {
Some("hex")
} else {
None
}
}
pub fn file_kind(value: &str) -> Option<FileKind> {
let caps = FILE_EXT_RE.captures(value)?;
let ext = caps.get(1).unwrap().as_str().to_ascii_lowercase();
FILE_EXTENSION_KIND.get(ext.as_str()).copied()
}
pub fn canonical_currency(value: &str) -> Option<String> {
if value.is_empty() {
return None;
}
let up = value.to_ascii_uppercase();
if CURRENCY_CODES.contains(up.as_str()) {
Some(up)
} else {
None
}
}
pub fn canonical_date(value: &str) -> Option<String> {
if let Some(c) = canonical_date_from_forms(value) {
return Some(c);
}
if COMPACT_DATE_RE.is_match(value) {
let (y, m, d) = (&value[0..4], &value[4..6], &value[6..8]);
if plausible_date(y, m, d) {
return Some(format!("{}-{}-{}", y, m, d));
}
}
None
}
fn canonical_date_from_forms(value: &str) -> Option<String> {
if DATE_ISO_RE.is_match(value) {
let (y, m, d) = (&value[0..4], &value[5..7], &value[8..10]);
if plausible_date(y, m, d) {
return Some(value.to_string());
}
return None;
}
if DATE_SLASH_RE.is_match(value) {
let (y, m, d) = (&value[0..4], &value[5..7], &value[8..10]);
if plausible_date(y, m, d) {
return Some(format!("{}-{}-{}", y, m, d));
}
return None;
}
if let Some(caps) = DATE_US_RE.captures(value) {
let mon = pad2(caps.get(1).unwrap().as_str());
let day = pad2(caps.get(2).unwrap().as_str());
let year = caps.get(3).unwrap().as_str();
if plausible_date(year, &mon, &day) {
return Some(format!("{}-{}-{}", year, mon, day));
}
}
None
}
fn pad2(s: &str) -> String {
if s.len() == 1 {
format!("0{}", s)
} else {
s.to_string()
}
}
fn plausible_date(y: &str, m: &str, d: &str) -> bool {
let yi: i32 = y.parse().unwrap_or(-1);
let mi: i32 = m.parse().unwrap_or(-1);
let di: i32 = d.parse().unwrap_or(-1);
(1900..=2100).contains(&yi) && (1..=12).contains(&mi) && (1..=31).contains(&di)
}
const SPECIFICITY_SEMANTIC: f64 = 1.0;
const SPECIFICITY_STRUCTURED: f64 = 0.8;
const SPECIFICITY_BOUNDED: f64 = 0.7;
const SPECIFICITY_TYPED: f64 = 0.5;
#[derive(Debug, Clone)]
pub struct Verdict {
pub ty: SegmentType,
pub confidence: f64,
pub specificity: f64,
}
pub trait Recognizer: Send + Sync {
fn try_classify(&self, segment: &str) -> Option<Verdict>;
}
pub fn segment_type_from_str(s: &str) -> Option<SegmentType> {
use SegmentType::*;
Some(match s {
"literal" => Literal,
"integer" => Integer,
"float" => Float,
"number" => Number,
"uuid" => Uuid,
"date" => Date,
"timestamp" => Timestamp,
"hash" => Hash,
"slug" => Slug,
"ipv4" => Ipv4,
"ipv6" => Ipv6,
"url" => Url,
"email" => Email,
"boolean" => Boolean,
"version" => Version,
"locale" => Locale,
"currency" => Currency,
"phone" => Phone,
"jwt" => Jwt,
"mime" => Mime,
"file" => File,
"color" => Color,
"coordinate" => Coordinate,
"country" => Country,
"base64" => Base64,
"year" => Year,
"http_status" => HttpStatus,
"enum" => Enum,
"opaque_id" => OpaqueId,
_ => return None,
})
}
fn ensemble(segment: &str, recognizers: &[std::sync::Arc<dyn Recognizer>]) -> Option<Verdict> {
let mut best: Option<Verdict> = None;
let mut best_score = -1.0;
for r in recognizers {
if let Some(v) = r.try_classify(segment) {
let score = v.specificity * v.confidence;
if score > best_score {
best_score = score;
best = Some(v);
}
}
}
best
}
struct UuidRecognizer;
impl Recognizer for UuidRecognizer {
fn try_classify(&self, segment: &str) -> Option<Verdict> {
if segment.len() != 36 || !segment.contains('-') || !UUID_RE.is_match(segment) {
return None;
}
Some(Verdict {
ty: SegmentType::Uuid,
confidence: 1.0,
specificity: SPECIFICITY_SEMANTIC,
})
}
}
struct DateRecognizer;
impl Recognizer for DateRecognizer {
fn try_classify(&self, segment: &str) -> Option<Verdict> {
let has_dash = segment.contains('-');
let has_slash = segment.contains('/');
if !has_dash && !has_slash {
return None;
}
if !DATE_ISO_RE.is_match(segment)
&& !DATE_SLASH_RE.is_match(segment)
&& !DATE_US_RE.is_match(segment)
{
return None;
}
Some(Verdict {
ty: SegmentType::Date,
confidence: 1.0,
specificity: SPECIFICITY_STRUCTURED,
})
}
}
const TS_SECONDS_MIN: i64 = 1_000_000_000;
const TS_SECONDS_MAX: i64 = 9_999_999_999;
const TS_MILLIS_MIN: i64 = 1_000_000_000_000;
const TS_MILLIS_MAX: i64 = 9_999_999_999_999;
struct IntegerRecognizer;
impl Recognizer for IntegerRecognizer {
fn try_classify(&self, segment: &str) -> Option<Verdict> {
let c = segment.bytes().next()?;
if !c.is_ascii_digit() {
return None;
}
if !INTEGER_RE.is_match(segment) {
return None;
}
if let Ok(n) = segment.parse::<i64>() {
if (TS_MILLIS_MIN..=TS_MILLIS_MAX).contains(&n)
|| (TS_SECONDS_MIN..=TS_SECONDS_MAX).contains(&n)
{
return Some(Verdict {
ty: SegmentType::Timestamp,
confidence: 1.0,
specificity: SPECIFICITY_BOUNDED,
});
}
}
if COMPACT_DATE_RE.is_match(segment) {
let y: i32 = segment[0..4].parse().unwrap_or(-1);
let m: i32 = segment[4..6].parse().unwrap_or(-1);
let d: i32 = segment[6..8].parse().unwrap_or(-1);
if (1900..=2100).contains(&y) && (1..=12).contains(&m) && (1..=31).contains(&d) {
return Some(Verdict {
ty: SegmentType::Date,
confidence: 1.0,
specificity: SPECIFICITY_STRUCTURED,
});
}
}
Some(Verdict {
ty: SegmentType::Integer,
confidence: 1.0,
specificity: SPECIFICITY_TYPED,
})
}
}
const CACHE_MAX: usize = 10_000;
pub struct SegmentClassifier {
state: Mutex<ClassifierState>,
}
struct ClassifierState {
cache: HashMap<String, SegmentType>,
recognizers: Vec<std::sync::Arc<dyn Recognizer>>,
}
impl SegmentClassifier {
pub fn new() -> Self {
let recognizers: Vec<std::sync::Arc<dyn Recognizer>> = vec![
std::sync::Arc::new(UuidRecognizer),
std::sync::Arc::new(DateRecognizer),
std::sync::Arc::new(IntegerRecognizer),
];
Self {
state: Mutex::new(ClassifierState {
cache: HashMap::new(),
recognizers,
}),
}
}
pub fn classify(&self, segment: &str) -> SegmentType {
if segment.is_empty() {
return SegmentType::Literal;
}
{
let mut st = self.state.lock().unwrap();
if let Some(&v) = st.cache.get(segment) {
return v;
}
if st.cache.len() >= CACHE_MAX {
st.cache.clear();
}
}
let recognizers = {
let st = self.state.lock().unwrap();
st.recognizers.clone()
};
let t = compute_classification(segment, &recognizers);
let mut st = self.state.lock().unwrap();
st.cache.insert(segment.to_string(), t);
t
}
pub fn variable(&self, t: SegmentType) -> bool {
t != SegmentType::Literal
}
pub fn register_recognizer(&self, r: std::sync::Arc<dyn Recognizer>) {
let mut st = self.state.lock().unwrap();
st.recognizers.push(r);
st.cache.clear();
}
pub fn recognizer_count(&self) -> usize {
self.state.lock().unwrap().recognizers.len()
}
}
impl Default for SegmentClassifier {
fn default() -> Self {
Self::new()
}
}
pub static DEFAULT_CLASSIFIER: Lazy<SegmentClassifier> = Lazy::new(SegmentClassifier::new);
fn compute_classification(
segment: &str,
recognizers: &[std::sync::Arc<dyn Recognizer>],
) -> SegmentType {
let bytes = segment.as_bytes();
let size = bytes.len();
if size == 0 {
return SegmentType::Literal;
}
let first = bytes[0];
let digit0 = first.is_ascii_digit();
let has_dash = bytes.contains(&b'-');
let has_dot = bytes.contains(&b'.');
let has_colon = bytes.contains(&b':');
let has_slash = bytes.contains(&b'/');
let has_at = bytes.contains(&b'@');
let has_under = bytes.contains(&b'_');
let has_sep = has_dash || has_under;
let has_comma = bytes.contains(&b',');
let has_eq = bytes.contains(&b'=');
let has_plus = bytes.contains(&b'+');
if let Some(v) = ensemble(segment, recognizers) {
return v.ty;
}
if size > 4
&& bytes[0] == b'e'
&& bytes[1] == b'y'
&& segment.matches('.').count() == 2
&& JWT_RE.is_match(segment)
{
return SegmentType::Jwt;
}
if first == b'#' && COLOR_HEX_RE.is_match(segment) {
return SegmentType::Color;
}
if has_colon && segment.contains("://") && URL_RE.is_match(segment) {
return SegmentType::Url;
}
if has_at && EMAIL_RE.is_match(segment) {
return SegmentType::Email;
}
if has_slash && MIME_RE.is_match(segment) {
return SegmentType::Mime;
}
if has_dot && has_slash && SCHEMELESS_URL_RE.is_match(segment) {
return SegmentType::Url;
}
if digit0 && has_dot && IPV4_RE.is_match(segment) {
return classify_ipv4(segment);
}
if has_colon && IPV6_FULL_RE.is_match(segment) {
return SegmentType::Ipv6;
}
if has_colon && segment.contains("::") && IPV6_COMPRESSED_RE.is_match(segment) {
return SegmentType::Ipv6;
}
if has_comma && COORDINATE_RE.is_match(segment) {
return classify_coordinate(segment);
}
if size >= 32 && HASH_RE.is_match(segment) {
return SegmentType::Hash;
}
if first == b'v' && VERSION_RE.is_match(segment) {
return SegmentType::Version;
}
if (4..=5).contains(&size) && BOOLEAN_RE.is_match(segment) {
return SegmentType::Boolean;
}
if has_sep && LOCALE_RE.is_match(segment) {
return classify_locale_pair(segment);
}
if size == 2 && LOCALE_BARE_RE.is_match(segment) {
return classify_locale_bare(segment);
}
if has_colon && ISO_TIME_RE.is_match(segment) {
return SegmentType::Timestamp;
}
if first == b'+' && PHONE_RE.is_match(segment) {
return classify_phone(segment);
}
if (has_dash || has_dot || first == b'(') && PHONE_NANP_RE.is_match(segment) {
return SegmentType::Phone;
}
if has_dot && FLOAT_RE.is_match(segment) {
return SegmentType::Float;
}
if size == 3 && CURRENCY_RE.is_match(segment) {
return classify_currency(segment);
}
if size == 2 && COUNTRY_RE.is_match(segment) {
return classify_country(segment);
}
if size >= 16 && (has_eq || has_plus || has_slash) && BASE64_RE.is_match(segment) {
return SegmentType::Base64;
}
if has_dot && FILE_RE.is_match(segment) {
return classify_file(segment);
}
if has_sep && SLUG_RE.is_match(segment) {
return SegmentType::Slug;
}
if LITERAL_RE.is_match(segment) {
return SegmentType::Literal;
}
if OPAQUE_RE.is_match(segment) {
return SegmentType::OpaqueId;
}
SegmentType::Literal
}
fn classify_coordinate(segment: &str) -> SegmentType {
let Some(caps) = COORDINATE_RE.captures(segment) else {
return SegmentType::OpaqueId;
};
let a: f64 = caps.get(1).unwrap().as_str().parse().unwrap_or(f64::NAN);
let b: f64 = caps.get(2).unwrap().as_str().parse().unwrap_or(f64::NAN);
if a.is_nan() || b.is_nan() {
return SegmentType::OpaqueId;
}
if ((-90.0..=90.0).contains(&a) && (-180.0..=180.0).contains(&b))
|| ((-180.0..=180.0).contains(&a) && (-90.0..=90.0).contains(&b))
{
SegmentType::Coordinate
} else {
SegmentType::OpaqueId
}
}
fn classify_country(segment: &str) -> SegmentType {
if COUNTRY_CODES.contains(segment) {
SegmentType::Country
} else {
SegmentType::Literal
}
}
fn classify_file(segment: &str) -> SegmentType {
let Some(caps) = FILE_RE.captures(segment) else {
return SegmentType::OpaqueId;
};
let ext = caps.get(1).unwrap().as_str().to_ascii_lowercase();
if FILE_EXTENSION_KIND.contains_key(ext.as_str()) {
return SegmentType::File;
}
if SLUG_RE.is_match(segment) {
return SegmentType::Slug;
}
SegmentType::OpaqueId
}
fn classify_phone(segment: &str) -> SegmentType {
let digits = segment.bytes().filter(|b| b.is_ascii_digit()).count();
if (7..=15).contains(&digits) {
SegmentType::Phone
} else {
SegmentType::OpaqueId
}
}
fn classify_currency(segment: &str) -> SegmentType {
let up = segment.to_ascii_uppercase();
if CURRENCY_CODES.contains(up.as_str()) {
return SegmentType::Currency;
}
if LITERAL_RE.is_match(segment) {
SegmentType::Literal
} else {
SegmentType::OpaqueId
}
}
fn classify_locale_bare(segment: &str) -> SegmentType {
if LOCALE_LANGUAGE_CODES.contains(segment) {
SegmentType::Locale
} else {
SegmentType::Literal
}
}
fn classify_locale_pair(segment: &str) -> SegmentType {
let Some(caps) = LOCALE_RE.captures(segment) else {
return SegmentType::Literal;
};
if LOCALE_LANGUAGE_CODES.contains(caps.get(1).unwrap().as_str()) {
return SegmentType::Locale;
}
if SLUG_RE.is_match(segment) {
SegmentType::Slug
} else {
SegmentType::Literal
}
}
fn classify_ipv4(segment: &str) -> SegmentType {
for oct in segment.split('.') {
match oct.parse::<u32>() {
Ok(n) if n <= 255 => continue,
_ => return SegmentType::OpaqueId,
}
}
SegmentType::Ipv4
}