use crate::packs::currency::{detect_currency_format, CurrencyFormat};
use crate::packs::time::{detect_format, DateFormat};
use crate::packs::units::{parse_unit_value, UnitCategory};
#[derive(Debug, Clone)]
pub struct TypeGuess {
pub kind: GuessedType,
pub confidence: f64,
}
#[derive(Debug, Clone, PartialEq)]
pub enum GuessedType {
Integer,
Float,
Boolean,
Date(DateFormat),
Currency,
UnitValue(UnitCategory),
Json,
Uuid,
Email,
Url,
IpAddress,
NullSentinel,
Iban,
CreditCard,
Isbn,
Ssn,
Ein,
VatNumber,
Phone,
PlainString,
}
pub fn guess_type(s: &str) -> Vec<TypeGuess> {
let s = s.trim();
let mut guesses = Vec::new();
let lower = s.to_lowercase();
if matches!(
lower.as_str(),
"null" | "none" | "nil" | "n/a" | "na" | "nan" | "undefined" | "-" | "" | "unknown"
) {
guesses.push(TypeGuess {
kind: GuessedType::NullSentinel,
confidence: 0.95,
});
}
if matches!(
lower.as_str(),
"true" | "false" | "yes" | "no" | "y" | "n" | "on" | "off" | "t" | "f"
) {
guesses.push(TypeGuess {
kind: GuessedType::Boolean,
confidence: 0.9,
});
}
if s.parse::<i64>().is_ok() {
let is_bool_like = s == "1" || s == "0";
guesses.push(TypeGuess {
kind: GuessedType::Integer,
confidence: if is_bool_like { 0.5 } else { 0.95 },
});
guesses.push(TypeGuess {
kind: GuessedType::Float,
confidence: if is_bool_like { 0.5 } else { 0.7 },
});
} else if let Ok(f) = s.parse::<f64>() {
if !f.is_nan() && !f.is_infinite() {
guesses.push(TypeGuess {
kind: GuessedType::Float,
confidence: 0.95,
});
}
}
if s == "1" || s == "0" {
guesses.push(TypeGuess {
kind: GuessedType::Boolean,
confidence: 0.6,
});
}
let is_url_prefix =
s.starts_with("http://") || s.starts_with("https://") || s.starts_with("ftp://");
if !is_url_prefix && s.contains('@') && s.contains('.') && !s.contains(' ') {
let parts: Vec<&str> = s.split('@').collect();
if parts.len() == 2 && !parts[0].is_empty() && parts[1].contains('.') {
guesses.push(TypeGuess {
kind: GuessedType::Email,
confidence: 0.9,
});
guesses.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
return guesses;
}
}
let date_format = detect_format(s);
match date_format {
DateFormat::Unknown => {}
DateFormat::Ambiguous => {
guesses.push(TypeGuess {
kind: GuessedType::Date(date_format),
confidence: 0.6,
});
}
_ => {
guesses.push(TypeGuess {
kind: GuessedType::Date(date_format),
confidence: 0.85,
});
}
}
let currency_format = detect_currency_format(s);
match currency_format {
CurrencyFormat::NotCurrency | CurrencyFormat::PlainNumber => {}
CurrencyFormat::EuropeanLocale { .. } => {
guesses.push(TypeGuess {
kind: GuessedType::Currency,
confidence: 0.7,
});
}
_ => {
guesses.push(TypeGuess {
kind: GuessedType::Currency,
confidence: 0.9,
});
}
}
if s.starts_with('(') && s.ends_with(')') {
let inner = &s[1..s.len() - 1];
let stripped: String = inner.chars().filter(|c| *c != ',').collect();
if stripped.parse::<f64>().is_ok() {
guesses.push(TypeGuess {
kind: GuessedType::Currency,
confidence: 0.85,
});
}
}
if let Some(uv) = parse_unit_value(s) {
guesses.push(TypeGuess {
kind: GuessedType::UnitValue(uv.category),
confidence: 0.85,
});
} else if crate::packs::units::parse_qualified_weight(s).is_some() {
guesses.push(TypeGuess {
kind: GuessedType::UnitValue(crate::packs::units::UnitCategory::Weight),
confidence: 0.85,
});
} else if crate::packs::units::parse_pack_notation(s).is_some() {
guesses.push(TypeGuess {
kind: GuessedType::UnitValue(crate::packs::units::UnitCategory::Weight),
confidence: 0.8,
});
}
if s.len() == 36 && s.chars().filter(|c| *c == '-').count() == 4 {
let hex_parts: Vec<&str> = s.split('-').collect();
if hex_parts.len() == 5
&& hex_parts[0].len() == 8
&& hex_parts[1].len() == 4
&& hex_parts[2].len() == 4
&& hex_parts[3].len() == 4
&& hex_parts[4].len() == 12
&& s.chars().all(|c| c.is_ascii_hexdigit() || c == '-')
{
guesses.push(TypeGuess {
kind: GuessedType::Uuid,
confidence: 0.95,
});
}
}
if s.starts_with("http://") || s.starts_with("https://") || s.starts_with("ftp://") {
guesses.push(TypeGuess {
kind: GuessedType::Url,
confidence: 0.95,
});
}
let looks_like_ipv4 = s.split('.').count() == 4
&& s.split('.')
.all(|p| !p.is_empty() && p.chars().all(|c| c.is_ascii_digit()));
if looks_like_ipv4 {
if s.split('.').all(|p| p.parse::<u8>().is_ok()) {
guesses.push(TypeGuess {
kind: GuessedType::IpAddress,
confidence: 0.9,
});
} else {
guesses.push(TypeGuess {
kind: GuessedType::PlainString,
confidence: 0.95,
});
guesses.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
return guesses;
}
}
if s.contains(':')
&& s.split(':').count() >= 3
&& s.chars().all(|c| c.is_ascii_hexdigit() || c == ':')
{
guesses.push(TypeGuess {
kind: GuessedType::IpAddress,
confidence: 0.8,
});
}
let trimmed = s.trim();
if ((trimmed.starts_with('{') && trimmed.ends_with('}'))
|| (trimmed.starts_with('[') && trimmed.ends_with(']')))
&& serde_json::from_str::<serde_json::Value>(s).is_ok()
{
guesses.push(TypeGuess {
kind: GuessedType::Json,
confidence: 0.95,
});
}
{
let stripped: String = s.chars().filter(|c| !c.is_whitespace()).collect();
let upper = stripped.to_uppercase();
if upper.len() >= 15
&& upper.len() <= 34
&& upper.is_ascii()
&& upper[..2].chars().all(|c| c.is_ascii_alphabetic())
&& upper[2..4].chars().all(|c| c.is_ascii_digit())
&& upper[4..].chars().all(|c| c.is_ascii_alphanumeric())
{
let check: u32 = upper[2..4].parse().unwrap_or(0);
let body = &upper[4..];
let has_variety = body.chars().collect::<std::collections::HashSet<_>>().len() > 2;
if check >= 2 && has_variety {
guesses.push(TypeGuess {
kind: GuessedType::Iban,
confidence: 0.8,
});
}
}
}
let id_candidates = crate::packs::identifiers::detect(s);
for (id_type, confidence) in id_candidates {
let kind = match id_type {
crate::packs::identifiers::IdentifierType::Iban => GuessedType::Iban,
crate::packs::identifiers::IdentifierType::CreditCard => GuessedType::CreditCard,
crate::packs::identifiers::IdentifierType::Isbn10
| crate::packs::identifiers::IdentifierType::Isbn13 => GuessedType::Isbn,
crate::packs::identifiers::IdentifierType::UsSsn => GuessedType::Ssn,
crate::packs::identifiers::IdentifierType::UsEin => GuessedType::Ein,
crate::packs::identifiers::IdentifierType::UsNpi
| crate::packs::identifiers::IdentifierType::UkNhs => continue, crate::packs::identifiers::IdentifierType::EuVat => GuessedType::VatNumber,
crate::packs::identifiers::IdentifierType::Uuid => continue, crate::packs::identifiers::IdentifierType::Email => continue, crate::packs::identifiers::IdentifierType::Phone => GuessedType::Phone,
};
guesses.push(TypeGuess { kind, confidence });
}
let has_high_confidence_id = guesses.iter().any(|g| {
matches!(g.kind, GuessedType::CreditCard | GuessedType::Isbn) && g.confidence >= 0.85
});
if has_high_confidence_id {
for g in guesses.iter_mut() {
if g.kind == GuessedType::Integer {
g.confidence = 0.5;
}
}
}
let has_specific_date_on_digits = guesses.iter().any(|g| {
matches!(
g.kind,
GuessedType::Date(DateFormat::Hl7Date)
| GuessedType::Date(DateFormat::UnixSeconds)
| GuessedType::Date(DateFormat::UnixMillis)
)
}) && s.chars().all(|c| c.is_ascii_digit());
if has_specific_date_on_digits {
for g in guesses.iter_mut() {
if g.kind == GuessedType::Integer || g.kind == GuessedType::Float {
g.confidence = 0.4;
}
}
}
if guesses.is_empty() {
guesses.push(TypeGuess {
kind: GuessedType::PlainString,
confidence: 1.0,
});
}
guesses.sort_by(|a, b| {
b.confidence
.partial_cmp(&a.confidence)
.unwrap_or(std::cmp::Ordering::Equal)
});
guesses
}