use crate::core::{current_validation_context, Constraint, ConstraintMetadata, ConstraintResult};
use crate::prelude::*;
use crate::security::SqlSecurity;
use arrow::array::Array;
use async_trait::async_trait;
use datafusion::prelude::*;
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::RwLock;
use tracing::instrument;
static PATTERN_CACHE: Lazy<RwLock<HashMap<String, String>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub enum FormatType {
Regex(String),
Email,
Url { allow_localhost: bool },
CreditCard { detect_only: bool },
Phone { country: Option<String> },
PostalCode { country: String },
UUID,
IPv4,
IPv6,
Json,
Iso8601DateTime,
SocialSecurityNumber,
}
impl FormatType {
fn get_pattern(&self) -> Result<String> {
let cache_key = format!("{self:?}");
{
let cache = PATTERN_CACHE.read().map_err(|_| {
TermError::Internal("Failed to acquire read lock on pattern cache".to_string())
})?;
if let Some(pattern) = cache.get(&cache_key) {
return Ok(pattern.clone());
}
}
let pattern = match self {
FormatType::Regex(pattern) => {
SqlSecurity::validate_regex_pattern(pattern)?;
pattern.clone()
}
FormatType::Email => {
r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$".to_string()
}
FormatType::Url { allow_localhost } => {
if *allow_localhost {
r"^https?://(?:localhost|(?:[a-zA-Z0-9.-]+\.?[a-zA-Z]{2,}|(?:\d{1,3}\.){3}\d{1,3}))(?::\d+)?(?:/[^\s]*)?$".to_string()
} else {
r"^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?::\d+)?(?:/[^\s]*)?$".to_string()
}
}
FormatType::CreditCard { .. } => {
r"^(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|3[0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})$|^(?:\d{4}[-\s]?){3}\d{4}$".to_string()
}
FormatType::Phone { country } => {
match country.as_deref() {
Some("US") | Some("CA") => r"^(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})$".to_string(),
Some("UK") => r"^(\+44\s?)?(?:\(?0\d{4}\)?\s?\d{6}|\(?0\d{3}\)?\s?\d{7}|\(?0\d{2}\)?\s?\d{8})$".to_string(),
Some("DE") => r"^(\+49\s?)?(?:\(?0\d{2,5}\)?\s?\d{4,12})$".to_string(),
Some("FR") => r"^(\+33\s?)?(?:\(?0\d{1}\)?\s?\d{8})$".to_string(),
_ => r"^[\+]?[1-9][\d]{0,15}$".to_string(), }
}
FormatType::PostalCode { country } => {
match country.as_str() {
"US" => r"^\d{5}(-\d{4})?$".to_string(),
"CA" => r"^[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d$".to_string(),
"UK" => r"^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$".to_string(),
"DE" => r"^\d{5}$".to_string(),
"FR" => r"^\d{5}$".to_string(),
"JP" => r"^\d{3}-\d{4}$".to_string(),
"AU" => r"^\d{4}$".to_string(),
_ => r"^[A-Za-z0-9\s-]{3,10}$".to_string(), }
}
FormatType::UUID => {
r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$".to_string()
}
FormatType::IPv4 => {
r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$".to_string()
}
FormatType::IPv6 => {
r"^([0-9a-fA-F]{0,4}:){1,7}([0-9a-fA-F]{0,4})?$|^::$|^::1$|^([0-9a-fA-F]{1,4}:)*::([0-9a-fA-F]{1,4}:)*[0-9a-fA-F]{1,4}$".to_string()
}
FormatType::Json => {
r"^\s*[\{\[].*[\}\]]\s*$".to_string()
}
FormatType::Iso8601DateTime => {
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:\d{2})$".to_string()
}
FormatType::SocialSecurityNumber => {
r"^(00[1-9]|0[1-9][0-9]|[1-5][0-9]{2}|6[0-5][0-9]|66[0-5]|667|66[89]|6[7-9][0-9]|[7-8][0-9]{2})-?(0[1-9]|[1-9][0-9])-?(000[1-9]|00[1-9][0-9]|0[1-9][0-9]{2}|[1-9][0-9]{3})$".to_string()
}
};
{
let mut cache = PATTERN_CACHE.write().map_err(|_| {
TermError::Internal("Failed to acquire write lock on pattern cache".to_string())
})?;
cache.insert(cache_key, pattern.clone());
}
Ok(pattern)
}
pub fn name(&self) -> &str {
match self {
FormatType::Regex(_) => "regex",
FormatType::Email => "email",
FormatType::Url { .. } => "url",
FormatType::CreditCard { .. } => "credit_card",
FormatType::Phone { .. } => "phone",
FormatType::PostalCode { .. } => "postal_code",
FormatType::UUID => "uuid",
FormatType::IPv4 => "ipv4",
FormatType::IPv6 => "ipv6",
FormatType::Json => "json",
FormatType::Iso8601DateTime => "iso8601_datetime",
FormatType::SocialSecurityNumber => "social_security_number",
}
}
pub fn description(&self) -> String {
match self {
FormatType::Regex(pattern) => format!("matches pattern '{pattern}'"),
FormatType::Email => "are valid email addresses".to_string(),
FormatType::Url { allow_localhost } => {
if *allow_localhost {
"are valid URLs (including localhost)".to_string()
} else {
"are valid URLs".to_string()
}
}
FormatType::CreditCard { detect_only } => {
if *detect_only {
"contain credit card number patterns".to_string()
} else {
"are valid credit card numbers".to_string()
}
}
FormatType::Phone { country } => match country.as_deref() {
Some(c) => format!("are valid {c} phone numbers"),
None => "are valid phone numbers".to_string(),
},
FormatType::PostalCode { country } => {
format!("are valid {country} postal codes")
}
FormatType::UUID => "are valid UUIDs".to_string(),
FormatType::IPv4 => "are valid IPv4 addresses".to_string(),
FormatType::IPv6 => "are valid IPv6 addresses".to_string(),
FormatType::Json => "are valid JSON documents".to_string(),
FormatType::Iso8601DateTime => "are valid ISO 8601 date-time strings".to_string(),
FormatType::SocialSecurityNumber => {
"contain Social Security Number patterns".to_string()
}
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct FormatOptions {
pub case_sensitive: bool,
pub trim_before_check: bool,
pub null_is_valid: bool,
}
impl Default for FormatOptions {
fn default() -> Self {
Self {
case_sensitive: true,
trim_before_check: false,
null_is_valid: true, }
}
}
impl FormatOptions {
pub fn new() -> Self {
Self::default()
}
pub fn case_sensitive(mut self, case_sensitive: bool) -> Self {
self.case_sensitive = case_sensitive;
self
}
pub fn trim_before_check(mut self, trim: bool) -> Self {
self.trim_before_check = trim;
self
}
pub fn null_is_valid(mut self, null_valid: bool) -> Self {
self.null_is_valid = null_valid;
self
}
pub fn case_insensitive() -> Self {
Self::new().case_sensitive(false)
}
pub fn strict() -> Self {
Self::new().null_is_valid(false)
}
pub fn lenient() -> Self {
Self::new()
.case_sensitive(false)
.trim_before_check(true)
.null_is_valid(true)
}
pub fn with_trimming() -> Self {
Self::new().trim_before_check(true)
}
}
#[derive(Debug, Clone)]
pub struct FormatConstraint {
column: String,
format: FormatType,
threshold: f64,
options: FormatOptions,
}
impl FormatConstraint {
pub fn new(
column: impl Into<String>,
format: FormatType,
threshold: f64,
options: FormatOptions,
) -> Result<Self> {
let column_str = column.into();
SqlSecurity::validate_identifier(&column_str)?;
if !(0.0..=1.0).contains(&threshold) {
return Err(TermError::SecurityError(
"Threshold must be between 0.0 and 1.0".to_string(),
));
}
format.get_pattern()?;
Ok(Self {
column: column_str,
format,
threshold,
options,
})
}
pub fn email(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::Email,
threshold,
FormatOptions::default(),
)
}
pub fn url(column: impl Into<String>, threshold: f64, allow_localhost: bool) -> Result<Self> {
Self::new(
column,
FormatType::Url { allow_localhost },
threshold,
FormatOptions::default(),
)
}
pub fn credit_card(
column: impl Into<String>,
threshold: f64,
detect_only: bool,
) -> Result<Self> {
Self::new(
column,
FormatType::CreditCard { detect_only },
threshold,
FormatOptions::default(),
)
}
pub fn phone(
column: impl Into<String>,
threshold: f64,
country: Option<String>,
) -> Result<Self> {
Self::new(
column,
FormatType::Phone { country },
threshold,
FormatOptions::new().trim_before_check(true),
)
}
pub fn postal_code(
column: impl Into<String>,
threshold: f64,
country: impl Into<String>,
) -> Result<Self> {
Self::new(
column,
FormatType::PostalCode {
country: country.into(),
},
threshold,
FormatOptions::new().trim_before_check(true),
)
}
pub fn uuid(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::UUID,
threshold,
FormatOptions::default(),
)
}
pub fn ipv4(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::IPv4,
threshold,
FormatOptions::default(),
)
}
pub fn ipv6(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::IPv6,
threshold,
FormatOptions::default(),
)
}
pub fn json(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::Json,
threshold,
FormatOptions::default(),
)
}
pub fn iso8601_datetime(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::Iso8601DateTime,
threshold,
FormatOptions::default(),
)
}
pub fn regex(
column: impl Into<String>,
pattern: impl Into<String>,
threshold: f64,
) -> Result<Self> {
Self::new(
column,
FormatType::Regex(pattern.into()),
threshold,
FormatOptions::default(),
)
}
pub fn social_security_number(column: impl Into<String>, threshold: f64) -> Result<Self> {
Self::new(
column,
FormatType::SocialSecurityNumber,
threshold,
FormatOptions::new().trim_before_check(true),
)
}
}
#[async_trait]
impl Constraint for FormatConstraint {
#[instrument(skip(self, ctx), fields(
column = %self.column,
format = %self.format.name(),
threshold = %self.threshold
))]
async fn evaluate(&self, ctx: &SessionContext) -> Result<ConstraintResult> {
let validation_ctx = current_validation_context();
let table_name = validation_ctx.table_name();
let column_identifier = SqlSecurity::escape_identifier(&self.column)?;
let pattern = self.format.get_pattern()?;
let escaped_pattern = SqlSecurity::validate_regex_pattern(&pattern)?;
let column_expr = if self.options.trim_before_check {
format!("TRIM({column_identifier})")
} else {
column_identifier.clone()
};
let pattern_operator = if self.options.case_sensitive {
"~"
} else {
"~*"
};
let sql = if self.options.null_is_valid {
format!(
"SELECT
COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' OR {column_identifier} IS NULL THEN 1 END) as matches,
COUNT(*) as total
FROM {table_name}"
)
} else {
format!(
"SELECT
COUNT(CASE WHEN {column_expr} {pattern_operator} '{escaped_pattern}' THEN 1 END) as matches,
COUNT(*) as total
FROM {table_name}"
)
};
let df = ctx.sql(&sql).await?;
let batches = df.collect().await?;
if batches.is_empty() {
return Ok(ConstraintResult::skipped("No data to validate"));
}
let batch = &batches[0];
if batch.num_rows() == 0 {
return Ok(ConstraintResult::skipped("No data to validate"));
}
let matches = batch
.column(0)
.as_any()
.downcast_ref::<arrow::array::Int64Array>()
.ok_or_else(|| TermError::Internal("Failed to extract match count".to_string()))?
.value(0) as f64;
let total = batch
.column(1)
.as_any()
.downcast_ref::<arrow::array::Int64Array>()
.ok_or_else(|| TermError::Internal("Failed to extract total count".to_string()))?
.value(0) as f64;
if total == 0.0 {
return Ok(ConstraintResult::skipped("No data to validate"));
}
let match_ratio = matches / total;
let is_success = match &self.format {
FormatType::CreditCard { detect_only: true } => {
match_ratio <= self.threshold
}
_ => {
match_ratio >= self.threshold
}
};
if is_success {
Ok(ConstraintResult::success_with_metric(match_ratio))
} else {
let message = match &self.format {
FormatType::CreditCard { detect_only: true } => {
format!(
"Credit card detection ratio {match_ratio:.3} exceeds threshold {:.3}",
self.threshold
)
}
_ => {
let desc = self.format.description();
format!(
"Format validation ratio {match_ratio:.3} is below threshold {:.3} - values that {desc}",
self.threshold
)
}
};
Ok(ConstraintResult::failure_with_metric(match_ratio, message))
}
}
fn name(&self) -> &str {
self.format.name()
}
fn column(&self) -> Option<&str> {
Some(&self.column)
}
fn metadata(&self) -> ConstraintMetadata {
let description = match &self.format {
FormatType::CreditCard { detect_only: true } => {
let threshold_pct = self.threshold * 100.0;
let desc = self.format.description();
format!(
"Checks that no more than {threshold_pct:.1}% of values in '{}' {desc}",
self.column
)
}
_ => {
let threshold_pct = self.threshold * 100.0;
let desc = self.format.description();
format!(
"Checks that at least {threshold_pct:.1}% of values in '{}' {desc}",
self.column
)
}
};
ConstraintMetadata::for_column(&self.column)
.with_description(description)
.with_custom("format_type", self.format.name())
.with_custom("threshold", self.threshold.to_string())
.with_custom("case_sensitive", self.options.case_sensitive.to_string())
.with_custom(
"trim_before_check",
self.options.trim_before_check.to_string(),
)
.with_custom("null_is_valid", self.options.null_is_valid.to_string())
.with_custom("constraint_type", "format")
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::ConstraintStatus;
use arrow::array::StringArray;
use arrow::datatypes::{DataType, Field, Schema};
use arrow::record_batch::RecordBatch;
use datafusion::datasource::MemTable;
use std::sync::Arc;
use crate::test_helpers::evaluate_constraint_with_context;
async fn create_test_context(values: Vec<Option<&str>>) -> SessionContext {
let ctx = SessionContext::new();
let schema = Arc::new(Schema::new(vec![Field::new(
"text_col",
DataType::Utf8,
true,
)]));
let array = StringArray::from(values);
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap();
let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
ctx.register_table("data", Arc::new(provider)).unwrap();
ctx
}
#[tokio::test]
async fn test_email_format_constraint() {
let values = vec![
Some("test@example.com"),
Some("user@domain.org"),
Some("invalid-email"),
Some("another@test.net"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::email("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "email");
}
#[tokio::test]
async fn test_url_format_constraint() {
let values = vec![
Some("https://example.com"),
Some("http://test.org"),
Some("not-a-url"),
Some("https://another.site.net/path"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::url("text_col", 0.7, false).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "url");
}
#[tokio::test]
async fn test_url_with_localhost() {
let values = vec![
Some("https://localhost:3000"),
Some("http://localhost"),
Some("https://example.com"),
Some("not-a-url"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::url("text_col", 0.7, true).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); }
#[tokio::test]
async fn test_credit_card_detection() {
let values = vec![
Some("4111-1111-1111-1111"),
Some("5555 5555 5555 4444"),
Some("normal text"),
Some("4111111111111111"), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::credit_card("text_col", 0.8, true).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(constraint.name(), "credit_card");
}
#[tokio::test]
async fn test_phone_number_us() {
let values = vec![
Some("(555) 123-4567"),
Some("555-123-4567"),
Some("5551234567"),
Some("invalid-phone"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::phone("text_col", 0.7, Some("US".to_string())).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(constraint.name(), "phone");
}
#[tokio::test]
async fn test_postal_code_us() {
let values = vec![
Some("12345"),
Some("12345-6789"),
Some("invalid"),
Some("98765"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::postal_code("text_col", 0.7, "US").unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "postal_code");
}
#[tokio::test]
async fn test_uuid_format() {
let values = vec![
Some("550e8400-e29b-41d4-a716-446655440000"),
Some("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
Some("invalid-uuid"),
Some("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::uuid("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "uuid");
}
#[tokio::test]
async fn test_ipv4_format() {
let values = vec![
Some("192.168.1.1"),
Some("10.0.0.1"),
Some("256.256.256.256"), Some("172.16.0.1"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::ipv4("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv4");
}
#[tokio::test]
async fn test_ipv6_format() {
let values = vec![
Some("2001:0db8:85a3:0000:0000:8a2e:0370:7334"),
Some("2001:db8:85a3::8a2e:370:7334"),
Some("invalid-ipv6"),
Some("::1"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::ipv6("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "ipv6");
}
#[tokio::test]
async fn test_json_format() {
let values = vec![
Some(r#"{"key": "value"}"#),
Some(r#"[1, 2, 3]"#),
Some("not json"),
Some(r#"{"nested": {"key": "value"}}"#),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::json("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "json");
}
#[tokio::test]
async fn test_iso8601_datetime_format() {
let values = vec![
Some("2023-12-25T10:30:00Z"),
Some("2023-12-25T10:30:00.123Z"),
Some("invalid-datetime"),
Some("2023-12-25T10:30:00+05:30"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::iso8601_datetime("text_col", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "iso8601_datetime");
}
#[tokio::test]
async fn test_custom_regex_format() {
let values = vec![
Some("ABC123"),
Some("DEF456"),
Some("invalid"),
Some("GHI789"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::regex("text_col", r"^[A-Z]{3}\d{3}$", 0.7).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); assert_eq!(constraint.name(), "regex");
}
#[tokio::test]
async fn test_format_options_case_insensitive() {
let values = vec![
Some("abc123"),
Some("DEF456"),
Some("invalid"),
Some("ghi789"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::new(
"text_col",
FormatType::Regex(r"^[A-Z]{3}\d{3}$".to_string()),
0.7,
FormatOptions::new().case_sensitive(false),
)
.unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); }
#[tokio::test]
async fn test_format_options_trim_whitespace() {
let values = vec![
Some(" test@example.com "),
Some("user@domain.org"),
Some(" invalid-email "),
Some(" another@test.net "),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::new(
"text_col",
FormatType::Email,
0.7,
FormatOptions::new().trim_before_check(true),
)
.unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.75)); }
#[tokio::test]
async fn test_format_options_null_handling() {
let values = vec![Some("test@example.com"), None, Some("invalid-email"), None];
let ctx = create_test_context(values).await;
let constraint1 = FormatConstraint::new(
"text_col",
FormatType::Email,
0.6,
FormatOptions::new().null_is_valid(true),
)
.unwrap();
let result1 = evaluate_constraint_with_context(&constraint1, &ctx, "data")
.await
.unwrap();
assert_eq!(result1.status, ConstraintStatus::Success);
assert_eq!(result1.metric, Some(0.75));
let constraint2 = FormatConstraint::new(
"text_col",
FormatType::Email,
0.2,
FormatOptions::new().null_is_valid(false),
)
.unwrap();
let result2 = evaluate_constraint_with_context(&constraint2, &ctx, "data")
.await
.unwrap();
assert_eq!(result2.status, ConstraintStatus::Success);
assert_eq!(result2.metric, Some(0.25)); }
#[tokio::test]
async fn test_constraint_failure() {
let values = vec![
Some("invalid"),
Some("also_invalid"),
Some("nope"),
Some("still_invalid"),
];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::email("text_col", 0.5).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Failure);
assert_eq!(result.metric, Some(0.0)); assert!(result.message.is_some());
}
#[tokio::test]
async fn test_empty_data() {
let ctx = create_test_context(vec![]).await;
let constraint = FormatConstraint::email("text_col", 0.9).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Skipped);
}
#[test]
fn test_invalid_threshold() {
let result = FormatConstraint::email("col", 1.5);
assert!(result.is_err());
assert!(result
.unwrap_err()
.to_string()
.contains("Threshold must be between 0.0 and 1.0"));
}
#[test]
fn test_pattern_caching() {
let format1 = FormatType::Email;
let format2 = FormatType::Email;
let pattern1 = format1.get_pattern().unwrap();
let pattern2 = format2.get_pattern().unwrap();
assert_eq!(pattern1, pattern2);
for _ in 0..100 {
let _ = format1.get_pattern().unwrap();
}
}
#[test]
fn test_format_type_descriptions() {
assert_eq!(FormatType::Email.description(), "are valid email addresses");
assert_eq!(
FormatType::Url {
allow_localhost: true
}
.description(),
"are valid URLs (including localhost)"
);
assert_eq!(
FormatType::Phone {
country: Some("US".to_string())
}
.description(),
"are valid US phone numbers"
);
assert_eq!(
FormatType::PostalCode {
country: "CA".to_string()
}
.description(),
"are valid CA postal codes"
);
}
#[test]
fn test_all_format_types_have_patterns() {
let formats = vec![
FormatType::Email,
FormatType::Url {
allow_localhost: false,
},
FormatType::Url {
allow_localhost: true,
},
FormatType::CreditCard { detect_only: false },
FormatType::Phone { country: None },
FormatType::Phone {
country: Some("US".to_string()),
},
FormatType::PostalCode {
country: "US".to_string(),
},
FormatType::UUID,
FormatType::IPv4,
FormatType::IPv6,
FormatType::Json,
FormatType::Iso8601DateTime,
FormatType::Regex(r"^\d+$".to_string()),
];
for format in formats {
assert!(
format.get_pattern().is_ok(),
"Format {format:?} should have a valid pattern"
);
}
}
#[test]
fn test_format_options_convenience_methods() {
let options = FormatOptions::case_insensitive();
assert!(!options.case_sensitive);
assert!(!options.trim_before_check);
assert!(options.null_is_valid);
let options = FormatOptions::strict();
assert!(options.case_sensitive);
assert!(!options.trim_before_check);
assert!(!options.null_is_valid);
let options = FormatOptions::lenient();
assert!(!options.case_sensitive);
assert!(options.trim_before_check);
assert!(options.null_is_valid);
let options = FormatOptions::with_trimming();
assert!(options.case_sensitive);
assert!(options.trim_before_check);
assert!(options.null_is_valid);
}
#[tokio::test]
async fn test_ssn_format_valid() {
let values = vec![
Some("123-45-6789"), Some("123456789"), Some("456-78-9012"), Some("789012345"), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::social_security_number("text_col", 0.95).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(1.0)); assert_eq!(constraint.name(), "social_security_number");
}
#[tokio::test]
async fn test_ssn_format_invalid_patterns() {
let values = vec![
Some("000-12-3456"), Some("666-12-3456"), Some("900-12-3456"), Some("123-00-4567"), Some("123-45-0000"), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::social_security_number("text_col", 0.0).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.0)); }
#[tokio::test]
async fn test_ssn_format_mixed() {
let values = vec![
Some("123-45-6789"), Some("not-an-ssn"), Some("666-12-3456"), Some("456789012"), Some("123 45 6789"), Some("789-01-2345"), None, Some("234-56-7890"), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::social_security_number("text_col", 0.5).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success);
assert_eq!(result.metric, Some(0.625));
}
#[tokio::test]
async fn test_ssn_format_threshold() {
let values = vec![
Some("123-45-6789"), Some("invalid"), Some("234-56-7890"), Some("not-ssn"), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::social_security_number("text_col", 0.8).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Failure); assert_eq!(result.metric, Some(0.5));
let constraint = FormatConstraint::social_security_number("text_col", 0.4).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.status, ConstraintStatus::Success); assert_eq!(result.metric, Some(0.5));
}
#[tokio::test]
async fn test_ssn_edge_cases() {
let values = vec![
Some("078-05-1120"), Some("219-09-9999"), Some("457-55-5462"), Some("999-99-9999"), Some("123-45-67890"), Some("12-345-6789"), Some("ABC-DE-FGHI"), Some(""), ];
let ctx = create_test_context(values).await;
let constraint = FormatConstraint::social_security_number("text_col", 0.3).unwrap();
let result = evaluate_constraint_with_context(&constraint, &ctx, "data")
.await
.unwrap();
assert_eq!(result.metric, Some(0.375)); assert_eq!(result.status, ConstraintStatus::Success); }
}