use chrono::NaiveDate;
use datasynth_core::CountryPack;
use rand::Rng;
use rust_decimal::Decimal;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DateFormat {
ISO,
US,
USDash,
EU,
EUDash,
EUDot,
Long,
ShortYear,
Compact,
Unix,
ExcelSerial,
}
impl DateFormat {
pub fn all() -> Vec<Self> {
vec![
DateFormat::ISO,
DateFormat::US,
DateFormat::USDash,
DateFormat::EU,
DateFormat::EUDash,
DateFormat::EUDot,
DateFormat::Long,
DateFormat::ShortYear,
DateFormat::Compact,
]
}
pub fn from_locale_short(short: &str) -> Self {
let s = short.to_uppercase();
if s.starts_with("YYYY") {
DateFormat::ISO
} else if s.starts_with("MM") {
if s.contains('-') {
DateFormat::USDash
} else {
DateFormat::US
}
} else if s.starts_with("DD") {
if s.contains('.') {
DateFormat::EUDot
} else if s.contains('-') {
DateFormat::EUDash
} else {
DateFormat::EU
}
} else {
DateFormat::ISO
}
}
pub fn format(&self, date: NaiveDate) -> String {
match self {
DateFormat::ISO => date.format("%Y-%m-%d").to_string(),
DateFormat::US => date.format("%m/%d/%Y").to_string(),
DateFormat::USDash => date.format("%m-%d-%Y").to_string(),
DateFormat::EU => date.format("%d/%m/%Y").to_string(),
DateFormat::EUDash => date.format("%d-%m-%Y").to_string(),
DateFormat::EUDot => date.format("%d.%m.%Y").to_string(),
DateFormat::Long => date.format("%B %d, %Y").to_string(),
DateFormat::ShortYear => date.format("%m/%d/%y").to_string(),
DateFormat::Compact => date.format("%Y%m%d").to_string(),
DateFormat::Unix => {
let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid unix epoch date");
let days = (date - epoch).num_days();
(days * 86400).to_string()
}
DateFormat::ExcelSerial => {
let epoch = NaiveDate::from_ymd_opt(1899, 12, 30).expect("valid excel epoch date");
let days = (date - epoch).num_days();
days.to_string()
}
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum AmountFormat {
Plain,
USComma,
EUFormat,
SpaceSeparator,
CurrencyPrefix(String),
CurrencySuffix(String),
Accounting,
Scientific,
NoDecimals,
FourDecimals,
}
impl AmountFormat {
pub fn common() -> Vec<Self> {
vec![
AmountFormat::Plain,
AmountFormat::USComma,
AmountFormat::EUFormat,
AmountFormat::SpaceSeparator,
AmountFormat::CurrencyPrefix("$".to_string()),
AmountFormat::CurrencySuffix("USD".to_string()),
AmountFormat::Accounting,
AmountFormat::NoDecimals,
]
}
pub fn from_locale(
decimal_sep: &str,
thousands_sep: &str,
currency_symbol: &str,
_default_currency: &str,
) -> Self {
match (decimal_sep, thousands_sep) {
(",", ".") => AmountFormat::EUFormat,
(".", " ") => AmountFormat::SpaceSeparator,
(".", ",") => {
if !currency_symbol.is_empty() {
AmountFormat::CurrencyPrefix(currency_symbol.to_string())
} else {
AmountFormat::USComma
}
}
_ => AmountFormat::Plain,
}
}
pub fn format(&self, amount: Decimal) -> String {
let is_negative = amount < Decimal::ZERO;
let abs_amount = amount.abs();
let amount_f64: f64 = abs_amount.try_into().unwrap_or(0.0);
match self {
AmountFormat::Plain => {
if is_negative {
format!("-{amount_f64:.2}")
} else {
format!("{amount_f64:.2}")
}
}
AmountFormat::USComma => {
let formatted = format_with_thousands(amount_f64, ',', '.');
if is_negative {
format!("-{formatted}")
} else {
formatted
}
}
AmountFormat::EUFormat => {
let formatted = format_with_thousands(amount_f64, '.', ',');
if is_negative {
format!("-{formatted}")
} else {
formatted
}
}
AmountFormat::SpaceSeparator => {
let formatted = format_with_thousands(amount_f64, ' ', '.');
if is_negative {
format!("-{formatted}")
} else {
formatted
}
}
AmountFormat::CurrencyPrefix(symbol) => {
let formatted = format_with_thousands(amount_f64, ',', '.');
if is_negative {
format!("-{symbol}{formatted}")
} else {
format!("{symbol}{formatted}")
}
}
AmountFormat::CurrencySuffix(code) => {
let formatted = format_with_thousands(amount_f64, ',', '.');
if is_negative {
format!("-{formatted} {code}")
} else {
format!("{formatted} {code}")
}
}
AmountFormat::Accounting => {
let formatted = format_with_thousands(amount_f64, ',', '.');
if is_negative {
format!("({formatted})")
} else {
formatted
}
}
AmountFormat::Scientific => {
if is_negative {
format!("-{amount_f64:.5E}")
} else {
format!("{amount_f64:.5E}")
}
}
AmountFormat::NoDecimals => {
let rounded = amount_f64.round() as i64;
if is_negative {
format!("-{}", rounded.abs())
} else {
rounded.to_string()
}
}
AmountFormat::FourDecimals => {
if is_negative {
format!("-{amount_f64:.4}")
} else {
format!("{amount_f64:.4}")
}
}
}
}
}
fn format_with_thousands(value: f64, thousand_sep: char, decimal_sep: char) -> String {
let integer_part = value.trunc() as i64;
let decimal_part = ((value.fract() * 100.0).round() as i64).abs();
let integer_str = integer_part.abs().to_string();
let mut result = String::new();
for (i, c) in integer_str.chars().rev().enumerate() {
if i > 0 && i % 3 == 0 {
result.push(thousand_sep);
}
result.push(c);
}
let integer_formatted: String = result.chars().rev().collect();
format!("{integer_formatted}{decimal_sep}{decimal_part:02}")
}
#[derive(Debug, Clone)]
pub enum IdentifierFormat {
Original,
Upper,
Lower,
WithPrefix(String),
WithSuffix(String),
ZeroPadded(usize),
SpacePadded(usize),
WithSeparator { separator: char, interval: usize },
}
impl IdentifierFormat {
pub fn format(&self, id: &str) -> String {
match self {
IdentifierFormat::Original => id.to_string(),
IdentifierFormat::Upper => id.to_uppercase(),
IdentifierFormat::Lower => id.to_lowercase(),
IdentifierFormat::WithPrefix(prefix) => format!("{prefix}{id}"),
IdentifierFormat::WithSuffix(suffix) => format!("{id}{suffix}"),
IdentifierFormat::ZeroPadded(len) => {
if id.len() >= *len {
id.to_string()
} else {
format!("{id:0>len$}")
}
}
IdentifierFormat::SpacePadded(len) => {
if id.len() >= *len {
id.to_string()
} else {
format!("{id:>len$}")
}
}
IdentifierFormat::WithSeparator {
separator,
interval,
} => {
let mut result = String::new();
for (i, c) in id.chars().enumerate() {
if i > 0 && i % interval == 0 {
result.push(*separator);
}
result.push(c);
}
result
}
}
}
}
#[derive(Debug, Clone)]
pub enum TextFormat {
Original,
Upper,
Lower,
Title,
LeadingWhitespace(usize),
TrailingWhitespace(usize),
ExtraSpaces,
Trimmed,
NonBreakingSpaces,
}
impl TextFormat {
pub fn format(&self, text: &str) -> String {
match self {
TextFormat::Original => text.to_string(),
TextFormat::Upper => text.to_uppercase(),
TextFormat::Lower => text.to_lowercase(),
TextFormat::Title => text
.split_whitespace()
.map(|word| {
let mut chars = word.chars();
match chars.next() {
None => String::new(),
Some(first) => {
first.to_uppercase().to_string()
+ chars.as_str().to_lowercase().as_str()
}
}
})
.collect::<Vec<_>>()
.join(" "),
TextFormat::LeadingWhitespace(n) => {
format!("{}{}", " ".repeat(*n), text)
}
TextFormat::TrailingWhitespace(n) => {
format!("{}{}", text, " ".repeat(*n))
}
TextFormat::ExtraSpaces => text.split_whitespace().collect::<Vec<_>>().join(" "),
TextFormat::Trimmed => text.trim().to_string(),
TextFormat::NonBreakingSpaces => text.replace(' ', "\u{00A0}"),
}
}
}
#[derive(Debug, Clone)]
pub struct FormatVariationConfig {
pub date_variation_rate: f64,
pub amount_variation_rate: f64,
pub identifier_variation_rate: f64,
pub text_variation_rate: f64,
pub allowed_date_formats: Vec<DateFormat>,
pub allowed_amount_formats: Vec<AmountFormat>,
}
impl Default for FormatVariationConfig {
fn default() -> Self {
Self {
date_variation_rate: 0.05,
amount_variation_rate: 0.03,
identifier_variation_rate: 0.02,
text_variation_rate: 0.05,
allowed_date_formats: DateFormat::all(),
allowed_amount_formats: AmountFormat::common(),
}
}
}
pub struct FormatVariationInjector {
config: FormatVariationConfig,
stats: FormatVariationStats,
country_pack: Option<CountryPack>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct FormatVariationStats {
pub date_variations: usize,
pub amount_variations: usize,
pub identifier_variations: usize,
pub text_variations: usize,
pub total_processed: usize,
}
impl FormatVariationInjector {
pub fn new(config: FormatVariationConfig) -> Self {
Self {
config,
stats: FormatVariationStats::default(),
country_pack: None,
}
}
pub fn set_country_pack(&mut self, pack: CountryPack) {
self.country_pack = Some(pack);
}
fn baseline_date_format(&self) -> DateFormat {
match &self.country_pack {
Some(pack) => {
let short = &pack.locale.date_format.short;
if short.is_empty() {
DateFormat::ISO
} else {
DateFormat::from_locale_short(short)
}
}
None => DateFormat::ISO,
}
}
fn baseline_amount_format(&self) -> AmountFormat {
match &self.country_pack {
Some(pack) => {
let locale = &pack.locale;
let dec_sep = &locale.number_format.decimal_separator;
let thou_sep = &locale.number_format.thousands_separator;
let symbol = &locale.currency_symbol;
let currency = &locale.default_currency;
if dec_sep.is_empty() && thou_sep.is_empty() {
AmountFormat::Plain
} else {
AmountFormat::from_locale(dec_sep, thou_sep, symbol, currency)
}
}
None => AmountFormat::Plain,
}
}
pub fn vary_date<R: Rng>(&mut self, date: NaiveDate, rng: &mut R) -> String {
self.stats.total_processed += 1;
if rng.random::<f64>() < self.config.date_variation_rate {
self.stats.date_variations += 1;
let format = &self.config.allowed_date_formats
[rng.random_range(0..self.config.allowed_date_formats.len())];
format.format(date)
} else {
self.baseline_date_format().format(date)
}
}
pub fn vary_amount<R: Rng>(&mut self, amount: Decimal, rng: &mut R) -> String {
self.stats.total_processed += 1;
if rng.random::<f64>() < self.config.amount_variation_rate {
self.stats.amount_variations += 1;
let format = &self.config.allowed_amount_formats
[rng.random_range(0..self.config.allowed_amount_formats.len())];
format.format(amount)
} else {
self.baseline_amount_format().format(amount)
}
}
pub fn vary_identifier<R: Rng>(&mut self, id: &str, rng: &mut R) -> String {
self.stats.total_processed += 1;
if rng.random::<f64>() < self.config.identifier_variation_rate {
self.stats.identifier_variations += 1;
let variations = [
IdentifierFormat::Upper,
IdentifierFormat::Lower,
IdentifierFormat::ZeroPadded(10),
IdentifierFormat::WithPrefix(" ".to_string()),
IdentifierFormat::WithSuffix(" ".to_string()),
];
let format = &variations[rng.random_range(0..variations.len())];
format.format(id)
} else {
id.to_string()
}
}
pub fn vary_text<R: Rng>(&mut self, text: &str, rng: &mut R) -> String {
self.stats.total_processed += 1;
if rng.random::<f64>() < self.config.text_variation_rate {
self.stats.text_variations += 1;
let variations = [
TextFormat::Upper,
TextFormat::Lower,
TextFormat::Title,
TextFormat::LeadingWhitespace(1),
TextFormat::TrailingWhitespace(1),
TextFormat::ExtraSpaces,
];
let format = &variations[rng.random_range(0..variations.len())];
format.format(text)
} else {
text.to_string()
}
}
pub fn stats(&self) -> &FormatVariationStats {
&self.stats
}
pub fn reset_stats(&mut self) {
self.stats = FormatVariationStats::default();
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
use rust_decimal_macros::dec;
#[test]
fn test_date_formats() {
let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
assert_eq!(DateFormat::ISO.format(date), "2024-01-15");
assert_eq!(DateFormat::US.format(date), "01/15/2024");
assert_eq!(DateFormat::EU.format(date), "15/01/2024");
assert_eq!(DateFormat::Compact.format(date), "20240115");
}
#[test]
fn test_amount_formats() {
let amount = dec!(1234567.89);
assert_eq!(AmountFormat::Plain.format(amount), "1234567.89");
assert_eq!(AmountFormat::USComma.format(amount), "1,234,567.89");
assert_eq!(AmountFormat::EUFormat.format(amount), "1.234.567,89");
assert_eq!(AmountFormat::NoDecimals.format(amount), "1234568");
}
#[test]
fn test_negative_amounts() {
let amount = dec!(-1234.56);
assert_eq!(AmountFormat::Plain.format(amount), "-1234.56");
assert_eq!(AmountFormat::Accounting.format(amount), "(1,234.56)");
}
#[test]
fn test_identifier_formats() {
let id = "abc123";
assert_eq!(IdentifierFormat::Upper.format(id), "ABC123");
assert_eq!(IdentifierFormat::ZeroPadded(10).format(id), "0000abc123");
}
#[test]
fn test_text_formats() {
let text = "hello world";
assert_eq!(TextFormat::Upper.format(text), "HELLO WORLD");
assert_eq!(TextFormat::Title.format(text), "Hello World");
assert_eq!(TextFormat::ExtraSpaces.format(text), "hello world");
}
#[test]
fn test_format_injector() {
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let config = FormatVariationConfig {
date_variation_rate: 1.0, ..Default::default()
};
let mut injector = FormatVariationInjector::new(config);
let mut rng = ChaCha8Rng::seed_from_u64(42);
let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
let formatted = injector.vary_date(date, &mut rng);
assert!(!formatted.is_empty());
assert_eq!(injector.stats().date_variations, 1);
}
#[test]
fn test_date_format_from_locale_short() {
assert_eq!(DateFormat::from_locale_short("MM/DD/YYYY"), DateFormat::US);
assert_eq!(
DateFormat::from_locale_short("MM-DD-YYYY"),
DateFormat::USDash
);
assert_eq!(DateFormat::from_locale_short("DD/MM/YYYY"), DateFormat::EU);
assert_eq!(
DateFormat::from_locale_short("DD-MM-YYYY"),
DateFormat::EUDash
);
assert_eq!(
DateFormat::from_locale_short("DD.MM.YYYY"),
DateFormat::EUDot
);
assert_eq!(DateFormat::from_locale_short("YYYY-MM-DD"), DateFormat::ISO);
assert_eq!(DateFormat::from_locale_short(""), DateFormat::ISO);
}
#[test]
fn test_amount_format_from_locale() {
assert_eq!(
AmountFormat::from_locale(",", ".", "\u{20ac}", "EUR"),
AmountFormat::EUFormat
);
assert_eq!(
AmountFormat::from_locale(".", ",", "$", "USD"),
AmountFormat::CurrencyPrefix("$".to_string())
);
assert_eq!(
AmountFormat::from_locale(".", " ", "", "CHF"),
AmountFormat::SpaceSeparator
);
assert_eq!(
AmountFormat::from_locale("X", "Y", "", "XYZ"),
AmountFormat::Plain
);
}
#[test]
fn test_injector_with_country_pack_date_baseline() {
use datasynth_core::country::schema::{DateFormatConfig, LocaleConfig};
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let pack = CountryPack {
locale: LocaleConfig {
date_format: DateFormatConfig {
short: "DD.MM.YYYY".to_string(),
..Default::default()
},
..Default::default()
},
..Default::default()
};
let config = FormatVariationConfig {
date_variation_rate: 0.0, ..Default::default()
};
let mut injector = FormatVariationInjector::new(config);
injector.set_country_pack(pack);
let mut rng = ChaCha8Rng::seed_from_u64(42);
let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
let formatted = injector.vary_date(date, &mut rng);
assert_eq!(formatted, "15.01.2024");
}
#[test]
fn test_injector_with_country_pack_amount_baseline() {
use datasynth_core::country::schema::{LocaleConfig, NumberFormatConfig};
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let pack = CountryPack {
locale: LocaleConfig {
number_format: NumberFormatConfig {
decimal_separator: ",".to_string(),
thousands_separator: ".".to_string(),
..Default::default()
},
currency_symbol: "\u{20ac}".to_string(),
default_currency: "EUR".to_string(),
..Default::default()
},
..Default::default()
};
let config = FormatVariationConfig {
amount_variation_rate: 0.0, ..Default::default()
};
let mut injector = FormatVariationInjector::new(config);
injector.set_country_pack(pack);
let mut rng = ChaCha8Rng::seed_from_u64(42);
let amount = dec!(1234.56);
let formatted = injector.vary_amount(amount, &mut rng);
assert_eq!(formatted, "1.234,56");
}
#[test]
fn test_injector_without_country_pack_uses_defaults() {
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let config = FormatVariationConfig {
date_variation_rate: 0.0,
amount_variation_rate: 0.0,
..Default::default()
};
let mut injector = FormatVariationInjector::new(config);
let mut rng = ChaCha8Rng::seed_from_u64(42);
let date = NaiveDate::from_ymd_opt(2024, 1, 15).unwrap();
let formatted_date = injector.vary_date(date, &mut rng);
assert_eq!(formatted_date, "2024-01-15");
let amount = dec!(1234.56);
let formatted_amount = injector.vary_amount(amount, &mut rng);
assert_eq!(formatted_amount, "1234.56"); }
}