use chrono::{Duration, NaiveDate};
use rand::Rng;
use rust_decimal::Decimal;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq)]
pub enum DuplicateType {
Exact,
Near {
varying_fields: Vec<String>,
},
Fuzzy {
similarity: f64,
},
CrossSystem {
source_system: String,
target_system: String,
},
}
#[derive(Debug, Clone)]
pub struct DuplicateConfig {
pub duplicate_rate: f64,
pub exact_rate: f64,
pub near_rate: f64,
pub fuzzy_rate: f64,
pub max_date_offset_days: i64,
pub varying_fields: Vec<String>,
pub amount_variance: f64,
}
impl Default for DuplicateConfig {
fn default() -> Self {
Self {
duplicate_rate: 0.005, exact_rate: 0.3, near_rate: 0.5, fuzzy_rate: 0.2, max_date_offset_days: 5,
varying_fields: vec![
"entry_date".to_string(),
"created_by".to_string(),
"description".to_string(),
],
amount_variance: 0.01, }
}
}
#[derive(Debug, Clone)]
pub struct DuplicateRecord<T: Clone> {
pub original: T,
pub duplicate: T,
pub duplicate_type: DuplicateType,
pub differing_fields: Vec<String>,
pub duplicate_id: String,
}
pub trait Duplicatable: Clone {
fn get_id(&self) -> String;
fn set_id(&mut self, id: String);
fn get_field(&self, field: &str) -> Option<String>;
fn set_field(&mut self, field: &str, value: &str);
fn get_amount(&self) -> Option<Decimal>;
fn set_amount(&mut self, amount: Decimal);
fn get_date(&self) -> Option<NaiveDate>;
fn set_date(&mut self, date: NaiveDate);
}
pub struct DuplicateGenerator {
config: DuplicateConfig,
stats: DuplicateStats,
next_duplicate_id: u64,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DuplicateStats {
pub total_processed: usize,
pub total_duplicates: usize,
pub exact_duplicates: usize,
pub near_duplicates: usize,
pub fuzzy_duplicates: usize,
pub cross_system_duplicates: usize,
}
impl DuplicateGenerator {
pub fn new(config: DuplicateConfig) -> Self {
Self {
config,
stats: DuplicateStats::default(),
next_duplicate_id: 1,
}
}
pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
rng.random::<f64>() < self.config.duplicate_rate
}
pub fn create_duplicate<T: Duplicatable, R: Rng>(
&mut self,
record: &T,
rng: &mut R,
) -> DuplicateRecord<T> {
self.stats.total_processed += 1;
self.stats.total_duplicates += 1;
let duplicate_type = self.select_duplicate_type(rng);
let mut duplicate = record.clone();
let mut differing_fields = Vec::new();
let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
self.next_duplicate_id += 1;
duplicate.set_id(new_id);
differing_fields.push("id".to_string());
match &duplicate_type {
DuplicateType::Exact => {
self.stats.exact_duplicates += 1;
}
DuplicateType::Near { varying_fields } => {
self.stats.near_duplicates += 1;
self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
differing_fields.extend(varying_fields.clone());
}
DuplicateType::Fuzzy { similarity } => {
self.stats.fuzzy_duplicates += 1;
let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
differing_fields.extend(varied);
}
DuplicateType::CrossSystem {
source_system: _,
target_system,
} => {
self.stats.cross_system_duplicates += 1;
if let Some(_current_id) = duplicate.get_field("system_id") {
duplicate.set_field("system_id", target_system);
differing_fields.push("system_id".to_string());
}
}
}
let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
DuplicateRecord {
original: record.clone(),
duplicate,
duplicate_type,
differing_fields,
duplicate_id,
}
}
fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
let r = rng.random::<f64>();
if r < self.config.exact_rate {
DuplicateType::Exact
} else if r < self.config.exact_rate + self.config.near_rate {
DuplicateType::Near {
varying_fields: self.config.varying_fields.clone(),
}
} else {
DuplicateType::Fuzzy {
similarity: rng.random_range(0.8..0.95),
}
}
}
fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
&self,
record: &mut T,
varying_fields: &[String],
rng: &mut R,
) {
for field in varying_fields {
match field.as_str() {
"entry_date" | "date" => {
if let Some(date) = record.get_date() {
let offset = rng.random_range(
-self.config.max_date_offset_days..=self.config.max_date_offset_days,
);
record.set_date(date + Duration::days(offset));
}
}
"amount" | "debit_amount" | "credit_amount" => {
if let Some(amount) = record.get_amount() {
let variance = 1.0
+ rng.random_range(
-self.config.amount_variance..self.config.amount_variance,
);
let new_amount =
amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
record.set_amount(new_amount.round_dp(2));
}
}
"description" => {
if let Some(desc) = record.get_field("description") {
let variations = [
format!("{desc} "),
format!(" {desc}"),
desc.to_uppercase(),
desc.to_lowercase(),
];
let variation = &variations[rng.random_range(0..variations.len())];
record.set_field("description", variation);
}
}
_ => {
if let Some(value) = record.get_field(field) {
record.set_field(field, &format!("{value} "));
}
}
}
}
}
fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
&self,
record: &mut T,
similarity: f64,
rng: &mut R,
) -> Vec<String> {
let mut varied_fields = Vec::new();
let change_probability = 1.0 - similarity;
if rng.random::<f64>() < change_probability {
if let Some(amount) = record.get_amount() {
let variance = 1.0 + rng.random_range(-0.1..0.1); let new_amount =
amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
record.set_amount(new_amount.round_dp(2));
varied_fields.push("amount".to_string());
}
}
if rng.random::<f64>() < change_probability {
if let Some(date) = record.get_date() {
let offset = rng.random_range(-30..=30);
record.set_date(date + Duration::days(offset));
varied_fields.push("date".to_string());
}
}
if rng.random::<f64>() < change_probability {
if let Some(desc) = record.get_field("description") {
let abbreviated = abbreviate_text(&desc);
record.set_field("description", &abbreviated);
varied_fields.push("description".to_string());
}
}
varied_fields
}
pub fn stats(&self) -> &DuplicateStats {
&self.stats
}
pub fn reset_stats(&mut self) {
self.stats = DuplicateStats::default();
}
}
fn abbreviate_text(text: &str) -> String {
let abbreviations = [
("Account", "Acct"),
("Payment", "Pmt"),
("Invoice", "Inv"),
("Number", "No"),
("Department", "Dept"),
("Company", "Co"),
("Corporation", "Corp"),
("International", "Intl"),
("Management", "Mgmt"),
("Reference", "Ref"),
];
let mut result = text.to_string();
for (full, abbr) in abbreviations {
result = result.replace(full, abbr);
}
result
}
pub struct DuplicateDetector {
similarity_threshold: f64,
comparison_fields: Vec<String>,
}
impl DuplicateDetector {
pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
Self {
similarity_threshold,
comparison_fields,
}
}
pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
if a == b {
return 1.0;
}
let a_chars: std::collections::HashSet<char> = a.chars().collect();
let b_chars: std::collections::HashSet<char> = b.chars().collect();
let intersection = a_chars.intersection(&b_chars).count();
let union = a_chars.union(&b_chars).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
let mut total_similarity = 0.0;
let mut field_count = 0;
for field in &self.comparison_fields {
if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
total_similarity += self.string_similarity(&val_a, &val_b);
field_count += 1;
}
}
if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
if amt_a_f64.abs() > 0.0 {
let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
total_similarity += 1.0 - ratio.min(1.0);
field_count += 1;
}
}
if field_count == 0 {
return false;
}
let avg_similarity = total_similarity / field_count as f64;
avg_similarity >= self.similarity_threshold
}
pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
let mut duplicates = Vec::new();
for i in 0..records.len() {
for j in (i + 1)..records.len() {
if self.are_duplicates(&records[i], &records[j]) {
let mut similarity = 0.0;
let mut count = 0;
for field in &self.comparison_fields {
if let (Some(a), Some(b)) =
(records[i].get_field(field), records[j].get_field(field))
{
similarity += self.string_similarity(&a, &b);
count += 1;
}
}
if count > 0 {
duplicates.push((i, j, similarity / count as f64));
}
}
}
}
duplicates
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[derive(Clone)]
struct TestRecord {
id: String,
description: String,
amount: Decimal,
date: NaiveDate,
}
impl Duplicatable for TestRecord {
fn get_id(&self) -> String {
self.id.clone()
}
fn set_id(&mut self, id: String) {
self.id = id;
}
fn get_field(&self, field: &str) -> Option<String> {
match field {
"description" => Some(self.description.clone()),
"id" => Some(self.id.clone()),
_ => None,
}
}
fn set_field(&mut self, field: &str, value: &str) {
if field == "description" {
self.description = value.to_string();
}
}
fn get_amount(&self) -> Option<Decimal> {
Some(self.amount)
}
fn set_amount(&mut self, amount: Decimal) {
self.amount = amount;
}
fn get_date(&self) -> Option<NaiveDate> {
Some(self.date)
}
fn set_date(&mut self, date: NaiveDate) {
self.date = date;
}
}
#[test]
fn test_duplicate_generation() {
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
use rust_decimal_macros::dec;
let config = DuplicateConfig::default();
let mut generator = DuplicateGenerator::new(config);
let mut rng = ChaCha8Rng::seed_from_u64(42);
let record = TestRecord {
id: "JE001".to_string(),
description: "Test Entry".to_string(),
amount: dec!(1000),
date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
};
let duplicate = generator.create_duplicate(&record, &mut rng);
assert_ne!(duplicate.duplicate.get_id(), record.get_id());
assert_eq!(generator.stats().total_duplicates, 1);
}
#[test]
fn test_string_similarity() {
let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
assert!(detector.string_similarity("hello", "helo") > 0.8);
assert!(detector.string_similarity("abc", "xyz") < 0.5);
}
#[test]
fn test_abbreviate_text() {
let text = "Account Payment Invoice";
let abbreviated = abbreviate_text(text);
assert_eq!(abbreviated, "Acct Pmt Inv");
}
}