1use chrono::{Duration, NaiveDate};
10use rand::Rng;
11use rust_decimal::Decimal;
12
13#[derive(Debug, Clone, PartialEq)]
15pub enum DuplicateType {
16 Exact,
18 Near {
20 varying_fields: Vec<String>,
22 },
23 Fuzzy {
25 similarity: f64,
27 },
28 CrossSystem {
30 source_system: String,
32 target_system: String,
34 },
35}
36
37#[derive(Debug, Clone)]
39pub struct DuplicateConfig {
40 pub duplicate_rate: f64,
42 pub exact_rate: f64,
44 pub near_rate: f64,
46 pub fuzzy_rate: f64,
48 pub max_date_offset_days: i64,
50 pub varying_fields: Vec<String>,
52 pub amount_variance: f64,
54}
55
56impl Default for DuplicateConfig {
57 fn default() -> Self {
58 Self {
59 duplicate_rate: 0.005, exact_rate: 0.3, near_rate: 0.5, fuzzy_rate: 0.2, max_date_offset_days: 5,
64 varying_fields: vec![
65 "entry_date".to_string(),
66 "created_by".to_string(),
67 "description".to_string(),
68 ],
69 amount_variance: 0.01, }
71 }
72}
73
74#[derive(Debug, Clone)]
76pub struct DuplicateRecord<T: Clone> {
77 pub original: T,
79 pub duplicate: T,
81 pub duplicate_type: DuplicateType,
83 pub differing_fields: Vec<String>,
85 pub duplicate_id: String,
87}
88
89pub trait Duplicatable: Clone {
91 fn get_id(&self) -> String;
93
94 fn set_id(&mut self, id: String);
96
97 fn get_field(&self, field: &str) -> Option<String>;
99
100 fn set_field(&mut self, field: &str, value: &str);
102
103 fn get_amount(&self) -> Option<Decimal>;
105
106 fn set_amount(&mut self, amount: Decimal);
108
109 fn get_date(&self) -> Option<NaiveDate>;
111
112 fn set_date(&mut self, date: NaiveDate);
114}
115
116pub struct DuplicateGenerator {
118 config: DuplicateConfig,
119 stats: DuplicateStats,
120 next_duplicate_id: u64,
121}
122
123#[derive(Debug, Clone, Default)]
125pub struct DuplicateStats {
126 pub total_processed: usize,
128 pub total_duplicates: usize,
130 pub exact_duplicates: usize,
132 pub near_duplicates: usize,
134 pub fuzzy_duplicates: usize,
136 pub cross_system_duplicates: usize,
138}
139
140impl DuplicateGenerator {
141 pub fn new(config: DuplicateConfig) -> Self {
143 Self {
144 config,
145 stats: DuplicateStats::default(),
146 next_duplicate_id: 1,
147 }
148 }
149
150 pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
152 rng.gen::<f64>() < self.config.duplicate_rate
153 }
154
155 pub fn create_duplicate<T: Duplicatable, R: Rng>(
157 &mut self,
158 record: &T,
159 rng: &mut R,
160 ) -> DuplicateRecord<T> {
161 self.stats.total_processed += 1;
162 self.stats.total_duplicates += 1;
163
164 let duplicate_type = self.select_duplicate_type(rng);
165 let mut duplicate = record.clone();
166 let mut differing_fields = Vec::new();
167
168 let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
170 self.next_duplicate_id += 1;
171 duplicate.set_id(new_id);
172 differing_fields.push("id".to_string());
173
174 match &duplicate_type {
175 DuplicateType::Exact => {
176 self.stats.exact_duplicates += 1;
177 }
179 DuplicateType::Near { varying_fields } => {
180 self.stats.near_duplicates += 1;
181 self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
182 differing_fields.extend(varying_fields.clone());
183 }
184 DuplicateType::Fuzzy { similarity } => {
185 self.stats.fuzzy_duplicates += 1;
186 let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
187 differing_fields.extend(varied);
188 }
189 DuplicateType::CrossSystem {
190 source_system: _,
191 target_system,
192 } => {
193 self.stats.cross_system_duplicates += 1;
194 if let Some(_current_id) = duplicate.get_field("system_id") {
196 duplicate.set_field("system_id", target_system);
197 differing_fields.push("system_id".to_string());
198 }
199 }
200 }
201
202 let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
203
204 DuplicateRecord {
205 original: record.clone(),
206 duplicate,
207 duplicate_type,
208 differing_fields,
209 duplicate_id,
210 }
211 }
212
213 fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
215 let r = rng.gen::<f64>();
216
217 if r < self.config.exact_rate {
218 DuplicateType::Exact
219 } else if r < self.config.exact_rate + self.config.near_rate {
220 DuplicateType::Near {
221 varying_fields: self.config.varying_fields.clone(),
222 }
223 } else {
224 DuplicateType::Fuzzy {
225 similarity: rng.gen_range(0.8..0.95),
226 }
227 }
228 }
229
230 fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
232 &self,
233 record: &mut T,
234 varying_fields: &[String],
235 rng: &mut R,
236 ) {
237 for field in varying_fields {
238 match field.as_str() {
239 "entry_date" | "date" => {
240 if let Some(date) = record.get_date() {
241 let offset = rng.gen_range(
242 -self.config.max_date_offset_days..=self.config.max_date_offset_days,
243 );
244 record.set_date(date + Duration::days(offset));
245 }
246 }
247 "amount" | "debit_amount" | "credit_amount" => {
248 if let Some(amount) = record.get_amount() {
249 let variance = 1.0
250 + rng.gen_range(
251 -self.config.amount_variance..self.config.amount_variance,
252 );
253 let new_amount =
254 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
255 record.set_amount(new_amount.round_dp(2));
256 }
257 }
258 "description" => {
259 if let Some(desc) = record.get_field("description") {
260 let variations = [
262 format!("{} ", desc),
263 format!(" {}", desc),
264 desc.to_uppercase(),
265 desc.to_lowercase(),
266 ];
267 let variation = &variations[rng.gen_range(0..variations.len())];
268 record.set_field("description", variation);
269 }
270 }
271 _ => {
272 if let Some(value) = record.get_field(field) {
274 record.set_field(field, &format!("{} ", value));
275 }
276 }
277 }
278 }
279 }
280
281 fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
283 &self,
284 record: &mut T,
285 similarity: f64,
286 rng: &mut R,
287 ) -> Vec<String> {
288 let mut varied_fields = Vec::new();
289 let change_probability = 1.0 - similarity;
290
291 if rng.gen::<f64>() < change_probability {
293 if let Some(amount) = record.get_amount() {
294 let variance = 1.0 + rng.gen_range(-0.1..0.1); let new_amount =
296 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
297 record.set_amount(new_amount.round_dp(2));
298 varied_fields.push("amount".to_string());
299 }
300 }
301
302 if rng.gen::<f64>() < change_probability {
304 if let Some(date) = record.get_date() {
305 let offset = rng.gen_range(-30..=30);
306 record.set_date(date + Duration::days(offset));
307 varied_fields.push("date".to_string());
308 }
309 }
310
311 if rng.gen::<f64>() < change_probability {
313 if let Some(desc) = record.get_field("description") {
314 let abbreviated = abbreviate_text(&desc);
316 record.set_field("description", &abbreviated);
317 varied_fields.push("description".to_string());
318 }
319 }
320
321 varied_fields
322 }
323
324 pub fn stats(&self) -> &DuplicateStats {
326 &self.stats
327 }
328
329 pub fn reset_stats(&mut self) {
331 self.stats = DuplicateStats::default();
332 }
333}
334
335fn abbreviate_text(text: &str) -> String {
337 let abbreviations = [
338 ("Account", "Acct"),
339 ("Payment", "Pmt"),
340 ("Invoice", "Inv"),
341 ("Number", "No"),
342 ("Department", "Dept"),
343 ("Company", "Co"),
344 ("Corporation", "Corp"),
345 ("International", "Intl"),
346 ("Management", "Mgmt"),
347 ("Reference", "Ref"),
348 ];
349
350 let mut result = text.to_string();
351 for (full, abbr) in abbreviations {
352 result = result.replace(full, abbr);
353 }
354 result
355}
356
357pub struct DuplicateDetector {
359 similarity_threshold: f64,
361 comparison_fields: Vec<String>,
363}
364
365impl DuplicateDetector {
366 pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
368 Self {
369 similarity_threshold,
370 comparison_fields,
371 }
372 }
373
374 pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
376 if a == b {
377 return 1.0;
378 }
379
380 let a_chars: std::collections::HashSet<char> = a.chars().collect();
381 let b_chars: std::collections::HashSet<char> = b.chars().collect();
382
383 let intersection = a_chars.intersection(&b_chars).count();
384 let union = a_chars.union(&b_chars).count();
385
386 if union == 0 {
387 0.0
388 } else {
389 intersection as f64 / union as f64
390 }
391 }
392
393 pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
395 let mut total_similarity = 0.0;
396 let mut field_count = 0;
397
398 for field in &self.comparison_fields {
399 if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
400 total_similarity += self.string_similarity(&val_a, &val_b);
401 field_count += 1;
402 }
403 }
404
405 if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
407 let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
408 let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
409
410 if amt_a_f64.abs() > 0.0 {
411 let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
412 total_similarity += 1.0 - ratio.min(1.0);
413 field_count += 1;
414 }
415 }
416
417 if field_count == 0 {
418 return false;
419 }
420
421 let avg_similarity = total_similarity / field_count as f64;
422 avg_similarity >= self.similarity_threshold
423 }
424
425 pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
427 let mut duplicates = Vec::new();
428
429 for i in 0..records.len() {
430 for j in (i + 1)..records.len() {
431 if self.are_duplicates(&records[i], &records[j]) {
432 let mut similarity = 0.0;
433 let mut count = 0;
434
435 for field in &self.comparison_fields {
436 if let (Some(a), Some(b)) =
437 (records[i].get_field(field), records[j].get_field(field))
438 {
439 similarity += self.string_similarity(&a, &b);
440 count += 1;
441 }
442 }
443
444 if count > 0 {
445 duplicates.push((i, j, similarity / count as f64));
446 }
447 }
448 }
449 }
450
451 duplicates
452 }
453}
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458
459 #[derive(Clone)]
461 struct TestRecord {
462 id: String,
463 description: String,
464 amount: Decimal,
465 date: NaiveDate,
466 }
467
468 impl Duplicatable for TestRecord {
469 fn get_id(&self) -> String {
470 self.id.clone()
471 }
472
473 fn set_id(&mut self, id: String) {
474 self.id = id;
475 }
476
477 fn get_field(&self, field: &str) -> Option<String> {
478 match field {
479 "description" => Some(self.description.clone()),
480 "id" => Some(self.id.clone()),
481 _ => None,
482 }
483 }
484
485 fn set_field(&mut self, field: &str, value: &str) {
486 if field == "description" {
487 self.description = value.to_string();
488 }
489 }
490
491 fn get_amount(&self) -> Option<Decimal> {
492 Some(self.amount)
493 }
494
495 fn set_amount(&mut self, amount: Decimal) {
496 self.amount = amount;
497 }
498
499 fn get_date(&self) -> Option<NaiveDate> {
500 Some(self.date)
501 }
502
503 fn set_date(&mut self, date: NaiveDate) {
504 self.date = date;
505 }
506 }
507
508 #[test]
509 fn test_duplicate_generation() {
510 use rand::SeedableRng;
511 use rand_chacha::ChaCha8Rng;
512 use rust_decimal_macros::dec;
513
514 let config = DuplicateConfig::default();
515 let mut generator = DuplicateGenerator::new(config);
516 let mut rng = ChaCha8Rng::seed_from_u64(42);
517
518 let record = TestRecord {
519 id: "JE001".to_string(),
520 description: "Test Entry".to_string(),
521 amount: dec!(1000),
522 date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
523 };
524
525 let duplicate = generator.create_duplicate(&record, &mut rng);
526
527 assert_ne!(duplicate.duplicate.get_id(), record.get_id());
528 assert_eq!(generator.stats().total_duplicates, 1);
529 }
530
531 #[test]
532 fn test_string_similarity() {
533 let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
534
535 assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
536 assert!(detector.string_similarity("hello", "helo") > 0.8);
537 assert!(detector.string_similarity("abc", "xyz") < 0.5);
538 }
539
540 #[test]
541 fn test_abbreviate_text() {
542 let text = "Account Payment Invoice";
543 let abbreviated = abbreviate_text(text);
544 assert_eq!(abbreviated, "Acct Pmt Inv");
545 }
546}