1use chrono::{Duration, NaiveDate};
10use rand::{Rng, RngExt};
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13
14#[derive(Debug, Clone, PartialEq)]
16pub enum DuplicateType {
17 Exact,
19 Near {
21 varying_fields: Vec<String>,
23 },
24 Fuzzy {
26 similarity: f64,
28 },
29 CrossSystem {
31 source_system: String,
33 target_system: String,
35 },
36}
37
38#[derive(Debug, Clone)]
40pub struct DuplicateConfig {
41 pub duplicate_rate: f64,
43 pub exact_rate: f64,
45 pub near_rate: f64,
47 pub fuzzy_rate: f64,
49 pub max_date_offset_days: i64,
51 pub varying_fields: Vec<String>,
53 pub amount_variance: f64,
55}
56
57impl Default for DuplicateConfig {
58 fn default() -> Self {
59 Self {
60 duplicate_rate: 0.005, exact_rate: 0.3, near_rate: 0.5, fuzzy_rate: 0.2, max_date_offset_days: 5,
65 varying_fields: vec![
66 "entry_date".to_string(),
67 "created_by".to_string(),
68 "description".to_string(),
69 ],
70 amount_variance: 0.01, }
72 }
73}
74
75#[derive(Debug, Clone)]
77pub struct DuplicateRecord<T: Clone> {
78 pub original: T,
80 pub duplicate: T,
82 pub duplicate_type: DuplicateType,
84 pub differing_fields: Vec<String>,
86 pub duplicate_id: String,
88}
89
90pub trait Duplicatable: Clone {
92 fn get_id(&self) -> String;
94
95 fn set_id(&mut self, id: String);
97
98 fn get_field(&self, field: &str) -> Option<String>;
100
101 fn set_field(&mut self, field: &str, value: &str);
103
104 fn get_amount(&self) -> Option<Decimal>;
106
107 fn set_amount(&mut self, amount: Decimal);
109
110 fn get_date(&self) -> Option<NaiveDate>;
112
113 fn set_date(&mut self, date: NaiveDate);
115}
116
117pub struct DuplicateGenerator {
119 config: DuplicateConfig,
120 stats: DuplicateStats,
121 next_duplicate_id: u64,
122}
123
124#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct DuplicateStats {
127 pub total_processed: usize,
129 pub total_duplicates: usize,
131 pub exact_duplicates: usize,
133 pub near_duplicates: usize,
135 pub fuzzy_duplicates: usize,
137 pub cross_system_duplicates: usize,
139}
140
141impl DuplicateGenerator {
142 pub fn new(config: DuplicateConfig) -> Self {
144 Self {
145 config,
146 stats: DuplicateStats::default(),
147 next_duplicate_id: 1,
148 }
149 }
150
151 pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
153 rng.random::<f64>() < self.config.duplicate_rate
154 }
155
156 pub fn create_duplicate<T: Duplicatable, R: Rng>(
158 &mut self,
159 record: &T,
160 rng: &mut R,
161 ) -> DuplicateRecord<T> {
162 self.stats.total_processed += 1;
163 self.stats.total_duplicates += 1;
164
165 let duplicate_type = self.select_duplicate_type(rng);
166 let mut duplicate = record.clone();
167 let mut differing_fields = Vec::new();
168
169 let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
171 self.next_duplicate_id += 1;
172 duplicate.set_id(new_id);
173 differing_fields.push("id".to_string());
174
175 match &duplicate_type {
176 DuplicateType::Exact => {
177 self.stats.exact_duplicates += 1;
178 }
180 DuplicateType::Near { varying_fields } => {
181 self.stats.near_duplicates += 1;
182 self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
183 differing_fields.extend(varying_fields.clone());
184 }
185 DuplicateType::Fuzzy { similarity } => {
186 self.stats.fuzzy_duplicates += 1;
187 let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
188 differing_fields.extend(varied);
189 }
190 DuplicateType::CrossSystem {
191 source_system: _,
192 target_system,
193 } => {
194 self.stats.cross_system_duplicates += 1;
195 if let Some(_current_id) = duplicate.get_field("system_id") {
197 duplicate.set_field("system_id", target_system);
198 differing_fields.push("system_id".to_string());
199 }
200 }
201 }
202
203 let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
204
205 DuplicateRecord {
206 original: record.clone(),
207 duplicate,
208 duplicate_type,
209 differing_fields,
210 duplicate_id,
211 }
212 }
213
214 fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
216 let r = rng.random::<f64>();
217
218 if r < self.config.exact_rate {
219 DuplicateType::Exact
220 } else if r < self.config.exact_rate + self.config.near_rate {
221 DuplicateType::Near {
222 varying_fields: self.config.varying_fields.clone(),
223 }
224 } else {
225 DuplicateType::Fuzzy {
226 similarity: rng.random_range(0.8..0.95),
227 }
228 }
229 }
230
231 fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
233 &self,
234 record: &mut T,
235 varying_fields: &[String],
236 rng: &mut R,
237 ) {
238 for field in varying_fields {
239 match field.as_str() {
240 "entry_date" | "date" => {
241 if let Some(date) = record.get_date() {
242 let offset = rng.random_range(
243 -self.config.max_date_offset_days..=self.config.max_date_offset_days,
244 );
245 record.set_date(date + Duration::days(offset));
246 }
247 }
248 "amount" | "debit_amount" | "credit_amount" => {
249 if let Some(amount) = record.get_amount() {
250 let variance = 1.0
251 + rng.random_range(
252 -self.config.amount_variance..self.config.amount_variance,
253 );
254 let new_amount =
255 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
256 record.set_amount(new_amount.round_dp(2));
257 }
258 }
259 "description" => {
260 if let Some(desc) = record.get_field("description") {
261 let variations = [
263 format!("{desc} "),
264 format!(" {desc}"),
265 desc.to_uppercase(),
266 desc.to_lowercase(),
267 ];
268 let variation = &variations[rng.random_range(0..variations.len())];
269 record.set_field("description", variation);
270 }
271 }
272 _ => {
273 if let Some(value) = record.get_field(field) {
275 record.set_field(field, &format!("{value} "));
276 }
277 }
278 }
279 }
280 }
281
282 fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
284 &self,
285 record: &mut T,
286 similarity: f64,
287 rng: &mut R,
288 ) -> Vec<String> {
289 let mut varied_fields = Vec::new();
290 let change_probability = 1.0 - similarity;
291
292 if rng.random::<f64>() < change_probability {
294 if let Some(amount) = record.get_amount() {
295 let variance = 1.0 + rng.random_range(-0.1..0.1); let new_amount =
297 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
298 record.set_amount(new_amount.round_dp(2));
299 varied_fields.push("amount".to_string());
300 }
301 }
302
303 if rng.random::<f64>() < change_probability {
305 if let Some(date) = record.get_date() {
306 let offset = rng.random_range(-30..=30);
307 record.set_date(date + Duration::days(offset));
308 varied_fields.push("date".to_string());
309 }
310 }
311
312 if rng.random::<f64>() < change_probability {
314 if let Some(desc) = record.get_field("description") {
315 let abbreviated = abbreviate_text(&desc);
317 record.set_field("description", &abbreviated);
318 varied_fields.push("description".to_string());
319 }
320 }
321
322 varied_fields
323 }
324
325 pub fn stats(&self) -> &DuplicateStats {
327 &self.stats
328 }
329
330 pub fn reset_stats(&mut self) {
332 self.stats = DuplicateStats::default();
333 }
334}
335
336fn abbreviate_text(text: &str) -> String {
338 let abbreviations = [
339 ("Account", "Acct"),
340 ("Payment", "Pmt"),
341 ("Invoice", "Inv"),
342 ("Number", "No"),
343 ("Department", "Dept"),
344 ("Company", "Co"),
345 ("Corporation", "Corp"),
346 ("International", "Intl"),
347 ("Management", "Mgmt"),
348 ("Reference", "Ref"),
349 ];
350
351 let mut result = text.to_string();
352 for (full, abbr) in abbreviations {
353 result = result.replace(full, abbr);
354 }
355 result
356}
357
358pub struct DuplicateDetector {
360 similarity_threshold: f64,
362 comparison_fields: Vec<String>,
364}
365
366impl DuplicateDetector {
367 pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
369 Self {
370 similarity_threshold,
371 comparison_fields,
372 }
373 }
374
375 pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
377 if a == b {
378 return 1.0;
379 }
380
381 let a_chars: std::collections::HashSet<char> = a.chars().collect();
382 let b_chars: std::collections::HashSet<char> = b.chars().collect();
383
384 let intersection = a_chars.intersection(&b_chars).count();
385 let union = a_chars.union(&b_chars).count();
386
387 if union == 0 {
388 0.0
389 } else {
390 intersection as f64 / union as f64
391 }
392 }
393
394 pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
396 let mut total_similarity = 0.0;
397 let mut field_count = 0;
398
399 for field in &self.comparison_fields {
400 if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
401 total_similarity += self.string_similarity(&val_a, &val_b);
402 field_count += 1;
403 }
404 }
405
406 if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
408 let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
409 let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
410
411 if amt_a_f64.abs() > 0.0 {
412 let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
413 total_similarity += 1.0 - ratio.min(1.0);
414 field_count += 1;
415 }
416 }
417
418 if field_count == 0 {
419 return false;
420 }
421
422 let avg_similarity = total_similarity / field_count as f64;
423 avg_similarity >= self.similarity_threshold
424 }
425
426 pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
428 let mut duplicates = Vec::new();
429
430 for i in 0..records.len() {
431 for j in (i + 1)..records.len() {
432 if self.are_duplicates(&records[i], &records[j]) {
433 let mut similarity = 0.0;
434 let mut count = 0;
435
436 for field in &self.comparison_fields {
437 if let (Some(a), Some(b)) =
438 (records[i].get_field(field), records[j].get_field(field))
439 {
440 similarity += self.string_similarity(&a, &b);
441 count += 1;
442 }
443 }
444
445 if count > 0 {
446 duplicates.push((i, j, similarity / count as f64));
447 }
448 }
449 }
450 }
451
452 duplicates
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use super::*;
459
460 #[derive(Clone)]
462 struct TestRecord {
463 id: String,
464 description: String,
465 amount: Decimal,
466 date: NaiveDate,
467 }
468
469 impl Duplicatable for TestRecord {
470 fn get_id(&self) -> String {
471 self.id.clone()
472 }
473
474 fn set_id(&mut self, id: String) {
475 self.id = id;
476 }
477
478 fn get_field(&self, field: &str) -> Option<String> {
479 match field {
480 "description" => Some(self.description.clone()),
481 "id" => Some(self.id.clone()),
482 _ => None,
483 }
484 }
485
486 fn set_field(&mut self, field: &str, value: &str) {
487 if field == "description" {
488 self.description = value.to_string();
489 }
490 }
491
492 fn get_amount(&self) -> Option<Decimal> {
493 Some(self.amount)
494 }
495
496 fn set_amount(&mut self, amount: Decimal) {
497 self.amount = amount;
498 }
499
500 fn get_date(&self) -> Option<NaiveDate> {
501 Some(self.date)
502 }
503
504 fn set_date(&mut self, date: NaiveDate) {
505 self.date = date;
506 }
507 }
508
509 #[test]
510 fn test_duplicate_generation() {
511 use rand::SeedableRng;
512 use rand_chacha::ChaCha8Rng;
513 use rust_decimal_macros::dec;
514
515 let config = DuplicateConfig::default();
516 let mut generator = DuplicateGenerator::new(config);
517 let mut rng = ChaCha8Rng::seed_from_u64(42);
518
519 let record = TestRecord {
520 id: "JE001".to_string(),
521 description: "Test Entry".to_string(),
522 amount: dec!(1000),
523 date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
524 };
525
526 let duplicate = generator.create_duplicate(&record, &mut rng);
527
528 assert_ne!(duplicate.duplicate.get_id(), record.get_id());
529 assert_eq!(generator.stats().total_duplicates, 1);
530 }
531
532 #[test]
533 fn test_string_similarity() {
534 let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
535
536 assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
537 assert!(detector.string_similarity("hello", "helo") > 0.8);
538 assert!(detector.string_similarity("abc", "xyz") < 0.5);
539 }
540
541 #[test]
542 fn test_abbreviate_text() {
543 let text = "Account Payment Invoice";
544 let abbreviated = abbreviate_text(text);
545 assert_eq!(abbreviated, "Acct Pmt Inv");
546 }
547}