1use chrono::{Duration, NaiveDate};
10use rand::Rng;
11use rust_decimal::Decimal;
12use serde::{Deserialize, Serialize};
13
14#[derive(Debug, Clone, PartialEq)]
16pub enum DuplicateType {
17 Exact,
19 Near {
21 varying_fields: Vec<String>,
23 },
24 Fuzzy {
26 similarity: f64,
28 },
29 CrossSystem {
31 source_system: String,
33 target_system: String,
35 },
36}
37
38#[derive(Debug, Clone)]
40pub struct DuplicateConfig {
41 pub duplicate_rate: f64,
43 pub exact_rate: f64,
45 pub near_rate: f64,
47 pub fuzzy_rate: f64,
49 pub max_date_offset_days: i64,
51 pub varying_fields: Vec<String>,
53 pub amount_variance: f64,
55}
56
57impl Default for DuplicateConfig {
58 fn default() -> Self {
59 Self {
60 duplicate_rate: 0.005, exact_rate: 0.3, near_rate: 0.5, fuzzy_rate: 0.2, max_date_offset_days: 5,
65 varying_fields: vec![
66 "entry_date".to_string(),
67 "created_by".to_string(),
68 "description".to_string(),
69 ],
70 amount_variance: 0.01, }
72 }
73}
74
75#[derive(Debug, Clone)]
77pub struct DuplicateRecord<T: Clone> {
78 pub original: T,
80 pub duplicate: T,
82 pub duplicate_type: DuplicateType,
84 pub differing_fields: Vec<String>,
86 pub duplicate_id: String,
88}
89
90pub trait Duplicatable: Clone {
92 fn get_id(&self) -> String;
94
95 fn set_id(&mut self, id: String);
97
98 fn get_field(&self, field: &str) -> Option<String>;
100
101 fn set_field(&mut self, field: &str, value: &str);
103
104 fn get_amount(&self) -> Option<Decimal>;
106
107 fn set_amount(&mut self, amount: Decimal);
109
110 fn get_date(&self) -> Option<NaiveDate>;
112
113 fn set_date(&mut self, date: NaiveDate);
115}
116
117pub struct DuplicateGenerator {
119 config: DuplicateConfig,
120 stats: DuplicateStats,
121 next_duplicate_id: u64,
122}
123
124#[derive(Debug, Clone, Default, Serialize, Deserialize)]
126pub struct DuplicateStats {
127 pub total_processed: usize,
129 pub total_duplicates: usize,
131 pub exact_duplicates: usize,
133 pub near_duplicates: usize,
135 pub fuzzy_duplicates: usize,
137 pub cross_system_duplicates: usize,
139}
140
141impl DuplicateGenerator {
142 pub fn new(config: DuplicateConfig) -> Self {
144 Self {
145 config,
146 stats: DuplicateStats::default(),
147 next_duplicate_id: 1,
148 }
149 }
150
151 pub fn should_duplicate<R: Rng>(&self, rng: &mut R) -> bool {
153 rng.gen::<f64>() < self.config.duplicate_rate
154 }
155
156 pub fn create_duplicate<T: Duplicatable, R: Rng>(
158 &mut self,
159 record: &T,
160 rng: &mut R,
161 ) -> DuplicateRecord<T> {
162 self.stats.total_processed += 1;
163 self.stats.total_duplicates += 1;
164
165 let duplicate_type = self.select_duplicate_type(rng);
166 let mut duplicate = record.clone();
167 let mut differing_fields = Vec::new();
168
169 let new_id = format!("{}-DUP{}", record.get_id(), self.next_duplicate_id);
171 self.next_duplicate_id += 1;
172 duplicate.set_id(new_id);
173 differing_fields.push("id".to_string());
174
175 match &duplicate_type {
176 DuplicateType::Exact => {
177 self.stats.exact_duplicates += 1;
178 }
180 DuplicateType::Near { varying_fields } => {
181 self.stats.near_duplicates += 1;
182 self.apply_near_duplicate_variations(&mut duplicate, varying_fields, rng);
183 differing_fields.extend(varying_fields.clone());
184 }
185 DuplicateType::Fuzzy { similarity } => {
186 self.stats.fuzzy_duplicates += 1;
187 let varied = self.apply_fuzzy_variations(&mut duplicate, *similarity, rng);
188 differing_fields.extend(varied);
189 }
190 DuplicateType::CrossSystem {
191 source_system: _,
192 target_system,
193 } => {
194 self.stats.cross_system_duplicates += 1;
195 if let Some(_current_id) = duplicate.get_field("system_id") {
197 duplicate.set_field("system_id", target_system);
198 differing_fields.push("system_id".to_string());
199 }
200 }
201 }
202
203 let duplicate_id = format!("DUP{:08}", self.stats.total_duplicates);
204
205 DuplicateRecord {
206 original: record.clone(),
207 duplicate,
208 duplicate_type,
209 differing_fields,
210 duplicate_id,
211 }
212 }
213
214 fn select_duplicate_type<R: Rng>(&self, rng: &mut R) -> DuplicateType {
216 let r = rng.gen::<f64>();
217
218 if r < self.config.exact_rate {
219 DuplicateType::Exact
220 } else if r < self.config.exact_rate + self.config.near_rate {
221 DuplicateType::Near {
222 varying_fields: self.config.varying_fields.clone(),
223 }
224 } else {
225 DuplicateType::Fuzzy {
226 similarity: rng.gen_range(0.8..0.95),
227 }
228 }
229 }
230
231 fn apply_near_duplicate_variations<T: Duplicatable, R: Rng>(
233 &self,
234 record: &mut T,
235 varying_fields: &[String],
236 rng: &mut R,
237 ) {
238 for field in varying_fields {
239 match field.as_str() {
240 "entry_date" | "date" => {
241 if let Some(date) = record.get_date() {
242 let offset = rng.gen_range(
243 -self.config.max_date_offset_days..=self.config.max_date_offset_days,
244 );
245 record.set_date(date + Duration::days(offset));
246 }
247 }
248 "amount" | "debit_amount" | "credit_amount" => {
249 if let Some(amount) = record.get_amount() {
250 let variance = 1.0
251 + rng.gen_range(
252 -self.config.amount_variance..self.config.amount_variance,
253 );
254 let new_amount =
255 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
256 record.set_amount(new_amount.round_dp(2));
257 }
258 }
259 "description" => {
260 if let Some(desc) = record.get_field("description") {
261 let variations = [
263 format!("{} ", desc),
264 format!(" {}", desc),
265 desc.to_uppercase(),
266 desc.to_lowercase(),
267 ];
268 let variation = &variations[rng.gen_range(0..variations.len())];
269 record.set_field("description", variation);
270 }
271 }
272 _ => {
273 if let Some(value) = record.get_field(field) {
275 record.set_field(field, &format!("{} ", value));
276 }
277 }
278 }
279 }
280 }
281
282 fn apply_fuzzy_variations<T: Duplicatable, R: Rng>(
284 &self,
285 record: &mut T,
286 similarity: f64,
287 rng: &mut R,
288 ) -> Vec<String> {
289 let mut varied_fields = Vec::new();
290 let change_probability = 1.0 - similarity;
291
292 if rng.gen::<f64>() < change_probability {
294 if let Some(amount) = record.get_amount() {
295 let variance = 1.0 + rng.gen_range(-0.1..0.1); let new_amount =
297 amount * Decimal::from_f64_retain(variance).unwrap_or(Decimal::ONE);
298 record.set_amount(new_amount.round_dp(2));
299 varied_fields.push("amount".to_string());
300 }
301 }
302
303 if rng.gen::<f64>() < change_probability {
305 if let Some(date) = record.get_date() {
306 let offset = rng.gen_range(-30..=30);
307 record.set_date(date + Duration::days(offset));
308 varied_fields.push("date".to_string());
309 }
310 }
311
312 if rng.gen::<f64>() < change_probability {
314 if let Some(desc) = record.get_field("description") {
315 let abbreviated = abbreviate_text(&desc);
317 record.set_field("description", &abbreviated);
318 varied_fields.push("description".to_string());
319 }
320 }
321
322 varied_fields
323 }
324
325 pub fn stats(&self) -> &DuplicateStats {
327 &self.stats
328 }
329
330 pub fn reset_stats(&mut self) {
332 self.stats = DuplicateStats::default();
333 }
334}
335
336fn abbreviate_text(text: &str) -> String {
338 let abbreviations = [
339 ("Account", "Acct"),
340 ("Payment", "Pmt"),
341 ("Invoice", "Inv"),
342 ("Number", "No"),
343 ("Department", "Dept"),
344 ("Company", "Co"),
345 ("Corporation", "Corp"),
346 ("International", "Intl"),
347 ("Management", "Mgmt"),
348 ("Reference", "Ref"),
349 ];
350
351 let mut result = text.to_string();
352 for (full, abbr) in abbreviations {
353 result = result.replace(full, abbr);
354 }
355 result
356}
357
358pub struct DuplicateDetector {
360 similarity_threshold: f64,
362 comparison_fields: Vec<String>,
364}
365
366impl DuplicateDetector {
367 pub fn new(similarity_threshold: f64, comparison_fields: Vec<String>) -> Self {
369 Self {
370 similarity_threshold,
371 comparison_fields,
372 }
373 }
374
375 pub fn string_similarity(&self, a: &str, b: &str) -> f64 {
377 if a == b {
378 return 1.0;
379 }
380
381 let a_chars: std::collections::HashSet<char> = a.chars().collect();
382 let b_chars: std::collections::HashSet<char> = b.chars().collect();
383
384 let intersection = a_chars.intersection(&b_chars).count();
385 let union = a_chars.union(&b_chars).count();
386
387 if union == 0 {
388 0.0
389 } else {
390 intersection as f64 / union as f64
391 }
392 }
393
394 pub fn are_duplicates<T: Duplicatable>(&self, a: &T, b: &T) -> bool {
396 let mut total_similarity = 0.0;
397 let mut field_count = 0;
398
399 for field in &self.comparison_fields {
400 if let (Some(val_a), Some(val_b)) = (a.get_field(field), b.get_field(field)) {
401 total_similarity += self.string_similarity(&val_a, &val_b);
402 field_count += 1;
403 }
404 }
405
406 if let (Some(amt_a), Some(amt_b)) = (a.get_amount(), b.get_amount()) {
408 let amt_a_f64: f64 = amt_a.try_into().unwrap_or(0.0);
409 let amt_b_f64: f64 = amt_b.try_into().unwrap_or(0.0);
410
411 if amt_a_f64.abs() > 0.0 {
412 let ratio = (amt_a_f64 - amt_b_f64).abs() / amt_a_f64.abs();
413 total_similarity += 1.0 - ratio.min(1.0);
414 field_count += 1;
415 }
416 }
417
418 if field_count == 0 {
419 return false;
420 }
421
422 let avg_similarity = total_similarity / field_count as f64;
423 avg_similarity >= self.similarity_threshold
424 }
425
426 pub fn find_duplicates<T: Duplicatable>(&self, records: &[T]) -> Vec<(usize, usize, f64)> {
428 let mut duplicates = Vec::new();
429
430 for i in 0..records.len() {
431 for j in (i + 1)..records.len() {
432 if self.are_duplicates(&records[i], &records[j]) {
433 let mut similarity = 0.0;
434 let mut count = 0;
435
436 for field in &self.comparison_fields {
437 if let (Some(a), Some(b)) =
438 (records[i].get_field(field), records[j].get_field(field))
439 {
440 similarity += self.string_similarity(&a, &b);
441 count += 1;
442 }
443 }
444
445 if count > 0 {
446 duplicates.push((i, j, similarity / count as f64));
447 }
448 }
449 }
450 }
451
452 duplicates
453 }
454}
455
456#[cfg(test)]
457#[allow(clippy::unwrap_used)]
458mod tests {
459 use super::*;
460
461 #[derive(Clone)]
463 struct TestRecord {
464 id: String,
465 description: String,
466 amount: Decimal,
467 date: NaiveDate,
468 }
469
470 impl Duplicatable for TestRecord {
471 fn get_id(&self) -> String {
472 self.id.clone()
473 }
474
475 fn set_id(&mut self, id: String) {
476 self.id = id;
477 }
478
479 fn get_field(&self, field: &str) -> Option<String> {
480 match field {
481 "description" => Some(self.description.clone()),
482 "id" => Some(self.id.clone()),
483 _ => None,
484 }
485 }
486
487 fn set_field(&mut self, field: &str, value: &str) {
488 if field == "description" {
489 self.description = value.to_string();
490 }
491 }
492
493 fn get_amount(&self) -> Option<Decimal> {
494 Some(self.amount)
495 }
496
497 fn set_amount(&mut self, amount: Decimal) {
498 self.amount = amount;
499 }
500
501 fn get_date(&self) -> Option<NaiveDate> {
502 Some(self.date)
503 }
504
505 fn set_date(&mut self, date: NaiveDate) {
506 self.date = date;
507 }
508 }
509
510 #[test]
511 fn test_duplicate_generation() {
512 use rand::SeedableRng;
513 use rand_chacha::ChaCha8Rng;
514 use rust_decimal_macros::dec;
515
516 let config = DuplicateConfig::default();
517 let mut generator = DuplicateGenerator::new(config);
518 let mut rng = ChaCha8Rng::seed_from_u64(42);
519
520 let record = TestRecord {
521 id: "JE001".to_string(),
522 description: "Test Entry".to_string(),
523 amount: dec!(1000),
524 date: NaiveDate::from_ymd_opt(2024, 1, 15).unwrap(),
525 };
526
527 let duplicate = generator.create_duplicate(&record, &mut rng);
528
529 assert_ne!(duplicate.duplicate.get_id(), record.get_id());
530 assert_eq!(generator.stats().total_duplicates, 1);
531 }
532
533 #[test]
534 fn test_string_similarity() {
535 let detector = DuplicateDetector::new(0.8, vec!["description".to_string()]);
536
537 assert_eq!(detector.string_similarity("hello", "hello"), 1.0);
538 assert!(detector.string_similarity("hello", "helo") > 0.8);
539 assert!(detector.string_similarity("abc", "xyz") < 0.5);
540 }
541
542 #[test]
543 fn test_abbreviate_text() {
544 let text = "Account Payment Invoice";
545 let abbreviated = abbreviate_text(text);
546 assert_eq!(abbreviated, "Acct Pmt Inv");
547 }
548}