1use std::collections::HashSet;
16
17use rust_decimal::Decimal;
18use rustledger_plugin_types::{DirectiveData, DirectiveWrapper, PluginError, TransactionData};
19
20use crate::fingerprint::structural_hash;
21
22#[derive(Debug)]
24pub struct StructuralDuplicate {
25 pub index: usize,
27 pub date: String,
29 pub narration: String,
31}
32
33impl StructuralDuplicate {
34 #[must_use]
36 pub fn to_plugin_error(&self) -> PluginError {
37 PluginError::error(format!(
38 "Duplicate transaction: {} \"{}\"",
39 self.date, self.narration
40 ))
41 }
42}
43
44#[must_use]
50pub fn find_structural_duplicates(directives: &[DirectiveWrapper]) -> Vec<StructuralDuplicate> {
51 let mut seen: HashSet<u64> = HashSet::new();
52 let mut duplicates = Vec::new();
53
54 for (i, wrapper) in directives.iter().enumerate() {
55 if let DirectiveData::Transaction(txn) = &wrapper.data {
56 let hash = structural_hash(&wrapper.date, txn);
57 if !seen.insert(hash) {
58 duplicates.push(StructuralDuplicate {
59 index: i,
60 date: wrapper.date.clone(),
61 narration: txn.narration.clone(),
62 });
63 }
64 }
65 }
66
67 duplicates
68}
69
70#[derive(Debug, Clone)]
76pub struct FuzzyDedupConfig {
77 pub text_similarity_threshold: f64,
80}
81
82impl Default for FuzzyDedupConfig {
83 fn default() -> Self {
84 Self {
85 text_similarity_threshold: 0.5,
86 }
87 }
88}
89
90#[derive(Debug)]
92pub struct FuzzyDuplicateMatch {
93 pub new_index: usize,
95 pub existing_index: usize,
97}
98
99#[must_use]
105pub fn find_fuzzy_duplicates(
106 new_directives: &[DirectiveWrapper],
107 existing_directives: &[DirectiveWrapper],
108 config: &FuzzyDedupConfig,
109) -> Vec<FuzzyDuplicateMatch> {
110 let mut matches = Vec::new();
111
112 let existing: Vec<(usize, &str, Option<Decimal>, String)> = existing_directives
114 .iter()
115 .enumerate()
116 .filter_map(|(i, w)| {
117 if let DirectiveData::Transaction(txn) = &w.data {
118 Some((i, w.date.as_str(), first_posting_amount(txn), txn_text(txn)))
119 } else {
120 None
121 }
122 })
123 .collect();
124
125 for (new_i, wrapper) in new_directives.iter().enumerate() {
126 if let DirectiveData::Transaction(txn) = &wrapper.data {
127 let new_amount = first_posting_amount(txn);
128 let new_text = txn_text(txn);
129
130 for &(existing_i, existing_date, ref existing_amount, ref existing_text) in &existing {
131 if wrapper.date != existing_date {
132 continue;
133 }
134 if new_amount != *existing_amount {
135 continue;
136 }
137 if fuzzy_text_match(&new_text, existing_text, config.text_similarity_threshold) {
138 matches.push(FuzzyDuplicateMatch {
139 new_index: new_i,
140 existing_index: existing_i,
141 });
142 break; }
144 }
145 }
146 }
147
148 matches
149}
150
151fn first_posting_amount(txn: &TransactionData) -> Option<Decimal> {
153 txn.postings.first().and_then(|p| {
154 p.units
155 .as_ref()
156 .and_then(|u| u.number.parse::<Decimal>().ok())
157 })
158}
159
160fn txn_text(txn: &TransactionData) -> String {
162 let mut text = String::new();
163 if let Some(ref payee) = txn.payee {
164 text.push_str(payee);
165 text.push(' ');
166 }
167 text.push_str(&txn.narration);
168 text.to_lowercase()
169}
170
171fn fuzzy_text_match(a: &str, b: &str, threshold: f64) -> bool {
174 if a.is_empty() || b.is_empty() {
175 return false;
176 }
177 if a == b {
178 return true;
179 }
180 if a.contains(b) || b.contains(a) {
181 return true;
182 }
183 let a_words: Vec<&str> = a.split_whitespace().collect();
184 let b_words: Vec<&str> = b.split_whitespace().collect();
185 let (shorter, longer) = if a_words.len() <= b_words.len() {
186 (&a_words, &b_words)
187 } else {
188 (&b_words, &a_words)
189 };
190 if shorter.is_empty() {
191 return false;
192 }
193 let match_count = shorter.iter().filter(|w| longer.contains(w)).count();
194 #[allow(clippy::cast_precision_loss)]
195 let ratio = match_count as f64 / shorter.len() as f64;
196 ratio >= threshold
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202 use rustledger_plugin_types::{AmountData, DirectiveData, PostingData, TransactionData};
203
204 fn make_directive(
205 date: &str,
206 payee: Option<&str>,
207 narration: &str,
208 amount: &str,
209 ) -> DirectiveWrapper {
210 DirectiveWrapper {
211 directive_type: "transaction".to_string(),
212 date: date.to_string(),
213 filename: None,
214 lineno: None,
215 data: DirectiveData::Transaction(TransactionData {
216 flag: "*".to_string(),
217 payee: payee.map(String::from),
218 narration: narration.to_string(),
219 tags: vec![],
220 links: vec![],
221 metadata: vec![],
222 postings: vec![PostingData {
223 account: "Assets:Bank".to_string(),
224 units: Some(AmountData {
225 number: amount.to_string(),
226 currency: "USD".to_string(),
227 }),
228 cost: None,
229 price: None,
230 flag: None,
231 metadata: vec![],
232 }],
233 }),
234 }
235 }
236
237 #[test]
240 fn structural_finds_exact_duplicates() {
241 let directives = vec![
242 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
243 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
244 ];
245 let dups = find_structural_duplicates(&directives);
246 assert_eq!(dups.len(), 1);
247 assert_eq!(dups[0].index, 1);
248 }
249
250 #[test]
251 fn structural_no_false_positives() {
252 let directives = vec![
253 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
254 make_directive("2024-01-15", Some("Store"), "Groceries", "-51.00"),
255 ];
256 let dups = find_structural_duplicates(&directives);
257 assert!(dups.is_empty());
258 }
259
260 #[test]
261 fn structural_duplicate_to_plugin_error() {
262 let dup = StructuralDuplicate {
263 index: 1,
264 date: "2024-01-15".to_string(),
265 narration: "Test".to_string(),
266 };
267 let err = dup.to_plugin_error();
268 assert!(err.message.contains("Duplicate transaction"));
269 assert!(err.message.contains("2024-01-15"));
270 }
271
272 #[test]
275 fn fuzzy_finds_matching_transactions() {
276 let new = vec![make_directive(
277 "2024-01-15",
278 Some("WHOLE FOODS"),
279 "Groceries",
280 "-50.00",
281 )];
282 let existing = vec![make_directive(
283 "2024-01-15",
284 Some("Whole Foods Market"),
285 "Groceries",
286 "-50.00",
287 )];
288 let config = FuzzyDedupConfig::default();
289 let matches = find_fuzzy_duplicates(&new, &existing, &config);
290 assert_eq!(matches.len(), 1);
291 }
292
293 #[test]
294 fn fuzzy_no_match_different_date() {
295 let new = vec![make_directive(
296 "2024-01-15",
297 Some("Store"),
298 "Groceries",
299 "-50.00",
300 )];
301 let existing = vec![make_directive(
302 "2024-01-16",
303 Some("Store"),
304 "Groceries",
305 "-50.00",
306 )];
307 let config = FuzzyDedupConfig::default();
308 let matches = find_fuzzy_duplicates(&new, &existing, &config);
309 assert!(matches.is_empty());
310 }
311
312 #[test]
313 fn fuzzy_no_match_different_amount() {
314 let new = vec![make_directive(
315 "2024-01-15",
316 Some("Store"),
317 "Groceries",
318 "-50.00",
319 )];
320 let existing = vec![make_directive(
321 "2024-01-15",
322 Some("Store"),
323 "Groceries",
324 "-51.00",
325 )];
326 let config = FuzzyDedupConfig::default();
327 let matches = find_fuzzy_duplicates(&new, &existing, &config);
328 assert!(matches.is_empty());
329 }
330
331 #[test]
332 fn fuzzy_text_match_exact() {
333 assert!(fuzzy_text_match("hello world", "hello world", 0.5));
334 }
335
336 #[test]
337 fn fuzzy_text_match_contains() {
338 assert!(fuzzy_text_match("hello", "hello world", 0.5));
339 assert!(fuzzy_text_match("hello world", "hello", 0.5));
340 }
341
342 #[test]
343 fn fuzzy_text_match_word_overlap() {
344 assert!(fuzzy_text_match("whole foods", "whole foods market", 0.5));
346 }
347
348 #[test]
349 fn fuzzy_text_match_insufficient_overlap() {
350 assert!(!fuzzy_text_match("alpha", "beta gamma", 0.5));
352 }
353
354 #[test]
355 fn fuzzy_text_match_empty() {
356 assert!(!fuzzy_text_match("", "hello", 0.5));
357 assert!(!fuzzy_text_match("hello", "", 0.5));
358 }
359
360 #[test]
363 fn fuzzy_multiple_matches_only_first_returned() {
364 let new = vec![make_directive(
366 "2024-01-15",
367 Some("Store"),
368 "Groceries",
369 "-50.00",
370 )];
371 let existing = vec![
372 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
373 make_directive("2024-01-15", Some("Store"), "Groceries run", "-50.00"),
374 ];
375 let config = FuzzyDedupConfig::default();
376 let matches = find_fuzzy_duplicates(&new, &existing, &config);
377 assert_eq!(matches.len(), 1);
379 assert_eq!(matches[0].new_index, 0);
380 assert_eq!(matches[0].existing_index, 0);
381 }
382
383 #[test]
384 fn fuzzy_non_transaction_directives_are_skipped() {
385 let note_directive = DirectiveWrapper {
386 directive_type: "note".to_string(),
387 date: "2024-01-15".to_string(),
388 filename: None,
389 lineno: None,
390 data: DirectiveData::Note(rustledger_plugin_types::NoteData {
391 account: "Assets:Bank".to_string(),
392 comment: "A note".to_string(),
393 metadata: vec![],
394 }),
395 };
396 let txn = make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00");
397
398 let matches = find_fuzzy_duplicates(
400 std::slice::from_ref(¬e_directive),
401 std::slice::from_ref(&txn),
402 &FuzzyDedupConfig::default(),
403 );
404 assert!(matches.is_empty());
405
406 let matches =
408 find_fuzzy_duplicates(&[txn], &[note_directive], &FuzzyDedupConfig::default());
409 assert!(matches.is_empty());
410 }
411
412 #[test]
413 fn fuzzy_empty_directives_list() {
414 let config = FuzzyDedupConfig::default();
415
416 let matches = find_fuzzy_duplicates(&[], &[], &config);
418 assert!(matches.is_empty());
419
420 let existing = vec![make_directive(
422 "2024-01-15",
423 Some("Store"),
424 "Test",
425 "-50.00",
426 )];
427 let matches = find_fuzzy_duplicates(&[], &existing, &config);
428 assert!(matches.is_empty());
429
430 let new = vec![make_directive(
432 "2024-01-15",
433 Some("Store"),
434 "Test",
435 "-50.00",
436 )];
437 let matches = find_fuzzy_duplicates(&new, &[], &config);
438 assert!(matches.is_empty());
439 }
440
441 #[test]
442 fn fuzzy_matching_with_narration_only() {
443 let new = vec![make_directive(
445 "2024-01-15",
446 None,
447 "whole foods market",
448 "-50.00",
449 )];
450 let existing = vec![make_directive(
451 "2024-01-15",
452 None,
453 "Whole Foods Market #123",
454 "-50.00",
455 )];
456 let config = FuzzyDedupConfig::default();
457 let matches = find_fuzzy_duplicates(&new, &existing, &config);
458 assert_eq!(matches.len(), 1);
459 }
460
461 #[test]
462 fn fuzzy_matching_payee_vs_narration() {
463 let new = vec![make_directive(
465 "2024-01-15",
466 Some("Whole Foods"),
467 "Payment",
468 "-50.00",
469 )];
470 let existing = vec![make_directive(
471 "2024-01-15",
472 None,
473 "whole foods market",
474 "-50.00",
475 )];
476 let config = FuzzyDedupConfig::default();
477 let matches = find_fuzzy_duplicates(&new, &existing, &config);
478 assert_eq!(matches.len(), 1);
480 }
481
482 #[test]
483 fn structural_empty_directives_list() {
484 let dups = find_structural_duplicates(&[]);
485 assert!(dups.is_empty());
486 }
487
488 #[test]
489 fn structural_non_transaction_not_duplicated() {
490 let note1 = DirectiveWrapper {
491 directive_type: "note".to_string(),
492 date: "2024-01-15".to_string(),
493 filename: None,
494 lineno: None,
495 data: DirectiveData::Note(rustledger_plugin_types::NoteData {
496 account: "Assets:Bank".to_string(),
497 comment: "Same note".to_string(),
498 metadata: vec![],
499 }),
500 };
501 let note2 = note1.clone();
502 let dups = find_structural_duplicates(&[note1, note2]);
503 assert!(dups.is_empty());
504 }
505
506 #[test]
507 fn fuzzy_dedup_config_default() {
508 let config = FuzzyDedupConfig::default();
509 assert!((config.text_similarity_threshold - 0.5).abs() < f64::EPSILON);
510 }
511
512 #[test]
513 fn fuzzy_high_threshold_rejects_partial_matches() {
514 let new = vec![make_directive("2024-01-15", None, "whole foods", "-50.00")];
515 let existing = vec![make_directive(
516 "2024-01-15",
517 None,
518 "whole foods market special",
519 "-50.00",
520 )];
521 let config = FuzzyDedupConfig {
524 text_similarity_threshold: 0.9,
525 };
526 let matches = find_fuzzy_duplicates(&new, &existing, &config);
527 assert_eq!(matches.len(), 1);
528
529 let new = vec![make_directive("2024-01-15", None, "alpha beta", "-50.00")];
531 let existing = vec![make_directive(
532 "2024-01-15",
533 None,
534 "alpha gamma delta",
535 "-50.00",
536 )];
537 let matches = find_fuzzy_duplicates(&new, &existing, &config);
538 assert!(matches.is_empty());
540 }
541}