1use std::collections::HashSet;
16
17use rust_decimal::Decimal;
18use rustledger_plugin_types::{DirectiveData, DirectiveWrapper, PluginError, TransactionData};
19
20use crate::fingerprint::structural_hash;
21
22#[derive(Debug)]
24pub struct StructuralDuplicate {
25 pub index: usize,
27 pub date: String,
29 pub narration: String,
31}
32
33impl StructuralDuplicate {
34 #[must_use]
36 pub fn to_plugin_error(&self) -> PluginError {
37 PluginError::error(format!(
38 "Duplicate transaction: {} \"{}\"",
39 self.date, self.narration
40 ))
41 }
42}
43
44#[must_use]
50pub fn find_structural_duplicates(directives: &[DirectiveWrapper]) -> Vec<StructuralDuplicate> {
51 let mut seen: HashSet<u64> = HashSet::new();
52 let mut duplicates = Vec::new();
53
54 for (i, wrapper) in directives.iter().enumerate() {
55 if let DirectiveData::Transaction(txn) = &wrapper.data {
56 let hash = structural_hash(&wrapper.date, txn);
57 if !seen.insert(hash) {
58 duplicates.push(StructuralDuplicate {
59 index: i,
60 date: wrapper.date.clone(),
61 narration: txn.narration.clone(),
62 });
63 }
64 }
65 }
66
67 duplicates
68}
69
70#[derive(Debug, Clone)]
76pub struct FuzzyDedupConfig {
77 pub text_similarity_threshold: f64,
80}
81
82impl Default for FuzzyDedupConfig {
83 fn default() -> Self {
84 Self {
85 text_similarity_threshold: 0.5,
86 }
87 }
88}
89
90#[derive(Debug)]
92pub struct FuzzyDuplicateMatch {
93 pub new_index: usize,
95 pub existing_index: usize,
97}
98
99#[must_use]
105pub fn find_fuzzy_duplicates(
106 new_directives: &[DirectiveWrapper],
107 existing_directives: &[DirectiveWrapper],
108 config: &FuzzyDedupConfig,
109) -> Vec<FuzzyDuplicateMatch> {
110 let mut matches = Vec::new();
111
112 let existing: Vec<(usize, &str, Option<Decimal>, String)> = existing_directives
114 .iter()
115 .enumerate()
116 .filter_map(|(i, w)| {
117 if let DirectiveData::Transaction(txn) = &w.data {
118 Some((i, w.date.as_str(), first_posting_amount(txn), txn_text(txn)))
119 } else {
120 None
121 }
122 })
123 .collect();
124
125 for (new_i, wrapper) in new_directives.iter().enumerate() {
126 if let DirectiveData::Transaction(txn) = &wrapper.data {
127 let new_amount = first_posting_amount(txn);
128 let new_text = txn_text(txn);
129
130 for &(existing_i, existing_date, ref existing_amount, ref existing_text) in &existing {
131 if wrapper.date != existing_date {
132 continue;
133 }
134 if new_amount != *existing_amount {
135 continue;
136 }
137 if fuzzy_text_match(&new_text, existing_text, config.text_similarity_threshold) {
138 matches.push(FuzzyDuplicateMatch {
139 new_index: new_i,
140 existing_index: existing_i,
141 });
142 break; }
144 }
145 }
146 }
147
148 matches
149}
150
151fn first_posting_amount(txn: &TransactionData) -> Option<Decimal> {
153 txn.postings.first().and_then(|p| {
154 p.units
155 .as_ref()
156 .and_then(|u| u.number.parse::<Decimal>().ok())
157 })
158}
159
160fn txn_text(txn: &TransactionData) -> String {
162 let mut text = String::new();
163 if let Some(ref payee) = txn.payee {
164 text.push_str(payee);
165 text.push(' ');
166 }
167 text.push_str(&txn.narration);
168 text.to_lowercase()
169}
170
171fn fuzzy_text_match(a: &str, b: &str, threshold: f64) -> bool {
174 if a.is_empty() || b.is_empty() {
175 return false;
176 }
177 if a == b {
178 return true;
179 }
180 if a.contains(b) || b.contains(a) {
181 return true;
182 }
183 let a_words: Vec<&str> = a.split_whitespace().collect();
184 let b_words: Vec<&str> = b.split_whitespace().collect();
185 let (shorter, longer) = if a_words.len() <= b_words.len() {
186 (&a_words, &b_words)
187 } else {
188 (&b_words, &a_words)
189 };
190 if shorter.is_empty() {
191 return false;
192 }
193 let match_count = shorter.iter().filter(|w| longer.contains(w)).count();
194 #[allow(clippy::cast_precision_loss)]
195 let ratio = match_count as f64 / shorter.len() as f64;
196 ratio >= threshold
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202 use rustledger_plugin_types::{AmountData, DirectiveData, PostingData, TransactionData};
203
204 fn make_directive(
205 date: &str,
206 payee: Option<&str>,
207 narration: &str,
208 amount: &str,
209 ) -> DirectiveWrapper {
210 DirectiveWrapper {
211 directive_type: "transaction".to_string(),
212 date: date.to_string(),
213 filename: None,
214 lineno: None,
215 data: DirectiveData::Transaction(TransactionData {
216 flag: "*".to_string(),
217 payee: payee.map(String::from),
218 narration: narration.to_string(),
219 tags: vec![],
220 links: vec![],
221 metadata: vec![],
222 postings: vec![PostingData {
223 account: "Assets:Bank".to_string(),
224 units: Some(AmountData {
225 number: amount.to_string(),
226 currency: "USD".to_string(),
227 }),
228 cost: None,
229 price: None,
230 flag: None,
231 metadata: vec![],
232 span: None,
233 }],
234 }),
235 }
236 }
237
238 #[test]
241 fn structural_finds_exact_duplicates() {
242 let directives = vec![
243 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
244 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
245 ];
246 let dups = find_structural_duplicates(&directives);
247 assert_eq!(dups.len(), 1);
248 assert_eq!(dups[0].index, 1);
249 }
250
251 #[test]
252 fn structural_no_false_positives() {
253 let directives = vec![
254 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
255 make_directive("2024-01-15", Some("Store"), "Groceries", "-51.00"),
256 ];
257 let dups = find_structural_duplicates(&directives);
258 assert!(dups.is_empty());
259 }
260
261 #[test]
262 fn structural_duplicate_to_plugin_error() {
263 let dup = StructuralDuplicate {
264 index: 1,
265 date: "2024-01-15".to_string(),
266 narration: "Test".to_string(),
267 };
268 let err = dup.to_plugin_error();
269 assert!(err.message.contains("Duplicate transaction"));
270 assert!(err.message.contains("2024-01-15"));
271 }
272
273 #[test]
276 fn fuzzy_finds_matching_transactions() {
277 let new = vec![make_directive(
278 "2024-01-15",
279 Some("WHOLE FOODS"),
280 "Groceries",
281 "-50.00",
282 )];
283 let existing = vec![make_directive(
284 "2024-01-15",
285 Some("Whole Foods Market"),
286 "Groceries",
287 "-50.00",
288 )];
289 let config = FuzzyDedupConfig::default();
290 let matches = find_fuzzy_duplicates(&new, &existing, &config);
291 assert_eq!(matches.len(), 1);
292 }
293
294 #[test]
295 fn fuzzy_no_match_different_date() {
296 let new = vec![make_directive(
297 "2024-01-15",
298 Some("Store"),
299 "Groceries",
300 "-50.00",
301 )];
302 let existing = vec![make_directive(
303 "2024-01-16",
304 Some("Store"),
305 "Groceries",
306 "-50.00",
307 )];
308 let config = FuzzyDedupConfig::default();
309 let matches = find_fuzzy_duplicates(&new, &existing, &config);
310 assert!(matches.is_empty());
311 }
312
313 #[test]
314 fn fuzzy_no_match_different_amount() {
315 let new = vec![make_directive(
316 "2024-01-15",
317 Some("Store"),
318 "Groceries",
319 "-50.00",
320 )];
321 let existing = vec![make_directive(
322 "2024-01-15",
323 Some("Store"),
324 "Groceries",
325 "-51.00",
326 )];
327 let config = FuzzyDedupConfig::default();
328 let matches = find_fuzzy_duplicates(&new, &existing, &config);
329 assert!(matches.is_empty());
330 }
331
332 #[test]
333 fn fuzzy_text_match_exact() {
334 assert!(fuzzy_text_match("hello world", "hello world", 0.5));
335 }
336
337 #[test]
338 fn fuzzy_text_match_contains() {
339 assert!(fuzzy_text_match("hello", "hello world", 0.5));
340 assert!(fuzzy_text_match("hello world", "hello", 0.5));
341 }
342
343 #[test]
344 fn fuzzy_text_match_word_overlap() {
345 assert!(fuzzy_text_match("whole foods", "whole foods market", 0.5));
347 }
348
349 #[test]
350 fn fuzzy_text_match_insufficient_overlap() {
351 assert!(!fuzzy_text_match("alpha", "beta gamma", 0.5));
353 }
354
355 #[test]
356 fn fuzzy_text_match_empty() {
357 assert!(!fuzzy_text_match("", "hello", 0.5));
358 assert!(!fuzzy_text_match("hello", "", 0.5));
359 }
360
361 #[test]
364 fn fuzzy_multiple_matches_only_first_returned() {
365 let new = vec![make_directive(
367 "2024-01-15",
368 Some("Store"),
369 "Groceries",
370 "-50.00",
371 )];
372 let existing = vec![
373 make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
374 make_directive("2024-01-15", Some("Store"), "Groceries run", "-50.00"),
375 ];
376 let config = FuzzyDedupConfig::default();
377 let matches = find_fuzzy_duplicates(&new, &existing, &config);
378 assert_eq!(matches.len(), 1);
380 assert_eq!(matches[0].new_index, 0);
381 assert_eq!(matches[0].existing_index, 0);
382 }
383
384 #[test]
385 fn fuzzy_non_transaction_directives_are_skipped() {
386 let note_directive = DirectiveWrapper {
387 directive_type: "note".to_string(),
388 date: "2024-01-15".to_string(),
389 filename: None,
390 lineno: None,
391 data: DirectiveData::Note(rustledger_plugin_types::NoteData {
392 account: "Assets:Bank".to_string(),
393 comment: "A note".to_string(),
394 metadata: vec![],
395 }),
396 };
397 let txn = make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00");
398
399 let matches = find_fuzzy_duplicates(
401 std::slice::from_ref(¬e_directive),
402 std::slice::from_ref(&txn),
403 &FuzzyDedupConfig::default(),
404 );
405 assert!(matches.is_empty());
406
407 let matches =
409 find_fuzzy_duplicates(&[txn], &[note_directive], &FuzzyDedupConfig::default());
410 assert!(matches.is_empty());
411 }
412
413 #[test]
414 fn fuzzy_empty_directives_list() {
415 let config = FuzzyDedupConfig::default();
416
417 let matches = find_fuzzy_duplicates(&[], &[], &config);
419 assert!(matches.is_empty());
420
421 let existing = vec![make_directive(
423 "2024-01-15",
424 Some("Store"),
425 "Test",
426 "-50.00",
427 )];
428 let matches = find_fuzzy_duplicates(&[], &existing, &config);
429 assert!(matches.is_empty());
430
431 let new = vec![make_directive(
433 "2024-01-15",
434 Some("Store"),
435 "Test",
436 "-50.00",
437 )];
438 let matches = find_fuzzy_duplicates(&new, &[], &config);
439 assert!(matches.is_empty());
440 }
441
442 #[test]
443 fn fuzzy_matching_with_narration_only() {
444 let new = vec![make_directive(
446 "2024-01-15",
447 None,
448 "whole foods market",
449 "-50.00",
450 )];
451 let existing = vec![make_directive(
452 "2024-01-15",
453 None,
454 "Whole Foods Market #123",
455 "-50.00",
456 )];
457 let config = FuzzyDedupConfig::default();
458 let matches = find_fuzzy_duplicates(&new, &existing, &config);
459 assert_eq!(matches.len(), 1);
460 }
461
462 #[test]
463 fn fuzzy_matching_payee_vs_narration() {
464 let new = vec![make_directive(
466 "2024-01-15",
467 Some("Whole Foods"),
468 "Payment",
469 "-50.00",
470 )];
471 let existing = vec![make_directive(
472 "2024-01-15",
473 None,
474 "whole foods market",
475 "-50.00",
476 )];
477 let config = FuzzyDedupConfig::default();
478 let matches = find_fuzzy_duplicates(&new, &existing, &config);
479 assert_eq!(matches.len(), 1);
481 }
482
483 #[test]
484 fn structural_empty_directives_list() {
485 let dups = find_structural_duplicates(&[]);
486 assert!(dups.is_empty());
487 }
488
489 #[test]
490 fn structural_non_transaction_not_duplicated() {
491 let note1 = DirectiveWrapper {
492 directive_type: "note".to_string(),
493 date: "2024-01-15".to_string(),
494 filename: None,
495 lineno: None,
496 data: DirectiveData::Note(rustledger_plugin_types::NoteData {
497 account: "Assets:Bank".to_string(),
498 comment: "Same note".to_string(),
499 metadata: vec![],
500 }),
501 };
502 let note2 = note1.clone();
503 let dups = find_structural_duplicates(&[note1, note2]);
504 assert!(dups.is_empty());
505 }
506
507 #[test]
508 fn fuzzy_dedup_config_default() {
509 let config = FuzzyDedupConfig::default();
510 assert!((config.text_similarity_threshold - 0.5).abs() < f64::EPSILON);
511 }
512
513 #[test]
514 fn fuzzy_high_threshold_rejects_partial_matches() {
515 let new = vec![make_directive("2024-01-15", None, "whole foods", "-50.00")];
516 let existing = vec![make_directive(
517 "2024-01-15",
518 None,
519 "whole foods market special",
520 "-50.00",
521 )];
522 let config = FuzzyDedupConfig {
525 text_similarity_threshold: 0.9,
526 };
527 let matches = find_fuzzy_duplicates(&new, &existing, &config);
528 assert_eq!(matches.len(), 1);
529
530 let new = vec![make_directive("2024-01-15", None, "alpha beta", "-50.00")];
532 let existing = vec![make_directive(
533 "2024-01-15",
534 None,
535 "alpha gamma delta",
536 "-50.00",
537 )];
538 let matches = find_fuzzy_duplicates(&new, &existing, &config);
539 assert!(matches.is_empty());
541 }
542}