Skip to main content

rustledger_ops/
dedup.rs

1//! Duplicate transaction detection.
2//!
3//! Provides three deduplication strategies:
4//!
5//! - **Structural** — exact hash match using [`crate::fingerprint::structural_hash`].
6//!   Finds transactions that are byte-for-byte identical (excluding metadata).
7//!
8//! - **Fuzzy** — approximate match using date + amount + text similarity.
9//!   Finds transactions that are likely the same despite minor differences
10//!   (e.g., different payee formatting between bank and ledger).
11//!
12//! - **Fingerprint** (future, Phase 1) — stable BLAKE3 fingerprint match for
13//!   import deduplication across runs.
14
15use std::collections::HashSet;
16
17use rust_decimal::Decimal;
18use rustledger_plugin_types::{DirectiveData, DirectiveWrapper, PluginError, TransactionData};
19
20use crate::fingerprint::structural_hash;
21
22/// Result of finding a structural duplicate.
23#[derive(Debug)]
24pub struct StructuralDuplicate {
25    /// Index of the duplicate directive in the input slice.
26    pub index: usize,
27    /// Date of the duplicate transaction.
28    pub date: String,
29    /// Narration of the duplicate transaction.
30    pub narration: String,
31}
32
33impl StructuralDuplicate {
34    /// Convert to a [`PluginError`] for use in plugin wrappers.
35    #[must_use]
36    pub fn to_plugin_error(&self) -> PluginError {
37        PluginError::error(format!(
38            "Duplicate transaction: {} \"{}\"",
39            self.date, self.narration
40        ))
41    }
42}
43
44/// Find structurally duplicate transactions in a directive list.
45///
46/// Returns the indices and details of transactions whose structural hash
47/// matches an earlier transaction. The first occurrence is kept; subsequent
48/// duplicates are reported.
49#[must_use]
50pub fn find_structural_duplicates(directives: &[DirectiveWrapper]) -> Vec<StructuralDuplicate> {
51    let mut seen: HashSet<u64> = HashSet::new();
52    let mut duplicates = Vec::new();
53
54    for (i, wrapper) in directives.iter().enumerate() {
55        if let DirectiveData::Transaction(txn) = &wrapper.data {
56            let hash = structural_hash(&wrapper.date, txn);
57            if !seen.insert(hash) {
58                duplicates.push(StructuralDuplicate {
59                    index: i,
60                    date: wrapper.date.clone(),
61                    narration: txn.narration.clone(),
62                });
63            }
64        }
65    }
66
67    duplicates
68}
69
70// ============================================================================
71// Fuzzy dedup — for matching imported transactions against existing ledger
72// ============================================================================
73
74/// Configuration for fuzzy duplicate detection.
75#[derive(Debug, Clone)]
76pub struct FuzzyDedupConfig {
77    /// Minimum word overlap ratio to consider text a match (0.0 to 1.0).
78    /// Default: 0.5 (50% of the shorter text's words must appear in the longer).
79    pub text_similarity_threshold: f64,
80}
81
82impl Default for FuzzyDedupConfig {
83    fn default() -> Self {
84        Self {
85            text_similarity_threshold: 0.5,
86        }
87    }
88}
89
90/// Result of a fuzzy duplicate match.
91#[derive(Debug)]
92pub struct FuzzyDuplicateMatch {
93    /// Index of the new transaction that is a duplicate.
94    pub new_index: usize,
95    /// Index of the existing transaction it matches.
96    pub existing_index: usize,
97}
98
99/// Find fuzzy duplicates between new and existing transactions.
100///
101/// Matches on: same date, same first-posting amount, and fuzzy text match
102/// on payee/narration. Returns indices of new transactions that are
103/// probable duplicates of existing ones.
104#[must_use]
105pub fn find_fuzzy_duplicates(
106    new_directives: &[DirectiveWrapper],
107    existing_directives: &[DirectiveWrapper],
108    config: &FuzzyDedupConfig,
109) -> Vec<FuzzyDuplicateMatch> {
110    let mut matches = Vec::new();
111
112    // Pre-compute existing transaction info for efficient comparison
113    let existing: Vec<(usize, &str, Option<Decimal>, String)> = existing_directives
114        .iter()
115        .enumerate()
116        .filter_map(|(i, w)| {
117            if let DirectiveData::Transaction(txn) = &w.data {
118                Some((i, w.date.as_str(), first_posting_amount(txn), txn_text(txn)))
119            } else {
120                None
121            }
122        })
123        .collect();
124
125    for (new_i, wrapper) in new_directives.iter().enumerate() {
126        if let DirectiveData::Transaction(txn) = &wrapper.data {
127            let new_amount = first_posting_amount(txn);
128            let new_text = txn_text(txn);
129
130            for &(existing_i, existing_date, ref existing_amount, ref existing_text) in &existing {
131                if wrapper.date != existing_date {
132                    continue;
133                }
134                if new_amount != *existing_amount {
135                    continue;
136                }
137                if fuzzy_text_match(&new_text, existing_text, config.text_similarity_threshold) {
138                    matches.push(FuzzyDuplicateMatch {
139                        new_index: new_i,
140                        existing_index: existing_i,
141                    });
142                    break; // One match is enough
143                }
144            }
145        }
146    }
147
148    matches
149}
150
151/// Get the decimal amount from the first posting of a transaction.
152fn first_posting_amount(txn: &TransactionData) -> Option<Decimal> {
153    txn.postings.first().and_then(|p| {
154        p.units
155            .as_ref()
156            .and_then(|u| u.number.parse::<Decimal>().ok())
157    })
158}
159
160/// Build a lowercase string combining payee and narration for fuzzy matching.
161fn txn_text(txn: &TransactionData) -> String {
162    let mut text = String::new();
163    if let Some(ref payee) = txn.payee {
164        text.push_str(payee);
165        text.push(' ');
166    }
167    text.push_str(&txn.narration);
168    text.to_lowercase()
169}
170
171/// Fuzzy text match: returns true if either string contains the other,
172/// or if they share significant word overlap.
173fn fuzzy_text_match(a: &str, b: &str, threshold: f64) -> bool {
174    if a.is_empty() || b.is_empty() {
175        return false;
176    }
177    if a == b {
178        return true;
179    }
180    if a.contains(b) || b.contains(a) {
181        return true;
182    }
183    let a_words: Vec<&str> = a.split_whitespace().collect();
184    let b_words: Vec<&str> = b.split_whitespace().collect();
185    let (shorter, longer) = if a_words.len() <= b_words.len() {
186        (&a_words, &b_words)
187    } else {
188        (&b_words, &a_words)
189    };
190    if shorter.is_empty() {
191        return false;
192    }
193    let match_count = shorter.iter().filter(|w| longer.contains(w)).count();
194    #[allow(clippy::cast_precision_loss)]
195    let ratio = match_count as f64 / shorter.len() as f64;
196    ratio >= threshold
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202    use rustledger_plugin_types::{AmountData, DirectiveData, PostingData, TransactionData};
203
204    fn make_directive(
205        date: &str,
206        payee: Option<&str>,
207        narration: &str,
208        amount: &str,
209    ) -> DirectiveWrapper {
210        DirectiveWrapper {
211            directive_type: "transaction".to_string(),
212            date: date.to_string(),
213            filename: None,
214            lineno: None,
215            data: DirectiveData::Transaction(TransactionData {
216                flag: "*".to_string(),
217                payee: payee.map(String::from),
218                narration: narration.to_string(),
219                tags: vec![],
220                links: vec![],
221                metadata: vec![],
222                postings: vec![PostingData {
223                    account: "Assets:Bank".to_string(),
224                    units: Some(AmountData {
225                        number: amount.to_string(),
226                        currency: "USD".to_string(),
227                    }),
228                    cost: None,
229                    price: None,
230                    flag: None,
231                    metadata: vec![],
232                }],
233            }),
234        }
235    }
236
237    // ===== Structural dedup tests =====
238
239    #[test]
240    fn structural_finds_exact_duplicates() {
241        let directives = vec![
242            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
243            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
244        ];
245        let dups = find_structural_duplicates(&directives);
246        assert_eq!(dups.len(), 1);
247        assert_eq!(dups[0].index, 1);
248    }
249
250    #[test]
251    fn structural_no_false_positives() {
252        let directives = vec![
253            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
254            make_directive("2024-01-15", Some("Store"), "Groceries", "-51.00"),
255        ];
256        let dups = find_structural_duplicates(&directives);
257        assert!(dups.is_empty());
258    }
259
260    #[test]
261    fn structural_duplicate_to_plugin_error() {
262        let dup = StructuralDuplicate {
263            index: 1,
264            date: "2024-01-15".to_string(),
265            narration: "Test".to_string(),
266        };
267        let err = dup.to_plugin_error();
268        assert!(err.message.contains("Duplicate transaction"));
269        assert!(err.message.contains("2024-01-15"));
270    }
271
272    // ===== Fuzzy dedup tests =====
273
274    #[test]
275    fn fuzzy_finds_matching_transactions() {
276        let new = vec![make_directive(
277            "2024-01-15",
278            Some("WHOLE FOODS"),
279            "Groceries",
280            "-50.00",
281        )];
282        let existing = vec![make_directive(
283            "2024-01-15",
284            Some("Whole Foods Market"),
285            "Groceries",
286            "-50.00",
287        )];
288        let config = FuzzyDedupConfig::default();
289        let matches = find_fuzzy_duplicates(&new, &existing, &config);
290        assert_eq!(matches.len(), 1);
291    }
292
293    #[test]
294    fn fuzzy_no_match_different_date() {
295        let new = vec![make_directive(
296            "2024-01-15",
297            Some("Store"),
298            "Groceries",
299            "-50.00",
300        )];
301        let existing = vec![make_directive(
302            "2024-01-16",
303            Some("Store"),
304            "Groceries",
305            "-50.00",
306        )];
307        let config = FuzzyDedupConfig::default();
308        let matches = find_fuzzy_duplicates(&new, &existing, &config);
309        assert!(matches.is_empty());
310    }
311
312    #[test]
313    fn fuzzy_no_match_different_amount() {
314        let new = vec![make_directive(
315            "2024-01-15",
316            Some("Store"),
317            "Groceries",
318            "-50.00",
319        )];
320        let existing = vec![make_directive(
321            "2024-01-15",
322            Some("Store"),
323            "Groceries",
324            "-51.00",
325        )];
326        let config = FuzzyDedupConfig::default();
327        let matches = find_fuzzy_duplicates(&new, &existing, &config);
328        assert!(matches.is_empty());
329    }
330
331    #[test]
332    fn fuzzy_text_match_exact() {
333        assert!(fuzzy_text_match("hello world", "hello world", 0.5));
334    }
335
336    #[test]
337    fn fuzzy_text_match_contains() {
338        assert!(fuzzy_text_match("hello", "hello world", 0.5));
339        assert!(fuzzy_text_match("hello world", "hello", 0.5));
340    }
341
342    #[test]
343    fn fuzzy_text_match_word_overlap() {
344        // "whole foods" shares 2/2 words with "whole foods market" -> 100%
345        assert!(fuzzy_text_match("whole foods", "whole foods market", 0.5));
346    }
347
348    #[test]
349    fn fuzzy_text_match_insufficient_overlap() {
350        // "alpha" shares 0/1 words with "beta gamma" -> 0%
351        assert!(!fuzzy_text_match("alpha", "beta gamma", 0.5));
352    }
353
354    #[test]
355    fn fuzzy_text_match_empty() {
356        assert!(!fuzzy_text_match("", "hello", 0.5));
357        assert!(!fuzzy_text_match("hello", "", 0.5));
358    }
359
360    // ===== Additional fuzzy dedup tests =====
361
362    #[test]
363    fn fuzzy_multiple_matches_only_first_returned() {
364        // Two existing transactions match the same new one; only first match per new txn
365        let new = vec![make_directive(
366            "2024-01-15",
367            Some("Store"),
368            "Groceries",
369            "-50.00",
370        )];
371        let existing = vec![
372            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
373            make_directive("2024-01-15", Some("Store"), "Groceries run", "-50.00"),
374        ];
375        let config = FuzzyDedupConfig::default();
376        let matches = find_fuzzy_duplicates(&new, &existing, &config);
377        // Should only return one match (the first existing match, due to `break`)
378        assert_eq!(matches.len(), 1);
379        assert_eq!(matches[0].new_index, 0);
380        assert_eq!(matches[0].existing_index, 0);
381    }
382
383    #[test]
384    fn fuzzy_non_transaction_directives_are_skipped() {
385        let note_directive = DirectiveWrapper {
386            directive_type: "note".to_string(),
387            date: "2024-01-15".to_string(),
388            filename: None,
389            lineno: None,
390            data: DirectiveData::Note(rustledger_plugin_types::NoteData {
391                account: "Assets:Bank".to_string(),
392                comment: "A note".to_string(),
393                metadata: vec![],
394            }),
395        };
396        let txn = make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00");
397
398        // Note in new directives - should be skipped
399        let matches = find_fuzzy_duplicates(
400            std::slice::from_ref(&note_directive),
401            std::slice::from_ref(&txn),
402            &FuzzyDedupConfig::default(),
403        );
404        assert!(matches.is_empty());
405
406        // Note in existing directives - should be skipped
407        let matches =
408            find_fuzzy_duplicates(&[txn], &[note_directive], &FuzzyDedupConfig::default());
409        assert!(matches.is_empty());
410    }
411
412    #[test]
413    fn fuzzy_empty_directives_list() {
414        let config = FuzzyDedupConfig::default();
415
416        // Both empty
417        let matches = find_fuzzy_duplicates(&[], &[], &config);
418        assert!(matches.is_empty());
419
420        // New empty
421        let existing = vec![make_directive(
422            "2024-01-15",
423            Some("Store"),
424            "Test",
425            "-50.00",
426        )];
427        let matches = find_fuzzy_duplicates(&[], &existing, &config);
428        assert!(matches.is_empty());
429
430        // Existing empty
431        let new = vec![make_directive(
432            "2024-01-15",
433            Some("Store"),
434            "Test",
435            "-50.00",
436        )];
437        let matches = find_fuzzy_duplicates(&new, &[], &config);
438        assert!(matches.is_empty());
439    }
440
441    #[test]
442    fn fuzzy_matching_with_narration_only() {
443        // No payee, match on narration text only
444        let new = vec![make_directive(
445            "2024-01-15",
446            None,
447            "whole foods market",
448            "-50.00",
449        )];
450        let existing = vec![make_directive(
451            "2024-01-15",
452            None,
453            "Whole Foods Market #123",
454            "-50.00",
455        )];
456        let config = FuzzyDedupConfig::default();
457        let matches = find_fuzzy_duplicates(&new, &existing, &config);
458        assert_eq!(matches.len(), 1);
459    }
460
461    #[test]
462    fn fuzzy_matching_payee_vs_narration() {
463        // Payee in new matches narration in existing (both lowercased)
464        let new = vec![make_directive(
465            "2024-01-15",
466            Some("Whole Foods"),
467            "Payment",
468            "-50.00",
469        )];
470        let existing = vec![make_directive(
471            "2024-01-15",
472            None,
473            "whole foods market",
474            "-50.00",
475        )];
476        let config = FuzzyDedupConfig::default();
477        let matches = find_fuzzy_duplicates(&new, &existing, &config);
478        // "whole foods payment" vs "whole foods market" — shares 2/3 words (67%) > 50%
479        assert_eq!(matches.len(), 1);
480    }
481
482    #[test]
483    fn structural_empty_directives_list() {
484        let dups = find_structural_duplicates(&[]);
485        assert!(dups.is_empty());
486    }
487
488    #[test]
489    fn structural_non_transaction_not_duplicated() {
490        let note1 = DirectiveWrapper {
491            directive_type: "note".to_string(),
492            date: "2024-01-15".to_string(),
493            filename: None,
494            lineno: None,
495            data: DirectiveData::Note(rustledger_plugin_types::NoteData {
496                account: "Assets:Bank".to_string(),
497                comment: "Same note".to_string(),
498                metadata: vec![],
499            }),
500        };
501        let note2 = note1.clone();
502        let dups = find_structural_duplicates(&[note1, note2]);
503        assert!(dups.is_empty());
504    }
505
506    #[test]
507    fn fuzzy_dedup_config_default() {
508        let config = FuzzyDedupConfig::default();
509        assert!((config.text_similarity_threshold - 0.5).abs() < f64::EPSILON);
510    }
511
512    #[test]
513    fn fuzzy_high_threshold_rejects_partial_matches() {
514        let new = vec![make_directive("2024-01-15", None, "whole foods", "-50.00")];
515        let existing = vec![make_directive(
516            "2024-01-15",
517            None,
518            "whole foods market special",
519            "-50.00",
520        )];
521        // At threshold 0.9, "whole foods" (2 words) needs 90% of 2 = 1.8 → 2 matches in longer
522        // "whole foods" are both in longer text, so 2/2 = 100% → still passes
523        let config = FuzzyDedupConfig {
524            text_similarity_threshold: 0.9,
525        };
526        let matches = find_fuzzy_duplicates(&new, &existing, &config);
527        assert_eq!(matches.len(), 1);
528
529        // Totally different words at high threshold should not match
530        let new = vec![make_directive("2024-01-15", None, "alpha beta", "-50.00")];
531        let existing = vec![make_directive(
532            "2024-01-15",
533            None,
534            "alpha gamma delta",
535            "-50.00",
536        )];
537        let matches = find_fuzzy_duplicates(&new, &existing, &config);
538        // "alpha beta" shares 1/2 words with "alpha gamma delta" → 50% < 90%
539        assert!(matches.is_empty());
540    }
541}