Skip to main content

rustledger_ops/
dedup.rs

1//! Duplicate transaction detection.
2//!
3//! Provides three deduplication strategies:
4//!
5//! - **Structural** — exact hash match using [`crate::fingerprint::structural_hash`].
6//!   Finds transactions that are byte-for-byte identical (excluding metadata).
7//!
8//! - **Fuzzy** — approximate match using date + amount + text similarity.
9//!   Finds transactions that are likely the same despite minor differences
10//!   (e.g., different payee formatting between bank and ledger).
11//!
12//! - **Fingerprint** (future, Phase 1) — stable BLAKE3 fingerprint match for
13//!   import deduplication across runs.
14
15use std::collections::HashSet;
16
17use rust_decimal::Decimal;
18use rustledger_plugin_types::{DirectiveData, DirectiveWrapper, PluginError, TransactionData};
19
20use crate::fingerprint::structural_hash;
21
22/// Result of finding a structural duplicate.
23#[derive(Debug)]
24pub struct StructuralDuplicate {
25    /// Index of the duplicate directive in the input slice.
26    pub index: usize,
27    /// Date of the duplicate transaction.
28    pub date: String,
29    /// Narration of the duplicate transaction.
30    pub narration: String,
31}
32
33impl StructuralDuplicate {
34    /// Convert to a [`PluginError`] for use in plugin wrappers.
35    #[must_use]
36    pub fn to_plugin_error(&self) -> PluginError {
37        PluginError::error(format!(
38            "Duplicate transaction: {} \"{}\"",
39            self.date, self.narration
40        ))
41    }
42}
43
44/// Find structurally duplicate transactions in a directive list.
45///
46/// Returns the indices and details of transactions whose structural hash
47/// matches an earlier transaction. The first occurrence is kept; subsequent
48/// duplicates are reported.
49#[must_use]
50pub fn find_structural_duplicates(directives: &[DirectiveWrapper]) -> Vec<StructuralDuplicate> {
51    let mut seen: HashSet<u64> = HashSet::new();
52    let mut duplicates = Vec::new();
53
54    for (i, wrapper) in directives.iter().enumerate() {
55        if let DirectiveData::Transaction(txn) = &wrapper.data {
56            let hash = structural_hash(&wrapper.date, txn);
57            if !seen.insert(hash) {
58                duplicates.push(StructuralDuplicate {
59                    index: i,
60                    date: wrapper.date.clone(),
61                    narration: txn.narration.clone(),
62                });
63            }
64        }
65    }
66
67    duplicates
68}
69
70// ============================================================================
71// Fuzzy dedup — for matching imported transactions against existing ledger
72// ============================================================================
73
74/// Configuration for fuzzy duplicate detection.
75#[derive(Debug, Clone)]
76pub struct FuzzyDedupConfig {
77    /// Minimum word overlap ratio to consider text a match (0.0 to 1.0).
78    /// Default: 0.5 (50% of the shorter text's words must appear in the longer).
79    pub text_similarity_threshold: f64,
80}
81
82impl Default for FuzzyDedupConfig {
83    fn default() -> Self {
84        Self {
85            text_similarity_threshold: 0.5,
86        }
87    }
88}
89
90/// Result of a fuzzy duplicate match.
91#[derive(Debug)]
92pub struct FuzzyDuplicateMatch {
93    /// Index of the new transaction that is a duplicate.
94    pub new_index: usize,
95    /// Index of the existing transaction it matches.
96    pub existing_index: usize,
97}
98
99/// Find fuzzy duplicates between new and existing transactions.
100///
101/// Matches on: same date, same first-posting amount, and fuzzy text match
102/// on payee/narration. Returns indices of new transactions that are
103/// probable duplicates of existing ones.
104#[must_use]
105pub fn find_fuzzy_duplicates(
106    new_directives: &[DirectiveWrapper],
107    existing_directives: &[DirectiveWrapper],
108    config: &FuzzyDedupConfig,
109) -> Vec<FuzzyDuplicateMatch> {
110    let mut matches = Vec::new();
111
112    // Pre-compute existing transaction info for efficient comparison
113    let existing: Vec<(usize, &str, Option<Decimal>, String)> = existing_directives
114        .iter()
115        .enumerate()
116        .filter_map(|(i, w)| {
117            if let DirectiveData::Transaction(txn) = &w.data {
118                Some((i, w.date.as_str(), first_posting_amount(txn), txn_text(txn)))
119            } else {
120                None
121            }
122        })
123        .collect();
124
125    for (new_i, wrapper) in new_directives.iter().enumerate() {
126        if let DirectiveData::Transaction(txn) = &wrapper.data {
127            let new_amount = first_posting_amount(txn);
128            let new_text = txn_text(txn);
129
130            for &(existing_i, existing_date, ref existing_amount, ref existing_text) in &existing {
131                if wrapper.date != existing_date {
132                    continue;
133                }
134                if new_amount != *existing_amount {
135                    continue;
136                }
137                if fuzzy_text_match(&new_text, existing_text, config.text_similarity_threshold) {
138                    matches.push(FuzzyDuplicateMatch {
139                        new_index: new_i,
140                        existing_index: existing_i,
141                    });
142                    break; // One match is enough
143                }
144            }
145        }
146    }
147
148    matches
149}
150
151/// Get the decimal amount from the first posting of a transaction.
152fn first_posting_amount(txn: &TransactionData) -> Option<Decimal> {
153    txn.postings.first().and_then(|p| {
154        p.units
155            .as_ref()
156            .and_then(|u| u.number.parse::<Decimal>().ok())
157    })
158}
159
160/// Build a lowercase string combining payee and narration for fuzzy matching.
161fn txn_text(txn: &TransactionData) -> String {
162    let mut text = String::new();
163    if let Some(ref payee) = txn.payee {
164        text.push_str(payee);
165        text.push(' ');
166    }
167    text.push_str(&txn.narration);
168    text.to_lowercase()
169}
170
171/// Fuzzy text match: returns true if either string contains the other,
172/// or if they share significant word overlap.
173fn fuzzy_text_match(a: &str, b: &str, threshold: f64) -> bool {
174    if a.is_empty() || b.is_empty() {
175        return false;
176    }
177    if a == b {
178        return true;
179    }
180    if a.contains(b) || b.contains(a) {
181        return true;
182    }
183    let a_words: Vec<&str> = a.split_whitespace().collect();
184    let b_words: Vec<&str> = b.split_whitespace().collect();
185    let (shorter, longer) = if a_words.len() <= b_words.len() {
186        (&a_words, &b_words)
187    } else {
188        (&b_words, &a_words)
189    };
190    if shorter.is_empty() {
191        return false;
192    }
193    let match_count = shorter.iter().filter(|w| longer.contains(w)).count();
194    #[allow(clippy::cast_precision_loss)]
195    let ratio = match_count as f64 / shorter.len() as f64;
196    ratio >= threshold
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202    use rustledger_plugin_types::{AmountData, DirectiveData, PostingData, TransactionData};
203
204    fn make_directive(
205        date: &str,
206        payee: Option<&str>,
207        narration: &str,
208        amount: &str,
209    ) -> DirectiveWrapper {
210        DirectiveWrapper {
211            directive_type: "transaction".to_string(),
212            date: date.to_string(),
213            filename: None,
214            lineno: None,
215            data: DirectiveData::Transaction(TransactionData {
216                flag: "*".to_string(),
217                payee: payee.map(String::from),
218                narration: narration.to_string(),
219                tags: vec![],
220                links: vec![],
221                metadata: vec![],
222                postings: vec![PostingData {
223                    account: "Assets:Bank".to_string(),
224                    units: Some(AmountData {
225                        number: amount.to_string(),
226                        currency: "USD".to_string(),
227                    }),
228                    cost: None,
229                    price: None,
230                    flag: None,
231                    metadata: vec![],
232                    span: None,
233                }],
234            }),
235        }
236    }
237
238    // ===== Structural dedup tests =====
239
240    #[test]
241    fn structural_finds_exact_duplicates() {
242        let directives = vec![
243            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
244            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
245        ];
246        let dups = find_structural_duplicates(&directives);
247        assert_eq!(dups.len(), 1);
248        assert_eq!(dups[0].index, 1);
249    }
250
251    #[test]
252    fn structural_no_false_positives() {
253        let directives = vec![
254            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
255            make_directive("2024-01-15", Some("Store"), "Groceries", "-51.00"),
256        ];
257        let dups = find_structural_duplicates(&directives);
258        assert!(dups.is_empty());
259    }
260
261    #[test]
262    fn structural_duplicate_to_plugin_error() {
263        let dup = StructuralDuplicate {
264            index: 1,
265            date: "2024-01-15".to_string(),
266            narration: "Test".to_string(),
267        };
268        let err = dup.to_plugin_error();
269        assert!(err.message.contains("Duplicate transaction"));
270        assert!(err.message.contains("2024-01-15"));
271    }
272
273    // ===== Fuzzy dedup tests =====
274
275    #[test]
276    fn fuzzy_finds_matching_transactions() {
277        let new = vec![make_directive(
278            "2024-01-15",
279            Some("WHOLE FOODS"),
280            "Groceries",
281            "-50.00",
282        )];
283        let existing = vec![make_directive(
284            "2024-01-15",
285            Some("Whole Foods Market"),
286            "Groceries",
287            "-50.00",
288        )];
289        let config = FuzzyDedupConfig::default();
290        let matches = find_fuzzy_duplicates(&new, &existing, &config);
291        assert_eq!(matches.len(), 1);
292    }
293
294    #[test]
295    fn fuzzy_no_match_different_date() {
296        let new = vec![make_directive(
297            "2024-01-15",
298            Some("Store"),
299            "Groceries",
300            "-50.00",
301        )];
302        let existing = vec![make_directive(
303            "2024-01-16",
304            Some("Store"),
305            "Groceries",
306            "-50.00",
307        )];
308        let config = FuzzyDedupConfig::default();
309        let matches = find_fuzzy_duplicates(&new, &existing, &config);
310        assert!(matches.is_empty());
311    }
312
313    #[test]
314    fn fuzzy_no_match_different_amount() {
315        let new = vec![make_directive(
316            "2024-01-15",
317            Some("Store"),
318            "Groceries",
319            "-50.00",
320        )];
321        let existing = vec![make_directive(
322            "2024-01-15",
323            Some("Store"),
324            "Groceries",
325            "-51.00",
326        )];
327        let config = FuzzyDedupConfig::default();
328        let matches = find_fuzzy_duplicates(&new, &existing, &config);
329        assert!(matches.is_empty());
330    }
331
332    #[test]
333    fn fuzzy_text_match_exact() {
334        assert!(fuzzy_text_match("hello world", "hello world", 0.5));
335    }
336
337    #[test]
338    fn fuzzy_text_match_contains() {
339        assert!(fuzzy_text_match("hello", "hello world", 0.5));
340        assert!(fuzzy_text_match("hello world", "hello", 0.5));
341    }
342
343    #[test]
344    fn fuzzy_text_match_word_overlap() {
345        // "whole foods" shares 2/2 words with "whole foods market" -> 100%
346        assert!(fuzzy_text_match("whole foods", "whole foods market", 0.5));
347    }
348
349    #[test]
350    fn fuzzy_text_match_insufficient_overlap() {
351        // "alpha" shares 0/1 words with "beta gamma" -> 0%
352        assert!(!fuzzy_text_match("alpha", "beta gamma", 0.5));
353    }
354
355    #[test]
356    fn fuzzy_text_match_empty() {
357        assert!(!fuzzy_text_match("", "hello", 0.5));
358        assert!(!fuzzy_text_match("hello", "", 0.5));
359    }
360
361    // ===== Additional fuzzy dedup tests =====
362
363    #[test]
364    fn fuzzy_multiple_matches_only_first_returned() {
365        // Two existing transactions match the same new one; only first match per new txn
366        let new = vec![make_directive(
367            "2024-01-15",
368            Some("Store"),
369            "Groceries",
370            "-50.00",
371        )];
372        let existing = vec![
373            make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00"),
374            make_directive("2024-01-15", Some("Store"), "Groceries run", "-50.00"),
375        ];
376        let config = FuzzyDedupConfig::default();
377        let matches = find_fuzzy_duplicates(&new, &existing, &config);
378        // Should only return one match (the first existing match, due to `break`)
379        assert_eq!(matches.len(), 1);
380        assert_eq!(matches[0].new_index, 0);
381        assert_eq!(matches[0].existing_index, 0);
382    }
383
384    #[test]
385    fn fuzzy_non_transaction_directives_are_skipped() {
386        let note_directive = DirectiveWrapper {
387            directive_type: "note".to_string(),
388            date: "2024-01-15".to_string(),
389            filename: None,
390            lineno: None,
391            data: DirectiveData::Note(rustledger_plugin_types::NoteData {
392                account: "Assets:Bank".to_string(),
393                comment: "A note".to_string(),
394                metadata: vec![],
395            }),
396        };
397        let txn = make_directive("2024-01-15", Some("Store"), "Groceries", "-50.00");
398
399        // Note in new directives - should be skipped
400        let matches = find_fuzzy_duplicates(
401            std::slice::from_ref(&note_directive),
402            std::slice::from_ref(&txn),
403            &FuzzyDedupConfig::default(),
404        );
405        assert!(matches.is_empty());
406
407        // Note in existing directives - should be skipped
408        let matches =
409            find_fuzzy_duplicates(&[txn], &[note_directive], &FuzzyDedupConfig::default());
410        assert!(matches.is_empty());
411    }
412
413    #[test]
414    fn fuzzy_empty_directives_list() {
415        let config = FuzzyDedupConfig::default();
416
417        // Both empty
418        let matches = find_fuzzy_duplicates(&[], &[], &config);
419        assert!(matches.is_empty());
420
421        // New empty
422        let existing = vec![make_directive(
423            "2024-01-15",
424            Some("Store"),
425            "Test",
426            "-50.00",
427        )];
428        let matches = find_fuzzy_duplicates(&[], &existing, &config);
429        assert!(matches.is_empty());
430
431        // Existing empty
432        let new = vec![make_directive(
433            "2024-01-15",
434            Some("Store"),
435            "Test",
436            "-50.00",
437        )];
438        let matches = find_fuzzy_duplicates(&new, &[], &config);
439        assert!(matches.is_empty());
440    }
441
442    #[test]
443    fn fuzzy_matching_with_narration_only() {
444        // No payee, match on narration text only
445        let new = vec![make_directive(
446            "2024-01-15",
447            None,
448            "whole foods market",
449            "-50.00",
450        )];
451        let existing = vec![make_directive(
452            "2024-01-15",
453            None,
454            "Whole Foods Market #123",
455            "-50.00",
456        )];
457        let config = FuzzyDedupConfig::default();
458        let matches = find_fuzzy_duplicates(&new, &existing, &config);
459        assert_eq!(matches.len(), 1);
460    }
461
462    #[test]
463    fn fuzzy_matching_payee_vs_narration() {
464        // Payee in new matches narration in existing (both lowercased)
465        let new = vec![make_directive(
466            "2024-01-15",
467            Some("Whole Foods"),
468            "Payment",
469            "-50.00",
470        )];
471        let existing = vec![make_directive(
472            "2024-01-15",
473            None,
474            "whole foods market",
475            "-50.00",
476        )];
477        let config = FuzzyDedupConfig::default();
478        let matches = find_fuzzy_duplicates(&new, &existing, &config);
479        // "whole foods payment" vs "whole foods market" — shares 2/3 words (67%) > 50%
480        assert_eq!(matches.len(), 1);
481    }
482
483    #[test]
484    fn structural_empty_directives_list() {
485        let dups = find_structural_duplicates(&[]);
486        assert!(dups.is_empty());
487    }
488
489    #[test]
490    fn structural_non_transaction_not_duplicated() {
491        let note1 = DirectiveWrapper {
492            directive_type: "note".to_string(),
493            date: "2024-01-15".to_string(),
494            filename: None,
495            lineno: None,
496            data: DirectiveData::Note(rustledger_plugin_types::NoteData {
497                account: "Assets:Bank".to_string(),
498                comment: "Same note".to_string(),
499                metadata: vec![],
500            }),
501        };
502        let note2 = note1.clone();
503        let dups = find_structural_duplicates(&[note1, note2]);
504        assert!(dups.is_empty());
505    }
506
507    #[test]
508    fn fuzzy_dedup_config_default() {
509        let config = FuzzyDedupConfig::default();
510        assert!((config.text_similarity_threshold - 0.5).abs() < f64::EPSILON);
511    }
512
513    #[test]
514    fn fuzzy_high_threshold_rejects_partial_matches() {
515        let new = vec![make_directive("2024-01-15", None, "whole foods", "-50.00")];
516        let existing = vec![make_directive(
517            "2024-01-15",
518            None,
519            "whole foods market special",
520            "-50.00",
521        )];
522        // At threshold 0.9, "whole foods" (2 words) needs 90% of 2 = 1.8 → 2 matches in longer
523        // "whole foods" are both in longer text, so 2/2 = 100% → still passes
524        let config = FuzzyDedupConfig {
525            text_similarity_threshold: 0.9,
526        };
527        let matches = find_fuzzy_duplicates(&new, &existing, &config);
528        assert_eq!(matches.len(), 1);
529
530        // Totally different words at high threshold should not match
531        let new = vec![make_directive("2024-01-15", None, "alpha beta", "-50.00")];
532        let existing = vec![make_directive(
533            "2024-01-15",
534            None,
535            "alpha gamma delta",
536            "-50.00",
537        )];
538        let matches = find_fuzzy_duplicates(&new, &existing, &config);
539        // "alpha beta" shares 1/2 words with "alpha gamma delta" → 50% < 90%
540        assert!(matches.is_empty());
541    }
542}