Skip to main content

rustledger_plugin/native/plugins/
no_duplicates.rs

1//! Hash-based duplicate transaction detection.
2//!
3//! Mirrors Python beancount's `beancount.plugins.noduplicates`, which uses
4//! `beancount.core.compare.hash_entry` to identify structurally identical
5//! transactions. `hash_entry` hashes every field that contributes to a
6//! transaction's structural identity: flag, payee, narration, tags, links,
7//! and each posting's account, units, cost, price, and flag. Metadata is
8//! deliberately excluded (beancount's `hash_entry` passes `exclude_meta=True`).
9//!
10//! The hash helpers below use exhaustive struct destructuring so that adding
11//! a field to `TransactionData`, `PostingData`, `CostData`, `AmountData`, or
12//! `PriceAnnotationData` causes a compile error here — forcing whoever adds
13//! the field to explicitly decide whether it contributes to structural
14//! identity (add to the hash) or not (bind with `_` and document why).
15
16use crate::types::{
17    AmountData, CostData, DirectiveData, PluginError, PluginInput, PluginOutput, PostingData,
18    PriceAnnotationData, TransactionData,
19};
20
21use super::super::NativePlugin;
22
23/// Plugin that detects duplicate transactions based on hash.
24pub struct NoDuplicatesPlugin;
25
26impl NativePlugin for NoDuplicatesPlugin {
27    fn name(&self) -> &'static str {
28        "noduplicates"
29    }
30
31    fn description(&self) -> &'static str {
32        "Hash-based duplicate transaction detection"
33    }
34
35    fn process(&self, input: PluginInput) -> PluginOutput {
36        use std::collections::HashSet;
37        use std::collections::hash_map::DefaultHasher;
38        use std::hash::{Hash, Hasher};
39
40        // Sentinel bytes used to discriminate `None` from `Some` before each
41        // optional component. Otherwise `(None, Some(x))` and `(Some(x), None)`
42        // could collide for adjacent fields. Python's tuple hash achieves the
43        // equivalent via `hash(None)` being a distinct fixed value.
44        const ABSENT: u8 = 0;
45        const PRESENT: u8 = 1;
46
47        fn hash_amount<H: Hasher>(amount: &AmountData, hasher: &mut H) {
48            let AmountData { number, currency } = amount;
49            number.hash(hasher);
50            currency.hash(hasher);
51        }
52
53        fn hash_cost<H: Hasher>(cost: &CostData, hasher: &mut H) {
54            let CostData {
55                number_per,
56                number_total,
57                currency,
58                date,
59                label,
60                merge,
61            } = cost;
62            number_per.hash(hasher);
63            number_total.hash(hasher);
64            currency.hash(hasher);
65            date.hash(hasher);
66            label.hash(hasher);
67            merge.hash(hasher);
68        }
69
70        fn hash_price<H: Hasher>(price: &PriceAnnotationData, hasher: &mut H) {
71            let PriceAnnotationData {
72                is_total,
73                amount,
74                number,
75                currency,
76            } = price;
77            is_total.hash(hasher);
78            match amount {
79                Some(a) => {
80                    PRESENT.hash(hasher);
81                    hash_amount(a, hasher);
82                }
83                None => ABSENT.hash(hasher),
84            }
85            number.hash(hasher);
86            currency.hash(hasher);
87        }
88
89        fn hash_posting<H: Hasher>(posting: &PostingData, hasher: &mut H) {
90            // Destructure so any future field added to `PostingData` causes a
91            // compile error here and the maintainer must explicitly decide
92            // whether it's part of structural identity.
93            let PostingData {
94                account,
95                units,
96                cost,
97                price,
98                flag,
99                // Metadata is intentionally NOT hashed — matches beancount's
100                // hash_entry(exclude_meta=True) default. Bind to `_` so adding
101                // a new field in the future is still a compile error.
102                metadata: _,
103            } = posting;
104
105            account.hash(hasher);
106            match units {
107                Some(u) => {
108                    PRESENT.hash(hasher);
109                    hash_amount(u, hasher);
110                }
111                None => ABSENT.hash(hasher),
112            }
113            match cost {
114                Some(c) => {
115                    PRESENT.hash(hasher);
116                    hash_cost(c, hasher);
117                }
118                None => ABSENT.hash(hasher),
119            }
120            match price {
121                Some(p) => {
122                    PRESENT.hash(hasher);
123                    hash_price(p, hasher);
124                }
125                None => ABSENT.hash(hasher),
126            }
127            flag.hash(hasher);
128        }
129
130        fn hash_transaction(date: &str, txn: &TransactionData) -> u64 {
131            // Destructure so any future field added to `TransactionData`
132            // causes a compile error here.
133            let TransactionData {
134                flag,
135                payee,
136                narration,
137                tags,
138                links,
139                // Metadata is intentionally NOT hashed — matches beancount's
140                // hash_entry(exclude_meta=True) default.
141                metadata: _,
142                postings,
143            } = txn;
144
145            let mut hasher = DefaultHasher::new();
146            date.hash(&mut hasher);
147            flag.hash(&mut hasher);
148            payee.hash(&mut hasher);
149            narration.hash(&mut hasher);
150
151            // Tags and links are unordered sets in beancount (`frozenset`),
152            // so:
153            //   1. Sort + dedup so the hash is stable regardless of parser
154            //      order and collapses any accidental duplicates the parser
155            //      might emit (matching beancount set semantics).
156            //   2. Each collection is prefixed with its length so the two
157            //      streams can't be merged or swapped without changing the
158            //      resulting hash — e.g. `tags={a,b}, links={}` no longer
159            //      collides with `tags={a}, links={b}`.
160            let mut sorted_tags: Vec<&String> = tags.iter().collect();
161            sorted_tags.sort();
162            sorted_tags.dedup();
163            sorted_tags.len().hash(&mut hasher);
164            for tag in sorted_tags {
165                tag.hash(&mut hasher);
166            }
167
168            let mut sorted_links: Vec<&String> = links.iter().collect();
169            sorted_links.sort();
170            sorted_links.dedup();
171            sorted_links.len().hash(&mut hasher);
172            for link in sorted_links {
173                link.hash(&mut hasher);
174            }
175
176            // Prefix postings with their count so the posting stream can't
177            // collide with trailing fields of the set streams above.
178            postings.len().hash(&mut hasher);
179            for posting in postings {
180                hash_posting(posting, &mut hasher);
181            }
182
183            hasher.finish()
184        }
185
186        let mut seen: HashSet<u64> = HashSet::new();
187        let mut errors = Vec::new();
188
189        for wrapper in &input.directives {
190            if let DirectiveData::Transaction(txn) = &wrapper.data {
191                let hash = hash_transaction(&wrapper.date, txn);
192                if !seen.insert(hash) {
193                    errors.push(PluginError::error(format!(
194                        "Duplicate transaction: {} \"{}\"",
195                        wrapper.date, txn.narration
196                    )));
197                }
198            }
199        }
200
201        PluginOutput {
202            directives: input.directives,
203            errors,
204        }
205    }
206}