rustledger_plugin/native/plugins/no_duplicates.rs
1//! Hash-based duplicate transaction detection.
2//!
3//! Mirrors Python beancount's `beancount.plugins.noduplicates`, which uses
4//! `beancount.core.compare.hash_entry` to identify structurally identical
5//! transactions. `hash_entry` hashes every field that contributes to a
6//! transaction's structural identity: flag, payee, narration, tags, links,
7//! and each posting's account, units, cost, price, and flag. Metadata is
8//! deliberately excluded (beancount's `hash_entry` passes `exclude_meta=True`).
9//!
10//! The hash helpers below use exhaustive struct destructuring so that adding
11//! a field to `TransactionData`, `PostingData`, `CostData`, `AmountData`, or
12//! `PriceAnnotationData` causes a compile error here — forcing whoever adds
13//! the field to explicitly decide whether it contributes to structural
14//! identity (add to the hash) or not (bind with `_` and document why).
15
16use crate::types::{
17 AmountData, CostData, DirectiveData, PluginError, PluginInput, PluginOutput, PostingData,
18 PriceAnnotationData, TransactionData,
19};
20
21use super::super::NativePlugin;
22
23/// Plugin that detects duplicate transactions based on hash.
24pub struct NoDuplicatesPlugin;
25
26impl NativePlugin for NoDuplicatesPlugin {
27 fn name(&self) -> &'static str {
28 "noduplicates"
29 }
30
31 fn description(&self) -> &'static str {
32 "Hash-based duplicate transaction detection"
33 }
34
35 fn process(&self, input: PluginInput) -> PluginOutput {
36 use std::collections::HashSet;
37 use std::collections::hash_map::DefaultHasher;
38 use std::hash::{Hash, Hasher};
39
40 // Sentinel bytes used to discriminate `None` from `Some` before each
41 // optional component. Otherwise `(None, Some(x))` and `(Some(x), None)`
42 // could collide for adjacent fields. Python's tuple hash achieves the
43 // equivalent via `hash(None)` being a distinct fixed value.
44 const ABSENT: u8 = 0;
45 const PRESENT: u8 = 1;
46
47 fn hash_amount<H: Hasher>(amount: &AmountData, hasher: &mut H) {
48 let AmountData { number, currency } = amount;
49 number.hash(hasher);
50 currency.hash(hasher);
51 }
52
53 fn hash_cost<H: Hasher>(cost: &CostData, hasher: &mut H) {
54 let CostData {
55 number_per,
56 number_total,
57 currency,
58 date,
59 label,
60 merge,
61 } = cost;
62 number_per.hash(hasher);
63 number_total.hash(hasher);
64 currency.hash(hasher);
65 date.hash(hasher);
66 label.hash(hasher);
67 merge.hash(hasher);
68 }
69
70 fn hash_price<H: Hasher>(price: &PriceAnnotationData, hasher: &mut H) {
71 let PriceAnnotationData {
72 is_total,
73 amount,
74 number,
75 currency,
76 } = price;
77 is_total.hash(hasher);
78 match amount {
79 Some(a) => {
80 PRESENT.hash(hasher);
81 hash_amount(a, hasher);
82 }
83 None => ABSENT.hash(hasher),
84 }
85 number.hash(hasher);
86 currency.hash(hasher);
87 }
88
89 fn hash_posting<H: Hasher>(posting: &PostingData, hasher: &mut H) {
90 // Destructure so any future field added to `PostingData` causes a
91 // compile error here and the maintainer must explicitly decide
92 // whether it's part of structural identity.
93 let PostingData {
94 account,
95 units,
96 cost,
97 price,
98 flag,
99 // Metadata is intentionally NOT hashed — matches beancount's
100 // hash_entry(exclude_meta=True) default. Bind to `_` so adding
101 // a new field in the future is still a compile error.
102 metadata: _,
103 } = posting;
104
105 account.hash(hasher);
106 match units {
107 Some(u) => {
108 PRESENT.hash(hasher);
109 hash_amount(u, hasher);
110 }
111 None => ABSENT.hash(hasher),
112 }
113 match cost {
114 Some(c) => {
115 PRESENT.hash(hasher);
116 hash_cost(c, hasher);
117 }
118 None => ABSENT.hash(hasher),
119 }
120 match price {
121 Some(p) => {
122 PRESENT.hash(hasher);
123 hash_price(p, hasher);
124 }
125 None => ABSENT.hash(hasher),
126 }
127 flag.hash(hasher);
128 }
129
130 fn hash_transaction(date: &str, txn: &TransactionData) -> u64 {
131 // Destructure so any future field added to `TransactionData`
132 // causes a compile error here.
133 let TransactionData {
134 flag,
135 payee,
136 narration,
137 tags,
138 links,
139 // Metadata is intentionally NOT hashed — matches beancount's
140 // hash_entry(exclude_meta=True) default.
141 metadata: _,
142 postings,
143 } = txn;
144
145 let mut hasher = DefaultHasher::new();
146 date.hash(&mut hasher);
147 flag.hash(&mut hasher);
148 payee.hash(&mut hasher);
149 narration.hash(&mut hasher);
150
151 // Tags and links are unordered sets in beancount (`frozenset`),
152 // so:
153 // 1. Sort + dedup so the hash is stable regardless of parser
154 // order and collapses any accidental duplicates the parser
155 // might emit (matching beancount set semantics).
156 // 2. Each collection is prefixed with its length so the two
157 // streams can't be merged or swapped without changing the
158 // resulting hash — e.g. `tags={a,b}, links={}` no longer
159 // collides with `tags={a}, links={b}`.
160 let mut sorted_tags: Vec<&String> = tags.iter().collect();
161 sorted_tags.sort();
162 sorted_tags.dedup();
163 sorted_tags.len().hash(&mut hasher);
164 for tag in sorted_tags {
165 tag.hash(&mut hasher);
166 }
167
168 let mut sorted_links: Vec<&String> = links.iter().collect();
169 sorted_links.sort();
170 sorted_links.dedup();
171 sorted_links.len().hash(&mut hasher);
172 for link in sorted_links {
173 link.hash(&mut hasher);
174 }
175
176 // Prefix postings with their count so the posting stream can't
177 // collide with trailing fields of the set streams above.
178 postings.len().hash(&mut hasher);
179 for posting in postings {
180 hash_posting(posting, &mut hasher);
181 }
182
183 hasher.finish()
184 }
185
186 let mut seen: HashSet<u64> = HashSet::new();
187 let mut errors = Vec::new();
188
189 for wrapper in &input.directives {
190 if let DirectiveData::Transaction(txn) = &wrapper.data {
191 let hash = hash_transaction(&wrapper.date, txn);
192 if !seen.insert(hash) {
193 errors.push(PluginError::error(format!(
194 "Duplicate transaction: {} \"{}\"",
195 wrapper.date, txn.narration
196 )));
197 }
198 }
199 }
200
201 PluginOutput {
202 directives: input.directives,
203 errors,
204 }
205 }
206}