rustledger_loader/dedup.rs
1//! Cross-file `InternedStr` deduplication.
2//!
3//! Each parsed file has its own per-file [`StringInterner`], so the
4//! same string (account name, currency code, tag, link, payee,
5//! narration) appearing in two included files lands in two different
6//! `Arc<str>` allocations. The [`InternedStr`] `PartialEq` fast path
7//! (`Arc::ptr_eq`) then fails on cross-file equality and falls back
8//! to byte comparison.
9//!
10//! This module provides the merge step: walk a slice of directives
11//! through a single shared [`StringInterner`] so identical strings
12//! share one `Arc`. After this pass, equality checks across the entire
13//! directive list hit the pointer-equality fast path.
14//!
15//! Coverage: every `InternedStr` and `Vec<InternedStr>` field reachable
16//! from a `Directive` is re-interned — including `Transaction.payee`,
17//! `Transaction.narration`, `Transaction.tags`, `Transaction.links`,
18//! and `Document.tags` / `Document.links`. (Earlier versions of this
19//! pass only covered posting-level `account` / `currency` fields;
20//! Copilot review on PR #1081 expanded the walk.)
21//!
22//! The dedup walk is feature-independent (no `cache` / `rkyv`
23//! dependency) so it can run on every load path, not just cache hits.
24//! [`Loader::load`](crate::Loader::load) invokes it automatically; the
25//! cache-hit path in `rustledger`'s `check` command and the WASM
26//! parsed-ledger constructor call it explicitly.
27
28use rustledger_core::Directive;
29use rustledger_core::intern::{InternedStr, StringInterner};
30use rustledger_core::{IncompleteAmount, PriceAnnotation};
31use rustledger_parser::Spanned;
32
33/// Re-intern all strings in directives to deduplicate memory.
34///
35/// Walks through all directives and re-interns account names and
36/// currencies using a shared [`StringInterner`], so identical strings
37/// share a single `Arc<str>` allocation. Returns the number of strings
38/// that were deduplicated (i.e., strings that were found to already
39/// exist in the interner).
40pub fn reintern_directives(directives: &mut [Spanned<Directive>]) -> usize {
41 let mut interner = StringInterner::with_capacity(1024);
42 let mut dedup_count = 0;
43 for spanned in directives.iter_mut() {
44 dedup_count += reintern_directive(&mut spanned.value, &mut interner);
45 }
46 dedup_count
47}
48
49/// Re-intern strings in a slice of plain directives (without `Spanned` wrapper).
50///
51/// Used by WASM caching where `Spanned<Directive>` is not present.
52pub fn reintern_plain_directives(directives: &mut [Directive]) -> usize {
53 let mut interner = StringInterner::with_capacity(1024);
54 let mut dedup_count = 0;
55 for directive in directives.iter_mut() {
56 dedup_count += reintern_directive(directive, &mut interner);
57 }
58 dedup_count
59}
60
61/// Single-lookup helper used by [`reintern_directive`]. The
62/// `intern_with_status` API on [`StringInterner`] does one hash probe
63/// and returns both the interned value and a "was it already there?"
64/// flag — replacing the earlier `contains` + `intern` double-lookup.
65/// Caught by Copilot review on PR #1081. Returns `true` when the
66/// string was already present (i.e., this call contributed a dedup
67/// hit).
68fn do_intern(s: &mut InternedStr, interner: &mut StringInterner) -> bool {
69 let (new, was_new) = interner.intern_with_status(s.as_str());
70 *s = new;
71 !was_new
72}
73
74/// Re-intern every entry of a `Vec<InternedStr>`, tallying the dedup
75/// hits into `dedup_count`. Hoisted to module scope rather than nested
76/// inside [`reintern_directive`] so clippy's `items_after_statements`
77/// lint stays happy.
78fn intern_vec(v: &mut [InternedStr], interner: &mut StringInterner, dedup_count: &mut usize) {
79 for s in v.iter_mut() {
80 if do_intern(s, interner) {
81 *dedup_count += 1;
82 }
83 }
84}
85
86/// Re-intern all `InternedStr` fields in a single directive,
87/// deduplicating identical strings to share a single `Arc<str>`
88/// allocation. Returns the count of strings that were already present
89/// in the interner (i.e., this directive's contribution to the
90/// dedup-hit total).
91fn reintern_directive(directive: &mut Directive, interner: &mut StringInterner) -> usize {
92 let mut dedup_count = 0;
93
94 match directive {
95 Directive::Transaction(txn) => {
96 // Transaction-level InternedStr fields. The pre-Copilot
97 // version of this walk skipped these — cross-file payees /
98 // narrations / tags / links never hit `Arc::ptr_eq`.
99 if let Some(ref mut payee) = txn.payee
100 && do_intern(payee, interner)
101 {
102 dedup_count += 1;
103 }
104 if do_intern(&mut txn.narration, interner) {
105 dedup_count += 1;
106 }
107 intern_vec(&mut txn.tags, interner, &mut dedup_count);
108 intern_vec(&mut txn.links, interner, &mut dedup_count);
109
110 for posting in &mut txn.postings {
111 if do_intern(&mut posting.account, interner) {
112 dedup_count += 1;
113 }
114 // Units
115 if let Some(ref mut units) = posting.units {
116 match units {
117 IncompleteAmount::Complete(amt) => {
118 if do_intern(&mut amt.currency, interner) {
119 dedup_count += 1;
120 }
121 }
122 IncompleteAmount::CurrencyOnly(cur) => {
123 if do_intern(cur, interner) {
124 dedup_count += 1;
125 }
126 }
127 IncompleteAmount::NumberOnly(_) => {}
128 }
129 }
130 // Cost spec
131 if let Some(ref mut cost) = posting.cost
132 && let Some(ref mut cur) = cost.currency
133 && do_intern(cur, interner)
134 {
135 dedup_count += 1;
136 }
137 // Price annotation
138 if let Some(ref mut price) = posting.price {
139 match price {
140 PriceAnnotation::Unit(amt) | PriceAnnotation::Total(amt) => {
141 if do_intern(&mut amt.currency, interner) {
142 dedup_count += 1;
143 }
144 }
145 PriceAnnotation::UnitIncomplete(inc)
146 | PriceAnnotation::TotalIncomplete(inc) => match inc {
147 IncompleteAmount::Complete(amt) => {
148 if do_intern(&mut amt.currency, interner) {
149 dedup_count += 1;
150 }
151 }
152 IncompleteAmount::CurrencyOnly(cur) => {
153 if do_intern(cur, interner) {
154 dedup_count += 1;
155 }
156 }
157 IncompleteAmount::NumberOnly(_) => {}
158 },
159 PriceAnnotation::UnitEmpty | PriceAnnotation::TotalEmpty => {}
160 }
161 }
162 }
163 }
164 Directive::Balance(bal) => {
165 if do_intern(&mut bal.account, interner) {
166 dedup_count += 1;
167 }
168 if do_intern(&mut bal.amount.currency, interner) {
169 dedup_count += 1;
170 }
171 }
172 Directive::Open(open) => {
173 if do_intern(&mut open.account, interner) {
174 dedup_count += 1;
175 }
176 intern_vec(&mut open.currencies, interner, &mut dedup_count);
177 }
178 Directive::Close(close) => {
179 if do_intern(&mut close.account, interner) {
180 dedup_count += 1;
181 }
182 }
183 Directive::Commodity(comm) => {
184 if do_intern(&mut comm.currency, interner) {
185 dedup_count += 1;
186 }
187 }
188 Directive::Pad(pad) => {
189 if do_intern(&mut pad.account, interner) {
190 dedup_count += 1;
191 }
192 if do_intern(&mut pad.source_account, interner) {
193 dedup_count += 1;
194 }
195 }
196 Directive::Note(note) => {
197 if do_intern(&mut note.account, interner) {
198 dedup_count += 1;
199 }
200 }
201 Directive::Document(doc) => {
202 if do_intern(&mut doc.account, interner) {
203 dedup_count += 1;
204 }
205 // Pre-Copilot this skipped tags/links. They're now covered.
206 intern_vec(&mut doc.tags, interner, &mut dedup_count);
207 intern_vec(&mut doc.links, interner, &mut dedup_count);
208 }
209 Directive::Price(price) => {
210 if do_intern(&mut price.currency, interner) {
211 dedup_count += 1;
212 }
213 if do_intern(&mut price.amount.currency, interner) {
214 dedup_count += 1;
215 }
216 }
217 Directive::Event(_) | Directive::Query(_) | Directive::Custom(_) => {
218 // These don't contain InternedStr fields
219 }
220 }
221
222 dedup_count
223}