Skip to main content

rustledger_loader/
dedup.rs

1//! Cross-file `InternedStr` deduplication.
2//!
3//! Each parsed file has its own per-file [`StringInterner`], so the
4//! same string (account name, currency code, tag, link, payee,
5//! narration) appearing in two included files lands in two different
6//! `Arc<str>` allocations. The [`InternedStr`] `PartialEq` fast path
7//! (`Arc::ptr_eq`) then fails on cross-file equality and falls back
8//! to byte comparison.
9//!
10//! This module provides the merge step: walk a slice of directives
11//! through a single shared [`StringInterner`] so identical strings
12//! share one `Arc`. After this pass, equality checks across the entire
13//! directive list hit the pointer-equality fast path.
14//!
15//! Coverage: every `InternedStr` and `Vec<InternedStr>` field reachable
16//! from a `Directive` is re-interned — including `Transaction.payee`,
17//! `Transaction.narration`, `Transaction.tags`, `Transaction.links`,
18//! and `Document.tags` / `Document.links`. (Earlier versions of this
19//! pass only covered posting-level `account` / `currency` fields;
20//! Copilot review on PR #1081 expanded the walk.)
21//!
22//! The dedup walk is feature-independent (no `cache` / `rkyv`
23//! dependency) so it can run on every load path, not just cache hits.
24//! [`Loader::load`](crate::Loader::load) invokes it automatically; the
25//! cache-hit path in `rustledger`'s `check` command and the WASM
26//! parsed-ledger constructor call it explicitly.
27
28use rustledger_core::Directive;
29use rustledger_core::intern::{InternedStr, StringInterner};
30use rustledger_core::{IncompleteAmount, PriceAnnotation};
31use rustledger_parser::Spanned;
32
33/// Re-intern all strings in directives to deduplicate memory.
34///
35/// Walks through all directives and re-interns account names and
36/// currencies using a shared [`StringInterner`], so identical strings
37/// share a single `Arc<str>` allocation. Returns the number of strings
38/// that were deduplicated (i.e., strings that were found to already
39/// exist in the interner).
40pub fn reintern_directives(directives: &mut [Spanned<Directive>]) -> usize {
41    let mut interner = StringInterner::with_capacity(1024);
42    let mut dedup_count = 0;
43    for spanned in directives.iter_mut() {
44        dedup_count += reintern_directive(&mut spanned.value, &mut interner);
45    }
46    dedup_count
47}
48
49/// Re-intern strings in a slice of plain directives (without `Spanned` wrapper).
50///
51/// Used by WASM caching where `Spanned<Directive>` is not present.
52pub fn reintern_plain_directives(directives: &mut [Directive]) -> usize {
53    let mut interner = StringInterner::with_capacity(1024);
54    let mut dedup_count = 0;
55    for directive in directives.iter_mut() {
56        dedup_count += reintern_directive(directive, &mut interner);
57    }
58    dedup_count
59}
60
61/// Single-lookup helper used by [`reintern_directive`]. The
62/// `intern_with_status` API on [`StringInterner`] does one hash probe
63/// and returns both the interned value and a "was it already there?"
64/// flag — replacing the earlier `contains` + `intern` double-lookup.
65/// Caught by Copilot review on PR #1081. Returns `true` when the
66/// string was already present (i.e., this call contributed a dedup
67/// hit).
68fn do_intern(s: &mut InternedStr, interner: &mut StringInterner) -> bool {
69    let (new, was_new) = interner.intern_with_status(s.as_str());
70    *s = new;
71    !was_new
72}
73
74/// Re-intern every entry of a `Vec<InternedStr>`, tallying the dedup
75/// hits into `dedup_count`. Hoisted to module scope rather than nested
76/// inside [`reintern_directive`] so clippy's `items_after_statements`
77/// lint stays happy.
78fn intern_vec(v: &mut [InternedStr], interner: &mut StringInterner, dedup_count: &mut usize) {
79    for s in v.iter_mut() {
80        if do_intern(s, interner) {
81            *dedup_count += 1;
82        }
83    }
84}
85
86/// Re-intern all `InternedStr` fields in a single directive,
87/// deduplicating identical strings to share a single `Arc<str>`
88/// allocation. Returns the count of strings that were already present
89/// in the interner (i.e., this directive's contribution to the
90/// dedup-hit total).
91fn reintern_directive(directive: &mut Directive, interner: &mut StringInterner) -> usize {
92    let mut dedup_count = 0;
93
94    match directive {
95        Directive::Transaction(txn) => {
96            // Transaction-level InternedStr fields. The pre-Copilot
97            // version of this walk skipped these — cross-file payees /
98            // narrations / tags / links never hit `Arc::ptr_eq`.
99            if let Some(ref mut payee) = txn.payee
100                && do_intern(payee, interner)
101            {
102                dedup_count += 1;
103            }
104            if do_intern(&mut txn.narration, interner) {
105                dedup_count += 1;
106            }
107            intern_vec(&mut txn.tags, interner, &mut dedup_count);
108            intern_vec(&mut txn.links, interner, &mut dedup_count);
109
110            for posting in &mut txn.postings {
111                if do_intern(&mut posting.account, interner) {
112                    dedup_count += 1;
113                }
114                // Units
115                if let Some(ref mut units) = posting.units {
116                    match units {
117                        IncompleteAmount::Complete(amt) => {
118                            if do_intern(&mut amt.currency, interner) {
119                                dedup_count += 1;
120                            }
121                        }
122                        IncompleteAmount::CurrencyOnly(cur) => {
123                            if do_intern(cur, interner) {
124                                dedup_count += 1;
125                            }
126                        }
127                        IncompleteAmount::NumberOnly(_) => {}
128                    }
129                }
130                // Cost spec
131                if let Some(ref mut cost) = posting.cost
132                    && let Some(ref mut cur) = cost.currency
133                    && do_intern(cur, interner)
134                {
135                    dedup_count += 1;
136                }
137                // Price annotation
138                if let Some(ref mut price) = posting.price {
139                    match price {
140                        PriceAnnotation::Unit(amt) | PriceAnnotation::Total(amt) => {
141                            if do_intern(&mut amt.currency, interner) {
142                                dedup_count += 1;
143                            }
144                        }
145                        PriceAnnotation::UnitIncomplete(inc)
146                        | PriceAnnotation::TotalIncomplete(inc) => match inc {
147                            IncompleteAmount::Complete(amt) => {
148                                if do_intern(&mut amt.currency, interner) {
149                                    dedup_count += 1;
150                                }
151                            }
152                            IncompleteAmount::CurrencyOnly(cur) => {
153                                if do_intern(cur, interner) {
154                                    dedup_count += 1;
155                                }
156                            }
157                            IncompleteAmount::NumberOnly(_) => {}
158                        },
159                        PriceAnnotation::UnitEmpty | PriceAnnotation::TotalEmpty => {}
160                    }
161                }
162            }
163        }
164        Directive::Balance(bal) => {
165            if do_intern(&mut bal.account, interner) {
166                dedup_count += 1;
167            }
168            if do_intern(&mut bal.amount.currency, interner) {
169                dedup_count += 1;
170            }
171        }
172        Directive::Open(open) => {
173            if do_intern(&mut open.account, interner) {
174                dedup_count += 1;
175            }
176            intern_vec(&mut open.currencies, interner, &mut dedup_count);
177        }
178        Directive::Close(close) => {
179            if do_intern(&mut close.account, interner) {
180                dedup_count += 1;
181            }
182        }
183        Directive::Commodity(comm) => {
184            if do_intern(&mut comm.currency, interner) {
185                dedup_count += 1;
186            }
187        }
188        Directive::Pad(pad) => {
189            if do_intern(&mut pad.account, interner) {
190                dedup_count += 1;
191            }
192            if do_intern(&mut pad.source_account, interner) {
193                dedup_count += 1;
194            }
195        }
196        Directive::Note(note) => {
197            if do_intern(&mut note.account, interner) {
198                dedup_count += 1;
199            }
200        }
201        Directive::Document(doc) => {
202            if do_intern(&mut doc.account, interner) {
203                dedup_count += 1;
204            }
205            // Pre-Copilot this skipped tags/links. They're now covered.
206            intern_vec(&mut doc.tags, interner, &mut dedup_count);
207            intern_vec(&mut doc.links, interner, &mut dedup_count);
208        }
209        Directive::Price(price) => {
210            if do_intern(&mut price.currency, interner) {
211                dedup_count += 1;
212            }
213            if do_intern(&mut price.amount.currency, interner) {
214                dedup_count += 1;
215            }
216        }
217        Directive::Event(_) | Directive::Query(_) | Directive::Custom(_) => {
218            // These don't contain InternedStr fields
219        }
220    }
221
222    dedup_count
223}