Skip to main content

rustledger_core/
intern.rs

1//! String interning for accounts and currencies.
2//!
3//! String interning reduces memory usage by storing each unique string once
4//! and using references to that single copy. This is especially useful for
5//! account names and currencies which appear repeatedly throughout a ledger.
6//!
7//! # Example
8//!
9//! ```
10//! use rustledger_core::intern::StringInterner;
11//!
12//! let mut interner = StringInterner::new();
13//!
14//! let s1 = interner.intern("Expenses:Food");
15//! let s2 = interner.intern("Expenses:Food");
16//! let s3 = interner.intern("Assets:Bank");
17//!
18//! // s1 and s2 point to the same string
19//! assert!(std::ptr::eq(s1.as_str().as_ptr(), s2.as_str().as_ptr()));
20//!
21//! // s3 is different
22//! assert!(!std::ptr::eq(s1.as_str().as_ptr(), s3.as_str().as_ptr()));
23//! ```
24
25use rustc_hash::FxHashSet;
26use std::sync::Arc;
27
28use serde::{Deserialize, Deserializer, Serialize, Serializer};
29
30/// An interned string reference.
31///
32/// This is a thin wrapper around `Arc<str>` that provides cheap cloning
33/// and comparison. Two `InternedStr` values with the same content will
34/// share the same underlying memory.
35#[derive(Debug, Clone, Eq)]
36pub struct InternedStr(Arc<str>);
37
38// rkyv support: use AsString wrapper to serialize as String
39#[cfg(feature = "rkyv")]
40pub use rkyv_impl::AsInternedStr;
41
42/// Type alias for rkyv wrapper for `Option<InternedStr>`.
43/// Use: `#[rkyv(with = rkyv::with::Map<AsInternedStr>)]`
44#[cfg(feature = "rkyv")]
45pub type AsOptionInternedStr = rkyv::with::Map<AsInternedStr>;
46
47/// Type alias for rkyv wrapper for `Vec<InternedStr>`.
48/// Use: `#[rkyv(with = rkyv::with::Map<AsInternedStr>)]`
49#[cfg(feature = "rkyv")]
50pub type AsVecInternedStr = rkyv::with::Map<AsInternedStr>;
51
52#[cfg(feature = "rkyv")]
53mod rkyv_impl {
54    use super::InternedStr;
55    use rkyv::Place;
56    use rkyv::rancor::Fallible;
57    use rkyv::string::ArchivedString;
58    use rkyv::with::{ArchiveWith, DeserializeWith, SerializeWith};
59
60    /// Wrapper to serialize `InternedStr` as String with rkyv.
61    /// Use with `#[rkyv(with = AsInternedStr)]` on `InternedStr` fields.
62    pub struct AsInternedStr;
63
64    impl ArchiveWith<InternedStr> for AsInternedStr {
65        type Archived = ArchivedString;
66        type Resolver = rkyv::string::StringResolver;
67
68        fn resolve_with(field: &InternedStr, resolver: Self::Resolver, out: Place<Self::Archived>) {
69            ArchivedString::resolve_from_str(field.as_str(), resolver, out);
70        }
71    }
72
73    impl<S> SerializeWith<InternedStr, S> for AsInternedStr
74    where
75        S: Fallible + rkyv::ser::Writer + rkyv::ser::Allocator + ?Sized,
76        S::Error: rkyv::rancor::Source,
77    {
78        fn serialize_with(
79            field: &InternedStr,
80            serializer: &mut S,
81        ) -> Result<Self::Resolver, S::Error> {
82            ArchivedString::serialize_from_str(field.as_str(), serializer)
83        }
84    }
85
86    impl<D> DeserializeWith<ArchivedString, InternedStr, D> for AsInternedStr
87    where
88        D: Fallible + ?Sized,
89    {
90        fn deserialize_with(
91            field: &ArchivedString,
92            _deserializer: &mut D,
93        ) -> Result<InternedStr, D::Error> {
94            Ok(InternedStr::new(field.as_str()))
95        }
96    }
97}
98
99impl Serialize for InternedStr {
100    fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
101        self.0.serialize(serializer)
102    }
103}
104
105impl<'de> Deserialize<'de> for InternedStr {
106    fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
107        let s = String::deserialize(deserializer)?;
108        Ok(Self::new(s))
109    }
110}
111
112impl PartialOrd for InternedStr {
113    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
114        Some(self.cmp(other))
115    }
116}
117
118impl Ord for InternedStr {
119    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
120        self.0.cmp(&other.0)
121    }
122}
123
124impl InternedStr {
125    /// Create a new interned string (without using an interner).
126    /// Prefer using `StringInterner::intern` for deduplication.
127    pub fn new(s: impl Into<Arc<str>>) -> Self {
128        Self(s.into())
129    }
130
131    /// Get the string slice.
132    pub fn as_str(&self) -> &str {
133        &self.0
134    }
135
136    /// Check if two interned strings share the same allocation.
137    /// This is O(1) pointer comparison.
138    pub fn ptr_eq(&self, other: &Self) -> bool {
139        Arc::ptr_eq(&self.0, &other.0)
140    }
141}
142
143impl PartialEq for InternedStr {
144    fn eq(&self, other: &Self) -> bool {
145        // Fast path: pointer comparison
146        if Arc::ptr_eq(&self.0, &other.0) {
147            return true;
148        }
149        // Slow path: string comparison
150        self.0 == other.0
151    }
152}
153
154impl std::hash::Hash for InternedStr {
155    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
156        self.0.hash(state);
157    }
158}
159
160impl std::fmt::Display for InternedStr {
161    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
162        write!(f, "{}", self.0)
163    }
164}
165
166impl AsRef<str> for InternedStr {
167    fn as_ref(&self) -> &str {
168        &self.0
169    }
170}
171
172impl std::ops::Deref for InternedStr {
173    type Target = str;
174
175    fn deref(&self) -> &Self::Target {
176        &self.0
177    }
178}
179
180impl From<&str> for InternedStr {
181    fn from(s: &str) -> Self {
182        Self::new(s)
183    }
184}
185
186impl From<String> for InternedStr {
187    fn from(s: String) -> Self {
188        Self::new(s)
189    }
190}
191
192impl From<&String> for InternedStr {
193    fn from(s: &String) -> Self {
194        Self::new(s.as_str())
195    }
196}
197
198impl From<&Self> for InternedStr {
199    fn from(s: &Self) -> Self {
200        s.clone()
201    }
202}
203
204impl PartialEq<str> for InternedStr {
205    fn eq(&self, other: &str) -> bool {
206        self.as_str() == other
207    }
208}
209
210impl PartialEq<&str> for InternedStr {
211    fn eq(&self, other: &&str) -> bool {
212        self.as_str() == *other
213    }
214}
215
216impl PartialEq<String> for InternedStr {
217    fn eq(&self, other: &String) -> bool {
218        self.as_str() == other
219    }
220}
221
222impl Default for InternedStr {
223    fn default() -> Self {
224        Self::new("")
225    }
226}
227
228impl std::borrow::Borrow<str> for InternedStr {
229    fn borrow(&self) -> &str {
230        self.as_str()
231    }
232}
233
234/// A string interner that deduplicates strings.
235///
236/// This is useful for reducing memory usage when many strings with the
237/// same content are created, such as account names and currencies in
238/// a large ledger.
239#[derive(Debug, Default)]
240pub struct StringInterner {
241    /// Set of all interned strings.
242    strings: FxHashSet<Arc<str>>,
243}
244
245impl StringInterner {
246    /// Create a new empty interner.
247    pub fn new() -> Self {
248        Self {
249            strings: FxHashSet::default(),
250        }
251    }
252
253    /// Create an interner with pre-allocated capacity.
254    pub fn with_capacity(capacity: usize) -> Self {
255        Self {
256            strings: FxHashSet::with_capacity_and_hasher(capacity, Default::default()),
257        }
258    }
259
260    /// Intern a string.
261    ///
262    /// If the string already exists in the interner, returns a reference
263    /// to the existing copy. Otherwise, stores the string and returns
264    /// a reference to the new copy.
265    pub fn intern(&mut self, s: &str) -> InternedStr {
266        self.intern_with_status(s).0
267    }
268
269    /// Intern a string, also returning whether it was newly inserted.
270    ///
271    /// Equivalent to [`Self::intern`] but exposes the insertion bit
272    /// without a second hash lookup. Useful for dedup-counting passes
273    /// (see `rustledger_loader::dedup`) that previously called
274    /// `contains` then `intern` — a redundant double lookup. Returns
275    /// `(interned, was_new)`.
276    pub fn intern_with_status(&mut self, s: &str) -> (InternedStr, bool) {
277        if let Some(existing) = self.strings.get(s) {
278            (InternedStr(existing.clone()), false)
279        } else {
280            let arc: Arc<str> = s.into();
281            self.strings.insert(arc.clone());
282            (InternedStr(arc), true)
283        }
284    }
285
286    /// Intern a string, taking ownership.
287    pub fn intern_string(&mut self, s: String) -> InternedStr {
288        if let Some(existing) = self.strings.get(s.as_str()) {
289            InternedStr(existing.clone())
290        } else {
291            let arc: Arc<str> = s.into();
292            self.strings.insert(arc.clone());
293            InternedStr(arc)
294        }
295    }
296
297    /// Check if a string is already interned.
298    pub fn contains(&self, s: &str) -> bool {
299        self.strings.contains(s)
300    }
301
302    /// Get the number of unique strings.
303    pub fn len(&self) -> usize {
304        self.strings.len()
305    }
306
307    /// Check if the interner is empty.
308    pub fn is_empty(&self) -> bool {
309        self.strings.is_empty()
310    }
311
312    /// Get an iterator over all interned strings.
313    pub fn iter(&self) -> impl Iterator<Item = &str> {
314        self.strings.iter().map(std::convert::AsRef::as_ref)
315    }
316
317    /// Clear all interned strings.
318    pub fn clear(&mut self) {
319        self.strings.clear();
320    }
321}
322
323/// A specialized interner for account names.
324///
325/// Account names follow a specific pattern (Type:Component:Component)
326/// and this interner can provide additional functionality like
327/// extracting components.
328#[derive(Debug, Default)]
329pub struct AccountInterner {
330    interner: StringInterner,
331}
332
333impl AccountInterner {
334    /// Create a new account interner.
335    pub fn new() -> Self {
336        Self {
337            interner: StringInterner::new(),
338        }
339    }
340
341    /// Intern an account name.
342    pub fn intern(&mut self, account: &str) -> InternedStr {
343        self.interner.intern(account)
344    }
345
346    /// Get the number of unique accounts.
347    pub fn len(&self) -> usize {
348        self.interner.len()
349    }
350
351    /// Check if empty.
352    pub fn is_empty(&self) -> bool {
353        self.interner.is_empty()
354    }
355
356    /// Get all interned accounts.
357    pub fn accounts(&self) -> impl Iterator<Item = &str> {
358        self.interner.iter()
359    }
360
361    /// Get accounts matching a prefix.
362    pub fn accounts_with_prefix<'a>(&'a self, prefix: &'a str) -> impl Iterator<Item = &'a str> {
363        self.interner.iter().filter(move |s| s.starts_with(prefix))
364    }
365}
366
367/// A specialized interner for currency codes.
368///
369/// Currency codes are typically short (3-4 characters) and uppercase.
370#[derive(Debug, Default)]
371pub struct CurrencyInterner {
372    interner: StringInterner,
373}
374
375impl CurrencyInterner {
376    /// Create a new currency interner.
377    pub fn new() -> Self {
378        Self {
379            interner: StringInterner::new(),
380        }
381    }
382
383    /// Intern a currency code.
384    pub fn intern(&mut self, currency: &str) -> InternedStr {
385        self.interner.intern(currency)
386    }
387
388    /// Get the number of unique currencies.
389    pub fn len(&self) -> usize {
390        self.interner.len()
391    }
392
393    /// Check if empty.
394    pub fn is_empty(&self) -> bool {
395        self.interner.is_empty()
396    }
397
398    /// Get all interned currencies.
399    pub fn currencies(&self) -> impl Iterator<Item = &str> {
400        self.interner.iter()
401    }
402}
403
404#[cfg(test)]
405mod tests {
406    use super::*;
407
408    #[test]
409    fn test_interned_str_equality() {
410        let s1 = InternedStr::new("hello");
411        let s2 = InternedStr::new("hello");
412        let s3 = InternedStr::new("world");
413
414        assert_eq!(s1, s2);
415        assert_ne!(s1, s3);
416        assert_eq!(s1, "hello");
417        assert_eq!(s1, "hello".to_string());
418    }
419
420    #[test]
421    fn test_interner_deduplication() {
422        let mut interner = StringInterner::new();
423
424        let s1 = interner.intern("Expenses:Food");
425        let s2 = interner.intern("Expenses:Food");
426        let s3 = interner.intern("Assets:Bank");
427
428        // s1 and s2 should share the same allocation
429        assert!(s1.ptr_eq(&s2));
430
431        // s3 is different
432        assert!(!s1.ptr_eq(&s3));
433
434        // Only 2 unique strings
435        assert_eq!(interner.len(), 2);
436    }
437
438    #[test]
439    fn test_interner_contains() {
440        let mut interner = StringInterner::new();
441
442        interner.intern("hello");
443
444        assert!(interner.contains("hello"));
445        assert!(!interner.contains("world"));
446    }
447
448    #[test]
449    fn test_account_interner() {
450        let mut interner = AccountInterner::new();
451
452        interner.intern("Expenses:Food:Coffee");
453        interner.intern("Expenses:Food:Groceries");
454        interner.intern("Assets:Bank:Checking");
455
456        assert_eq!(interner.len(), 3);
457
458        assert_eq!(interner.accounts_with_prefix("Expenses:").count(), 2);
459    }
460
461    #[test]
462    fn test_currency_interner() {
463        let mut interner = CurrencyInterner::new();
464
465        let usd1 = interner.intern("USD");
466        let usd2 = interner.intern("USD");
467        let eur = interner.intern("EUR");
468
469        assert!(usd1.ptr_eq(&usd2));
470        assert!(!usd1.ptr_eq(&eur));
471        assert_eq!(interner.len(), 2);
472    }
473
474    #[test]
475    fn test_interned_str_hash() {
476        use std::collections::HashMap;
477
478        let s1 = InternedStr::new("key");
479        let s2 = InternedStr::new("key");
480
481        let mut map = HashMap::new();
482        map.insert(s1, 1);
483
484        // s2 should find the same entry as s1
485        assert_eq!(map.get(&s2), Some(&1));
486    }
487}
488
489// rkyv wrapper for rust_decimal::Decimal - serialize as fixed 16 bytes
490#[cfg(feature = "rkyv")]
491pub use rkyv_decimal::AsDecimal;
492
493#[cfg(feature = "rkyv")]
494mod rkyv_decimal {
495    use rkyv::Place;
496    use rkyv::rancor::Fallible;
497    use rkyv::with::{ArchiveWith, DeserializeWith, SerializeWith};
498    use rust_decimal::Decimal;
499
500    /// Wrapper to serialize `Decimal` as fixed 16-byte binary with rkyv.
501    /// This is more compact and faster than string serialization.
502    pub struct AsDecimal;
503
504    impl ArchiveWith<Decimal> for AsDecimal {
505        type Archived = [u8; 16];
506        type Resolver = [(); 16];
507
508        fn resolve_with(field: &Decimal, resolver: Self::Resolver, out: Place<Self::Archived>) {
509            let bytes = field.serialize();
510            // Use rkyv's Archive impl for [u8; 16] which handles this safely
511            rkyv::Archive::resolve(&bytes, resolver, out);
512        }
513    }
514
515    impl<S> SerializeWith<Decimal, S> for AsDecimal
516    where
517        S: Fallible + ?Sized,
518    {
519        fn serialize_with(
520            _field: &Decimal,
521            _serializer: &mut S,
522        ) -> Result<Self::Resolver, S::Error> {
523            // No extra serialization needed - data is inlined
524            Ok([(); 16])
525        }
526    }
527
528    impl<D> DeserializeWith<[u8; 16], Decimal, D> for AsDecimal
529    where
530        D: Fallible + ?Sized,
531    {
532        fn deserialize_with(field: &[u8; 16], _deserializer: &mut D) -> Result<Decimal, D::Error> {
533            Ok(Decimal::deserialize(*field))
534        }
535    }
536}
537
538// rkyv wrapper for chrono::NaiveDate - serialize as i32 (days from CE)
539#[cfg(feature = "rkyv")]
540pub use rkyv_date::AsNaiveDate;
541
542#[cfg(feature = "rkyv")]
543mod rkyv_date {
544    use crate::NaiveDate;
545    use rkyv::Place;
546    use rkyv::rancor::Fallible;
547    use rkyv::with::{ArchiveWith, DeserializeWith, SerializeWith};
548
549    /// Wrapper to serialize `NaiveDate` as i32 (days since Unix epoch) with rkyv.
550    /// This is 4 bytes instead of 10+ for string, and faster to serialize.
551    pub struct AsNaiveDate;
552
553    const UNIX_EPOCH: NaiveDate = jiff::civil::date(1970, 1, 1);
554
555    impl ArchiveWith<NaiveDate> for AsNaiveDate {
556        type Archived = rkyv::Archived<i32>;
557        type Resolver = ();
558
559        fn resolve_with(field: &NaiveDate, _resolver: Self::Resolver, out: Place<Self::Archived>) {
560            let days = field.since(UNIX_EPOCH).unwrap_or_default().get_days();
561            rkyv::Archive::resolve(&days, (), out);
562        }
563    }
564
565    impl<S> SerializeWith<NaiveDate, S> for AsNaiveDate
566    where
567        S: Fallible + ?Sized,
568    {
569        fn serialize_with(
570            _field: &NaiveDate,
571            _serializer: &mut S,
572        ) -> Result<Self::Resolver, S::Error> {
573            Ok(())
574        }
575    }
576
577    impl<D> DeserializeWith<rkyv::Archived<i32>, NaiveDate, D> for AsNaiveDate
578    where
579        D: Fallible + ?Sized,
580    {
581        fn deserialize_with(
582            field: &rkyv::Archived<i32>,
583            _deserializer: &mut D,
584        ) -> Result<NaiveDate, D::Error> {
585            let days = field.to_native();
586            Ok(UNIX_EPOCH
587                .checked_add(jiff::Span::new().days(i64::from(days)))
588                .expect("valid date"))
589        }
590    }
591}