serde_intern/
lib.rs

1// SPDX-License-Identifier: BSD-2-Clause-Patent OR MIT OR Apache-2.0
2#![deny(missing_docs)]
3//! A [Serde](https://serde.rs) addon that allows *interning* of strings and
4//! byte sequences behind `Arc`s during deserialization.
5//!
6//! Unlike the stock `Rc` / `Arc` deserialization available in the main Serde
7//! crate, these custom deserializer functions **will find duplicate values**
8//! and instead of wrapping each of them into an individual `Arc` it **will
9//! reuse the existing arcs**.
10//!
11//! ## Example
12//!
13//! ```rust
14//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
15//! # use std::sync::Arc;
16//! # use serde_derive::Deserialize;
17//! use serde_intern::{clear_arc_cache, intern_arc_str};
18//!
19//! #[derive(Deserialize)]
20//! struct Person {
21//!     // add a custom deserializer hook
22//!     #[serde(deserialize_with="intern_arc_str")]
23//!     name: Arc<str>,
24//! }
25//!
26//! // when deserializing:
27//! let json = r#"[
28//!     { "name": "Yenna" },
29//!     { "name": "Yenna" },
30//!     { "name": "Yenna" }
31//! ]"#;
32//! let people: Vec<Person> = serde_json::from_str(json)?;
33//!
34//! // All structs share the same text sequence "Yenna" through reference
35//! // counting. There's an extra reference used by internal lookup table.
36//! let first = &people[0];
37//! assert_eq!(Arc::strong_count(&first.name), 4);
38//!
39//! // This function clears up the lookup table.
40//! clear_arc_cache();
41//! assert_eq!(Arc::strong_count(&first.name), 3);
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! Currently `serde-intern` supports string slices and slices of bytes.
47//! More types can be added later.
48//!
49//! ### Note:
50//!
51//! While this library allows sharing a common data storage across multiple
52//! deserialized entities **it is NOT a Zero-copy**.
53//! The first time a new sequence is encountered it is copied to the newly
54//! created heap region administered by `Arc`.
55//! To avoid copying data and instead refer to text sequences in the underlying
56//! buffer you should use Serde's built-in `borrow` deserializer attribute
57//! instead:
58//!
59//! ```rust
60//! # fn main() {
61//! # use std::borrow::Cow;
62//! # use serde_derive::Deserialize;
63//!
64//! #[derive(Deserialize)]
65//! struct Person<'storage> {
66//!     #[serde(borrow)]
67//!     name: Cow<'storage, str>,
68//! }
69//! # }
70//! ```
71//! Note that in this case the deserialized struct needs to keep the raw data
72//! in memory, as denoted by `'storage` lifetime annotation.
73//!
74//! `serde-intern` lets you drop the underlying buffer at a cost of a single
75//! copy.
76//!
77//! ## Implementation details
78//!
79//! To track the previously observed string slices and compare them with
80//! a currently deserializing slice the library maintains a lookup table.
81//! Its memory overhead is fairy small: it's a `HashMap<u64, Arc<str>>`, so
82//! each entity is a pair of `(u64, usize)` behind the scenes.
83//! We use string hashes  for keys to avoid extra memory overhead and not to
84//! force storing string references for a long time.
85//! In case of a hash collision the library will wrap the string into
86//! a separate new `Arc`.
87//!
88//! To speed things up we use a non-standard fast hash function from
89//! [`rustc-hash`](https://docs.rs/rustc-hash/2.0.0/rustc_hash/) crate.
90//! The lookup table is stored as a thread-local to avoid synchronizations.
91//! While the overhead is minimal, the library does offer [`clear_arc_cache`]
92//! hook to clear up lookup tables.
93
94use std::{
95    cell::RefCell, collections::HashMap, error::Error, hash::Hasher, sync::Arc,
96};
97
98use rustc_hash::FxHasher;
99use serde::{de::Visitor, Deserializer};
100
101thread_local! {
102    static INTERNED_U8S: RefCell<HashMap<u64, Arc<[u8]>>>
103        = RefCell::new(HashMap::new());
104    static INTERNED_STRINGS: RefCell<HashMap<u64, Arc<str>>>
105        = RefCell::new(HashMap::new());
106}
107
108struct ArcU8sVisitor {}
109
110impl<'de> Visitor<'de> for ArcU8sVisitor {
111    type Value = Arc<[u8]>;
112
113    fn expecting(
114        &self,
115        formatter: &mut std::fmt::Formatter,
116    ) -> std::fmt::Result {
117        write!(formatter, "Expected a slice of bytes")
118    }
119
120    fn visit_bytes<E>(self, buffer: &[u8]) -> Result<Self::Value, E>
121    where
122        E: Error,
123    {
124        let hash = quick_hash(buffer);
125        INTERNED_U8S.with_borrow_mut(
126            |lookup_table: &mut HashMap<_, Arc<[u8]>>| {
127                lookup_table
128                    .entry(hash)
129                    .or_insert_with(|| Arc::from(buffer));
130                match lookup_table.get(&hash) {
131                    Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
132                    _ => Ok(Arc::from(buffer)),
133                }
134            },
135        )
136    }
137}
138
139/// A Serde deserializer hook that allows multiple structs to share the same
140/// slice of data between them.
141///
142/// # Errors
143///
144/// This function will return an error if you place it in an attribute
145/// of a field that is not an `Arc<[u8]>`
146pub fn intern_arc_u8s<'de, D>(deserializer: D) -> Result<Arc<[u8]>, D::Error>
147where
148    D: Deserializer<'de>,
149{
150    deserializer.deserialize_str(ArcU8sVisitor {})
151}
152
153struct ArcStrVisitor {}
154
155impl<'de> Visitor<'de> for ArcStrVisitor {
156    type Value = Arc<str>;
157
158    fn expecting(
159        &self,
160        formatter: &mut std::fmt::Formatter,
161    ) -> std::fmt::Result {
162        write!(formatter, "Expected a string")
163    }
164
165    fn visit_str<E>(self, buffer: &str) -> Result<Self::Value, E>
166    where
167        E: Error,
168    {
169        let hash = quick_hash(buffer.as_bytes());
170        INTERNED_STRINGS.with_borrow_mut(
171            |lookup_table: &mut HashMap<_, Arc<str>>| {
172                lookup_table
173                    .entry(hash)
174                    .or_insert_with(|| Arc::from(buffer));
175                match lookup_table.get(&hash) {
176                    Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
177                    _ => Ok(Arc::from(buffer)),
178                }
179            },
180        )
181    }
182}
183
184/// A Serde deserializer hook that allows multiple structs to share the same
185/// string slice between them.
186///
187/// # Errors
188///
189/// This function will return an error if you place it in an attribute
190/// of a field that is not an `Arc<str>`
191pub fn intern_arc_str<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
192where
193    D: Deserializer<'de>,
194{
195    deserializer.deserialize_str(ArcStrVisitor {})
196}
197
198/// This function will clear up lookup tables for the current-thread only!
199///
200/// You can either call it after every call to [`serde_json::from_str`][1]
201/// or a similar function for other formats, or you can keep the tables alive
202/// for longer periods.
203///
204/// Note that after the tables are cleared there is no way to intern previously
205/// observed slices, so extra copies will be created again.
206///
207/// [1]: https://docs.rs/serde_json/1.0.122/serde_json/fn.from_str.html
208pub fn clear_arc_cache() {
209    INTERNED_U8S.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
210        lookup_table.clear()
211    });
212    INTERNED_STRINGS.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
213        lookup_table.clear()
214    });
215}
216
217fn quick_hash(data: &[u8]) -> u64 {
218    let mut hasher = FxHasher::default();
219    hasher.write(data);
220    hasher.finish()
221}
222
223#[cfg(test)]
224mod tests {
225    use super::*;
226
227    #[test]
228    fn deserialize_strings() {
229        #[derive(serde_derive::Deserialize)]
230        struct Person {
231            #[serde(deserialize_with = "intern_arc_str")]
232            name: Arc<str>,
233        }
234
235        let json = r#"
236            [
237                { "name": "Yenna" },
238                { "name": "Yenna" },
239                { "name": "Yenna" }
240            ]
241        "#;
242
243        let people: Vec<Person> = serde_json::from_str(json).unwrap();
244        let first = &people[0];
245        assert_eq!(Arc::strong_count(&first.name), 4);
246        clear_arc_cache();
247        assert_eq!(Arc::strong_count(&first.name), 3);
248    }
249}