serde_intern/lib.rs
1// SPDX-License-Identifier: BSD-2-Clause-Patent OR MIT OR Apache-2.0
2#![deny(missing_docs)]
3//! A [Serde](https://serde.rs) addon that allows *interning* of strings and
4//! byte sequences behind `Arc`s during deserialization.
5//!
6//! Unlike the stock `Rc` / `Arc` deserialization available in the main Serde
7//! crate, these custom deserializer functions **will find duplicate values**
8//! and instead of wrapping each of them into an individual `Arc` it **will
9//! reuse the existing arcs**.
10//!
11//! ## Example
12//!
13//! ```rust
14//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
15//! # use std::sync::Arc;
16//! # use serde_derive::Deserialize;
17//! use serde_intern::{clear_arc_cache, intern_arc_str};
18//!
19//! #[derive(Deserialize)]
20//! struct Person {
21//! // add a custom deserializer hook
22//! #[serde(deserialize_with="intern_arc_str")]
23//! name: Arc<str>,
24//! }
25//!
26//! // when deserializing:
27//! let json = r#"[
28//! { "name": "Yenna" },
29//! { "name": "Yenna" },
30//! { "name": "Yenna" }
31//! ]"#;
32//! let people: Vec<Person> = serde_json::from_str(json)?;
33//!
34//! // All structs share the same text sequence "Yenna" through reference
35//! // counting. There's an extra reference used by internal lookup table.
36//! let first = &people[0];
37//! assert_eq!(Arc::strong_count(&first.name), 4);
38//!
39//! // This function clears up the lookup table.
40//! clear_arc_cache();
41//! assert_eq!(Arc::strong_count(&first.name), 3);
42//! # Ok(())
43//! # }
44//! ```
45//!
46//! Currently `serde-intern` supports string slices and slices of bytes.
47//! More types can be added later.
48//!
49//! ### Note:
50//!
51//! While this library allows sharing a common data storage across multiple
52//! deserialized entities **it is NOT a Zero-copy**.
53//! The first time a new sequence is encountered it is copied to the newly
54//! created heap region administered by `Arc`.
55//! To avoid copying data and instead refer to text sequences in the underlying
56//! buffer you should use Serde's built-in `borrow` deserializer attribute
57//! instead:
58//!
59//! ```rust
60//! # fn main() {
61//! # use std::borrow::Cow;
62//! # use serde_derive::Deserialize;
63//!
64//! #[derive(Deserialize)]
65//! struct Person<'storage> {
66//! #[serde(borrow)]
67//! name: Cow<'storage, str>,
68//! }
69//! # }
70//! ```
71//! Note that in this case the deserialized struct needs to keep the raw data
72//! in memory, as denoted by `'storage` lifetime annotation.
73//!
74//! `serde-intern` lets you drop the underlying buffer at a cost of a single
75//! copy.
76//!
77//! ## Implementation details
78//!
79//! To track the previously observed string slices and compare them with
80//! a currently deserializing slice the library maintains a lookup table.
81//! Its memory overhead is fairy small: it's a `HashMap<u64, Arc<str>>`, so
82//! each entity is a pair of `(u64, usize)` behind the scenes.
83//! We use string hashes for keys to avoid extra memory overhead and not to
84//! force storing string references for a long time.
85//! In case of a hash collision the library will wrap the string into
86//! a separate new `Arc`.
87//!
88//! To speed things up we use a non-standard fast hash function from
89//! [`rustc-hash`](https://docs.rs/rustc-hash/2.0.0/rustc_hash/) crate.
90//! The lookup table is stored as a thread-local to avoid synchronizations.
91//! While the overhead is minimal, the library does offer [`clear_arc_cache`]
92//! hook to clear up lookup tables.
93
94use std::{
95 cell::RefCell, collections::HashMap, error::Error, hash::Hasher, sync::Arc,
96};
97
98use rustc_hash::FxHasher;
99use serde::{de::Visitor, Deserializer};
100
101thread_local! {
102 static INTERNED_U8S: RefCell<HashMap<u64, Arc<[u8]>>>
103 = RefCell::new(HashMap::new());
104 static INTERNED_STRINGS: RefCell<HashMap<u64, Arc<str>>>
105 = RefCell::new(HashMap::new());
106}
107
108struct ArcU8sVisitor {}
109
110impl<'de> Visitor<'de> for ArcU8sVisitor {
111 type Value = Arc<[u8]>;
112
113 fn expecting(
114 &self,
115 formatter: &mut std::fmt::Formatter,
116 ) -> std::fmt::Result {
117 write!(formatter, "Expected a slice of bytes")
118 }
119
120 fn visit_bytes<E>(self, buffer: &[u8]) -> Result<Self::Value, E>
121 where
122 E: Error,
123 {
124 let hash = quick_hash(buffer);
125 INTERNED_U8S.with_borrow_mut(
126 |lookup_table: &mut HashMap<_, Arc<[u8]>>| {
127 lookup_table
128 .entry(hash)
129 .or_insert_with(|| Arc::from(buffer));
130 match lookup_table.get(&hash) {
131 Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
132 _ => Ok(Arc::from(buffer)),
133 }
134 },
135 )
136 }
137}
138
139/// A Serde deserializer hook that allows multiple structs to share the same
140/// slice of data between them.
141///
142/// # Errors
143///
144/// This function will return an error if you place it in an attribute
145/// of a field that is not an `Arc<[u8]>`
146pub fn intern_arc_u8s<'de, D>(deserializer: D) -> Result<Arc<[u8]>, D::Error>
147where
148 D: Deserializer<'de>,
149{
150 deserializer.deserialize_str(ArcU8sVisitor {})
151}
152
153struct ArcStrVisitor {}
154
155impl<'de> Visitor<'de> for ArcStrVisitor {
156 type Value = Arc<str>;
157
158 fn expecting(
159 &self,
160 formatter: &mut std::fmt::Formatter,
161 ) -> std::fmt::Result {
162 write!(formatter, "Expected a string")
163 }
164
165 fn visit_str<E>(self, buffer: &str) -> Result<Self::Value, E>
166 where
167 E: Error,
168 {
169 let hash = quick_hash(buffer.as_bytes());
170 INTERNED_STRINGS.with_borrow_mut(
171 |lookup_table: &mut HashMap<_, Arc<str>>| {
172 lookup_table
173 .entry(hash)
174 .or_insert_with(|| Arc::from(buffer));
175 match lookup_table.get(&hash) {
176 Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
177 _ => Ok(Arc::from(buffer)),
178 }
179 },
180 )
181 }
182}
183
184/// A Serde deserializer hook that allows multiple structs to share the same
185/// string slice between them.
186///
187/// # Errors
188///
189/// This function will return an error if you place it in an attribute
190/// of a field that is not an `Arc<str>`
191pub fn intern_arc_str<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
192where
193 D: Deserializer<'de>,
194{
195 deserializer.deserialize_str(ArcStrVisitor {})
196}
197
198/// This function will clear up lookup tables for the current-thread only!
199///
200/// You can either call it after every call to [`serde_json::from_str`][1]
201/// or a similar function for other formats, or you can keep the tables alive
202/// for longer periods.
203///
204/// Note that after the tables are cleared there is no way to intern previously
205/// observed slices, so extra copies will be created again.
206///
207/// [1]: https://docs.rs/serde_json/1.0.122/serde_json/fn.from_str.html
208pub fn clear_arc_cache() {
209 INTERNED_U8S.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
210 lookup_table.clear()
211 });
212 INTERNED_STRINGS.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
213 lookup_table.clear()
214 });
215}
216
217fn quick_hash(data: &[u8]) -> u64 {
218 let mut hasher = FxHasher::default();
219 hasher.write(data);
220 hasher.finish()
221}
222
223#[cfg(test)]
224mod tests {
225 use super::*;
226
227 #[test]
228 fn deserialize_strings() {
229 #[derive(serde_derive::Deserialize)]
230 struct Person {
231 #[serde(deserialize_with = "intern_arc_str")]
232 name: Arc<str>,
233 }
234
235 let json = r#"
236 [
237 { "name": "Yenna" },
238 { "name": "Yenna" },
239 { "name": "Yenna" }
240 ]
241 "#;
242
243 let people: Vec<Person> = serde_json::from_str(json).unwrap();
244 let first = &people[0];
245 assert_eq!(Arc::strong_count(&first.name), 4);
246 clear_arc_cache();
247 assert_eq!(Arc::strong_count(&first.name), 3);
248 }
249}