serde-intern 1.0.0

A Serde addon that allows interning of strings and byte sequences behind `Arc`s during deserialization.
Documentation
// SPDX-License-Identifier: BSD-2-Clause-Patent OR MIT OR Apache-2.0
#![deny(missing_docs)]
//! A [Serde](https://serde.rs) addon that allows *interning* of strings and
//! byte sequences behind `Arc`s during deserialization.
//!
//! Unlike the stock `Rc` / `Arc` deserialization available in the main Serde
//! crate, these custom deserializer functions **will find duplicate values**
//! and instead of wrapping each of them into an individual `Arc` it **will
//! reuse the existing arcs**.
//!
//! ## Example
//!
//! ```rust
//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
//! # use std::sync::Arc;
//! # use serde_derive::Deserialize;
//! use serde_intern::{clear_arc_cache, intern_arc_str};
//!
//! #[derive(Deserialize)]
//! struct Person {
//!     // add a custom deserializer hook
//!     #[serde(deserialize_with="intern_arc_str")]
//!     name: Arc<str>,
//! }
//!
//! // when deserializing:
//! let json = r#"[
//!     { "name": "Yenna" },
//!     { "name": "Yenna" },
//!     { "name": "Yenna" }
//! ]"#;
//! let people: Vec<Person> = serde_json::from_str(json)?;
//!
//! // All structs share the same text sequence "Yenna" through reference
//! // counting. There's an extra reference used by internal lookup table.
//! let first = &people[0];
//! assert_eq!(Arc::strong_count(&first.name), 4);
//!
//! // This function clears up the lookup table.
//! clear_arc_cache();
//! assert_eq!(Arc::strong_count(&first.name), 3);
//! # Ok(())
//! # }
//! ```
//!
//! Currently `serde-intern` supports string slices and slices of bytes.
//! More types can be added later.
//!
//! ### Note:
//!
//! While this library allows sharing a common data storage across multiple
//! deserialized entities **it is NOT a Zero-copy**.
//! The first time a new sequence is encountered it is copied to the newly
//! created heap region administered by `Arc`.
//! To avoid copying data and instead refer to text sequences in the underlying
//! buffer you should use Serde's built-in `borrow` deserializer attribute
//! instead:
//!
//! ```rust
//! # fn main() {
//! # use std::borrow::Cow;
//! # use serde_derive::Deserialize;
//!
//! #[derive(Deserialize)]
//! struct Person<'storage> {
//!     #[serde(borrow)]
//!     name: Cow<'storage, str>,
//! }
//! # }
//! ```
//! Note that in this case the deserialized struct needs to keep the raw data
//! in memory, as denoted by `'storage` lifetime annotation.
//!
//! `serde-intern` lets you drop the underlying buffer at a cost of a single
//! copy.
//!
//! ## Implementation details
//!
//! To track the previously observed string slices and compare them with
//! a currently deserializing slice the library maintains a lookup table.
//! Its memory overhead is fairy small: it's a `HashMap<u64, Arc<str>>`, so
//! each entity is a pair of `(u64, usize)` behind the scenes.
//! We use string hashes  for keys to avoid extra memory overhead and not to
//! force storing string references for a long time.
//! In case of a hash collision the library will wrap the string into
//! a separate new `Arc`.
//!
//! To speed things up we use a non-standard fast hash function from
//! [`rustc-hash`](https://docs.rs/rustc-hash/2.0.0/rustc_hash/) crate.
//! The lookup table is stored as a thread-local to avoid synchronizations.
//! While the overhead is minimal, the library does offer [`clear_arc_cache`]
//! hook to clear up lookup tables.

use std::{
    cell::RefCell, collections::HashMap, error::Error, hash::Hasher, sync::Arc,
};

use rustc_hash::FxHasher;
use serde::{de::Visitor, Deserializer};

thread_local! {
    static INTERNED_U8S: RefCell<HashMap<u64, Arc<[u8]>>>
        = RefCell::new(HashMap::new());
    static INTERNED_STRINGS: RefCell<HashMap<u64, Arc<str>>>
        = RefCell::new(HashMap::new());
}

struct ArcU8sVisitor {}

impl<'de> Visitor<'de> for ArcU8sVisitor {
    type Value = Arc<[u8]>;

    fn expecting(
        &self,
        formatter: &mut std::fmt::Formatter,
    ) -> std::fmt::Result {
        write!(formatter, "Expected a slice of bytes")
    }

    fn visit_bytes<E>(self, buffer: &[u8]) -> Result<Self::Value, E>
    where
        E: Error,
    {
        let hash = quick_hash(buffer);
        INTERNED_U8S.with_borrow_mut(
            |lookup_table: &mut HashMap<_, Arc<[u8]>>| {
                lookup_table
                    .entry(hash)
                    .or_insert_with(|| Arc::from(buffer));
                match lookup_table.get(&hash) {
                    Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
                    _ => Ok(Arc::from(buffer)),
                }
            },
        )
    }
}

/// A Serde deserializer hook that allows multiple structs to share the same
/// slice of data between them.
///
/// # Errors
///
/// This function will return an error if you place it in an attribute
/// of a field that is not an `Arc<[u8]>`
pub fn intern_arc_u8s<'de, D>(deserializer: D) -> Result<Arc<[u8]>, D::Error>
where
    D: Deserializer<'de>,
{
    deserializer.deserialize_str(ArcU8sVisitor {})
}

struct ArcStrVisitor {}

impl<'de> Visitor<'de> for ArcStrVisitor {
    type Value = Arc<str>;

    fn expecting(
        &self,
        formatter: &mut std::fmt::Formatter,
    ) -> std::fmt::Result {
        write!(formatter, "Expected a string")
    }

    fn visit_str<E>(self, buffer: &str) -> Result<Self::Value, E>
    where
        E: Error,
    {
        let hash = quick_hash(buffer.as_bytes());
        INTERNED_STRINGS.with_borrow_mut(
            |lookup_table: &mut HashMap<_, Arc<str>>| {
                lookup_table
                    .entry(hash)
                    .or_insert_with(|| Arc::from(buffer));
                match lookup_table.get(&hash) {
                    Some(arc) if arc.as_ref() == buffer => Ok(arc.clone()),
                    _ => Ok(Arc::from(buffer)),
                }
            },
        )
    }
}

/// A Serde deserializer hook that allows multiple structs to share the same
/// string slice between them.
///
/// # Errors
///
/// This function will return an error if you place it in an attribute
/// of a field that is not an `Arc<str>`
pub fn intern_arc_str<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
where
    D: Deserializer<'de>,
{
    deserializer.deserialize_str(ArcStrVisitor {})
}

/// This function will clear up lookup tables for the current-thread only!
///
/// You can either call it after every call to [`serde_json::from_str`][1]
/// or a similar function for other formats, or you can keep the tables alive
/// for longer periods.
///
/// Note that after the tables are cleared there is no way to intern previously
/// observed slices, so extra copies will be created again.
///
/// [1]: https://docs.rs/serde_json/1.0.122/serde_json/fn.from_str.html
pub fn clear_arc_cache() {
    INTERNED_U8S.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
        lookup_table.clear()
    });
    INTERNED_STRINGS.with_borrow_mut(|lookup_table: &mut HashMap<_, _>| {
        lookup_table.clear()
    });
}

fn quick_hash(data: &[u8]) -> u64 {
    let mut hasher = FxHasher::default();
    hasher.write(data);
    hasher.finish()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn deserialize_strings() {
        #[derive(serde_derive::Deserialize)]
        struct Person {
            #[serde(deserialize_with = "intern_arc_str")]
            name: Arc<str>,
        }

        let json = r#"
            [
                { "name": "Yenna" },
                { "name": "Yenna" },
                { "name": "Yenna" }
            ]
        "#;

        let people: Vec<Person> = serde_json::from_str(json).unwrap();
        let first = &people[0];
        assert_eq!(Arc::strong_count(&first.name), 4);
        clear_arc_cache();
        assert_eq!(Arc::strong_count(&first.name), 3);
    }
}