tinymist_std/adt/
fmap.rs

1//! A map that shards items by their fingerprint.
2
3use std::{collections::HashMap, num::NonZeroU32};
4
5use crate::hash::Fingerprint;
6
7/// A global upper bound on the shard size.
8/// If there are too many shards, the memory overhead is unacceptable.
9const MAX_SHARD_SIZE: u32 = 512;
10
11/// Returns a read-only default shard size.
12fn default_shard_size() -> NonZeroU32 {
13    static ITEM_SHARD_SIZE: std::sync::OnceLock<NonZeroU32> = std::sync::OnceLock::new();
14
15    /// By testing, we found that the optimal shard size is 2 * number of
16    /// threads.
17    fn determine_default_shard_size() -> NonZeroU32 {
18        // This detection is from rayon.
19        let thread_cnt = {
20            std::thread::available_parallelism()
21                .map(|n| n.get())
22                .unwrap_or(1)
23        };
24
25        // A valid shard size is a power of two.
26        let size = (thread_cnt.next_power_of_two() * 2) as u32;
27        // Perform early non-zero check to avoid panics.
28        NonZeroU32::new(size.min(MAX_SHARD_SIZE)).unwrap()
29    }
30
31    *ITEM_SHARD_SIZE.get_or_init(determine_default_shard_size)
32}
33
34type FMapBase<V> = parking_lot::RwLock<HashMap<Fingerprint, V>>;
35
36/// A map that shards items by their fingerprint. This is faster
37/// than the dashmap in some cases.
38///
39/// It is fast since a fingerprint could split items into different shards
40/// efficiently.
41///
42/// Note: If a fingerprint is not calculated from a hash function, it is not
43/// guaranteed that the fingerprint is evenly distributed. Thus, in that case,
44/// the performance of this map is not guaranteed.
45pub struct FingerprintMap<V> {
46    mask: u32,
47    shards: Vec<parking_lot::RwLock<HashMap<Fingerprint, V>>>,
48}
49
50impl<V> Default for FingerprintMap<V> {
51    fn default() -> Self {
52        Self::new(default_shard_size())
53    }
54}
55
56impl<V> FingerprintMap<V> {
57    /// Creates a new `FingerprintMap` with the given shard size.
58    pub fn new(shard_size: NonZeroU32) -> Self {
59        let shard_size = shard_size.get().next_power_of_two();
60        let shard_size = shard_size.min(MAX_SHARD_SIZE);
61
62        assert!(
63            shard_size.is_power_of_two(),
64            "shard size must be a power of two"
65        );
66        assert!(shard_size > 0, "shard size must be greater than zero");
67        Self {
68            mask: shard_size - 1,
69            shards: (0..shard_size)
70                .map(|_| parking_lot::RwLock::new(HashMap::new()))
71                .collect(),
72        }
73    }
74
75    /// Iterates over all items in the map.
76    pub fn into_items(self) -> impl Iterator<Item = (Fingerprint, V)> {
77        self.shards
78            .into_iter()
79            .flat_map(|shard| shard.into_inner().into_iter())
80    }
81
82    /// Gets the shard
83    pub fn shard(&self, fg: Fingerprint) -> &FMapBase<V> {
84        let shards = &self.shards;
85        let route_idx = (fg.lower32() & self.mask) as usize;
86
87        // check that the route index is within the bounds of the shards
88        debug_assert!(route_idx < shards.len());
89        // SAFETY: `fg` is a valid index into `shards`, as shards size is never changed
90        // and mask is always a power of two.
91        unsafe { shards.get_unchecked(route_idx) }
92    }
93
94    /// Gets the mutable shard slice useful for parallel iteration
95    pub fn as_mut_slice(&mut self) -> &mut [FMapBase<V>] {
96        &mut self.shards
97    }
98
99    /// Checks if the map is empty.
100    pub fn contains_key(&self, fg: &Fingerprint) -> bool {
101        self.shard(*fg).read().contains_key(fg)
102    }
103}
104
105#[cfg(test)]
106mod tests {
107    #[test]
108    fn test_default_shard_size() {
109        let size = super::default_shard_size().get();
110
111        eprintln!("size = {size}");
112
113        assert!(size > 0);
114        assert_eq!(size & (size - 1), 0);
115    }
116}