tinymist_std/
hash.rs

1//! The hash extension module. It provides extra concepts like `Fingerprint` and
2//! `HashedTrait`.
3
4use core::fmt;
5use std::any::Any;
6use std::hash::{Hash, Hasher};
7use std::ops::Deref;
8
9use base64::Engine;
10use fxhash::FxHasher32;
11use siphasher::sip128::{Hasher128, SipHasher13};
12
13#[cfg(feature = "rkyv")]
14use rkyv::{Archive, Deserialize as rDeser, Serialize as rSer};
15
16use crate::error::prelude::Result;
17
18pub(crate) type FxBuildHasher = std::hash::BuildHasherDefault<FxHasher>;
19pub use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
20// pub type FxIndexSet<K> = indexmap::IndexSet<K, FxHasher>;
21// pub type FxIndexMap<K, V> = indexmap::IndexMap<K, V, FxHasher>;
22/// A dashmap that uses the FxHasher as the underlying hasher.
23pub type FxDashMap<K, V> = dashmap::DashMap<K, V, FxBuildHasher>;
24
25/// See <https://github.com/rust-lang/rust/blob/master/compiler/rustc_hir/src/stable_hash_impls.rs#L22>
26/// The fingerprint conflicts should be very rare and should be handled by the
27/// compiler.
28///
29/// > That being said, given a high quality hash function, the collision
30/// > probabilities in question are very small. For example, for a big crate
31/// > like `rustc_middle` (with ~50000 `LocalDefId`s as of the time of writing)
32/// > there is a probability of roughly 1 in 14,750,000,000 of a crate-internal
33/// > collision occurring. For a big crate graph with 1000 crates in it, there
34/// > is a probability of 1 in 36,890,000,000,000 of a `StableCrateId`
35/// > collision.
36#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
37#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
38#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
39pub struct Fingerprint {
40    lo: u64,
41    hi: u64,
42}
43
44impl fmt::Debug for Fingerprint {
45    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
46        f.write_str(&self.as_svg_id("fg"))
47    }
48}
49
50impl serde::Serialize for Fingerprint {
51    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
52        serializer.serialize_str(&self.as_svg_id(""))
53    }
54}
55
56impl<'de> serde::Deserialize<'de> for Fingerprint {
57    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
58        let s = <std::string::String as serde::Deserialize>::deserialize(deserializer)?;
59        Fingerprint::try_from_str(&s).map_err(serde::de::Error::custom)
60    }
61}
62
63impl Fingerprint {
64    /// Create a new fingerprint from the given pair of 64-bit integers.
65    pub fn from_pair(lo: u64, hi: u64) -> Self {
66        Self { lo, hi }
67    }
68
69    /// Create a new fingerprint from the given 128-bit integer.
70    pub const fn from_u128(hash: u128) -> Self {
71        // Self(hash as u64, (hash >> 64) as u64)
72        Self {
73            lo: hash as u64,
74            hi: (hash >> 64) as u64,
75        }
76    }
77
78    /// Get the fingerprint as a 128-bit integer.
79    pub fn to_u128(self) -> u128 {
80        ((self.hi as u128) << 64) | self.lo as u128
81    }
82
83    /// Cut the fingerprint into a 32-bit integer.
84    /// It could be used as a hash value if the fingerprint is calculated from a
85    /// stable hash function.
86    pub fn lower32(self) -> u32 {
87        self.lo as u32
88    }
89
90    /// Creates a new `Fingerprint` from a svg id that **doesn't have prefix**.
91    pub fn try_from_str(s: &str) -> Result<Self> {
92        let bytes = base64::engine::general_purpose::STANDARD_NO_PAD
93            .decode(&s.as_bytes()[..11])
94            .expect("invalid base64 string");
95        let lo = u64::from_le_bytes(bytes.try_into().unwrap());
96        let mut bytes = base64::engine::general_purpose::STANDARD_NO_PAD
97            .decode(&s.as_bytes()[11..])
98            .expect("invalid base64 string");
99        bytes.resize(8, 0);
100        let hi = u64::from_le_bytes(bytes.try_into().unwrap());
101        Ok(Self::from_pair(lo, hi))
102    }
103
104    /// Create a xml id from the given prefix and the fingerprint of this
105    /// reference. Note that the entire html document shares namespace for
106    /// ids.
107    #[comemo::memoize]
108    pub fn as_svg_id(self, prefix: &'static str) -> String {
109        let fingerprint_lo =
110            base64::engine::general_purpose::STANDARD_NO_PAD.encode(self.lo.to_le_bytes());
111        if self.hi == 0 {
112            return [prefix, &fingerprint_lo].join("");
113        }
114
115        // possible the id in the lower 64 bits.
116        let fingerprint_hi = {
117            let id = self.hi.to_le_bytes();
118            // truncate zero
119            let rev_zero = id.iter().rev().skip_while(|&&b| b == 0).count();
120            let id = &id[..rev_zero];
121            base64::engine::general_purpose::STANDARD_NO_PAD.encode(id)
122        };
123        [prefix, &fingerprint_lo, &fingerprint_hi].join("")
124    }
125}
126
127/// A fingerprint hasher that extends the [`std::hash::Hasher`] trait.
128pub trait FingerprintHasher: std::hash::Hasher {
129    /// Finish the fingerprint and return the fingerprint and the data.
130    /// The data is used to resolve the conflict.
131    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>);
132}
133
134/// A fingerprint hasher that uses the [`SipHasher13`] algorithm.
135#[derive(Default)]
136pub struct FingerprintSipHasher {
137    /// The underlying data passed to the hasher.
138    data: Vec<u8>,
139}
140
141/// The base hasher for the [`FingerprintSipHasher`].
142pub type FingerprintSipHasherBase = SipHasher13;
143
144impl FingerprintSipHasher {
145    /// Get the fast hash value and the underlying data.
146    pub fn fast_hash(&self) -> (u32, &Vec<u8>) {
147        let mut inner = FxHasher32::default();
148        self.data.hash(&mut inner);
149        (inner.finish() as u32, &self.data)
150    }
151}
152
153impl std::hash::Hasher for FingerprintSipHasher {
154    fn write(&mut self, bytes: &[u8]) {
155        self.data.extend_from_slice(bytes);
156    }
157
158    fn finish(&self) -> u64 {
159        let mut inner = FingerprintSipHasherBase::default();
160        self.data.hash(&mut inner);
161        inner.finish()
162    }
163}
164
165impl FingerprintHasher for FingerprintSipHasher {
166    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>) {
167        let buffer = self.data.clone();
168        let mut inner = FingerprintSipHasherBase::default();
169        buffer.hash(&mut inner);
170        let hash = inner.finish128();
171        (
172            Fingerprint {
173                lo: hash.h1,
174                hi: hash.h2,
175            },
176            buffer,
177        )
178    }
179}
180
181/// A fingerprint builder that produces unique fingerprint for each item.
182/// It resolves the conflict by checking the underlying data.
183/// See [`Fingerprint`] for more information.
184#[derive(Default)]
185pub struct FingerprintBuilder {
186    /// The fast conflict checker mapping fingerprints to their underlying data.
187    #[cfg(feature = "bi-hash")]
188    fast_conflict_checker: crate::adt::CHashMap<u32, Vec<u8>>,
189    /// The conflict checker mapping fingerprints to their underlying data.
190    conflict_checker: crate::adt::CHashMap<Fingerprint, Vec<u8>>,
191}
192
193#[cfg(not(feature = "bi-hash"))]
194impl FingerprintBuilder {
195    /// Resolve the fingerprint without checking the conflict.
196    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
197        let mut s = FingerprintSipHasher { data: Vec::new() };
198        item.hash(&mut s);
199        let (fingerprint, _featured_data) = s.finish_fingerprint();
200        fingerprint
201    }
202
203    /// Resolve the fingerprint and check the conflict.
204    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
205        let mut s = FingerprintSipHasher { data: Vec::new() };
206        item.type_id().hash(&mut s);
207        item.hash(&mut s);
208
209        let (fingerprint, featured_data) = s.finish_fingerprint();
210        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
211            self.conflict_checker.insert(fingerprint, featured_data);
212            return fingerprint;
213        };
214
215        if *prev_featured_data == *featured_data {
216            return fingerprint;
217        }
218
219        // todo: soft error
220        panic!("Fingerprint conflict detected!");
221    }
222}
223
224#[cfg(feature = "bi-hash")]
225impl FingerprintBuilder {
226    /// Resolve the fingerprint without checking the conflict.
227    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
228        let mut s = FingerprintSipHasher { data: Vec::new() };
229        item.hash(&mut s);
230        let (fingerprint, featured_data) = s.fast_hash();
231        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
232            self.fast_conflict_checker.insert(fingerprint, s.data);
233            return Fingerprint::from_pair(fingerprint as u64, 0);
234        };
235
236        if *prev_featured_data == *featured_data {
237            return Fingerprint::from_pair(fingerprint as u64, 0);
238        }
239
240        let (fingerprint, _featured_data) = s.finish_fingerprint();
241        fingerprint
242    }
243
244    /// Resolve the fingerprint and check the conflict.
245    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
246        let mut s = FingerprintSipHasher { data: Vec::new() };
247        item.type_id().hash(&mut s);
248        item.hash(&mut s);
249        let (fingerprint, featured_data) = s.fast_hash();
250        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
251            self.fast_conflict_checker.insert(fingerprint, s.data);
252            return Fingerprint::from_pair(fingerprint as u64, 0);
253        };
254
255        if *prev_featured_data == *featured_data {
256            return Fingerprint::from_pair(fingerprint as u64, 0);
257        }
258
259        let (fingerprint, featured_data) = s.finish_fingerprint();
260        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
261            self.conflict_checker.insert(fingerprint, featured_data);
262            return fingerprint;
263        };
264
265        if *prev_featured_data == *featured_data {
266            return fingerprint;
267        }
268
269        // todo: soft error
270        panic!("Fingerprint conflict detected!");
271    }
272}
273
274/// This function provides a hash function for items, which also includes a type
275/// id as part of the hash. Note: This function is not stable across different
276/// versions of typst-ts, so it is preferred to be always used in memory.
277/// Currently, this function use [`SipHasher13`] as the underlying hash
278/// algorithm.
279pub fn item_hash128<T: Hash + 'static>(item: &T) -> u128 {
280    // Also hash the TypeId because the type might be converted
281    // through an unsized coercion.
282    let mut state = SipHasher13::new();
283    item.type_id().hash(&mut state);
284    item.hash(&mut state);
285    state.finish128().as_u128()
286}
287
288/// Calculate a 128-bit siphash of a value.
289/// Currently, this function use [`SipHasher13`] as the underlying hash
290/// algorithm.
291#[inline]
292pub fn hash128<T: std::hash::Hash>(value: &T) -> u128 {
293    let mut state = SipHasher13::new();
294    value.hash(&mut state);
295    state.finish128().as_u128()
296}
297
298/// A convenience function for when you need a quick 64-bit hash.
299#[inline]
300pub fn hash64<T: Hash + ?Sized>(v: &T) -> u64 {
301    let mut state = FxHasher::default();
302    v.hash(&mut state);
303    state.finish()
304}
305
306// todo: rustc hash doesn't have 32-bit hash
307pub use fxhash::hash32;
308
309/// A trait that provides a static prehashed 128-bit hash.
310pub trait StaticHash128 {
311    /// Get the prehashed 128-bit hash.
312    fn get_hash(&self) -> u128;
313}
314
315impl Hash for dyn StaticHash128 {
316    #[inline]
317    fn hash<H: Hasher>(&self, state: &mut H) {
318        state.write_u128(self.get_hash());
319    }
320}
321
322/// A trait that provides a static prehashed 64-bit hash for any internal `T`.
323///
324/// Please ensure that the `T` is really mapped to the hash. Use it at your own
325/// risk.
326pub struct HashedTrait<T: ?Sized> {
327    hash: u128,
328    t: Box<T>,
329}
330
331impl<T: ?Sized> HashedTrait<T> {
332    /// Create a new `HashedTrait` with the given hash and the trait object.
333    pub fn new(hash: u128, t: Box<T>) -> Self {
334        Self { hash, t }
335    }
336}
337
338impl<T: ?Sized> Deref for HashedTrait<T> {
339    type Target = T;
340
341    fn deref(&self) -> &Self::Target {
342        &self.t
343    }
344}
345
346impl<T> Hash for HashedTrait<T> {
347    #[inline]
348    fn hash<H: Hasher>(&self, state: &mut H) {
349        state.write_u128(self.hash);
350    }
351}
352
353impl<T: Hash + Default + 'static> Default for HashedTrait<T> {
354    fn default() -> Self {
355        let t = T::default();
356        Self {
357            hash: item_hash128(&t),
358            t: Box::new(t),
359        }
360    }
361}
362
363impl<T: ?Sized> StaticHash128 for HashedTrait<T> {
364    fn get_hash(&self) -> u128 {
365        self.hash
366    }
367}
368
369#[test]
370fn test_fingerprint() {
371    let t = Fingerprint::from_pair(0, 1);
372    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
373
374    let t = Fingerprint::from_pair(1, 1);
375    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
376
377    let t = Fingerprint::from_pair(1, 0);
378    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
379
380    let t = Fingerprint::from_pair(0, 0);
381    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
382}