tinymist_std/
hash.rs

1//! The hash extension module. It provides extra concepts like `Fingerprint` and
2//! `HashedTrait`.
3
4use core::fmt;
5use std::any::Any;
6use std::hash::{Hash, Hasher};
7use std::ops::Deref;
8
9use base64::Engine;
10use fxhash::FxHasher32;
11use siphasher::sip128::{Hasher128, SipHasher13};
12
13#[cfg(feature = "rkyv")]
14use rkyv::{Archive, Deserialize as rDeser, Serialize as rSer};
15
16use crate::error::prelude::Result;
17
18pub(crate) type FxBuildHasher = std::hash::BuildHasherDefault<FxHasher>;
19pub use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
20// pub type FxIndexSet<K> = indexmap::IndexSet<K, FxHasher>;
21// pub type FxIndexMap<K, V> = indexmap::IndexMap<K, V, FxHasher>;
22/// A dashmap that uses the FxHasher as the underlying hasher.
23pub type FxDashMap<K, V> = dashmap::DashMap<K, V, FxBuildHasher>;
24
25/// See <https://github.com/rust-lang/rust/blob/master/compiler/rustc_hir/src/stable_hash_impls.rs#L22>
26/// The fingerprint conflicts should be very rare and should be handled by the
27/// compiler.
28///
29/// > That being said, given a high quality hash function, the collision
30/// > probabilities in question are very small. For example, for a big crate
31/// > like `rustc_middle` (with ~50000 `LocalDefId`s as of the time of writing)
32/// > there is a probability of roughly 1 in 14,750,000,000 of a crate-internal
33/// > collision occurring. For a big crate graph with 1000 crates in it, there
34/// > is a probability of 1 in 36,890,000,000,000 of a `StableCrateId`
35/// > collision.
36///
37/// This stores the 16-bytes data in a pair of `u64` instead of single `u128` to avoid heavily affecting `align` of the embedding structs.
38#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
39#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
40#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
41pub struct Fingerprint {
42    lo: u64,
43    hi: u64,
44}
45
46impl fmt::Debug for Fingerprint {
47    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
48        f.write_str(&self.as_svg_id("fg"))
49    }
50}
51
52impl serde::Serialize for Fingerprint {
53    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
54        serializer.serialize_str(&self.as_svg_id(""))
55    }
56}
57
58impl<'de> serde::Deserialize<'de> for Fingerprint {
59    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
60        let s = <std::string::String as serde::Deserialize>::deserialize(deserializer)?;
61        Fingerprint::try_from_str(&s).map_err(serde::de::Error::custom)
62    }
63}
64
65impl Fingerprint {
66    /// Creates a new fingerprint from the given pair of 64-bit integers.
67    pub fn from_pair(lo: u64, hi: u64) -> Self {
68        Self { lo, hi }
69    }
70
71    /// Creates a new fingerprint from the given 128-bit integer.
72    pub const fn from_u128(hash: u128) -> Self {
73        // Self(hash as u64, (hash >> 64) as u64)
74        Self {
75            lo: hash as u64,
76            hi: (hash >> 64) as u64,
77        }
78    }
79
80    /// Gets the fingerprint as a 128-bit integer.
81    pub fn to_u128(self) -> u128 {
82        ((self.hi as u128) << 64) | self.lo as u128
83    }
84
85    /// Cuts the fingerprint into a 32-bit integer.
86    ///
87    /// It could be used as a hash value if the fingerprint is calculated from a
88    /// stable hash function, otherwise, the result has pool quality.
89    pub fn lower32(self) -> u32 {
90        self.lo as u32
91    }
92
93    /// Creates a new `Fingerprint` from a svg id that **doesn't have prefix**.
94    // todo: why do we split bytes decoding into two parts?
95    // todo: this causes panic if the str is not correct...
96    pub fn try_from_str(s: &str) -> Result<Self> {
97        let bytes = base64::engine::general_purpose::STANDARD_NO_PAD
98            .decode(&s.as_bytes()[..11])
99            .expect("invalid base64 string");
100        let lo = u64::from_le_bytes(bytes.try_into().unwrap());
101        let mut bytes = base64::engine::general_purpose::STANDARD_NO_PAD
102            .decode(&s.as_bytes()[11..])
103            .expect("invalid base64 string");
104        bytes.resize(8, 0);
105        let hi = u64::from_le_bytes(bytes.try_into().unwrap());
106        Ok(Self::from_pair(lo, hi))
107    }
108
109    /// Creates a XML id from the given prefix and the fingerprint of this
110    /// reference. Note that the entire html document shares namespace for
111    /// ids.
112    #[comemo::memoize]
113    pub fn as_svg_id(self, prefix: &'static str) -> String {
114        let fingerprint_lo =
115            base64::engine::general_purpose::STANDARD_NO_PAD.encode(self.lo.to_le_bytes());
116        if self.hi == 0 {
117            return [prefix, &fingerprint_lo].join("");
118        }
119
120        // the possible id in the lower 64 bits.
121        let fingerprint_hi = {
122            let id = self.hi.to_le_bytes();
123            // truncate zero
124            let rev_zero = id.iter().rev().skip_while(|&&b| b == 0).count();
125            let id = &id[..rev_zero];
126            base64::engine::general_purpose::STANDARD_NO_PAD.encode(id)
127        };
128        [prefix, &fingerprint_lo, &fingerprint_hi].join("")
129    }
130}
131
132/// A fingerprint hasher that extends the [`std::hash::Hasher`] trait.
133pub trait FingerprintHasher: std::hash::Hasher {
134    /// Finish the fingerprint and return the fingerprint and the data.
135    /// The data is used to resolve the conflict.
136    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>);
137}
138
139/// A fingerprint hasher that uses the [`SipHasher13`] algorithm.
140#[derive(Default)]
141pub struct FingerprintSipHasher {
142    /// The underlying data passed to the hasher.
143    data: Vec<u8>,
144}
145
146/// The base hasher for the [`FingerprintSipHasher`].
147pub type FingerprintSipHasherBase = SipHasher13;
148
149impl FingerprintSipHasher {
150    /// Gets the fast hash value and the underlying data.
151    pub fn fast_hash(&self) -> (u32, &Vec<u8>) {
152        let mut inner = FxHasher32::default();
153        self.data.hash(&mut inner);
154        (inner.finish() as u32, &self.data)
155    }
156}
157
158impl std::hash::Hasher for FingerprintSipHasher {
159    fn write(&mut self, bytes: &[u8]) {
160        self.data.extend_from_slice(bytes);
161    }
162
163    fn finish(&self) -> u64 {
164        let mut inner = FingerprintSipHasherBase::default();
165        self.data.hash(&mut inner);
166        inner.finish()
167    }
168}
169
170impl FingerprintHasher for FingerprintSipHasher {
171    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>) {
172        let buffer = self.data.clone();
173        let mut inner = FingerprintSipHasherBase::default();
174        buffer.hash(&mut inner);
175        let hash = inner.finish128();
176        (
177            Fingerprint {
178                lo: hash.h1,
179                hi: hash.h2,
180            },
181            buffer,
182        )
183    }
184}
185
186/// A fingerprint builder that produces unique fingerprint for each item.
187/// It resolves the conflict by checking the underlying data.
188/// See [`Fingerprint`] for more information.
189#[derive(Default)]
190pub struct FingerprintBuilder {
191    /// The fast conflict checker mapping fingerprints to their underlying data.
192    #[cfg(feature = "bi-hash")]
193    fast_conflict_checker: crate::adt::CHashMap<u32, Vec<u8>>,
194    /// The conflict checker mapping fingerprints to their underlying data.
195    conflict_checker: crate::adt::CHashMap<Fingerprint, Vec<u8>>,
196}
197
198#[cfg(not(feature = "bi-hash"))]
199impl FingerprintBuilder {
200    /// Resolves the fingerprint without checking the conflict.
201    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
202        let mut s = FingerprintSipHasher { data: Vec::new() };
203        item.hash(&mut s);
204        let (fingerprint, _featured_data) = s.finish_fingerprint();
205        fingerprint
206    }
207
208    /// Resolves the fingerprint and check the conflict.
209    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
210        let mut s = FingerprintSipHasher { data: Vec::new() };
211        item.type_id().hash(&mut s);
212        item.hash(&mut s);
213
214        let (fingerprint, featured_data) = s.finish_fingerprint();
215        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
216            self.conflict_checker.insert(fingerprint, featured_data);
217            return fingerprint;
218        };
219
220        if *prev_featured_data == *featured_data {
221            return fingerprint;
222        }
223
224        // todo: soft error
225        panic!("Fingerprint conflict detected!");
226    }
227}
228
229#[cfg(feature = "bi-hash")]
230impl FingerprintBuilder {
231    /// Resolves the fingerprint without checking the conflict.
232    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
233        let mut s = FingerprintSipHasher { data: Vec::new() };
234        item.hash(&mut s);
235        let (fingerprint, featured_data) = s.fast_hash();
236        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
237            self.fast_conflict_checker.insert(fingerprint, s.data);
238            return Fingerprint::from_pair(fingerprint as u64, 0);
239        };
240
241        if *prev_featured_data == *featured_data {
242            return Fingerprint::from_pair(fingerprint as u64, 0);
243        }
244
245        let (fingerprint, _featured_data) = s.finish_fingerprint();
246        fingerprint
247    }
248
249    /// Resolves the fingerprint and check the conflict.
250    // todo: bi-hash affects the stability?
251    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
252        let mut s = FingerprintSipHasher { data: Vec::new() };
253        item.type_id().hash(&mut s);
254        item.hash(&mut s);
255        let (fingerprint, featured_data) = s.fast_hash();
256        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
257            self.fast_conflict_checker.insert(fingerprint, s.data);
258            return Fingerprint::from_pair(fingerprint as u64, 0);
259        };
260
261        if *prev_featured_data == *featured_data {
262            return Fingerprint::from_pair(fingerprint as u64, 0);
263        }
264
265        let (fingerprint, featured_data) = s.finish_fingerprint();
266        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
267            self.conflict_checker.insert(fingerprint, featured_data);
268            return fingerprint;
269        };
270
271        if *prev_featured_data == *featured_data {
272            return fingerprint;
273        }
274
275        // todo: soft error
276        panic!("Fingerprint conflict detected!");
277    }
278}
279
280/// This function provides a hash function for items, which also includes a type
281/// id as part of the hash. Note: This function is not stable across different
282/// versions of typst-ts, so it is preferred to be always used in memory.
283/// Currently, this function use [`SipHasher13`] as the underlying hash
284/// algorithm.
285pub fn item_hash128<T: Hash + 'static>(item: &T) -> u128 {
286    // Also hash the TypeId because the type might be converted
287    // through an unsized coercion.
288    let mut state = SipHasher13::new();
289    item.type_id().hash(&mut state);
290    item.hash(&mut state);
291    state.finish128().as_u128()
292}
293
294/// Calculate a 128-bit siphash of a value.
295/// Currently, this function use [`SipHasher13`] as the underlying hash
296/// algorithm.
297#[inline]
298pub fn hash128<T: std::hash::Hash>(value: &T) -> u128 {
299    let mut state = SipHasher13::new();
300    value.hash(&mut state);
301    state.finish128().as_u128()
302}
303
304/// A convenience function for when you need a quick 64-bit hash.
305#[inline]
306pub fn hash64<T: Hash + ?Sized>(v: &T) -> u64 {
307    let mut state = FxHasher::default();
308    v.hash(&mut state);
309    state.finish()
310}
311
312// todo: rustc hash doesn't have 32-bit hash
313pub use fxhash::hash32;
314
315/// A trait that provides a static prehashed 128-bit hash.
316pub trait StaticHash128 {
317    /// Get the prehashed 128-bit hash.
318    fn get_hash(&self) -> u128;
319}
320
321impl Hash for dyn StaticHash128 {
322    #[inline]
323    fn hash<H: Hasher>(&self, state: &mut H) {
324        state.write_u128(self.get_hash());
325    }
326}
327
328/// A trait that provides a static prehashed 64-bit hash for any internal `T`.
329///
330/// Please ensure that the `T` is really mapped to the hash. Use it at your own
331/// risk.
332pub struct HashedTrait<T: ?Sized> {
333    hash: u128,
334    t: Box<T>,
335}
336
337impl<T: ?Sized> HashedTrait<T> {
338    /// Create a new `HashedTrait` with the given hash and the trait object.
339    pub fn new(hash: u128, t: Box<T>) -> Self {
340        Self { hash, t }
341    }
342}
343
344impl<T: ?Sized> Deref for HashedTrait<T> {
345    type Target = T;
346
347    fn deref(&self) -> &Self::Target {
348        &self.t
349    }
350}
351
352impl<T> Hash for HashedTrait<T> {
353    #[inline]
354    fn hash<H: Hasher>(&self, state: &mut H) {
355        state.write_u128(self.hash);
356    }
357}
358
359impl<T: Hash + Default + 'static> Default for HashedTrait<T> {
360    fn default() -> Self {
361        let t = T::default();
362        Self {
363            hash: item_hash128(&t),
364            t: Box::new(t),
365        }
366    }
367}
368
369impl<T: ?Sized> StaticHash128 for HashedTrait<T> {
370    fn get_hash(&self) -> u128 {
371        self.hash
372    }
373}
374
375#[test]
376fn test_fingerprint() {
377    let t = Fingerprint::from_pair(0, 1);
378    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
379
380    let t = Fingerprint::from_pair(1, 1);
381    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
382
383    let t = Fingerprint::from_pair(1, 0);
384    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
385
386    let t = Fingerprint::from_pair(0, 0);
387    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
388}