reflexo/
hash.rs

1use core::fmt;
2use std::{
3    any::Any,
4    hash::{Hash, Hasher},
5};
6
7use base64::Engine;
8use fxhash::FxHasher32;
9use siphasher::sip128::{Hasher128, SipHasher13};
10
11#[cfg(feature = "rkyv")]
12use rkyv::{Archive, Deserialize as rDeser, Serialize as rSer};
13
14use crate::error::prelude::ZResult;
15
16pub(crate) type FxBuildHasher = std::hash::BuildHasherDefault<FxHasher>;
17pub use rustc_hash::{FxHashMap, FxHashSet, FxHasher};
18// pub type FxIndexSet<K> = indexmap::IndexSet<K, FxHasher>;
19// pub type FxIndexMap<K, V> = indexmap::IndexMap<K, V, FxHasher>;
20pub type FxDashMap<K, V> = dashmap::DashMap<K, V, FxBuildHasher>;
21
22/// See <https://github.com/rust-lang/rust/blob/master/compiler/rustc_hir/src/stable_hash_impls.rs#L22>
23/// The fingerprint conflicts should be very rare and should be handled by the
24/// compiler.
25///
26/// > That being said, given a high quality hash function, the collision
27/// > probabilities in question are very small. For example, for a big crate
28/// > like `rustc_middle` (with ~50000 `LocalDefId`s as of the time of writing)
29/// > there is a probability of roughly 1 in 14,750,000,000 of a crate-internal
30/// > collision occurring. For a big crate graph with 1000 crates in it, there
31/// > is a probability of 1 in 36,890,000,000,000 of a `StableCrateId`
32/// > collision.
33#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
34#[cfg_attr(feature = "rkyv", derive(Archive, rDeser, rSer))]
35#[cfg_attr(feature = "rkyv-validation", archive(check_bytes))]
36pub struct Fingerprint {
37    lo: u64,
38    hi: u64,
39}
40
41impl fmt::Debug for Fingerprint {
42    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
43        f.write_str(&self.as_svg_id("fg"))
44    }
45}
46
47impl serde::Serialize for Fingerprint {
48    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
49        serializer.serialize_str(&self.as_svg_id(""))
50    }
51}
52
53impl<'de> serde::Deserialize<'de> for Fingerprint {
54    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
55        let s = <std::string::String as serde::Deserialize>::deserialize(deserializer)?;
56        Fingerprint::try_from_str(&s).map_err(serde::de::Error::custom)
57    }
58}
59
60impl Fingerprint {
61    /// Create a new fingerprint from the given pair of 64-bit integers.
62    pub fn from_pair(lo: u64, hi: u64) -> Self {
63        Self { lo, hi }
64    }
65
66    /// Create a new fingerprint from the given 128-bit integer.
67    pub const fn from_u128(hash: u128) -> Self {
68        // Self(hash as u64, (hash >> 64) as u64)
69        Self {
70            lo: hash as u64,
71            hi: (hash >> 64) as u64,
72        }
73    }
74
75    /// Get the fingerprint as a 128-bit integer.
76    pub fn to_u128(self) -> u128 {
77        ((self.hi as u128) << 64) | self.lo as u128
78    }
79
80    /// Cut the fingerprint into a 32-bit integer.
81    /// It could be used as a hash value if the fingerprint is calculated from a
82    /// stable hash function.
83    pub fn lower32(self) -> u32 {
84        self.lo as u32
85    }
86
87    /// Creates a new `Fingerprint` from a svg id that **doesn't have prefix**.
88    pub fn try_from_str(s: &str) -> ZResult<Self> {
89        let bytes = base64::engine::general_purpose::STANDARD_NO_PAD
90            .decode(&s.as_bytes()[..11])
91            .expect("invalid base64 string");
92        let lo = u64::from_le_bytes(bytes.try_into().unwrap());
93        let mut bytes = base64::engine::general_purpose::STANDARD_NO_PAD
94            .decode(&s.as_bytes()[11..])
95            .expect("invalid base64 string");
96        bytes.resize(8, 0);
97        let hi = u64::from_le_bytes(bytes.try_into().unwrap());
98        Ok(Self::from_pair(lo, hi))
99    }
100
101    /// Create a xml id from the given prefix and the fingerprint of this
102    /// reference. Note that the entire html document shares namespace for
103    /// ids.
104    #[comemo::memoize]
105    pub fn as_svg_id(self, prefix: &'static str) -> String {
106        let fingerprint_lo =
107            base64::engine::general_purpose::STANDARD_NO_PAD.encode(self.lo.to_le_bytes());
108        if self.hi == 0 {
109            return [prefix, &fingerprint_lo].join("");
110        }
111
112        // possible the id in the lower 64 bits.
113        let fingerprint_hi = {
114            let id = self.hi.to_le_bytes();
115            // truncate zero
116            let rev_zero = id.iter().rev().skip_while(|&&b| b == 0).count();
117            let id = &id[..rev_zero];
118            base64::engine::general_purpose::STANDARD_NO_PAD.encode(id)
119        };
120        [prefix, &fingerprint_lo, &fingerprint_hi].join("")
121    }
122}
123
124/// A fingerprint hasher that extends the [`std::hash::Hasher`] trait.
125pub trait FingerprintHasher: std::hash::Hasher {
126    /// Finish the fingerprint and return the fingerprint and the data.
127    /// The data is used to resolve the conflict.
128    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>);
129}
130
131/// A fingerprint hasher that uses the [`SipHasher13`] algorithm.
132#[derive(Default)]
133pub struct FingerprintSipHasher {
134    /// The underlying data passed to the hasher.
135    data: Vec<u8>,
136}
137
138pub type FingerprintSipHasherBase = SipHasher13;
139
140impl FingerprintSipHasher {
141    pub fn fast_hash(&self) -> (u32, &Vec<u8>) {
142        let mut inner = FxHasher32::default();
143        self.data.hash(&mut inner);
144        (inner.finish() as u32, &self.data)
145    }
146}
147
148impl std::hash::Hasher for FingerprintSipHasher {
149    fn write(&mut self, bytes: &[u8]) {
150        self.data.extend_from_slice(bytes);
151    }
152
153    fn finish(&self) -> u64 {
154        let mut inner = FingerprintSipHasherBase::default();
155        self.data.hash(&mut inner);
156        inner.finish()
157    }
158}
159
160impl FingerprintHasher for FingerprintSipHasher {
161    fn finish_fingerprint(self) -> (Fingerprint, Vec<u8>) {
162        let buffer = self.data.clone();
163        let mut inner = FingerprintSipHasherBase::default();
164        buffer.hash(&mut inner);
165        let hash = inner.finish128();
166        (
167            Fingerprint {
168                lo: hash.h1,
169                hi: hash.h2,
170            },
171            buffer,
172        )
173    }
174}
175
176/// A fingerprint builder that produces unique fingerprint for each item.
177/// It resolves the conflict by checking the underlying data.
178/// See [`Fingerprint`] for more information.
179#[derive(Default)]
180pub struct FingerprintBuilder {
181    /// The fast conflict checker mapping fingerprints to their underlying data.
182    #[cfg(feature = "bi-hash")]
183    fast_conflict_checker: crate::adt::CHashMap<u32, Vec<u8>>,
184    /// The conflict checker mapping fingerprints to their underlying data.
185    conflict_checker: crate::adt::CHashMap<Fingerprint, Vec<u8>>,
186}
187
188#[cfg(not(feature = "bi-hash"))]
189impl FingerprintBuilder {
190    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
191        let mut s = FingerprintSipHasher { data: Vec::new() };
192        item.hash(&mut s);
193        let (fingerprint, _featured_data) = s.finish_fingerprint();
194        fingerprint
195    }
196
197    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
198        let mut s = FingerprintSipHasher { data: Vec::new() };
199        item.type_id().hash(&mut s);
200        item.hash(&mut s);
201
202        let (fingerprint, featured_data) = s.finish_fingerprint();
203        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
204            self.conflict_checker.insert(fingerprint, featured_data);
205            return fingerprint;
206        };
207
208        if *prev_featured_data == *featured_data {
209            return fingerprint;
210        }
211
212        // todo: soft error
213        panic!("Fingerprint conflict detected!");
214    }
215}
216
217#[cfg(feature = "bi-hash")]
218impl FingerprintBuilder {
219    pub fn resolve_unchecked<T: Hash>(&self, item: &T) -> Fingerprint {
220        let mut s = FingerprintSipHasher { data: Vec::new() };
221        item.hash(&mut s);
222        let (fingerprint, featured_data) = s.fast_hash();
223        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
224            self.fast_conflict_checker.insert(fingerprint, s.data);
225            return Fingerprint::from_pair(fingerprint as u64, 0);
226        };
227
228        if *prev_featured_data == *featured_data {
229            return Fingerprint::from_pair(fingerprint as u64, 0);
230        }
231
232        let (fingerprint, _featured_data) = s.finish_fingerprint();
233        fingerprint
234    }
235
236    pub fn resolve<T: Hash + 'static>(&self, item: &T) -> Fingerprint {
237        let mut s = FingerprintSipHasher { data: Vec::new() };
238        item.type_id().hash(&mut s);
239        item.hash(&mut s);
240        let (fingerprint, featured_data) = s.fast_hash();
241        let Some(prev_featured_data) = self.fast_conflict_checker.get(&fingerprint) else {
242            self.fast_conflict_checker.insert(fingerprint, s.data);
243            return Fingerprint::from_pair(fingerprint as u64, 0);
244        };
245
246        if *prev_featured_data == *featured_data {
247            return Fingerprint::from_pair(fingerprint as u64, 0);
248        }
249
250        let (fingerprint, featured_data) = s.finish_fingerprint();
251        let Some(prev_featured_data) = self.conflict_checker.get(&fingerprint) else {
252            self.conflict_checker.insert(fingerprint, featured_data);
253            return fingerprint;
254        };
255
256        if *prev_featured_data == *featured_data {
257            return fingerprint;
258        }
259
260        // todo: soft error
261        panic!("Fingerprint conflict detected!");
262    }
263}
264
265/// This function provides a hash function for items, which also includes a type
266/// id as part of the hash. Note: This function is not stable across different
267/// versions of typst-ts, so it is preferred to be always used in memory.
268/// Currently, this function use [`SipHasher13`] as the underlying hash
269/// algorithm.
270pub fn item_hash128<T: Hash + 'static>(item: &T) -> u128 {
271    // Also hash the TypeId because the type might be converted
272    // through an unsized coercion.
273    let mut state = SipHasher13::new();
274    item.type_id().hash(&mut state);
275    item.hash(&mut state);
276    state.finish128().as_u128()
277}
278
279/// Calculate a 128-bit siphash of a value.
280/// Currently, this function use [`SipHasher13`] as the underlying hash
281/// algorithm.
282#[inline]
283pub fn hash128<T: std::hash::Hash>(value: &T) -> u128 {
284    let mut state = SipHasher13::new();
285    value.hash(&mut state);
286    state.finish128().as_u128()
287}
288
289/// A convenience function for when you need a quick 64-bit hash.
290#[inline]
291pub fn hash64<T: Hash + ?Sized>(v: &T) -> u64 {
292    let mut state = FxHasher::default();
293    v.hash(&mut state);
294    state.finish()
295}
296
297// todo: rustc hash doesn't have 32-bit hash
298pub use fxhash::hash32;
299
300#[test]
301fn test_fingerprint() {
302    let t = Fingerprint::from_pair(0, 1);
303    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
304
305    let t = Fingerprint::from_pair(1, 1);
306    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
307
308    let t = Fingerprint::from_pair(1, 0);
309    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
310
311    let t = Fingerprint::from_pair(0, 0);
312    assert_eq!(Fingerprint::try_from_str(&t.as_svg_id("")).unwrap(), t);
313}