Skip to main content

llm_tokenizer/cache/
fingerprint.rs

1//! Tokenizer Fingerprinting for Cache Invalidation
2//!
3//! Creates a unique fingerprint of a tokenizer's configuration to detect
4//! when the tokenizer has changed and the cache needs to be cleared.
5
6use std::{
7    collections::hash_map::DefaultHasher,
8    hash::{Hash, Hasher},
9};
10
11use crate::traits::Tokenizer;
12
13/// A fingerprint of a tokenizer's configuration
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct TokenizerFingerprint {
16    /// Size of the vocabulary
17    pub vocab_size: usize,
18    /// Hash of a sample of vocabulary tokens (for speed)
19    pub vocab_hash: u64,
20    /// Hash of special tokens
21    pub special_tokens_hash: u64,
22}
23
24impl TokenizerFingerprint {
25    /// Create a fingerprint from a tokenizer
26    pub fn from_tokenizer(tokenizer: &dyn Tokenizer) -> Self {
27        let vocab_size = tokenizer.vocab_size();
28        let vocab_hash = Self::compute_vocab_hash(tokenizer);
29        let special_tokens_hash = Self::compute_special_tokens_hash(tokenizer);
30
31        Self {
32            vocab_size,
33            vocab_hash,
34            special_tokens_hash,
35        }
36    }
37
38    /// Compute a hash of the vocabulary by sampling tokens
39    fn compute_vocab_hash(tokenizer: &dyn Tokenizer) -> u64 {
40        let mut hasher = DefaultHasher::new();
41        let vocab_size = tokenizer.vocab_size();
42
43        // Sample up to 1000 tokens for speed
44        let sample_size = vocab_size.min(1000);
45        let step = vocab_size.checked_div(sample_size).unwrap_or(1);
46
47        for i in (0..vocab_size).step_by(step.max(1)) {
48            if let Some(token) = tokenizer.id_to_token(i as u32) {
49                token.hash(&mut hasher);
50            }
51        }
52
53        hasher.finish()
54    }
55
56    /// Compute a hash of special tokens
57    fn compute_special_tokens_hash(tokenizer: &dyn Tokenizer) -> u64 {
58        let mut hasher = DefaultHasher::new();
59        let special_tokens = tokenizer.get_special_tokens();
60
61        special_tokens.bos_token.hash(&mut hasher);
62        special_tokens.eos_token.hash(&mut hasher);
63        special_tokens.unk_token.hash(&mut hasher);
64        special_tokens.sep_token.hash(&mut hasher);
65        special_tokens.pad_token.hash(&mut hasher);
66        special_tokens.cls_token.hash(&mut hasher);
67        special_tokens.mask_token.hash(&mut hasher);
68        special_tokens.additional_special_tokens.hash(&mut hasher);
69
70        hasher.finish()
71    }
72}
73
74#[cfg(test)]
75mod tests {
76    use crate::{mock::MockTokenizer, *};
77
78    #[test]
79    fn test_fingerprint_equality() {
80        let tokenizer1 = MockTokenizer::new();
81        let tokenizer2 = MockTokenizer::new();
82
83        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer1);
84        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer2);
85
86        // Same tokenizer config should produce same fingerprint
87        assert_eq!(fp1, fp2);
88    }
89
90    #[test]
91    fn test_fingerprint_consistency() {
92        let tokenizer = MockTokenizer::new();
93
94        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer);
95        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer);
96
97        // Fingerprint should be consistent
98        assert_eq!(fp1, fp2);
99        assert_eq!(fp1.vocab_size, tokenizer.vocab_size());
100    }
101}