Skip to main content

llm_tokenizer/cache/
fingerprint.rs

1//! Tokenizer Fingerprinting for Cache Invalidation
2//!
3//! Creates a unique fingerprint of a tokenizer's configuration to detect
4//! when the tokenizer has changed and the cache needs to be cleared.
5
6use std::{
7    collections::hash_map::DefaultHasher,
8    hash::{Hash, Hasher},
9};
10
11use crate::traits::Tokenizer;
12
13/// A fingerprint of a tokenizer's configuration
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct TokenizerFingerprint {
16    /// Size of the vocabulary
17    pub vocab_size: usize,
18    /// Hash of a sample of vocabulary tokens (for speed)
19    pub vocab_hash: u64,
20    /// Hash of special tokens
21    pub special_tokens_hash: u64,
22}
23
24impl TokenizerFingerprint {
25    /// Create a fingerprint from a tokenizer
26    pub fn from_tokenizer(tokenizer: &dyn Tokenizer) -> Self {
27        let vocab_size = tokenizer.vocab_size();
28        let vocab_hash = Self::compute_vocab_hash(tokenizer);
29        let special_tokens_hash = Self::compute_special_tokens_hash(tokenizer);
30
31        Self {
32            vocab_size,
33            vocab_hash,
34            special_tokens_hash,
35        }
36    }
37
38    /// Compute a hash of the vocabulary by sampling tokens
39    fn compute_vocab_hash(tokenizer: &dyn Tokenizer) -> u64 {
40        let mut hasher = DefaultHasher::new();
41        let vocab_size = tokenizer.vocab_size();
42
43        // Sample up to 1000 tokens for speed
44        let sample_size = vocab_size.min(1000);
45        let step = if sample_size > 0 {
46            vocab_size / sample_size
47        } else {
48            1
49        };
50
51        for i in (0..vocab_size).step_by(step.max(1)) {
52            if let Some(token) = tokenizer.id_to_token(i as u32) {
53                token.hash(&mut hasher);
54            }
55        }
56
57        hasher.finish()
58    }
59
60    /// Compute a hash of special tokens
61    fn compute_special_tokens_hash(tokenizer: &dyn Tokenizer) -> u64 {
62        let mut hasher = DefaultHasher::new();
63        let special_tokens = tokenizer.get_special_tokens();
64
65        special_tokens.bos_token.hash(&mut hasher);
66        special_tokens.eos_token.hash(&mut hasher);
67        special_tokens.unk_token.hash(&mut hasher);
68        special_tokens.sep_token.hash(&mut hasher);
69        special_tokens.pad_token.hash(&mut hasher);
70        special_tokens.cls_token.hash(&mut hasher);
71        special_tokens.mask_token.hash(&mut hasher);
72        special_tokens.additional_special_tokens.hash(&mut hasher);
73
74        hasher.finish()
75    }
76}
77
78#[cfg(test)]
79mod tests {
80    use crate::{mock::MockTokenizer, *};
81
82    #[test]
83    fn test_fingerprint_equality() {
84        let tokenizer1 = MockTokenizer::new();
85        let tokenizer2 = MockTokenizer::new();
86
87        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer1);
88        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer2);
89
90        // Same tokenizer config should produce same fingerprint
91        assert_eq!(fp1, fp2);
92    }
93
94    #[test]
95    fn test_fingerprint_consistency() {
96        let tokenizer = MockTokenizer::new();
97
98        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer);
99        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer);
100
101        // Fingerprint should be consistent
102        assert_eq!(fp1, fp2);
103        assert_eq!(fp1.vocab_size, tokenizer.vocab_size());
104    }
105}