llm_tokenizer/cache/
fingerprint.rs1use std::{
7 collections::hash_map::DefaultHasher,
8 hash::{Hash, Hasher},
9};
10
11use crate::traits::Tokenizer;
12
13#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct TokenizerFingerprint {
16 pub vocab_size: usize,
18 pub vocab_hash: u64,
20 pub special_tokens_hash: u64,
22}
23
24impl TokenizerFingerprint {
25 pub fn from_tokenizer(tokenizer: &dyn Tokenizer) -> Self {
27 let vocab_size = tokenizer.vocab_size();
28 let vocab_hash = Self::compute_vocab_hash(tokenizer);
29 let special_tokens_hash = Self::compute_special_tokens_hash(tokenizer);
30
31 Self {
32 vocab_size,
33 vocab_hash,
34 special_tokens_hash,
35 }
36 }
37
38 fn compute_vocab_hash(tokenizer: &dyn Tokenizer) -> u64 {
40 let mut hasher = DefaultHasher::new();
41 let vocab_size = tokenizer.vocab_size();
42
43 let sample_size = vocab_size.min(1000);
45 let step = vocab_size.checked_div(sample_size).unwrap_or(1);
46
47 for i in (0..vocab_size).step_by(step.max(1)) {
48 if let Some(token) = tokenizer.id_to_token(i as u32) {
49 token.hash(&mut hasher);
50 }
51 }
52
53 hasher.finish()
54 }
55
56 fn compute_special_tokens_hash(tokenizer: &dyn Tokenizer) -> u64 {
58 let mut hasher = DefaultHasher::new();
59 let special_tokens = tokenizer.get_special_tokens();
60
61 special_tokens.bos_token.hash(&mut hasher);
62 special_tokens.eos_token.hash(&mut hasher);
63 special_tokens.unk_token.hash(&mut hasher);
64 special_tokens.sep_token.hash(&mut hasher);
65 special_tokens.pad_token.hash(&mut hasher);
66 special_tokens.cls_token.hash(&mut hasher);
67 special_tokens.mask_token.hash(&mut hasher);
68 special_tokens.additional_special_tokens.hash(&mut hasher);
69
70 hasher.finish()
71 }
72}
73
74#[cfg(test)]
75mod tests {
76 use crate::{mock::MockTokenizer, *};
77
78 #[test]
79 fn test_fingerprint_equality() {
80 let tokenizer1 = MockTokenizer::new();
81 let tokenizer2 = MockTokenizer::new();
82
83 let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer1);
84 let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer2);
85
86 assert_eq!(fp1, fp2);
88 }
89
90 #[test]
91 fn test_fingerprint_consistency() {
92 let tokenizer = MockTokenizer::new();
93
94 let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer);
95 let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer);
96
97 assert_eq!(fp1, fp2);
99 assert_eq!(fp1.vocab_size, tokenizer.vocab_size());
100 }
101}