llm_tokenizer/cache/
fingerprint.rs1use std::{
7 collections::hash_map::DefaultHasher,
8 hash::{Hash, Hasher},
9};
10
11use crate::traits::Tokenizer;
12
13#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub struct TokenizerFingerprint {
16 pub vocab_size: usize,
18 pub vocab_hash: u64,
20 pub special_tokens_hash: u64,
22}
23
24impl TokenizerFingerprint {
25 pub fn from_tokenizer(tokenizer: &dyn Tokenizer) -> Self {
27 let vocab_size = tokenizer.vocab_size();
28 let vocab_hash = Self::compute_vocab_hash(tokenizer);
29 let special_tokens_hash = Self::compute_special_tokens_hash(tokenizer);
30
31 Self {
32 vocab_size,
33 vocab_hash,
34 special_tokens_hash,
35 }
36 }
37
38 fn compute_vocab_hash(tokenizer: &dyn Tokenizer) -> u64 {
40 let mut hasher = DefaultHasher::new();
41 let vocab_size = tokenizer.vocab_size();
42
43 let sample_size = vocab_size.min(1000);
45 let step = if sample_size > 0 {
46 vocab_size / sample_size
47 } else {
48 1
49 };
50
51 for i in (0..vocab_size).step_by(step.max(1)) {
52 if let Some(token) = tokenizer.id_to_token(i as u32) {
53 token.hash(&mut hasher);
54 }
55 }
56
57 hasher.finish()
58 }
59
60 fn compute_special_tokens_hash(tokenizer: &dyn Tokenizer) -> u64 {
62 let mut hasher = DefaultHasher::new();
63 let special_tokens = tokenizer.get_special_tokens();
64
65 special_tokens.bos_token.hash(&mut hasher);
66 special_tokens.eos_token.hash(&mut hasher);
67 special_tokens.unk_token.hash(&mut hasher);
68 special_tokens.sep_token.hash(&mut hasher);
69 special_tokens.pad_token.hash(&mut hasher);
70 special_tokens.cls_token.hash(&mut hasher);
71 special_tokens.mask_token.hash(&mut hasher);
72 special_tokens.additional_special_tokens.hash(&mut hasher);
73
74 hasher.finish()
75 }
76}
77
78#[cfg(test)]
79mod tests {
80 use crate::{mock::MockTokenizer, *};
81
82 #[test]
83 fn test_fingerprint_equality() {
84 let tokenizer1 = MockTokenizer::new();
85 let tokenizer2 = MockTokenizer::new();
86
87 let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer1);
88 let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer2);
89
90 assert_eq!(fp1, fp2);
92 }
93
94 #[test]
95 fn test_fingerprint_consistency() {
96 let tokenizer = MockTokenizer::new();
97
98 let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer);
99 let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer);
100
101 assert_eq!(fp1, fp2);
103 assert_eq!(fp1.vocab_size, tokenizer.vocab_size());
104 }
105}