llm-tokenizer 1.3.1

LLM tokenizer library with caching and chat template support
Documentation
//! Tokenizer Fingerprinting for Cache Invalidation
//!
//! Creates a unique fingerprint of a tokenizer's configuration to detect
//! when the tokenizer has changed and the cache needs to be cleared.

use std::{
    collections::hash_map::DefaultHasher,
    hash::{Hash, Hasher},
};

use crate::traits::Tokenizer;

/// A fingerprint of a tokenizer's configuration
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct TokenizerFingerprint {
    /// Size of the vocabulary
    pub vocab_size: usize,
    /// Hash of a sample of vocabulary tokens (for speed)
    pub vocab_hash: u64,
    /// Hash of special tokens
    pub special_tokens_hash: u64,
}

impl TokenizerFingerprint {
    /// Create a fingerprint from a tokenizer
    pub fn from_tokenizer(tokenizer: &dyn Tokenizer) -> Self {
        let vocab_size = tokenizer.vocab_size();
        let vocab_hash = Self::compute_vocab_hash(tokenizer);
        let special_tokens_hash = Self::compute_special_tokens_hash(tokenizer);

        Self {
            vocab_size,
            vocab_hash,
            special_tokens_hash,
        }
    }

    /// Compute a hash of the vocabulary by sampling tokens
    fn compute_vocab_hash(tokenizer: &dyn Tokenizer) -> u64 {
        let mut hasher = DefaultHasher::new();
        let vocab_size = tokenizer.vocab_size();

        // Sample up to 1000 tokens for speed
        let sample_size = vocab_size.min(1000);
        let step = if sample_size > 0 {
            vocab_size / sample_size
        } else {
            1
        };

        for i in (0..vocab_size).step_by(step.max(1)) {
            if let Some(token) = tokenizer.id_to_token(i as u32) {
                token.hash(&mut hasher);
            }
        }

        hasher.finish()
    }

    /// Compute a hash of special tokens
    fn compute_special_tokens_hash(tokenizer: &dyn Tokenizer) -> u64 {
        let mut hasher = DefaultHasher::new();
        let special_tokens = tokenizer.get_special_tokens();

        special_tokens.bos_token.hash(&mut hasher);
        special_tokens.eos_token.hash(&mut hasher);
        special_tokens.unk_token.hash(&mut hasher);
        special_tokens.sep_token.hash(&mut hasher);
        special_tokens.pad_token.hash(&mut hasher);
        special_tokens.cls_token.hash(&mut hasher);
        special_tokens.mask_token.hash(&mut hasher);
        special_tokens.additional_special_tokens.hash(&mut hasher);

        hasher.finish()
    }
}

#[cfg(test)]
mod tests {
    use crate::{mock::MockTokenizer, *};

    #[test]
    fn test_fingerprint_equality() {
        let tokenizer1 = MockTokenizer::new();
        let tokenizer2 = MockTokenizer::new();

        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer1);
        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer2);

        // Same tokenizer config should produce same fingerprint
        assert_eq!(fp1, fp2);
    }

    #[test]
    fn test_fingerprint_consistency() {
        let tokenizer = MockTokenizer::new();

        let fp1 = TokenizerFingerprint::from_tokenizer(&tokenizer);
        let fp2 = TokenizerFingerprint::from_tokenizer(&tokenizer);

        // Fingerprint should be consistent
        assert_eq!(fp1, fp2);
        assert_eq!(fp1.vocab_size, tokenizer.vocab_size());
    }
}