tekken/lib.rs
1//! # Tekken - Rust Implementation of Mistral's Multimodal Tokenizer
2//!
3//! `tekken` is a Rust implementation of Mistral's Tekken tokenizer with full support
4//! for both text and audio tokenization. It provides high-performance, memory-safe
5//! tokenization that is fully compatible with the Python implementation.
6//!
7//! ## Features
8//!
9//! - **Text Tokenization**: Full BPE (Byte Pair Encoding) support with special tokens
10//! - **Audio Processing**: Convert audio waveforms to token sequences using mel-scale spectrograms
11//! - **Multimodal Support**: Mix text and audio tokens in a single sequence
12//! - **Version Compatibility**: Support for multiple tokenizer versions (V3, V7, V11, V13)
13//! - **Special Tokens**: Comprehensive handling of control, instruction, tool, and media tokens
14//!
15//! ## Quick Start
16//!
17//! ### Basic Text Tokenization
18//!
19//! ```rust,no_run
20//! use tekken::{Tekkenizer, SpecialTokenPolicy};
21//!
22//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
23//! // Load tokenizer from configuration file
24//! let tokenizer = Tekkenizer::from_file("tekken.json")?;
25//!
26//! // Encode text with BOS/EOS tokens
27//! let text = "Hello, world!";
28//! let tokens = tokenizer.encode(text, true, true)?;
29//! println!("Tokens: {:?}", tokens);
30//!
31//! // Decode back to text
32//! let decoded = tokenizer.decode(&tokens, SpecialTokenPolicy::Keep)?;
33//! println!("Decoded: {}", decoded);
34//! # Ok(())
35//! # }
36//! ```
37//!
38//! ### Audio Tokenization
39//!
40//! ```rust,no_run
41//! use tekken::{Audio, AudioConfig, AudioSpectrogramConfig, AudioEncoder};
42//!
43//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
44//! // Load audio file
45//! let audio = Audio::from_file("audio.wav")?;
46//!
47//! // Configure audio processing
48//! let spectrogram_config = AudioSpectrogramConfig::new(80, 160, 400)?;
49//! let audio_config = AudioConfig::new(16000, 12.5, spectrogram_config, None)?;
50//!
51//! // Create encoder and process audio
52//! let encoder = AudioEncoder::new(audio_config, 1000, 1001); // audio_token_id, begin_audio_token_id
53//! let encoding = encoder.encode(audio)?;
54//!
55//! println!("Audio encoded to {} tokens", encoding.tokens.len());
56//! # Ok(())
57//! # }
58//! ```
59//!
60//! ### Multimodal Tokenization
61//!
62//! ```rust,no_run
63//! use tekken::{Tekkenizer, Audio, SpecialTokenPolicy};
64//!
65//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
66//! let tokenizer = Tekkenizer::from_file("tekken.json")?;
67//!
68//! // Text tokens
69//! let text_tokens = tokenizer.encode("Please transcribe this audio:", true, false)?;
70//!
71//! // Audio tokens (if tokenizer has audio support)
72//! if tokenizer.has_audio_support() {
73//! let audio = Audio::from_file("speech.wav")?;
74//! let audio_encoding = tokenizer.encode_audio(audio)?;
75//!
76//! // Combine text and audio tokens
77//! let mut combined_tokens = text_tokens;
78//! combined_tokens.extend(audio_encoding.tokens);
79//!
80//! println!("Combined sequence: {} tokens", combined_tokens.len());
81//! }
82//! # Ok(())
83//! # }
84//! ```
85//!
86//! ## Architecture
87//!
88//! The library is organized into several modules:
89//!
90//! - [`tekkenizer`]: Main tokenizer implementation and text processing
91//! - [`audio`]: Audio processing, mel-scale spectrograms, and audio tokenization
92//! - [`special_tokens`]: Special token definitions and handling policies
93//! - [`config`]: Configuration structures and version management
94//! - [`errors`]: Comprehensive error handling
95//!
96//! ## Compatibility
97//!
98//! This Rust implementation is designed to be fully compatible with Mistral's Python
99//! tokenizer implementation:
100//!
101//! - Identical tokenization results for text
102//! - Same audio processing pipeline and token generation
103//! - Compatible special token handling
104//! - Matching mel filter bank computations
105//!
106//! ## Performance
107//!
108//! The Rust implementation provides significant performance improvements over Python:
109//!
110//! - Memory-safe processing with zero-copy operations where possible
111//! - Efficient audio processing with optimized mel-scale computations
112//! - Fast BPE tokenization using proven algorithms
113//! - Minimal allocations and efficient data structures
114
115pub mod audio;
116pub mod config;
117pub mod errors;
118pub mod special_tokens;
119pub mod tekkenizer;
120
121// Re-export commonly used types for convenience
122pub use audio::{Audio, AudioConfig, AudioEncoder, AudioSpectrogramConfig};
123pub use config::{TekkenConfig, TokenInfo};
124pub use errors::{Result, TokenizerError};
125pub use special_tokens::SpecialTokenInfo;
126pub use special_tokens::{SpecialTokenPolicy, SpecialTokens};
127pub use tekkenizer::Tekkenizer;