kizzasi_tokenizer/
lib.rs

1//! # kizzasi-tokenizer
2//!
3//! Signal quantization and tokenization for Kizzasi AGSP.
4//!
5//! This crate provides methods for converting continuous signals into
6//! representations suitable for autoregressive prediction:
7//!
8//! - **Continuous Embedding**: Direct float-to-latent projection (no discretization)
9//! - **VQ-VAE**: Vector Quantized embeddings with learned codebook
10//! - **μ-law**: Logarithmic quantization for audio signals
11//! - **Linear Quantization**: Simple uniform quantization
12//!
13//! ## AGSP Philosophy
14//!
15//! Unlike LLMs that tokenize text into discrete vocabulary indices,
16//! AGSP models can work with continuous signals directly. However,
17//! discretization can still be useful for:
18//!
19//! - Reducing model complexity
20//! - Enabling cross-modal transfer
21//! - Improving training stability
22//!
23//! ## COOLJAPAN Ecosystem
24//!
25//! This crate follows KIZZASI_POLICY.md and uses `scirs2-core` for all
26//! array and numerical operations.
27
28pub mod advanced_features;
29pub mod advanced_quant;
30pub mod batch;
31pub mod compat;
32mod continuous;
33pub mod domain_specific;
34pub mod enhanced_multiscale;
35pub mod entropy;
36mod error;
37pub mod gpu_quant;
38pub mod metrics;
39mod mulaw;
40mod multiscale;
41#[cfg(feature = "vqvae")]
42pub mod neural_codec;
43pub mod persistence;
44pub mod pretraining;
45pub mod profiling;
46mod quantizer;
47pub mod serde_utils;
48pub mod simd_quant;
49pub mod specialized;
50pub mod transformer;
51pub mod types;
52pub mod utils;
53
54#[cfg(feature = "vqvae")]
55pub mod vqvae;
56
57pub use continuous::{
58    ContinuousTokenizer, ReconstructionMetrics, TrainableContinuousTokenizer, TrainingConfig,
59};
60pub use error::{TokenizerError, TokenizerResult};
61pub use mulaw::MuLawCodec;
62pub use multiscale::{
63    MultiScaleTokenizer, PoolMethod, PyramidTokenizer, ScaleLevel, UpsampleMethod,
64};
65pub use quantizer::{LinearQuantizer, Quantizer};
66
67// Re-export advanced quantizers
68pub use advanced_quant::{AdaptiveQuantizer, DeadZoneQuantizer, NonUniformQuantizer};
69
70#[cfg(feature = "vqvae")]
71pub use vqvae::{
72    ProductQuantizer, ProductQuantizerConfig, RVQVAETokenizer, ResidualVQ, VQConfig,
73    VQVAETokenizer, VectorQuantizer,
74};
75
76// Re-export batch types
77pub use batch::{BatchTokenizer, StreamingTokenizer};
78
79// Re-export entropy coding
80pub use entropy::{
81    compression_ratio, compute_frequencies, ArithmeticDecoder, ArithmeticEncoder,
82    BitrateController, HuffmanDecoder, HuffmanEncoder, RangeDecoder, RangeEncoder,
83};
84
85// Re-export persistence types
86pub use persistence::{load_config, save_config, ModelCheckpoint, ModelMetadata, ModelVersion};
87
88// Re-export specialized tokenizers
89pub use specialized::{
90    DCTConfig, DCTTokenizer, FourierConfig, FourierTokenizer, KMeansConfig, KMeansTokenizer,
91    WaveletConfig, WaveletFamily, WaveletTokenizer,
92};
93
94// Re-export advanced features
95pub use advanced_features::{
96    add_batch_jitter, add_jitter, apply_batch_token_dropout, apply_temporal_coherence,
97    apply_token_dropout, HierarchicalConfig, HierarchicalTokenizer, JitterConfig,
98    TemporalCoherenceConfig, TemporalFilterType, TokenDropoutConfig,
99};
100
101// Re-export compatibility types
102pub use compat::{AudioMetadata, DType, ModelConfig, OnnxConfig, PyTorchCompat, TensorInfo};
103
104// Re-export neural codec types
105#[cfg(feature = "vqvae")]
106pub use neural_codec::{NeuralCodec, NeuralCodecConfig};
107
108// Re-export domain-specific tokenizers
109pub use domain_specific::{
110    EnvironmentalTokenizer, EnvironmentalTokenizerConfig, MusicTokenizer, MusicTokenizerConfig,
111    SpeechTokenizer, SpeechTokenizerConfig,
112};
113
114// Re-export transformer tokenizer
115pub use transformer::{
116    FeedForward, LayerNorm, MultiHeadAttention, PositionalEncoding, TransformerConfig,
117    TransformerEncoderLayer, TransformerTokenizer,
118};
119
120// Re-export pre-training utilities
121pub use pretraining::{
122    ContrastiveConfig, ContrastiveLearning, MSMConfig, MaskedSignalModeling, TemporalPrediction,
123    TemporalPredictionConfig,
124};
125
126// Re-export profiling utilities
127pub use profiling::{
128    AllocationEvent, EventType, MemoryProfiler, MemorySnapshot, ProfileScope, ScopeStats,
129    TimelineAnalyzer,
130};
131
132// Re-export core types
133pub use scirs2_core::ndarray::{Array1, Array2};
134
135/// Trait for signal tokenization
136pub trait SignalTokenizer {
137    /// Encode a continuous signal into tokens/embeddings
138    fn encode(&self, signal: &Array1<f32>) -> TokenizerResult<Array1<f32>>;
139
140    /// Decode tokens/embeddings back to continuous signal
141    fn decode(&self, tokens: &Array1<f32>) -> TokenizerResult<Array1<f32>>;
142
143    /// Get the embedding dimension
144    fn embed_dim(&self) -> usize;
145
146    /// Get the vocabulary size (for discrete tokenizers, 0 for continuous)
147    fn vocab_size(&self) -> usize;
148}
149
150/// Configuration for tokenizer selection
151#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
152pub enum TokenizerType {
153    /// Direct continuous embedding (no discretization)
154    Continuous { embed_dim: usize },
155    /// μ-law companding (8-bit or 16-bit)
156    MuLaw { bits: u8 },
157    /// Linear uniform quantization
158    Linear { bits: u8, min: f32, max: f32 },
159    /// Vector quantized (VQ-VAE style)
160    VectorQuantized {
161        codebook_size: usize,
162        embed_dim: usize,
163    },
164    /// Multi-scale hierarchical tokenization
165    MultiScale {
166        embed_dim_per_level: usize,
167        num_levels: usize,
168    },
169    /// Pyramid tokenization with residual encoding
170    Pyramid {
171        embed_dim_per_level: usize,
172        num_levels: usize,
173    },
174}
175
176impl Default for TokenizerType {
177    fn default() -> Self {
178        TokenizerType::Continuous { embed_dim: 256 }
179    }
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    #[test]
187    fn test_tokenizer_type_default() {
188        let t = TokenizerType::default();
189        match t {
190            TokenizerType::Continuous { embed_dim } => assert_eq!(embed_dim, 256),
191            _ => panic!("Expected Continuous tokenizer"),
192        }
193    }
194}