Skip to main content

st/
tokenizer.rs

1// Smart Tree Tokenizer - Turn patterns into bytes! 🗜️
2// "Like 6502 opcodes - LDA is $A9, not 'LOAD ACCUMULATOR'" - Hue
3
4use std::collections::HashMap;
5
6/// Common patterns tokenized to single bytes
7pub struct Tokenizer {
8    /// Pattern → Token mapping
9    patterns: HashMap<String, u8>,
10    /// Token → Pattern for decoding
11    tokens: HashMap<u8, String>,
12}
13
14impl Default for Tokenizer {
15    fn default() -> Self {
16        Self::new()
17    }
18}
19
20impl Tokenizer {
21    pub fn new() -> Self {
22        let mut t = Tokenizer {
23            patterns: HashMap::new(),
24            tokens: HashMap::new(),
25        };
26
27        // Directory tokens (0x80-0x8F)
28        t.add(0x80, "node_modules");
29        t.add(0x81, ".git");
30        t.add(0x82, "src");
31        t.add(0x83, "target");
32        t.add(0x84, "dist");
33        t.add(0x85, "build");
34        t.add(0x86, "docs");
35        t.add(0x87, "tests");
36        t.add(0x88, "examples");
37        t.add(0x89, ".vscode");
38        t.add(0x8A, ".github");
39
40        // File extensions (0x90-0x9F)
41        t.add(0x90, ".js");
42        t.add(0x91, ".rs");
43        t.add(0x92, ".py");
44        t.add(0x93, ".ts");
45        t.add(0x94, ".json");
46        t.add(0x95, ".md");
47        t.add(0x96, ".toml");
48        t.add(0x97, ".yaml");
49        t.add(0x98, ".tsx");
50        t.add(0x99, ".jsx");
51        t.add(0x9A, ".go");
52        t.add(0x9B, ".java");
53        t.add(0x9C, ".cpp");
54        t.add(0x9D, ".c");
55        t.add(0x9E, ".h");
56
57        // Common filenames (0xA0-0xAF)
58        t.add(0xA0, "README.md");
59        t.add(0xA1, "package.json");
60        t.add(0xA2, "Cargo.toml");
61        t.add(0xA3, "main.rs");
62        t.add(0xA4, "index.js");
63        t.add(0xA5, "app.js");
64        t.add(0xA6, ".gitignore");
65        t.add(0xA7, "LICENSE");
66        t.add(0xA8, "Makefile");
67        t.add(0xA9, "Dockerfile");
68        t.add(0xAA, "tsconfig.json");
69        t.add(0xAB, "setup.py");
70        t.add(0xAC, "go.mod");
71
72        // Patterns (0xB0-0xBF)
73        t.add(0xB0, "test_");
74        t.add(0xB1, "_test");
75        t.add(0xB2, ".min.");
76        t.add(0xB3, ".spec.");
77        t.add(0xB4, "TODO");
78        t.add(0xB5, "FIXME");
79        t.add(0xB6, "function");
80        t.add(0xB7, "async");
81        t.add(0xB8, "import");
82        t.add(0xB9, "export");
83        t.add(0xBA, "class");
84        t.add(0xBB, "struct");
85        t.add(0xBC, "impl");
86        t.add(0xBD, "trait");
87
88        // Common paths (0xC0-0xCF)
89        t.add(0xC0, "src/");
90        t.add(0xC1, "tests/");
91        t.add(0xC2, "docs/");
92        t.add(0xC3, "../");
93        t.add(0xC4, "./");
94        t.add(0xC5, "~/");
95
96        t
97    }
98
99    fn add(&mut self, token: u8, pattern: &str) {
100        self.patterns.insert(pattern.to_string(), token);
101        self.tokens.insert(token, pattern.to_string());
102    }
103
104    /// Tokenize a string
105    pub fn tokenize(&self, text: &str) -> Vec<u8> {
106        let mut result = Vec::new();
107        let mut remaining = text;
108
109        while !remaining.is_empty() {
110            let mut found = false;
111
112            // Try to match longest pattern first
113            for len in (1..=remaining.len()).rev() {
114                if let Some(chunk) = remaining.get(0..len) {
115                    if let Some(&token) = self.patterns.get(chunk) {
116                        result.push(token);
117                        remaining = &remaining[len..];
118                        found = true;
119                        break;
120                    }
121                }
122            }
123
124            if !found {
125                // No pattern matched, store as raw byte
126                result.push(remaining.as_bytes()[0]);
127                remaining = &remaining[1..];
128            }
129        }
130
131        result
132    }
133
134    /// Decode tokens back to string
135    pub fn decode(&self, tokens: &[u8]) -> String {
136        let mut result = String::new();
137
138        for &token in tokens {
139            if let Some(pattern) = self.tokens.get(&token) {
140                result.push_str(pattern);
141            } else if token < 128 {
142                // ASCII character
143                result.push(token as char);
144            } else {
145                // Unknown token
146                result.push_str(&format!("<{:02X}>", token));
147            }
148        }
149
150        result
151    }
152
153    /// Calculate compression ratio
154    pub fn compression_ratio(&self, original: &str) -> f64 {
155        let tokenized = self.tokenize(original);
156        tokenized.len() as f64 / original.len() as f64
157    }
158}
159
160/// Quantum tokenizer - even more compression!
161pub struct QuantumTokenizer {
162    base: Tokenizer,
163    /// Multi-pattern combinations
164    combos: HashMap<Vec<u8>, u8>,
165}
166
167impl Default for QuantumTokenizer {
168    fn default() -> Self {
169        Self::new()
170    }
171}
172
173impl QuantumTokenizer {
174    pub fn new() -> Self {
175        let mut qt = QuantumTokenizer {
176            base: Tokenizer::new(),
177            combos: HashMap::new(),
178        };
179
180        // Common combinations (0xE0-0xEF)
181        qt.add_combo(0xE0, &[0x82, 0xC0]); // "src" + "src/" = "src/"
182        qt.add_combo(0xE1, &[0x91, 0xA3]); // ".rs" + "main.rs"
183        qt.add_combo(0xE2, &[0x90, 0xA4]); // ".js" + "index.js"
184        qt.add_combo(0xE3, &[0x80, 0xC4]); // "node_modules" + "./"
185
186        qt
187    }
188
189    fn add_combo(&mut self, token: u8, pattern: &[u8]) {
190        self.combos.insert(pattern.to_vec(), token);
191    }
192
193    pub fn quantum_tokenize(&self, text: &str) -> Vec<u8> {
194        let tokens = self.base.tokenize(text);
195
196        // Second pass: combine tokens
197        let mut result = Vec::new();
198        let mut i = 0;
199
200        while i < tokens.len() {
201            let mut found = false;
202
203            // Try to match combo patterns
204            for len in (2..=4).rev() {
205                if i + len <= tokens.len() {
206                    if let Some(&combo_token) = self.combos.get(&tokens[i..i + len]) {
207                        result.push(combo_token);
208                        i += len;
209                        found = true;
210                        break;
211                    }
212                }
213            }
214
215            if !found {
216                result.push(tokens[i]);
217                i += 1;
218            }
219        }
220
221        result
222    }
223}
224
225/// Statistics for tokenization
226pub struct TokenStats {
227    pub original_size: usize,
228    pub tokenized_size: usize,
229    pub compression_ratio: f64,
230    pub patterns_found: usize,
231}
232
233impl TokenStats {
234    pub fn calculate(original: &str, tokenizer: &Tokenizer) -> Self {
235        let tokens = tokenizer.tokenize(original);
236        let patterns_found = tokens.iter().filter(|&&t| t >= 0x80).count();
237
238        TokenStats {
239            original_size: original.len(),
240            tokenized_size: tokens.len(),
241            compression_ratio: tokens.len() as f64 / original.len() as f64,
242            patterns_found,
243        }
244    }
245
246    pub fn display(&self) -> String {
247        format!(
248            "Tokenization: {} → {} bytes ({:.1}% ratio), {} patterns",
249            self.original_size,
250            self.tokenized_size,
251            self.compression_ratio * 100.0,
252            self.patterns_found
253        )
254    }
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    #[test]
262    fn test_basic_tokenization() {
263        let t = Tokenizer::new();
264
265        // Test directory tokenization
266        let tokens = t.tokenize("node_modules");
267        assert_eq!(tokens, vec![0x80]);
268
269        // Test decoding
270        let decoded = t.decode(&tokens);
271        assert_eq!(decoded, "node_modules");
272    }
273
274    #[test]
275    fn test_path_tokenization() {
276        let t = Tokenizer::new();
277
278        let original = "src/main.rs";
279        let tokens = t.tokenize(original);
280        assert!(tokens.len() < original.len());
281
282        let decoded = t.decode(&tokens);
283        assert_eq!(decoded, original);
284    }
285
286    #[test]
287    fn test_compression_ratio() {
288        let t = Tokenizer::new();
289
290        let text = "node_modules/package.json";
291        let ratio = t.compression_ratio(text);
292        assert!(ratio < 0.5); // Should compress to less than 50%
293    }
294
295    #[test]
296    fn test_quantum_tokenization() {
297        let qt = QuantumTokenizer::new();
298
299        let text = "src/main.rs";
300        let tokens = qt.quantum_tokenize(text);
301        assert!(tokens.len() <= 3); // Should be highly compressed
302    }
303}