1use std::collections::HashMap;
5
6pub struct Tokenizer {
8 patterns: HashMap<String, u8>,
10 tokens: HashMap<u8, String>,
12}
13
14impl Default for Tokenizer {
15 fn default() -> Self {
16 Self::new()
17 }
18}
19
20impl Tokenizer {
21 pub fn new() -> Self {
22 let mut t = Tokenizer {
23 patterns: HashMap::new(),
24 tokens: HashMap::new(),
25 };
26
27 t.add(0x80, "node_modules");
29 t.add(0x81, ".git");
30 t.add(0x82, "src");
31 t.add(0x83, "target");
32 t.add(0x84, "dist");
33 t.add(0x85, "build");
34 t.add(0x86, "docs");
35 t.add(0x87, "tests");
36 t.add(0x88, "examples");
37 t.add(0x89, ".vscode");
38 t.add(0x8A, ".github");
39
40 t.add(0x90, ".js");
42 t.add(0x91, ".rs");
43 t.add(0x92, ".py");
44 t.add(0x93, ".ts");
45 t.add(0x94, ".json");
46 t.add(0x95, ".md");
47 t.add(0x96, ".toml");
48 t.add(0x97, ".yaml");
49 t.add(0x98, ".tsx");
50 t.add(0x99, ".jsx");
51 t.add(0x9A, ".go");
52 t.add(0x9B, ".java");
53 t.add(0x9C, ".cpp");
54 t.add(0x9D, ".c");
55 t.add(0x9E, ".h");
56
57 t.add(0xA0, "README.md");
59 t.add(0xA1, "package.json");
60 t.add(0xA2, "Cargo.toml");
61 t.add(0xA3, "main.rs");
62 t.add(0xA4, "index.js");
63 t.add(0xA5, "app.js");
64 t.add(0xA6, ".gitignore");
65 t.add(0xA7, "LICENSE");
66 t.add(0xA8, "Makefile");
67 t.add(0xA9, "Dockerfile");
68 t.add(0xAA, "tsconfig.json");
69 t.add(0xAB, "setup.py");
70 t.add(0xAC, "go.mod");
71
72 t.add(0xB0, "test_");
74 t.add(0xB1, "_test");
75 t.add(0xB2, ".min.");
76 t.add(0xB3, ".spec.");
77 t.add(0xB4, "TODO");
78 t.add(0xB5, "FIXME");
79 t.add(0xB6, "function");
80 t.add(0xB7, "async");
81 t.add(0xB8, "import");
82 t.add(0xB9, "export");
83 t.add(0xBA, "class");
84 t.add(0xBB, "struct");
85 t.add(0xBC, "impl");
86 t.add(0xBD, "trait");
87
88 t.add(0xC0, "src/");
90 t.add(0xC1, "tests/");
91 t.add(0xC2, "docs/");
92 t.add(0xC3, "../");
93 t.add(0xC4, "./");
94 t.add(0xC5, "~/");
95
96 t
97 }
98
99 fn add(&mut self, token: u8, pattern: &str) {
100 self.patterns.insert(pattern.to_string(), token);
101 self.tokens.insert(token, pattern.to_string());
102 }
103
104 pub fn tokenize(&self, text: &str) -> Vec<u8> {
106 let mut result = Vec::new();
107 let mut remaining = text;
108
109 while !remaining.is_empty() {
110 let mut found = false;
111
112 for len in (1..=remaining.len()).rev() {
114 if let Some(chunk) = remaining.get(0..len) {
115 if let Some(&token) = self.patterns.get(chunk) {
116 result.push(token);
117 remaining = &remaining[len..];
118 found = true;
119 break;
120 }
121 }
122 }
123
124 if !found {
125 result.push(remaining.as_bytes()[0]);
127 remaining = &remaining[1..];
128 }
129 }
130
131 result
132 }
133
134 pub fn decode(&self, tokens: &[u8]) -> String {
136 let mut result = String::new();
137
138 for &token in tokens {
139 if let Some(pattern) = self.tokens.get(&token) {
140 result.push_str(pattern);
141 } else if token < 128 {
142 result.push(token as char);
144 } else {
145 result.push_str(&format!("<{:02X}>", token));
147 }
148 }
149
150 result
151 }
152
153 pub fn compression_ratio(&self, original: &str) -> f64 {
155 let tokenized = self.tokenize(original);
156 tokenized.len() as f64 / original.len() as f64
157 }
158}
159
160pub struct QuantumTokenizer {
162 base: Tokenizer,
163 combos: HashMap<Vec<u8>, u8>,
165}
166
167impl Default for QuantumTokenizer {
168 fn default() -> Self {
169 Self::new()
170 }
171}
172
173impl QuantumTokenizer {
174 pub fn new() -> Self {
175 let mut qt = QuantumTokenizer {
176 base: Tokenizer::new(),
177 combos: HashMap::new(),
178 };
179
180 qt.add_combo(0xE0, &[0x82, 0xC0]); qt.add_combo(0xE1, &[0x91, 0xA3]); qt.add_combo(0xE2, &[0x90, 0xA4]); qt.add_combo(0xE3, &[0x80, 0xC4]); qt
187 }
188
189 fn add_combo(&mut self, token: u8, pattern: &[u8]) {
190 self.combos.insert(pattern.to_vec(), token);
191 }
192
193 pub fn quantum_tokenize(&self, text: &str) -> Vec<u8> {
194 let tokens = self.base.tokenize(text);
195
196 let mut result = Vec::new();
198 let mut i = 0;
199
200 while i < tokens.len() {
201 let mut found = false;
202
203 for len in (2..=4).rev() {
205 if i + len <= tokens.len() {
206 if let Some(&combo_token) = self.combos.get(&tokens[i..i + len]) {
207 result.push(combo_token);
208 i += len;
209 found = true;
210 break;
211 }
212 }
213 }
214
215 if !found {
216 result.push(tokens[i]);
217 i += 1;
218 }
219 }
220
221 result
222 }
223}
224
225pub struct TokenStats {
227 pub original_size: usize,
228 pub tokenized_size: usize,
229 pub compression_ratio: f64,
230 pub patterns_found: usize,
231}
232
233impl TokenStats {
234 pub fn calculate(original: &str, tokenizer: &Tokenizer) -> Self {
235 let tokens = tokenizer.tokenize(original);
236 let patterns_found = tokens.iter().filter(|&&t| t >= 0x80).count();
237
238 TokenStats {
239 original_size: original.len(),
240 tokenized_size: tokens.len(),
241 compression_ratio: tokens.len() as f64 / original.len() as f64,
242 patterns_found,
243 }
244 }
245
246 pub fn display(&self) -> String {
247 format!(
248 "Tokenization: {} → {} bytes ({:.1}% ratio), {} patterns",
249 self.original_size,
250 self.tokenized_size,
251 self.compression_ratio * 100.0,
252 self.patterns_found
253 )
254 }
255}
256
257#[cfg(test)]
258mod tests {
259 use super::*;
260
261 #[test]
262 fn test_basic_tokenization() {
263 let t = Tokenizer::new();
264
265 let tokens = t.tokenize("node_modules");
267 assert_eq!(tokens, vec![0x80]);
268
269 let decoded = t.decode(&tokens);
271 assert_eq!(decoded, "node_modules");
272 }
273
274 #[test]
275 fn test_path_tokenization() {
276 let t = Tokenizer::new();
277
278 let original = "src/main.rs";
279 let tokens = t.tokenize(original);
280 assert!(tokens.len() < original.len());
281
282 let decoded = t.decode(&tokens);
283 assert_eq!(decoded, original);
284 }
285
286 #[test]
287 fn test_compression_ratio() {
288 let t = Tokenizer::new();
289
290 let text = "node_modules/package.json";
291 let ratio = t.compression_ratio(text);
292 assert!(ratio < 0.5); }
294
295 #[test]
296 fn test_quantum_tokenization() {
297 let qt = QuantumTokenizer::new();
298
299 let text = "src/main.rs";
300 let tokens = qt.quantum_tokenize(text);
301 assert!(tokens.len() <= 3); }
303}