1use crate::core::config::EncodingMode;
2use crate::simd::variants::DictionaryMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7#[derive(Debug, Clone)]
13pub struct Dictionary {
14 chars: Vec<char>,
15 char_to_index: HashMap<char, usize>,
16 lookup_table: Option<Box<[Option<usize>; 256]>>,
18 mode: EncodingMode,
19 padding: Option<char>,
20 start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24 pub fn new(chars: Vec<char>) -> Result<Self, String> {
34 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
35 }
36
37 pub fn new_with_mode(
51 chars: Vec<char>,
52 mode: EncodingMode,
53 padding: Option<char>,
54 ) -> Result<Self, String> {
55 Self::new_with_mode_and_range(chars, mode, padding, None)
56 }
57
58 pub fn new_with_mode_and_range(
71 chars: Vec<char>,
72 mode: EncodingMode,
73 padding: Option<char>,
74 start_codepoint: Option<u32>,
75 ) -> Result<Self, String> {
76 if mode == EncodingMode::ByteRange {
78 if let Some(start) = start_codepoint {
79 if let Some(end_codepoint) = start.checked_add(255) {
81 if std::char::from_u32(end_codepoint).is_none() {
82 return Err(format!(
83 "Invalid Unicode range: {}-{}",
84 start, end_codepoint
85 ));
86 }
87 for offset in 0..=255 {
89 if std::char::from_u32(start + offset).is_none() {
90 return Err(format!(
91 "Invalid Unicode codepoint in range: {}",
92 start + offset
93 ));
94 }
95 }
96 } else {
97 return Err("Start codepoint too high for 256-byte range".to_string());
98 }
99
100 return Ok(Dictionary {
101 chars: Vec::new(),
102 char_to_index: HashMap::new(),
103 lookup_table: None,
104 mode,
105 padding,
106 start_codepoint: Some(start),
107 });
108 } else {
109 return Err("ByteRange mode requires start_codepoint".to_string());
110 }
111 }
112
113 if chars.is_empty() {
114 return Err("Dictionary cannot be empty".to_string());
115 }
116
117 if mode == EncodingMode::Chunked {
119 let base = chars.len();
120 if !base.is_power_of_two() {
121 return Err(format!(
122 "Chunked mode requires power-of-two dictionary size, got {}",
123 base
124 ));
125 }
126 if base != 2
128 && base != 4
129 && base != 8
130 && base != 16
131 && base != 32
132 && base != 64
133 && base != 128
134 && base != 256
135 {
136 return Err(format!("Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
137 }
138 }
139
140 let mut char_to_index = HashMap::new();
142 for (i, &c) in chars.iter().enumerate() {
143 if char_to_index.insert(c, i).is_some() {
145 return Err(format!(
146 "Duplicate character in dictionary: '{}' (U+{:04X})",
147 c, c as u32
148 ));
149 }
150
151 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
153 return Err(format!(
154 "Control character not allowed in dictionary: U+{:04X}",
155 c as u32
156 ));
157 }
158
159 if c.is_whitespace() {
161 return Err(format!(
162 "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
163 c, c as u32
164 ));
165 }
166 }
167
168 if let Some(pad) = padding {
170 if char_to_index.contains_key(&pad) {
171 return Err(format!(
172 "Padding character '{}' conflicts with dictionary characters",
173 pad
174 ));
175 }
176 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
177 return Err(format!(
178 "Control character not allowed as padding: U+{:04X}",
179 pad as u32
180 ));
181 }
182 }
183
184 let lookup_table = if chars
186 .iter()
187 .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
188 {
189 let mut table = Box::new([None; 256]);
190 for (i, &c) in chars.iter().enumerate() {
191 table[c as usize] = Some(i);
192 }
193 Some(table)
194 } else {
195 None
196 };
197
198 Ok(Dictionary {
199 chars,
200 char_to_index,
201 lookup_table,
202 mode,
203 padding,
204 start_codepoint: None,
205 })
206 }
207
208 pub fn from_str(s: &str) -> Result<Self, String> {
214 let chars: Vec<char> = s.chars().collect();
215 Self::new(chars)
216 }
217
218 pub fn base(&self) -> usize {
222 match self.mode {
223 EncodingMode::ByteRange => 256,
224 _ => self.chars.len(),
225 }
226 }
227
228 pub fn mode(&self) -> &EncodingMode {
230 &self.mode
231 }
232
233 pub fn padding(&self) -> Option<char> {
235 self.padding
236 }
237
238 pub fn start_codepoint(&self) -> Option<u32> {
240 self.start_codepoint
241 }
242
243 pub fn encode_digit(&self, digit: usize) -> Option<char> {
247 match self.mode {
248 EncodingMode::ByteRange => {
249 if let Some(start) = self.start_codepoint {
250 if digit < 256 {
251 return std::char::from_u32(start + digit as u32);
252 }
253 }
254 None
255 }
256 _ => self.chars.get(digit).copied(),
257 }
258 }
259
260 pub fn decode_char(&self, c: char) -> Option<usize> {
264 match self.mode {
265 EncodingMode::ByteRange => {
266 if let Some(start) = self.start_codepoint {
267 let codepoint = c as u32;
268 if codepoint >= start && codepoint < start + 256 {
269 return Some((codepoint - start) as usize);
270 }
271 }
272 None
273 }
274 _ => {
275 if let Some(ref table) = self.lookup_table {
277 let char_val = c as u32;
278 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
279 return table[char_val as usize];
280 }
281 }
282 self.char_to_index.get(&c).copied()
284 }
285 }
286 }
287
288 pub fn simd_metadata(&self) -> DictionaryMetadata {
293 DictionaryMetadata::from_dictionary(self)
294 }
295
296 pub fn simd_available(&self) -> bool {
301 self.simd_metadata().simd_available()
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308
309 #[test]
310 fn test_duplicate_character_detection() {
311 let chars = vec!['a', 'b', 'c', 'a'];
312 let result = Dictionary::new(chars);
313 assert!(result.is_err());
314 assert!(result.unwrap_err().contains("Duplicate character"));
315 }
316
317 #[test]
318 fn test_empty_dictionary() {
319 let chars = vec![];
320 let result = Dictionary::new(chars);
321 assert!(result.is_err());
322 assert!(result.unwrap_err().contains("cannot be empty"));
323 }
324
325 #[test]
326 fn test_chunked_mode_power_of_two() {
327 let chars = vec!['a', 'b', 'c']; let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
329 assert!(result.is_err());
330 assert!(result.unwrap_err().contains("power-of-two"));
331 }
332
333 #[test]
334 fn test_chunked_mode_valid_sizes() {
335 for &size in &[2, 4, 8, 16, 32, 64] {
337 let chars: Vec<char> = (0..size)
338 .map(|i| {
339 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
341 })
342 .collect();
343 let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
344 assert!(result.is_ok(), "Size {} should be valid", size);
345 }
346 }
347
348 #[test]
349 fn test_control_character_rejection() {
350 let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::new(chars);
352 assert!(result.is_err());
353 assert!(result.unwrap_err().contains("Control character"));
354 }
355
356 #[test]
357 fn test_whitespace_rejection() {
358 let chars = vec!['a', 'b', ' ', 'c'];
359 let result = Dictionary::new(chars);
360 assert!(result.is_err());
361 assert!(result.unwrap_err().contains("Whitespace"));
362 }
363
364 #[test]
365 fn test_padding_conflict_with_dictionary() {
366 let chars = vec!['a', 'b', 'c', 'd'];
367 let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
368 assert!(result.is_err());
369 let err = result.unwrap_err();
370 assert!(err.contains("Padding character"));
371 assert!(err.contains("conflicts"));
372 }
373
374 #[test]
375 fn test_valid_padding() {
376 let chars = vec!['a', 'b', 'c', 'd'];
377 let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
378 assert!(result.is_ok());
379 }
380
381 #[test]
382 fn test_byte_range_exceeds_unicode() {
383 let result = Dictionary::new_with_mode_and_range(
385 Vec::new(),
386 EncodingMode::ByteRange,
387 None,
388 Some(0x10FF80), );
390 assert!(result.is_err());
391 }
392
393 #[test]
394 fn test_byte_range_valid_start() {
395 let result = Dictionary::new_with_mode_and_range(
396 Vec::new(),
397 EncodingMode::ByteRange,
398 None,
399 Some(0x1F300), );
401 assert!(result.is_ok());
402 }
403
404 #[test]
405 fn test_byte_range_no_start_codepoint() {
406 let result =
407 Dictionary::new_with_mode_and_range(Vec::new(), EncodingMode::ByteRange, None, None);
408 assert!(result.is_err());
409 assert!(result.unwrap_err().contains("requires start_codepoint"));
410 }
411
412 #[test]
413 fn test_detailed_error_messages() {
414 let chars = vec!['a', 'b', 'a'];
416 let err = Dictionary::new(chars).unwrap_err();
417 assert!(err.contains("'a'") || err.contains("U+"));
418 }
419}