1use crate::core::config::EncodingMode;
2use crate::simd::variants::DictionaryMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7#[derive(Debug, Clone)]
13pub struct Dictionary {
14 chars: Vec<char>,
15 char_to_index: HashMap<char, usize>,
16 lookup_table: Option<Box<[Option<usize>; 256]>>,
18 mode: EncodingMode,
19 padding: Option<char>,
20 start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24 pub fn builder() -> DictionaryBuilder {
37 DictionaryBuilder::new()
38 }
39
40 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
54 #[allow(deprecated)]
55 pub fn new(chars: Vec<char>) -> Result<Self, String> {
56 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
57 }
58
59 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
77 #[allow(deprecated)]
78 pub fn new_with_mode(
79 chars: Vec<char>,
80 mode: EncodingMode,
81 padding: Option<char>,
82 ) -> Result<Self, String> {
83 Self::new_with_mode_and_range(chars, mode, padding, None)
84 }
85
86 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
103 pub fn new_with_mode_and_range(
104 chars: Vec<char>,
105 mode: EncodingMode,
106 padding: Option<char>,
107 start_codepoint: Option<u32>,
108 ) -> Result<Self, String> {
109 if mode == EncodingMode::ByteRange {
111 if let Some(start) = start_codepoint {
112 if let Some(end_codepoint) = start.checked_add(255) {
114 if std::char::from_u32(end_codepoint).is_none() {
115 return Err(format!(
116 "Invalid Unicode range: {}-{}",
117 start, end_codepoint
118 ));
119 }
120 for offset in 0..=255 {
122 if std::char::from_u32(start + offset).is_none() {
123 return Err(format!(
124 "Invalid Unicode codepoint in range: {}",
125 start + offset
126 ));
127 }
128 }
129 } else {
130 return Err("Start codepoint too high for 256-byte range".to_string());
131 }
132
133 return Ok(Dictionary {
134 chars: Vec::new(),
135 char_to_index: HashMap::new(),
136 lookup_table: None,
137 mode,
138 padding,
139 start_codepoint: Some(start),
140 });
141 } else {
142 return Err("ByteRange mode requires start_codepoint".to_string());
143 }
144 }
145
146 if chars.is_empty() {
147 return Err("Dictionary cannot be empty".to_string());
148 }
149
150 if mode == EncodingMode::Chunked {
152 let base = chars.len();
153 if !base.is_power_of_two() {
154 return Err(format!(
155 "Chunked mode requires power-of-two dictionary size, got {}",
156 base
157 ));
158 }
159 if base != 2
161 && base != 4
162 && base != 8
163 && base != 16
164 && base != 32
165 && base != 64
166 && base != 128
167 && base != 256
168 {
169 return Err(format!(
170 "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
171 base
172 ));
173 }
174 }
175
176 let mut char_to_index = HashMap::new();
178 for (i, &c) in chars.iter().enumerate() {
179 if char_to_index.insert(c, i).is_some() {
181 return Err(format!(
182 "Duplicate character in dictionary: '{}' (U+{:04X})",
183 c, c as u32
184 ));
185 }
186
187 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
189 return Err(format!(
190 "Control character not allowed in dictionary: U+{:04X}",
191 c as u32
192 ));
193 }
194
195 if c.is_whitespace() {
197 return Err(format!(
198 "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
199 c, c as u32
200 ));
201 }
202 }
203
204 if let Some(pad) = padding {
206 if char_to_index.contains_key(&pad) {
207 return Err(format!(
208 "Padding character '{}' conflicts with dictionary characters",
209 pad
210 ));
211 }
212 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
213 return Err(format!(
214 "Control character not allowed as padding: U+{:04X}",
215 pad as u32
216 ));
217 }
218 }
219
220 let lookup_table = if chars
222 .iter()
223 .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
224 {
225 let mut table = Box::new([None; 256]);
226 for (i, &c) in chars.iter().enumerate() {
227 table[c as usize] = Some(i);
228 }
229 Some(table)
230 } else {
231 None
232 };
233
234 Ok(Dictionary {
235 chars,
236 char_to_index,
237 lookup_table,
238 mode,
239 padding,
240 start_codepoint: None,
241 })
242 }
243
244 #[deprecated(
254 since = "0.1.0",
255 note = "Use Dictionary::builder().chars_from_str(s).build() instead"
256 )]
257 #[allow(deprecated, clippy::should_implement_trait)]
258 pub fn from_str(s: &str) -> Result<Self, String> {
259 let chars: Vec<char> = s.chars().collect();
260 Self::new(chars)
261 }
262
263 pub fn base(&self) -> usize {
267 match self.mode {
268 EncodingMode::ByteRange => 256,
269 _ => self.chars.len(),
270 }
271 }
272
273 pub fn mode(&self) -> &EncodingMode {
275 &self.mode
276 }
277
278 pub fn padding(&self) -> Option<char> {
280 self.padding
281 }
282
283 pub fn start_codepoint(&self) -> Option<u32> {
285 self.start_codepoint
286 }
287
288 pub fn encode_digit(&self, digit: usize) -> Option<char> {
292 match self.mode {
293 EncodingMode::ByteRange => {
294 if let Some(start) = self.start_codepoint
295 && digit < 256
296 {
297 return std::char::from_u32(start + digit as u32);
298 }
299 None
300 }
301 _ => self.chars.get(digit).copied(),
302 }
303 }
304
305 pub fn decode_char(&self, c: char) -> Option<usize> {
309 match self.mode {
310 EncodingMode::ByteRange => {
311 if let Some(start) = self.start_codepoint {
312 let codepoint = c as u32;
313 if codepoint >= start && codepoint < start + 256 {
314 return Some((codepoint - start) as usize);
315 }
316 }
317 None
318 }
319 _ => {
320 if let Some(ref table) = self.lookup_table {
322 let char_val = c as u32;
323 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
324 return table[char_val as usize];
325 }
326 }
327 self.char_to_index.get(&c).copied()
329 }
330 }
331 }
332
333 pub fn simd_metadata(&self) -> DictionaryMetadata {
338 DictionaryMetadata::from_dictionary(self)
339 }
340
341 pub fn simd_available(&self) -> bool {
346 self.simd_metadata().simd_available()
347 }
348}
349
350#[derive(Debug, Default)]
363pub struct DictionaryBuilder {
364 chars: Option<Vec<char>>,
365 mode: Option<EncodingMode>,
366 padding: Option<char>,
367 start_codepoint: Option<u32>,
368}
369
370impl DictionaryBuilder {
371 pub fn new() -> Self {
373 Self {
374 chars: None,
375 mode: None,
376 padding: None,
377 start_codepoint: None,
378 }
379 }
380
381 pub fn chars(mut self, chars: Vec<char>) -> Self {
387 self.chars = Some(chars);
388 self
389 }
390
391 pub fn chars_from_str(mut self, s: &str) -> Self {
397 self.chars = Some(s.chars().collect());
398 self
399 }
400
401 pub fn mode(mut self, mode: EncodingMode) -> Self {
407 self.mode = Some(mode);
408 self
409 }
410
411 pub fn padding(mut self, padding: char) -> Self {
417 self.padding = Some(padding);
418 self
419 }
420
421 pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
427 self.start_codepoint = Some(start_codepoint);
428 self
429 }
430
431 #[allow(deprecated)]
440 pub fn build(self) -> Result<Dictionary, String> {
441 let mode = self.mode.unwrap_or(EncodingMode::BaseConversion);
442 let chars = self.chars.unwrap_or_default();
443
444 Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
445 }
446}
447
448#[cfg(test)]
449mod tests {
450 use super::*;
451
452 #[test]
453 fn test_duplicate_character_detection() {
454 let chars = vec!['a', 'b', 'c', 'a'];
455 let result = Dictionary::builder().chars(chars).build();
456 assert!(result.is_err());
457 assert!(result.unwrap_err().contains("Duplicate character"));
458 }
459
460 #[test]
461 fn test_empty_dictionary() {
462 let chars = vec![];
463 let result = Dictionary::builder().chars(chars).build();
464 assert!(result.is_err());
465 assert!(result.unwrap_err().contains("cannot be empty"));
466 }
467
468 #[test]
469 fn test_chunked_mode_power_of_two() {
470 let chars = vec!['a', 'b', 'c']; let result = Dictionary::builder()
472 .chars(chars)
473 .mode(EncodingMode::Chunked)
474 .build();
475 assert!(result.is_err());
476 assert!(result.unwrap_err().contains("power-of-two"));
477 }
478
479 #[test]
480 fn test_chunked_mode_valid_sizes() {
481 for &size in &[2, 4, 8, 16, 32, 64] {
483 let chars: Vec<char> = (0..size)
484 .map(|i| {
485 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
487 })
488 .collect();
489 let result = Dictionary::builder()
490 .chars(chars)
491 .mode(EncodingMode::Chunked)
492 .build();
493 assert!(result.is_ok(), "Size {} should be valid", size);
494 }
495 }
496
497 #[test]
498 fn test_control_character_rejection() {
499 let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::builder().chars(chars).build();
501 assert!(result.is_err());
502 assert!(result.unwrap_err().contains("Control character"));
503 }
504
505 #[test]
506 fn test_whitespace_rejection() {
507 let chars = vec!['a', 'b', ' ', 'c'];
508 let result = Dictionary::builder().chars(chars).build();
509 assert!(result.is_err());
510 assert!(result.unwrap_err().contains("Whitespace"));
511 }
512
513 #[test]
514 fn test_padding_conflict_with_dictionary() {
515 let chars = vec!['a', 'b', 'c', 'd'];
516 let result = Dictionary::builder()
517 .chars(chars)
518 .mode(EncodingMode::BaseConversion)
519 .padding('b')
520 .build();
521 assert!(result.is_err());
522 let err = result.unwrap_err();
523 assert!(err.contains("Padding character"));
524 assert!(err.contains("conflicts"));
525 }
526
527 #[test]
528 fn test_valid_padding() {
529 let chars = vec!['a', 'b', 'c', 'd'];
530 let result = Dictionary::builder()
531 .chars(chars)
532 .mode(EncodingMode::BaseConversion)
533 .padding('=')
534 .build();
535 assert!(result.is_ok());
536 }
537
538 #[test]
539 fn test_byte_range_exceeds_unicode() {
540 let result = Dictionary::builder()
542 .mode(EncodingMode::ByteRange)
543 .start_codepoint(0x10FF80) .build();
545 assert!(result.is_err());
546 }
547
548 #[test]
549 fn test_byte_range_valid_start() {
550 let result = Dictionary::builder()
551 .mode(EncodingMode::ByteRange)
552 .start_codepoint(0x1F300) .build();
554 assert!(result.is_ok());
555 }
556
557 #[test]
558 fn test_byte_range_no_start_codepoint() {
559 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
560 assert!(result.is_err());
561 assert!(result.unwrap_err().contains("requires start_codepoint"));
562 }
563
564 #[test]
565 fn test_detailed_error_messages() {
566 let chars = vec!['a', 'b', 'a'];
568 let err = Dictionary::builder().chars(chars).build().unwrap_err();
569 assert!(err.contains("'a'") || err.contains("U+"));
570 }
571
572 #[test]
574 fn test_builder_basic() {
575 let dict = Dictionary::builder()
576 .chars(vec!['0', '1', '2', '3'])
577 .build()
578 .unwrap();
579
580 assert_eq!(dict.base(), 4);
581 assert_eq!(dict.mode(), &EncodingMode::BaseConversion);
582 assert_eq!(dict.padding(), None);
583 }
584
585 #[test]
586 fn test_builder_from_str() {
587 let dict = Dictionary::builder()
588 .chars_from_str("0123456789ABCDEF")
589 .build()
590 .unwrap();
591
592 assert_eq!(dict.base(), 16);
593 }
594
595 #[test]
596 fn test_builder_with_mode() {
597 let dict = Dictionary::builder()
598 .chars(vec!['0', '1'])
599 .mode(EncodingMode::Chunked)
600 .build()
601 .unwrap();
602
603 assert_eq!(dict.mode(), &EncodingMode::Chunked);
604 }
605
606 #[test]
607 fn test_builder_with_padding() {
608 let dict = Dictionary::builder()
609 .chars_from_str("ABCD")
610 .padding('=')
611 .build()
612 .unwrap();
613
614 assert_eq!(dict.padding(), Some('='));
615 }
616
617 #[test]
618 fn test_builder_byte_range() {
619 let dict = Dictionary::builder()
620 .mode(EncodingMode::ByteRange)
621 .start_codepoint(0x1F300)
622 .build()
623 .unwrap();
624
625 assert_eq!(dict.mode(), &EncodingMode::ByteRange);
626 assert_eq!(dict.start_codepoint(), Some(0x1F300));
627 assert_eq!(dict.base(), 256);
628 }
629
630 #[test]
631 fn test_builder_byte_range_missing_start() {
632 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
633
634 assert!(result.is_err());
635 assert!(result.unwrap_err().contains("requires start_codepoint"));
636 }
637
638 #[test]
639 fn test_builder_validation_duplicates() {
640 let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
641
642 assert!(result.is_err());
643 assert!(result.unwrap_err().contains("Duplicate character"));
644 }
645
646 #[test]
647 fn test_builder_chunked_validation() {
648 let result = Dictionary::builder()
649 .chars(vec!['a', 'b', 'c']) .mode(EncodingMode::Chunked)
651 .build();
652
653 assert!(result.is_err());
654 assert!(result.unwrap_err().contains("power-of-two"));
655 }
656
657 #[test]
658 fn test_builder_padding_conflict() {
659 let result = Dictionary::builder()
660 .chars(vec!['a', 'b', 'c'])
661 .padding('b')
662 .build();
663
664 assert!(result.is_err());
665 assert!(result.unwrap_err().contains("Padding character"));
666 }
667
668 #[test]
669 fn test_builder_full_config() {
670 let dict = Dictionary::builder()
671 .chars_from_str("01")
672 .mode(EncodingMode::Chunked)
673 .padding('=')
674 .build()
675 .unwrap();
676
677 assert_eq!(dict.base(), 2);
678 assert_eq!(dict.mode(), &EncodingMode::Chunked);
679 assert_eq!(dict.padding(), Some('='));
680 }
681}