1use crate::core::config::EncodingMode;
2#[cfg(feature = "simd")]
3use crate::simd::variants::DictionaryMetadata;
4use std::collections::HashMap;
5
6const MAX_LOOKUP_TABLE_SIZE: usize = 256;
7
8pub fn is_safe_byte_range(start: u32) -> bool {
23 if start < 0x00A0 {
25 return false;
26 }
27
28 let end = match start.checked_add(255) {
30 Some(e) => e,
31 None => return false,
32 };
33
34 if end > 0x10FFFF {
36 return false;
37 }
38
39 if start <= 0xDFFF && end >= 0xD800 {
42 return false;
43 }
44
45 true
46}
47
48#[derive(Debug, Clone)]
54pub struct Dictionary {
55 chars: Vec<char>,
56 char_to_index: HashMap<char, usize>,
57 lookup_table: Option<Box<[Option<usize>; 256]>>,
59 mode: EncodingMode,
60 padding: Option<char>,
61 start_codepoint: Option<u32>,
62}
63
64impl Dictionary {
65 pub fn builder() -> DictionaryBuilder {
78 DictionaryBuilder::new()
79 }
80
81 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
95 #[allow(deprecated)]
96 pub fn new(chars: Vec<char>) -> Result<Self, String> {
97 Self::new_with_mode(chars, EncodingMode::Radix, None)
98 }
99
100 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
118 #[allow(deprecated)]
119 pub fn new_with_mode(
120 chars: Vec<char>,
121 mode: EncodingMode,
122 padding: Option<char>,
123 ) -> Result<Self, String> {
124 Self::new_with_mode_and_range(chars, mode, padding, None)
125 }
126
127 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
144 pub fn new_with_mode_and_range(
145 chars: Vec<char>,
146 mode: EncodingMode,
147 padding: Option<char>,
148 start_codepoint: Option<u32>,
149 ) -> Result<Self, String> {
150 if mode == EncodingMode::ByteRange {
152 if let Some(start) = start_codepoint {
153 if !is_safe_byte_range(start) {
158 return Err(format!(
159 "Unsafe ByteRange start_codepoint U+{:04X}: mapped range U+{:04X}..U+{:04X} \
160 overlaps with dangerous codepoints (NUL U+0000, C1 controls U+0080-U+009F, \
161 or surrogates U+D800-U+DFFF)",
162 start,
163 start,
164 start + 255
165 ));
166 }
167
168 return Ok(Dictionary {
169 chars: Vec::new(),
170 char_to_index: HashMap::new(),
171 lookup_table: None,
172 mode,
173 padding,
174 start_codepoint: Some(start),
175 });
176 } else {
177 return Err("ByteRange mode requires start_codepoint".to_string());
178 }
179 }
180
181 if chars.is_empty() {
182 return Err("Dictionary cannot be empty".to_string());
183 }
184
185 if mode == EncodingMode::Chunked {
187 let base = chars.len();
188 if !base.is_power_of_two() {
189 return Err(format!(
190 "Chunked mode requires power-of-two dictionary size, got {}",
191 base
192 ));
193 }
194 if base != 2
196 && base != 4
197 && base != 8
198 && base != 16
199 && base != 32
200 && base != 64
201 && base != 128
202 && base != 256
203 {
204 return Err(format!(
205 "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
206 base
207 ));
208 }
209 }
210
211 let mut char_to_index = HashMap::new();
213 for (i, &c) in chars.iter().enumerate() {
214 if char_to_index.insert(c, i).is_some() {
216 return Err(format!(
217 "Duplicate character in dictionary: '{}' (U+{:04X})",
218 c, c as u32
219 ));
220 }
221
222 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
224 return Err(format!(
225 "Control character not allowed in dictionary: U+{:04X}",
226 c as u32
227 ));
228 }
229
230 if c.is_whitespace() && c != ' ' {
232 return Err(format!(
233 "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
234 c, c as u32
235 ));
236 }
237 }
238
239 if let Some(pad) = padding {
241 if char_to_index.contains_key(&pad) {
242 return Err(format!(
243 "Padding character '{}' conflicts with dictionary characters",
244 pad
245 ));
246 }
247 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
248 return Err(format!(
249 "Control character not allowed as padding: U+{:04X}",
250 pad as u32
251 ));
252 }
253 }
254
255 let lookup_table = if chars
257 .iter()
258 .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
259 {
260 let mut table = Box::new([None; 256]);
261 for (i, &c) in chars.iter().enumerate() {
262 table[c as usize] = Some(i);
263 }
264 Some(table)
265 } else {
266 None
267 };
268
269 Ok(Dictionary {
270 chars,
271 char_to_index,
272 lookup_table,
273 mode,
274 padding,
275 start_codepoint: None,
276 })
277 }
278
279 #[deprecated(
289 since = "0.1.0",
290 note = "Use Dictionary::builder().chars_from_str(s).build() instead"
291 )]
292 #[allow(deprecated, clippy::should_implement_trait)]
293 pub fn from_str(s: &str) -> Result<Self, String> {
294 let chars: Vec<char> = s.chars().collect();
295 Self::new(chars)
296 }
297
298 pub fn base(&self) -> usize {
302 match self.mode {
303 EncodingMode::ByteRange => 256,
304 _ => self.chars.len(),
305 }
306 }
307
308 pub fn mode(&self) -> &EncodingMode {
310 &self.mode
311 }
312
313 pub fn padding(&self) -> Option<char> {
315 self.padding
316 }
317
318 pub fn start_codepoint(&self) -> Option<u32> {
320 self.start_codepoint
321 }
322
323 pub fn encode_digit(&self, digit: usize) -> Option<char> {
327 match self.mode {
328 EncodingMode::ByteRange => {
329 if let Some(start) = self.start_codepoint
330 && digit < 256
331 {
332 return std::char::from_u32(start + digit as u32);
333 }
334 None
335 }
336 _ => self.chars.get(digit).copied(),
337 }
338 }
339
340 pub fn decode_char(&self, c: char) -> Option<usize> {
344 match self.mode {
345 EncodingMode::ByteRange => {
346 if let Some(start) = self.start_codepoint {
347 let codepoint = c as u32;
348 if codepoint >= start && codepoint < start + 256 {
349 return Some((codepoint - start) as usize);
350 }
351 }
352 None
353 }
354 _ => {
355 if let Some(ref table) = self.lookup_table {
357 let char_val = c as u32;
358 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
359 return table[char_val as usize];
360 }
361 }
362 self.char_to_index.get(&c).copied()
364 }
365 }
366 }
367
368 #[cfg(feature = "simd")]
373 pub fn simd_metadata(&self) -> DictionaryMetadata {
374 DictionaryMetadata::from_dictionary(self)
375 }
376
377 #[cfg(feature = "simd")]
382 pub fn simd_available(&self) -> bool {
383 self.simd_metadata().simd_available()
384 }
385
386 #[cfg(not(feature = "simd"))]
390 pub fn simd_available(&self) -> bool {
391 false
392 }
393}
394
395#[derive(Debug, Default)]
408pub struct DictionaryBuilder {
409 chars: Option<Vec<char>>,
410 mode: Option<EncodingMode>,
411 padding: Option<char>,
412 start_codepoint: Option<u32>,
413}
414
415impl DictionaryBuilder {
416 pub fn new() -> Self {
418 Self {
419 chars: None,
420 mode: None,
421 padding: None,
422 start_codepoint: None,
423 }
424 }
425
426 pub fn chars(mut self, chars: Vec<char>) -> Self {
432 self.chars = Some(chars);
433 self
434 }
435
436 pub fn chars_from_str(mut self, s: &str) -> Self {
442 self.chars = Some(s.chars().collect());
443 self
444 }
445
446 pub fn mode(mut self, mode: EncodingMode) -> Self {
452 self.mode = Some(mode);
453 self
454 }
455
456 pub fn padding(mut self, padding: char) -> Self {
462 self.padding = Some(padding);
463 self
464 }
465
466 pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
472 self.start_codepoint = Some(start_codepoint);
473 self
474 }
475
476 #[allow(deprecated)]
485 pub fn build(self) -> Result<Dictionary, String> {
486 let mode = self.mode.unwrap_or(EncodingMode::Radix);
487 let chars = self.chars.unwrap_or_default();
488
489 Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
490 }
491}
492
493#[cfg(test)]
494mod tests {
495 use super::*;
496
497 #[test]
498 fn test_duplicate_character_detection() {
499 let chars = vec!['a', 'b', 'c', 'a'];
500 let result = Dictionary::builder().chars(chars).build();
501 assert!(result.is_err());
502 assert!(result.unwrap_err().contains("Duplicate character"));
503 }
504
505 #[test]
506 fn test_empty_dictionary() {
507 let chars = vec![];
508 let result = Dictionary::builder().chars(chars).build();
509 assert!(result.is_err());
510 assert!(result.unwrap_err().contains("cannot be empty"));
511 }
512
513 #[test]
514 fn test_chunked_mode_power_of_two() {
515 let chars = vec!['a', 'b', 'c']; let result = Dictionary::builder()
517 .chars(chars)
518 .mode(EncodingMode::Chunked)
519 .build();
520 assert!(result.is_err());
521 assert!(result.unwrap_err().contains("power-of-two"));
522 }
523
524 #[test]
525 fn test_chunked_mode_valid_sizes() {
526 for &size in &[2, 4, 8, 16, 32, 64] {
528 let chars: Vec<char> = (0..size)
529 .map(|i| {
530 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
532 })
533 .collect();
534 let result = Dictionary::builder()
535 .chars(chars)
536 .mode(EncodingMode::Chunked)
537 .build();
538 assert!(result.is_ok(), "Size {} should be valid", size);
539 }
540 }
541
542 #[test]
543 fn test_control_character_rejection() {
544 let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::builder().chars(chars).build();
546 assert!(result.is_err());
547 assert!(result.unwrap_err().contains("Control character"));
548 }
549
550 #[test]
551 fn test_whitespace_rejection() {
552 let chars = vec!['a', 'b', '\t', 'c'];
554 let result = Dictionary::builder().chars(chars).build();
555 assert!(result.is_err());
556 assert!(result.unwrap_err().contains("Whitespace"));
557
558 let chars_with_space = vec!['a', 'b', ' ', 'c'];
560 let result_space = Dictionary::builder().chars(chars_with_space).build();
561 assert!(result_space.is_ok());
562 }
563
564 #[test]
565 fn test_padding_conflict_with_dictionary() {
566 let chars = vec!['a', 'b', 'c', 'd'];
567 let result = Dictionary::builder()
568 .chars(chars)
569 .mode(EncodingMode::Radix)
570 .padding('b')
571 .build();
572 assert!(result.is_err());
573 let err = result.unwrap_err();
574 assert!(err.contains("Padding character"));
575 assert!(err.contains("conflicts"));
576 }
577
578 #[test]
579 fn test_valid_padding() {
580 let chars = vec!['a', 'b', 'c', 'd'];
581 let result = Dictionary::builder()
582 .chars(chars)
583 .mode(EncodingMode::Radix)
584 .padding('=')
585 .build();
586 assert!(result.is_ok());
587 }
588
589 #[test]
590 fn test_byte_range_exceeds_unicode() {
591 let result = Dictionary::builder()
593 .mode(EncodingMode::ByteRange)
594 .start_codepoint(0x10FF80) .build();
596 assert!(result.is_err());
597 }
598
599 #[test]
600 fn test_byte_range_valid_start() {
601 let result = Dictionary::builder()
602 .mode(EncodingMode::ByteRange)
603 .start_codepoint(0x1F300) .build();
605 assert!(result.is_ok());
606 }
607
608 #[test]
609 fn test_byte_range_no_start_codepoint() {
610 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
611 assert!(result.is_err());
612 assert!(result.unwrap_err().contains("requires start_codepoint"));
613 }
614
615 #[test]
616 fn test_detailed_error_messages() {
617 let chars = vec!['a', 'b', 'a'];
619 let err = Dictionary::builder().chars(chars).build().unwrap_err();
620 assert!(err.contains("'a'") || err.contains("U+"));
621 }
622
623 #[test]
625 fn test_builder_basic() {
626 let dict = Dictionary::builder()
627 .chars(vec!['0', '1', '2', '3'])
628 .build()
629 .unwrap();
630
631 assert_eq!(dict.base(), 4);
632 assert_eq!(dict.mode(), &EncodingMode::Radix);
633 assert_eq!(dict.padding(), None);
634 }
635
636 #[test]
637 fn test_builder_from_str() {
638 let dict = Dictionary::builder()
639 .chars_from_str("0123456789ABCDEF")
640 .build()
641 .unwrap();
642
643 assert_eq!(dict.base(), 16);
644 }
645
646 #[test]
647 fn test_builder_with_mode() {
648 let dict = Dictionary::builder()
649 .chars(vec!['0', '1'])
650 .mode(EncodingMode::Chunked)
651 .build()
652 .unwrap();
653
654 assert_eq!(dict.mode(), &EncodingMode::Chunked);
655 }
656
657 #[test]
658 fn test_builder_with_padding() {
659 let dict = Dictionary::builder()
660 .chars_from_str("ABCD")
661 .padding('=')
662 .build()
663 .unwrap();
664
665 assert_eq!(dict.padding(), Some('='));
666 }
667
668 #[test]
669 fn test_builder_byte_range() {
670 let dict = Dictionary::builder()
671 .mode(EncodingMode::ByteRange)
672 .start_codepoint(0x1F300)
673 .build()
674 .unwrap();
675
676 assert_eq!(dict.mode(), &EncodingMode::ByteRange);
677 assert_eq!(dict.start_codepoint(), Some(0x1F300));
678 assert_eq!(dict.base(), 256);
679 }
680
681 #[test]
682 fn test_builder_byte_range_missing_start() {
683 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
684
685 assert!(result.is_err());
686 assert!(result.unwrap_err().contains("requires start_codepoint"));
687 }
688
689 #[test]
690 fn test_builder_validation_duplicates() {
691 let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
692
693 assert!(result.is_err());
694 assert!(result.unwrap_err().contains("Duplicate character"));
695 }
696
697 #[test]
698 fn test_builder_chunked_validation() {
699 let result = Dictionary::builder()
700 .chars(vec!['a', 'b', 'c']) .mode(EncodingMode::Chunked)
702 .build();
703
704 assert!(result.is_err());
705 assert!(result.unwrap_err().contains("power-of-two"));
706 }
707
708 #[test]
709 fn test_builder_padding_conflict() {
710 let result = Dictionary::builder()
711 .chars(vec!['a', 'b', 'c'])
712 .padding('b')
713 .build();
714
715 assert!(result.is_err());
716 assert!(result.unwrap_err().contains("Padding character"));
717 }
718
719 #[test]
720 fn test_builder_full_config() {
721 let dict = Dictionary::builder()
722 .chars_from_str("01")
723 .mode(EncodingMode::Chunked)
724 .padding('=')
725 .build()
726 .unwrap();
727
728 assert_eq!(dict.base(), 2);
729 assert_eq!(dict.mode(), &EncodingMode::Chunked);
730 assert_eq!(dict.padding(), Some('='));
731 }
732
733 #[test]
736 fn test_is_safe_byte_range_nul() {
737 assert!(!is_safe_byte_range(0));
739 }
740
741 #[test]
742 fn test_is_safe_byte_range_end_of_c1() {
743 assert!(!is_safe_byte_range(0x009F));
745 }
746
747 #[test]
748 fn test_is_safe_byte_range_first_safe() {
749 assert!(is_safe_byte_range(0x00A0));
751 }
752
753 #[test]
754 fn test_is_safe_byte_range_just_below_surrogates() {
755 assert!(is_safe_byte_range(0xD700));
757 }
758
759 #[test]
760 fn test_is_safe_byte_range_overlaps_surrogate_start() {
761 assert!(!is_safe_byte_range(0xD701));
763 }
764
765 #[test]
766 fn test_is_safe_byte_range_above_surrogates() {
767 assert!(is_safe_byte_range(0xE000));
769 }
770
771 #[test]
772 fn test_is_safe_byte_range_at_unicode_max() {
773 assert!(is_safe_byte_range(0x10FF00));
775 }
776
777 #[test]
778 fn test_is_safe_byte_range_exceeds_unicode_max() {
779 assert!(!is_safe_byte_range(0x10FF01));
781 }
782}