1use crate::core::config::EncodingMode;
2#[cfg(feature = "simd")]
3use crate::simd::variants::DictionaryMetadata;
4use std::collections::HashMap;
5
6const MAX_LOOKUP_TABLE_SIZE: usize = 256;
7
8#[derive(Debug, Clone)]
14pub struct Dictionary {
15 chars: Vec<char>,
16 char_to_index: HashMap<char, usize>,
17 lookup_table: Option<Box<[Option<usize>; 256]>>,
19 mode: EncodingMode,
20 padding: Option<char>,
21 start_codepoint: Option<u32>,
22}
23
24impl Dictionary {
25 pub fn builder() -> DictionaryBuilder {
38 DictionaryBuilder::new()
39 }
40
41 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
55 #[allow(deprecated)]
56 pub fn new(chars: Vec<char>) -> Result<Self, String> {
57 Self::new_with_mode(chars, EncodingMode::Radix, None)
58 }
59
60 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
78 #[allow(deprecated)]
79 pub fn new_with_mode(
80 chars: Vec<char>,
81 mode: EncodingMode,
82 padding: Option<char>,
83 ) -> Result<Self, String> {
84 Self::new_with_mode_and_range(chars, mode, padding, None)
85 }
86
87 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
104 pub fn new_with_mode_and_range(
105 chars: Vec<char>,
106 mode: EncodingMode,
107 padding: Option<char>,
108 start_codepoint: Option<u32>,
109 ) -> Result<Self, String> {
110 if mode == EncodingMode::ByteRange {
112 if let Some(start) = start_codepoint {
113 if let Some(end_codepoint) = start.checked_add(255) {
115 if std::char::from_u32(end_codepoint).is_none() {
116 return Err(format!(
117 "Invalid Unicode range: {}-{}",
118 start, end_codepoint
119 ));
120 }
121 for offset in 0..=255 {
123 if std::char::from_u32(start + offset).is_none() {
124 return Err(format!(
125 "Invalid Unicode codepoint in range: {}",
126 start + offset
127 ));
128 }
129 }
130 } else {
131 return Err("Start codepoint too high for 256-byte range".to_string());
132 }
133
134 return Ok(Dictionary {
135 chars: Vec::new(),
136 char_to_index: HashMap::new(),
137 lookup_table: None,
138 mode,
139 padding,
140 start_codepoint: Some(start),
141 });
142 } else {
143 return Err("ByteRange mode requires start_codepoint".to_string());
144 }
145 }
146
147 if chars.is_empty() {
148 return Err("Dictionary cannot be empty".to_string());
149 }
150
151 if mode == EncodingMode::Chunked {
153 let base = chars.len();
154 if !base.is_power_of_two() {
155 return Err(format!(
156 "Chunked mode requires power-of-two dictionary size, got {}",
157 base
158 ));
159 }
160 if base != 2
162 && base != 4
163 && base != 8
164 && base != 16
165 && base != 32
166 && base != 64
167 && base != 128
168 && base != 256
169 {
170 return Err(format!(
171 "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
172 base
173 ));
174 }
175 }
176
177 let mut char_to_index = HashMap::new();
179 for (i, &c) in chars.iter().enumerate() {
180 if char_to_index.insert(c, i).is_some() {
182 return Err(format!(
183 "Duplicate character in dictionary: '{}' (U+{:04X})",
184 c, c as u32
185 ));
186 }
187
188 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
190 return Err(format!(
191 "Control character not allowed in dictionary: U+{:04X}",
192 c as u32
193 ));
194 }
195
196 if c.is_whitespace() && c != ' ' {
198 return Err(format!(
199 "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
200 c, c as u32
201 ));
202 }
203 }
204
205 if let Some(pad) = padding {
207 if char_to_index.contains_key(&pad) {
208 return Err(format!(
209 "Padding character '{}' conflicts with dictionary characters",
210 pad
211 ));
212 }
213 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
214 return Err(format!(
215 "Control character not allowed as padding: U+{:04X}",
216 pad as u32
217 ));
218 }
219 }
220
221 let lookup_table = if chars
223 .iter()
224 .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
225 {
226 let mut table = Box::new([None; 256]);
227 for (i, &c) in chars.iter().enumerate() {
228 table[c as usize] = Some(i);
229 }
230 Some(table)
231 } else {
232 None
233 };
234
235 Ok(Dictionary {
236 chars,
237 char_to_index,
238 lookup_table,
239 mode,
240 padding,
241 start_codepoint: None,
242 })
243 }
244
245 #[deprecated(
255 since = "0.1.0",
256 note = "Use Dictionary::builder().chars_from_str(s).build() instead"
257 )]
258 #[allow(deprecated, clippy::should_implement_trait)]
259 pub fn from_str(s: &str) -> Result<Self, String> {
260 let chars: Vec<char> = s.chars().collect();
261 Self::new(chars)
262 }
263
264 pub fn base(&self) -> usize {
268 match self.mode {
269 EncodingMode::ByteRange => 256,
270 _ => self.chars.len(),
271 }
272 }
273
274 pub fn mode(&self) -> &EncodingMode {
276 &self.mode
277 }
278
279 pub fn padding(&self) -> Option<char> {
281 self.padding
282 }
283
284 pub fn start_codepoint(&self) -> Option<u32> {
286 self.start_codepoint
287 }
288
289 pub fn encode_digit(&self, digit: usize) -> Option<char> {
293 match self.mode {
294 EncodingMode::ByteRange => {
295 if let Some(start) = self.start_codepoint
296 && digit < 256
297 {
298 return std::char::from_u32(start + digit as u32);
299 }
300 None
301 }
302 _ => self.chars.get(digit).copied(),
303 }
304 }
305
306 pub fn decode_char(&self, c: char) -> Option<usize> {
310 match self.mode {
311 EncodingMode::ByteRange => {
312 if let Some(start) = self.start_codepoint {
313 let codepoint = c as u32;
314 if codepoint >= start && codepoint < start + 256 {
315 return Some((codepoint - start) as usize);
316 }
317 }
318 None
319 }
320 _ => {
321 if let Some(ref table) = self.lookup_table {
323 let char_val = c as u32;
324 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
325 return table[char_val as usize];
326 }
327 }
328 self.char_to_index.get(&c).copied()
330 }
331 }
332 }
333
334 #[cfg(feature = "simd")]
339 pub fn simd_metadata(&self) -> DictionaryMetadata {
340 DictionaryMetadata::from_dictionary(self)
341 }
342
343 #[cfg(feature = "simd")]
348 pub fn simd_available(&self) -> bool {
349 self.simd_metadata().simd_available()
350 }
351
352 #[cfg(not(feature = "simd"))]
356 pub fn simd_available(&self) -> bool {
357 false
358 }
359}
360
361#[derive(Debug, Default)]
374pub struct DictionaryBuilder {
375 chars: Option<Vec<char>>,
376 mode: Option<EncodingMode>,
377 padding: Option<char>,
378 start_codepoint: Option<u32>,
379}
380
381impl DictionaryBuilder {
382 pub fn new() -> Self {
384 Self {
385 chars: None,
386 mode: None,
387 padding: None,
388 start_codepoint: None,
389 }
390 }
391
392 pub fn chars(mut self, chars: Vec<char>) -> Self {
398 self.chars = Some(chars);
399 self
400 }
401
402 pub fn chars_from_str(mut self, s: &str) -> Self {
408 self.chars = Some(s.chars().collect());
409 self
410 }
411
412 pub fn mode(mut self, mode: EncodingMode) -> Self {
418 self.mode = Some(mode);
419 self
420 }
421
422 pub fn padding(mut self, padding: char) -> Self {
428 self.padding = Some(padding);
429 self
430 }
431
432 pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
438 self.start_codepoint = Some(start_codepoint);
439 self
440 }
441
442 #[allow(deprecated)]
451 pub fn build(self) -> Result<Dictionary, String> {
452 let mode = self.mode.unwrap_or(EncodingMode::Radix);
453 let chars = self.chars.unwrap_or_default();
454
455 Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
456 }
457}
458
459#[cfg(test)]
460mod tests {
461 use super::*;
462
463 #[test]
464 fn test_duplicate_character_detection() {
465 let chars = vec!['a', 'b', 'c', 'a'];
466 let result = Dictionary::builder().chars(chars).build();
467 assert!(result.is_err());
468 assert!(result.unwrap_err().contains("Duplicate character"));
469 }
470
471 #[test]
472 fn test_empty_dictionary() {
473 let chars = vec![];
474 let result = Dictionary::builder().chars(chars).build();
475 assert!(result.is_err());
476 assert!(result.unwrap_err().contains("cannot be empty"));
477 }
478
479 #[test]
480 fn test_chunked_mode_power_of_two() {
481 let chars = vec!['a', 'b', 'c']; let result = Dictionary::builder()
483 .chars(chars)
484 .mode(EncodingMode::Chunked)
485 .build();
486 assert!(result.is_err());
487 assert!(result.unwrap_err().contains("power-of-two"));
488 }
489
490 #[test]
491 fn test_chunked_mode_valid_sizes() {
492 for &size in &[2, 4, 8, 16, 32, 64] {
494 let chars: Vec<char> = (0..size)
495 .map(|i| {
496 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
498 })
499 .collect();
500 let result = Dictionary::builder()
501 .chars(chars)
502 .mode(EncodingMode::Chunked)
503 .build();
504 assert!(result.is_ok(), "Size {} should be valid", size);
505 }
506 }
507
508 #[test]
509 fn test_control_character_rejection() {
510 let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::builder().chars(chars).build();
512 assert!(result.is_err());
513 assert!(result.unwrap_err().contains("Control character"));
514 }
515
516 #[test]
517 fn test_whitespace_rejection() {
518 let chars = vec!['a', 'b', '\t', 'c'];
520 let result = Dictionary::builder().chars(chars).build();
521 assert!(result.is_err());
522 assert!(result.unwrap_err().contains("Whitespace"));
523
524 let chars_with_space = vec!['a', 'b', ' ', 'c'];
526 let result_space = Dictionary::builder().chars(chars_with_space).build();
527 assert!(result_space.is_ok());
528 }
529
530 #[test]
531 fn test_padding_conflict_with_dictionary() {
532 let chars = vec!['a', 'b', 'c', 'd'];
533 let result = Dictionary::builder()
534 .chars(chars)
535 .mode(EncodingMode::Radix)
536 .padding('b')
537 .build();
538 assert!(result.is_err());
539 let err = result.unwrap_err();
540 assert!(err.contains("Padding character"));
541 assert!(err.contains("conflicts"));
542 }
543
544 #[test]
545 fn test_valid_padding() {
546 let chars = vec!['a', 'b', 'c', 'd'];
547 let result = Dictionary::builder()
548 .chars(chars)
549 .mode(EncodingMode::Radix)
550 .padding('=')
551 .build();
552 assert!(result.is_ok());
553 }
554
555 #[test]
556 fn test_byte_range_exceeds_unicode() {
557 let result = Dictionary::builder()
559 .mode(EncodingMode::ByteRange)
560 .start_codepoint(0x10FF80) .build();
562 assert!(result.is_err());
563 }
564
565 #[test]
566 fn test_byte_range_valid_start() {
567 let result = Dictionary::builder()
568 .mode(EncodingMode::ByteRange)
569 .start_codepoint(0x1F300) .build();
571 assert!(result.is_ok());
572 }
573
574 #[test]
575 fn test_byte_range_no_start_codepoint() {
576 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
577 assert!(result.is_err());
578 assert!(result.unwrap_err().contains("requires start_codepoint"));
579 }
580
581 #[test]
582 fn test_detailed_error_messages() {
583 let chars = vec!['a', 'b', 'a'];
585 let err = Dictionary::builder().chars(chars).build().unwrap_err();
586 assert!(err.contains("'a'") || err.contains("U+"));
587 }
588
589 #[test]
591 fn test_builder_basic() {
592 let dict = Dictionary::builder()
593 .chars(vec!['0', '1', '2', '3'])
594 .build()
595 .unwrap();
596
597 assert_eq!(dict.base(), 4);
598 assert_eq!(dict.mode(), &EncodingMode::Radix);
599 assert_eq!(dict.padding(), None);
600 }
601
602 #[test]
603 fn test_builder_from_str() {
604 let dict = Dictionary::builder()
605 .chars_from_str("0123456789ABCDEF")
606 .build()
607 .unwrap();
608
609 assert_eq!(dict.base(), 16);
610 }
611
612 #[test]
613 fn test_builder_with_mode() {
614 let dict = Dictionary::builder()
615 .chars(vec!['0', '1'])
616 .mode(EncodingMode::Chunked)
617 .build()
618 .unwrap();
619
620 assert_eq!(dict.mode(), &EncodingMode::Chunked);
621 }
622
623 #[test]
624 fn test_builder_with_padding() {
625 let dict = Dictionary::builder()
626 .chars_from_str("ABCD")
627 .padding('=')
628 .build()
629 .unwrap();
630
631 assert_eq!(dict.padding(), Some('='));
632 }
633
634 #[test]
635 fn test_builder_byte_range() {
636 let dict = Dictionary::builder()
637 .mode(EncodingMode::ByteRange)
638 .start_codepoint(0x1F300)
639 .build()
640 .unwrap();
641
642 assert_eq!(dict.mode(), &EncodingMode::ByteRange);
643 assert_eq!(dict.start_codepoint(), Some(0x1F300));
644 assert_eq!(dict.base(), 256);
645 }
646
647 #[test]
648 fn test_builder_byte_range_missing_start() {
649 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
650
651 assert!(result.is_err());
652 assert!(result.unwrap_err().contains("requires start_codepoint"));
653 }
654
655 #[test]
656 fn test_builder_validation_duplicates() {
657 let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
658
659 assert!(result.is_err());
660 assert!(result.unwrap_err().contains("Duplicate character"));
661 }
662
663 #[test]
664 fn test_builder_chunked_validation() {
665 let result = Dictionary::builder()
666 .chars(vec!['a', 'b', 'c']) .mode(EncodingMode::Chunked)
668 .build();
669
670 assert!(result.is_err());
671 assert!(result.unwrap_err().contains("power-of-two"));
672 }
673
674 #[test]
675 fn test_builder_padding_conflict() {
676 let result = Dictionary::builder()
677 .chars(vec!['a', 'b', 'c'])
678 .padding('b')
679 .build();
680
681 assert!(result.is_err());
682 assert!(result.unwrap_err().contains("Padding character"));
683 }
684
685 #[test]
686 fn test_builder_full_config() {
687 let dict = Dictionary::builder()
688 .chars_from_str("01")
689 .mode(EncodingMode::Chunked)
690 .padding('=')
691 .build()
692 .unwrap();
693
694 assert_eq!(dict.base(), 2);
695 assert_eq!(dict.mode(), &EncodingMode::Chunked);
696 assert_eq!(dict.padding(), Some('='));
697 }
698}