1use crate::core::config::EncodingMode;
2use crate::simd::variants::DictionaryMetadata;
3use std::collections::HashMap;
4
5const MAX_LOOKUP_TABLE_SIZE: usize = 256;
6
7#[derive(Debug, Clone)]
13pub struct Dictionary {
14 chars: Vec<char>,
15 char_to_index: HashMap<char, usize>,
16 lookup_table: Option<Box<[Option<usize>; 256]>>,
18 mode: EncodingMode,
19 padding: Option<char>,
20 start_codepoint: Option<u32>,
21}
22
23impl Dictionary {
24 pub fn builder() -> DictionaryBuilder {
37 DictionaryBuilder::new()
38 }
39
40 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
54 pub fn new(chars: Vec<char>) -> Result<Self, String> {
55 Self::new_with_mode(chars, EncodingMode::BaseConversion, None)
56 }
57
58 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
76 pub fn new_with_mode(
77 chars: Vec<char>,
78 mode: EncodingMode,
79 padding: Option<char>,
80 ) -> Result<Self, String> {
81 Self::new_with_mode_and_range(chars, mode, padding, None)
82 }
83
84 #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
101 pub fn new_with_mode_and_range(
102 chars: Vec<char>,
103 mode: EncodingMode,
104 padding: Option<char>,
105 start_codepoint: Option<u32>,
106 ) -> Result<Self, String> {
107 if mode == EncodingMode::ByteRange {
109 if let Some(start) = start_codepoint {
110 if let Some(end_codepoint) = start.checked_add(255) {
112 if std::char::from_u32(end_codepoint).is_none() {
113 return Err(format!(
114 "Invalid Unicode range: {}-{}",
115 start, end_codepoint
116 ));
117 }
118 for offset in 0..=255 {
120 if std::char::from_u32(start + offset).is_none() {
121 return Err(format!(
122 "Invalid Unicode codepoint in range: {}",
123 start + offset
124 ));
125 }
126 }
127 } else {
128 return Err("Start codepoint too high for 256-byte range".to_string());
129 }
130
131 return Ok(Dictionary {
132 chars: Vec::new(),
133 char_to_index: HashMap::new(),
134 lookup_table: None,
135 mode,
136 padding,
137 start_codepoint: Some(start),
138 });
139 } else {
140 return Err("ByteRange mode requires start_codepoint".to_string());
141 }
142 }
143
144 if chars.is_empty() {
145 return Err("Dictionary cannot be empty".to_string());
146 }
147
148 if mode == EncodingMode::Chunked {
150 let base = chars.len();
151 if !base.is_power_of_two() {
152 return Err(format!(
153 "Chunked mode requires power-of-two dictionary size, got {}",
154 base
155 ));
156 }
157 if base != 2
159 && base != 4
160 && base != 8
161 && base != 16
162 && base != 32
163 && base != 64
164 && base != 128
165 && base != 256
166 {
167 return Err(format!("Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}", base));
168 }
169 }
170
171 let mut char_to_index = HashMap::new();
173 for (i, &c) in chars.iter().enumerate() {
174 if char_to_index.insert(c, i).is_some() {
176 return Err(format!(
177 "Duplicate character in dictionary: '{}' (U+{:04X})",
178 c, c as u32
179 ));
180 }
181
182 if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
184 return Err(format!(
185 "Control character not allowed in dictionary: U+{:04X}",
186 c as u32
187 ));
188 }
189
190 if c.is_whitespace() {
192 return Err(format!(
193 "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
194 c, c as u32
195 ));
196 }
197 }
198
199 if let Some(pad) = padding {
201 if char_to_index.contains_key(&pad) {
202 return Err(format!(
203 "Padding character '{}' conflicts with dictionary characters",
204 pad
205 ));
206 }
207 if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
208 return Err(format!(
209 "Control character not allowed as padding: U+{:04X}",
210 pad as u32
211 ));
212 }
213 }
214
215 let lookup_table = if chars
217 .iter()
218 .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
219 {
220 let mut table = Box::new([None; 256]);
221 for (i, &c) in chars.iter().enumerate() {
222 table[c as usize] = Some(i);
223 }
224 Some(table)
225 } else {
226 None
227 };
228
229 Ok(Dictionary {
230 chars,
231 char_to_index,
232 lookup_table,
233 mode,
234 padding,
235 start_codepoint: None,
236 })
237 }
238
239 #[deprecated(
249 since = "0.1.0",
250 note = "Use Dictionary::builder().chars_from_str(s).build() instead"
251 )]
252 pub fn from_str(s: &str) -> Result<Self, String> {
253 let chars: Vec<char> = s.chars().collect();
254 Self::new(chars)
255 }
256
257 pub fn base(&self) -> usize {
261 match self.mode {
262 EncodingMode::ByteRange => 256,
263 _ => self.chars.len(),
264 }
265 }
266
267 pub fn mode(&self) -> &EncodingMode {
269 &self.mode
270 }
271
272 pub fn padding(&self) -> Option<char> {
274 self.padding
275 }
276
277 pub fn start_codepoint(&self) -> Option<u32> {
279 self.start_codepoint
280 }
281
282 pub fn encode_digit(&self, digit: usize) -> Option<char> {
286 match self.mode {
287 EncodingMode::ByteRange => {
288 if let Some(start) = self.start_codepoint {
289 if digit < 256 {
290 return std::char::from_u32(start + digit as u32);
291 }
292 }
293 None
294 }
295 _ => self.chars.get(digit).copied(),
296 }
297 }
298
299 pub fn decode_char(&self, c: char) -> Option<usize> {
303 match self.mode {
304 EncodingMode::ByteRange => {
305 if let Some(start) = self.start_codepoint {
306 let codepoint = c as u32;
307 if codepoint >= start && codepoint < start + 256 {
308 return Some((codepoint - start) as usize);
309 }
310 }
311 None
312 }
313 _ => {
314 if let Some(ref table) = self.lookup_table {
316 let char_val = c as u32;
317 if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
318 return table[char_val as usize];
319 }
320 }
321 self.char_to_index.get(&c).copied()
323 }
324 }
325 }
326
327 pub fn simd_metadata(&self) -> DictionaryMetadata {
332 DictionaryMetadata::from_dictionary(self)
333 }
334
335 pub fn simd_available(&self) -> bool {
340 self.simd_metadata().simd_available()
341 }
342}
343
344#[derive(Debug, Default)]
357pub struct DictionaryBuilder {
358 chars: Option<Vec<char>>,
359 mode: Option<EncodingMode>,
360 padding: Option<char>,
361 start_codepoint: Option<u32>,
362}
363
364impl DictionaryBuilder {
365 pub fn new() -> Self {
367 Self {
368 chars: None,
369 mode: None,
370 padding: None,
371 start_codepoint: None,
372 }
373 }
374
375 pub fn chars(mut self, chars: Vec<char>) -> Self {
381 self.chars = Some(chars);
382 self
383 }
384
385 pub fn chars_from_str(mut self, s: &str) -> Self {
391 self.chars = Some(s.chars().collect());
392 self
393 }
394
395 pub fn mode(mut self, mode: EncodingMode) -> Self {
401 self.mode = Some(mode);
402 self
403 }
404
405 pub fn padding(mut self, padding: char) -> Self {
411 self.padding = Some(padding);
412 self
413 }
414
415 pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
421 self.start_codepoint = Some(start_codepoint);
422 self
423 }
424
425 pub fn build(self) -> Result<Dictionary, String> {
434 let mode = self.mode.unwrap_or(EncodingMode::BaseConversion);
435 let chars = self.chars.unwrap_or_default();
436
437 #[allow(deprecated)]
438 Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
439 }
440}
441
442#[cfg(test)]
443mod tests {
444 use super::*;
445
446 #[test]
447 fn test_duplicate_character_detection() {
448 let chars = vec!['a', 'b', 'c', 'a'];
449 let result = Dictionary::new(chars);
450 assert!(result.is_err());
451 assert!(result.unwrap_err().contains("Duplicate character"));
452 }
453
454 #[test]
455 fn test_empty_dictionary() {
456 let chars = vec![];
457 let result = Dictionary::new(chars);
458 assert!(result.is_err());
459 assert!(result.unwrap_err().contains("cannot be empty"));
460 }
461
462 #[test]
463 fn test_chunked_mode_power_of_two() {
464 let chars = vec!['a', 'b', 'c']; let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
466 assert!(result.is_err());
467 assert!(result.unwrap_err().contains("power-of-two"));
468 }
469
470 #[test]
471 fn test_chunked_mode_valid_sizes() {
472 for &size in &[2, 4, 8, 16, 32, 64] {
474 let chars: Vec<char> = (0..size)
475 .map(|i| {
476 char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
478 })
479 .collect();
480 let result = Dictionary::new_with_mode(chars, EncodingMode::Chunked, None);
481 assert!(result.is_ok(), "Size {} should be valid", size);
482 }
483 }
484
485 #[test]
486 fn test_control_character_rejection() {
487 let chars = vec!['a', 'b', '\x00', 'c']; let result = Dictionary::new(chars);
489 assert!(result.is_err());
490 assert!(result.unwrap_err().contains("Control character"));
491 }
492
493 #[test]
494 fn test_whitespace_rejection() {
495 let chars = vec!['a', 'b', ' ', 'c'];
496 let result = Dictionary::new(chars);
497 assert!(result.is_err());
498 assert!(result.unwrap_err().contains("Whitespace"));
499 }
500
501 #[test]
502 fn test_padding_conflict_with_dictionary() {
503 let chars = vec!['a', 'b', 'c', 'd'];
504 let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('b'));
505 assert!(result.is_err());
506 let err = result.unwrap_err();
507 assert!(err.contains("Padding character"));
508 assert!(err.contains("conflicts"));
509 }
510
511 #[test]
512 fn test_valid_padding() {
513 let chars = vec!['a', 'b', 'c', 'd'];
514 let result = Dictionary::new_with_mode(chars, EncodingMode::BaseConversion, Some('='));
515 assert!(result.is_ok());
516 }
517
518 #[test]
519 fn test_byte_range_exceeds_unicode() {
520 let result = Dictionary::new_with_mode_and_range(
522 Vec::new(),
523 EncodingMode::ByteRange,
524 None,
525 Some(0x10FF80), );
527 assert!(result.is_err());
528 }
529
530 #[test]
531 fn test_byte_range_valid_start() {
532 let result = Dictionary::new_with_mode_and_range(
533 Vec::new(),
534 EncodingMode::ByteRange,
535 None,
536 Some(0x1F300), );
538 assert!(result.is_ok());
539 }
540
541 #[test]
542 fn test_byte_range_no_start_codepoint() {
543 let result =
544 Dictionary::new_with_mode_and_range(Vec::new(), EncodingMode::ByteRange, None, None);
545 assert!(result.is_err());
546 assert!(result.unwrap_err().contains("requires start_codepoint"));
547 }
548
549 #[test]
550 fn test_detailed_error_messages() {
551 let chars = vec!['a', 'b', 'a'];
553 let err = Dictionary::new(chars).unwrap_err();
554 assert!(err.contains("'a'") || err.contains("U+"));
555 }
556
557 #[test]
559 fn test_builder_basic() {
560 let dict = Dictionary::builder()
561 .chars(vec!['0', '1', '2', '3'])
562 .build()
563 .unwrap();
564
565 assert_eq!(dict.base(), 4);
566 assert_eq!(dict.mode(), &EncodingMode::BaseConversion);
567 assert_eq!(dict.padding(), None);
568 }
569
570 #[test]
571 fn test_builder_from_str() {
572 let dict = Dictionary::builder()
573 .chars_from_str("0123456789ABCDEF")
574 .build()
575 .unwrap();
576
577 assert_eq!(dict.base(), 16);
578 }
579
580 #[test]
581 fn test_builder_with_mode() {
582 let dict = Dictionary::builder()
583 .chars(vec!['0', '1'])
584 .mode(EncodingMode::Chunked)
585 .build()
586 .unwrap();
587
588 assert_eq!(dict.mode(), &EncodingMode::Chunked);
589 }
590
591 #[test]
592 fn test_builder_with_padding() {
593 let dict = Dictionary::builder()
594 .chars_from_str("ABCD")
595 .padding('=')
596 .build()
597 .unwrap();
598
599 assert_eq!(dict.padding(), Some('='));
600 }
601
602 #[test]
603 fn test_builder_byte_range() {
604 let dict = Dictionary::builder()
605 .mode(EncodingMode::ByteRange)
606 .start_codepoint(0x1F300)
607 .build()
608 .unwrap();
609
610 assert_eq!(dict.mode(), &EncodingMode::ByteRange);
611 assert_eq!(dict.start_codepoint(), Some(0x1F300));
612 assert_eq!(dict.base(), 256);
613 }
614
615 #[test]
616 fn test_builder_byte_range_missing_start() {
617 let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
618
619 assert!(result.is_err());
620 assert!(result.unwrap_err().contains("requires start_codepoint"));
621 }
622
623 #[test]
624 fn test_builder_validation_duplicates() {
625 let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
626
627 assert!(result.is_err());
628 assert!(result.unwrap_err().contains("Duplicate character"));
629 }
630
631 #[test]
632 fn test_builder_chunked_validation() {
633 let result = Dictionary::builder()
634 .chars(vec!['a', 'b', 'c']) .mode(EncodingMode::Chunked)
636 .build();
637
638 assert!(result.is_err());
639 assert!(result.unwrap_err().contains("power-of-two"));
640 }
641
642 #[test]
643 fn test_builder_padding_conflict() {
644 let result = Dictionary::builder()
645 .chars(vec!['a', 'b', 'c'])
646 .padding('b')
647 .build();
648
649 assert!(result.is_err());
650 assert!(result.unwrap_err().contains("Padding character"));
651 }
652
653 #[test]
654 fn test_builder_full_config() {
655 let dict = Dictionary::builder()
656 .chars_from_str("01")
657 .mode(EncodingMode::Chunked)
658 .padding('=')
659 .build()
660 .unwrap();
661
662 assert_eq!(dict.base(), 2);
663 assert_eq!(dict.mode(), &EncodingMode::Chunked);
664 assert_eq!(dict.padding(), Some('='));
665 }
666}