Skip to main content

base_d/core/
dictionary.rs

1use crate::core::config::EncodingMode;
2#[cfg(feature = "simd")]
3use crate::simd::variants::DictionaryMetadata;
4use std::collections::HashMap;
5
6const MAX_LOOKUP_TABLE_SIZE: usize = 256;
7
8/// Checks whether a ByteRange dictionary with the given `start_codepoint` is safe
9/// for encoding all 256 possible byte values (0-255).
10///
11/// A range is considered **unsafe** if any mapped codepoint falls in:
12/// - U+0000 (NUL) -- causes CString/git failures
13/// - U+0001..=U+001F (C0 control characters) -- non-printable, break terminals/parsers
14/// - U+007F (DEL) -- non-printable control character
15/// - U+0080..=U+009F (C1 control characters) -- non-printable, break terminals/parsers
16/// - U+D800..=U+DFFF (surrogates) -- invalid in UTF-8, `char::from_u32` returns None
17///
18/// A safe `start_codepoint` must satisfy:
19/// - `start >= 0x00A0` (skips NUL, C0 controls, DEL, and C1 controls)
20/// - `start + 255 < 0xD800` OR `start > 0xDFFF` (avoids surrogate gap)
21/// - `start + 255 <= 0x10FFFF` (stays within Unicode)
22pub fn is_safe_byte_range(start: u32) -> bool {
23    // Must be above C1 control range (avoids NUL U+0000 and C1 controls U+0080-U+009F)
24    if start < 0x00A0 {
25        return false;
26    }
27
28    // end codepoint for byte 255
29    let end = match start.checked_add(255) {
30        Some(e) => e,
31        None => return false,
32    };
33
34    // Must not exceed Unicode maximum
35    if end > 0x10FFFF {
36        return false;
37    }
38
39    // Must not overlap with surrogate range U+D800..=U+DFFF
40    // Overlap occurs when start <= 0xDFFF AND end >= 0xD800
41    if start <= 0xDFFF && end >= 0xD800 {
42        return false;
43    }
44
45    true
46}
47
48/// Represents an encoding dictionary with its characters and configuration.
49///
50/// An dictionary defines the character set and encoding mode used for converting
51/// binary data to text. Supports three modes: mathematical base conversion,
52/// chunked (RFC 4648), and byte-range mapping.
53#[derive(Debug, Clone)]
54pub struct Dictionary {
55    chars: Vec<char>,
56    char_to_index: HashMap<char, usize>,
57    // Fast lookup table for ASCII/extended ASCII characters
58    lookup_table: Option<Box<[Option<usize>; 256]>>,
59    mode: EncodingMode,
60    padding: Option<char>,
61    start_codepoint: Option<u32>,
62}
63
64impl Dictionary {
65    /// Creates a new DictionaryBuilder for constructing a Dictionary.
66    ///
67    /// # Example
68    ///
69    /// ```
70    /// use base_d::{Dictionary, EncodingMode};
71    /// let dict = Dictionary::builder()
72    ///     .chars_from_str("0123456789ABCDEF")
73    ///     .mode(EncodingMode::Radix)
74    ///     .build()
75    ///     .unwrap();
76    /// ```
77    pub fn builder() -> DictionaryBuilder {
78        DictionaryBuilder::new()
79    }
80
81    /// Creates a new dictionary with default settings (Radix mode, no padding).
82    ///
83    /// # Arguments
84    ///
85    /// * `chars` - Vector of characters to use in the dictionary
86    ///
87    /// # Errors
88    ///
89    /// Returns an error if the dictionary is empty or contains duplicate characters.
90    ///
91    /// # Deprecated
92    ///
93    /// Use `Dictionary::builder()` instead for more flexible configuration.
94    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
95    #[allow(deprecated)]
96    pub fn new(chars: Vec<char>) -> Result<Self, String> {
97        Self::new_with_mode(chars, EncodingMode::Radix, None)
98    }
99
100    /// Creates a new dictionary with specified encoding mode and optional padding.
101    ///
102    /// # Arguments
103    ///
104    /// * `chars` - Vector of characters to use in the dictionary
105    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
106    /// * `padding` - Optional padding character (typically '=' for RFC modes)
107    ///
108    /// # Errors
109    ///
110    /// Returns an error if:
111    /// - The dictionary is empty or contains duplicates
112    /// - Chunked mode is used with a non-power-of-two dictionary size
113    ///
114    /// # Deprecated
115    ///
116    /// Use `Dictionary::builder()` instead for more flexible configuration.
117    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
118    #[allow(deprecated)]
119    pub fn new_with_mode(
120        chars: Vec<char>,
121        mode: EncodingMode,
122        padding: Option<char>,
123    ) -> Result<Self, String> {
124        Self::new_with_mode_and_range(chars, mode, padding, None)
125    }
126
127    /// Creates a new dictionary with full configuration including byte-range support.
128    ///
129    /// # Arguments
130    ///
131    /// * `chars` - Vector of characters (empty for ByteRange mode)
132    /// * `mode` - Encoding mode
133    /// * `padding` - Optional padding character
134    /// * `start_codepoint` - Starting Unicode codepoint for ByteRange mode
135    ///
136    /// # Errors
137    ///
138    /// Returns an error if configuration is invalid for the specified mode.
139    ///
140    /// # Deprecated
141    ///
142    /// Use `Dictionary::builder()` instead for more flexible configuration.
143    #[deprecated(since = "0.1.0", note = "Use Dictionary::builder() instead")]
144    pub fn new_with_mode_and_range(
145        chars: Vec<char>,
146        mode: EncodingMode,
147        padding: Option<char>,
148        start_codepoint: Option<u32>,
149    ) -> Result<Self, String> {
150        // ByteRange mode doesn't need chars, just validates start_codepoint
151        if mode == EncodingMode::ByteRange {
152            if let Some(start) = start_codepoint {
153                // Validate that we can represent all 256 bytes
154                // Validate safety: reject ranges that map bytes to NUL, C1 controls,
155                // or surrogates. These produce encoded strings that break git, terminals,
156                // and other text-processing tools.
157                if !is_safe_byte_range(start) {
158                    return Err(format!(
159                        "Unsafe ByteRange start_codepoint U+{:04X}: mapped range U+{:04X}..U+{:04X} \
160                         overlaps with dangerous codepoints (NUL U+0000, C1 controls U+0080-U+009F, \
161                         or surrogates U+D800-U+DFFF)",
162                        start,
163                        start,
164                        start + 255
165                    ));
166                }
167
168                return Ok(Dictionary {
169                    chars: Vec::new(),
170                    char_to_index: HashMap::new(),
171                    lookup_table: None,
172                    mode,
173                    padding,
174                    start_codepoint: Some(start),
175                });
176            } else {
177                return Err("ByteRange mode requires start_codepoint".to_string());
178            }
179        }
180
181        if chars.is_empty() {
182            return Err("Dictionary cannot be empty".to_string());
183        }
184
185        // Validate dictionary size for chunked mode
186        if mode == EncodingMode::Chunked {
187            let base = chars.len();
188            if !base.is_power_of_two() {
189                return Err(format!(
190                    "Chunked mode requires power-of-two dictionary size, got {}",
191                    base
192                ));
193            }
194            // Additional check: ensure we have valid sizes for chunked mode
195            if base != 2
196                && base != 4
197                && base != 8
198                && base != 16
199                && base != 32
200                && base != 64
201                && base != 128
202                && base != 256
203            {
204                return Err(format!(
205                    "Chunked mode requires dictionary size of 2, 4, 8, 16, 32, 64, 128, or 256, got {}",
206                    base
207                ));
208            }
209        }
210
211        // Validate character properties
212        let mut char_to_index = HashMap::new();
213        for (i, &c) in chars.iter().enumerate() {
214            // Check for duplicate characters
215            if char_to_index.insert(c, i).is_some() {
216                return Err(format!(
217                    "Duplicate character in dictionary: '{}' (U+{:04X})",
218                    c, c as u32
219                ));
220            }
221
222            // Check for invalid Unicode characters
223            if c.is_control() && c != '\t' && c != '\n' && c != '\r' {
224                return Err(format!(
225                    "Control character not allowed in dictionary: U+{:04X}",
226                    c as u32
227                ));
228            }
229
230            // Check for whitespace (allow space for RFC-compliant encodings like Base45)
231            if c.is_whitespace() && c != ' ' {
232                return Err(format!(
233                    "Whitespace character not allowed in dictionary: '{}' (U+{:04X})",
234                    c, c as u32
235                ));
236            }
237        }
238
239        // Validate padding character if present
240        if let Some(pad) = padding {
241            if char_to_index.contains_key(&pad) {
242                return Err(format!(
243                    "Padding character '{}' conflicts with dictionary characters",
244                    pad
245                ));
246            }
247            if pad.is_control() && pad != '\t' && pad != '\n' && pad != '\r' {
248                return Err(format!(
249                    "Control character not allowed as padding: U+{:04X}",
250                    pad as u32
251                ));
252            }
253        }
254
255        // Build fast lookup table for ASCII characters
256        let lookup_table = if chars
257            .iter()
258            .all(|&c| (c as u32) < MAX_LOOKUP_TABLE_SIZE as u32)
259        {
260            let mut table = Box::new([None; 256]);
261            for (i, &c) in chars.iter().enumerate() {
262                table[c as usize] = Some(i);
263            }
264            Some(table)
265        } else {
266            None
267        };
268
269        Ok(Dictionary {
270            chars,
271            char_to_index,
272            lookup_table,
273            mode,
274            padding,
275            start_codepoint: None,
276        })
277    }
278
279    /// Creates an dictionary from a string of characters.
280    ///
281    /// # Arguments
282    ///
283    /// * `s` - String containing the dictionary characters
284    ///
285    /// # Deprecated
286    ///
287    /// Use `Dictionary::builder().chars_from_str(s).build()` instead.
288    #[deprecated(
289        since = "0.1.0",
290        note = "Use Dictionary::builder().chars_from_str(s).build() instead"
291    )]
292    #[allow(deprecated, clippy::should_implement_trait)]
293    pub fn from_str(s: &str) -> Result<Self, String> {
294        let chars: Vec<char> = s.chars().collect();
295        Self::new(chars)
296    }
297
298    /// Returns the base (radix) of the dictionary.
299    ///
300    /// For ByteRange mode, always returns 256. Otherwise returns the number of characters.
301    pub fn base(&self) -> usize {
302        match self.mode {
303            EncodingMode::ByteRange => 256,
304            _ => self.chars.len(),
305        }
306    }
307
308    /// Returns the encoding mode of this dictionary.
309    pub fn mode(&self) -> &EncodingMode {
310        &self.mode
311    }
312
313    /// Returns the padding character, if any.
314    pub fn padding(&self) -> Option<char> {
315        self.padding
316    }
317
318    /// Returns the starting Unicode codepoint for ByteRange mode.
319    pub fn start_codepoint(&self) -> Option<u32> {
320        self.start_codepoint
321    }
322
323    /// Encodes a digit (0 to base-1) as a character.
324    ///
325    /// Returns `None` if the digit is out of range.
326    pub fn encode_digit(&self, digit: usize) -> Option<char> {
327        match self.mode {
328            EncodingMode::ByteRange => {
329                if let Some(start) = self.start_codepoint
330                    && digit < 256
331                {
332                    return std::char::from_u32(start + digit as u32);
333                }
334                None
335            }
336            _ => self.chars.get(digit).copied(),
337        }
338    }
339
340    /// Decodes a character back to its digit value.
341    ///
342    /// Returns `None` if the character is not in the dictionary.
343    pub fn decode_char(&self, c: char) -> Option<usize> {
344        match self.mode {
345            EncodingMode::ByteRange => {
346                if let Some(start) = self.start_codepoint {
347                    let codepoint = c as u32;
348                    if codepoint >= start && codepoint < start + 256 {
349                        return Some((codepoint - start) as usize);
350                    }
351                }
352                None
353            }
354            _ => {
355                // Use fast lookup table for ASCII characters
356                if let Some(ref table) = self.lookup_table {
357                    let char_val = c as u32;
358                    if char_val < MAX_LOOKUP_TABLE_SIZE as u32 {
359                        return table[char_val as usize];
360                    }
361                }
362                // Fall back to HashMap for non-ASCII
363                self.char_to_index.get(&c).copied()
364            }
365        }
366    }
367
368    /// Returns SIMD metadata for this dictionary.
369    ///
370    /// This provides information about whether SIMD acceleration is available
371    /// for this dictionary and which implementation to use.
372    #[cfg(feature = "simd")]
373    pub fn simd_metadata(&self) -> DictionaryMetadata {
374        DictionaryMetadata::from_dictionary(self)
375    }
376
377    /// Returns whether SIMD acceleration is available for this dictionary.
378    ///
379    /// This is a convenience method that checks if SIMD can be used with
380    /// the current CPU features and dictionary configuration.
381    #[cfg(feature = "simd")]
382    pub fn simd_available(&self) -> bool {
383        self.simd_metadata().simd_available()
384    }
385
386    /// Returns whether SIMD acceleration is available for this dictionary.
387    ///
388    /// When the `simd` feature is disabled, this always returns `false`.
389    #[cfg(not(feature = "simd"))]
390    pub fn simd_available(&self) -> bool {
391        false
392    }
393}
394
395/// Builder for constructing a Dictionary with flexible configuration.
396///
397/// # Example
398///
399/// ```
400/// use base_d::{Dictionary, EncodingMode};
401/// let dict = Dictionary::builder()
402///     .chars_from_str("0123456789ABCDEF")
403///     .mode(EncodingMode::Radix)
404///     .build()
405///     .unwrap();
406/// ```
407#[derive(Debug, Default)]
408pub struct DictionaryBuilder {
409    chars: Option<Vec<char>>,
410    mode: Option<EncodingMode>,
411    padding: Option<char>,
412    start_codepoint: Option<u32>,
413}
414
415impl DictionaryBuilder {
416    /// Creates a new DictionaryBuilder with default settings.
417    pub fn new() -> Self {
418        Self {
419            chars: None,
420            mode: None,
421            padding: None,
422            start_codepoint: None,
423        }
424    }
425
426    /// Sets the dictionary characters from a vector.
427    ///
428    /// # Arguments
429    ///
430    /// * `chars` - Vector of characters to use in the dictionary
431    pub fn chars(mut self, chars: Vec<char>) -> Self {
432        self.chars = Some(chars);
433        self
434    }
435
436    /// Sets the dictionary characters from a string.
437    ///
438    /// # Arguments
439    ///
440    /// * `s` - String containing the dictionary characters
441    pub fn chars_from_str(mut self, s: &str) -> Self {
442        self.chars = Some(s.chars().collect());
443        self
444    }
445
446    /// Sets the encoding mode.
447    ///
448    /// # Arguments
449    ///
450    /// * `mode` - Encoding mode (Radix, Chunked, or ByteRange)
451    pub fn mode(mut self, mode: EncodingMode) -> Self {
452        self.mode = Some(mode);
453        self
454    }
455
456    /// Sets the padding character.
457    ///
458    /// # Arguments
459    ///
460    /// * `padding` - Padding character (typically '=' for RFC modes)
461    pub fn padding(mut self, padding: char) -> Self {
462        self.padding = Some(padding);
463        self
464    }
465
466    /// Sets the starting Unicode codepoint for ByteRange mode.
467    ///
468    /// # Arguments
469    ///
470    /// * `start_codepoint` - Starting Unicode codepoint
471    pub fn start_codepoint(mut self, start_codepoint: u32) -> Self {
472        self.start_codepoint = Some(start_codepoint);
473        self
474    }
475
476    /// Builds the Dictionary with the configured settings.
477    ///
478    /// # Errors
479    ///
480    /// Returns an error if:
481    /// - The configuration is invalid for the specified mode
482    /// - Required fields are missing
483    /// - Validation fails (duplicates, invalid characters, etc.)
484    #[allow(deprecated)]
485    pub fn build(self) -> Result<Dictionary, String> {
486        let mode = self.mode.unwrap_or(EncodingMode::Radix);
487        let chars = self.chars.unwrap_or_default();
488
489        Dictionary::new_with_mode_and_range(chars, mode, self.padding, self.start_codepoint)
490    }
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496
497    #[test]
498    fn test_duplicate_character_detection() {
499        let chars = vec!['a', 'b', 'c', 'a'];
500        let result = Dictionary::builder().chars(chars).build();
501        assert!(result.is_err());
502        assert!(result.unwrap_err().contains("Duplicate character"));
503    }
504
505    #[test]
506    fn test_empty_dictionary() {
507        let chars = vec![];
508        let result = Dictionary::builder().chars(chars).build();
509        assert!(result.is_err());
510        assert!(result.unwrap_err().contains("cannot be empty"));
511    }
512
513    #[test]
514    fn test_chunked_mode_power_of_two() {
515        let chars = vec!['a', 'b', 'c']; // 3 is not power of 2
516        let result = Dictionary::builder()
517            .chars(chars)
518            .mode(EncodingMode::Chunked)
519            .build();
520        assert!(result.is_err());
521        assert!(result.unwrap_err().contains("power-of-two"));
522    }
523
524    #[test]
525    fn test_chunked_mode_valid_sizes() {
526        // Test all valid chunked sizes
527        for &size in &[2, 4, 8, 16, 32, 64] {
528            let chars: Vec<char> = (0..size)
529                .map(|i| {
530                    // Use a wider range of Unicode characters
531                    char::from_u32('A' as u32 + (i % 26) + ((i / 26) * 100)).unwrap()
532                })
533                .collect();
534            let result = Dictionary::builder()
535                .chars(chars)
536                .mode(EncodingMode::Chunked)
537                .build();
538            assert!(result.is_ok(), "Size {} should be valid", size);
539        }
540    }
541
542    #[test]
543    fn test_control_character_rejection() {
544        let chars = vec!['a', 'b', '\x00', 'c']; // null character
545        let result = Dictionary::builder().chars(chars).build();
546        assert!(result.is_err());
547        assert!(result.unwrap_err().contains("Control character"));
548    }
549
550    #[test]
551    fn test_whitespace_rejection() {
552        // Tab should be rejected (only space is allowed for RFC encodings like Base45)
553        let chars = vec!['a', 'b', '\t', 'c'];
554        let result = Dictionary::builder().chars(chars).build();
555        assert!(result.is_err());
556        assert!(result.unwrap_err().contains("Whitespace"));
557
558        // But space should be allowed (for Base45 RFC 9285 compliance)
559        let chars_with_space = vec!['a', 'b', ' ', 'c'];
560        let result_space = Dictionary::builder().chars(chars_with_space).build();
561        assert!(result_space.is_ok());
562    }
563
564    #[test]
565    fn test_padding_conflict_with_dictionary() {
566        let chars = vec!['a', 'b', 'c', 'd'];
567        let result = Dictionary::builder()
568            .chars(chars)
569            .mode(EncodingMode::Radix)
570            .padding('b')
571            .build();
572        assert!(result.is_err());
573        let err = result.unwrap_err();
574        assert!(err.contains("Padding character"));
575        assert!(err.contains("conflicts"));
576    }
577
578    #[test]
579    fn test_valid_padding() {
580        let chars = vec!['a', 'b', 'c', 'd'];
581        let result = Dictionary::builder()
582            .chars(chars)
583            .mode(EncodingMode::Radix)
584            .padding('=')
585            .build();
586        assert!(result.is_ok());
587    }
588
589    #[test]
590    fn test_byte_range_exceeds_unicode() {
591        // Test with a start codepoint so high that start + 255 exceeds max valid Unicode (0x10FFFF)
592        let result = Dictionary::builder()
593            .mode(EncodingMode::ByteRange)
594            .start_codepoint(0x10FF80) // 0x10FF80 + 255 = 0x110078, exceeds 0x10FFFF
595            .build();
596        assert!(result.is_err());
597    }
598
599    #[test]
600    fn test_byte_range_valid_start() {
601        let result = Dictionary::builder()
602            .mode(EncodingMode::ByteRange)
603            .start_codepoint(0x1F300) // Valid start in emoji range
604            .build();
605        assert!(result.is_ok());
606    }
607
608    #[test]
609    fn test_byte_range_no_start_codepoint() {
610        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
611        assert!(result.is_err());
612        assert!(result.unwrap_err().contains("requires start_codepoint"));
613    }
614
615    #[test]
616    fn test_detailed_error_messages() {
617        // Test that error messages include useful information
618        let chars = vec!['a', 'b', 'a'];
619        let err = Dictionary::builder().chars(chars).build().unwrap_err();
620        assert!(err.contains("'a'") || err.contains("U+"));
621    }
622
623    // DictionaryBuilder tests
624    #[test]
625    fn test_builder_basic() {
626        let dict = Dictionary::builder()
627            .chars(vec!['0', '1', '2', '3'])
628            .build()
629            .unwrap();
630
631        assert_eq!(dict.base(), 4);
632        assert_eq!(dict.mode(), &EncodingMode::Radix);
633        assert_eq!(dict.padding(), None);
634    }
635
636    #[test]
637    fn test_builder_from_str() {
638        let dict = Dictionary::builder()
639            .chars_from_str("0123456789ABCDEF")
640            .build()
641            .unwrap();
642
643        assert_eq!(dict.base(), 16);
644    }
645
646    #[test]
647    fn test_builder_with_mode() {
648        let dict = Dictionary::builder()
649            .chars(vec!['0', '1'])
650            .mode(EncodingMode::Chunked)
651            .build()
652            .unwrap();
653
654        assert_eq!(dict.mode(), &EncodingMode::Chunked);
655    }
656
657    #[test]
658    fn test_builder_with_padding() {
659        let dict = Dictionary::builder()
660            .chars_from_str("ABCD")
661            .padding('=')
662            .build()
663            .unwrap();
664
665        assert_eq!(dict.padding(), Some('='));
666    }
667
668    #[test]
669    fn test_builder_byte_range() {
670        let dict = Dictionary::builder()
671            .mode(EncodingMode::ByteRange)
672            .start_codepoint(0x1F300)
673            .build()
674            .unwrap();
675
676        assert_eq!(dict.mode(), &EncodingMode::ByteRange);
677        assert_eq!(dict.start_codepoint(), Some(0x1F300));
678        assert_eq!(dict.base(), 256);
679    }
680
681    #[test]
682    fn test_builder_byte_range_missing_start() {
683        let result = Dictionary::builder().mode(EncodingMode::ByteRange).build();
684
685        assert!(result.is_err());
686        assert!(result.unwrap_err().contains("requires start_codepoint"));
687    }
688
689    #[test]
690    fn test_builder_validation_duplicates() {
691        let result = Dictionary::builder().chars(vec!['a', 'b', 'a']).build();
692
693        assert!(result.is_err());
694        assert!(result.unwrap_err().contains("Duplicate character"));
695    }
696
697    #[test]
698    fn test_builder_chunked_validation() {
699        let result = Dictionary::builder()
700            .chars(vec!['a', 'b', 'c']) // 3 is not power of 2
701            .mode(EncodingMode::Chunked)
702            .build();
703
704        assert!(result.is_err());
705        assert!(result.unwrap_err().contains("power-of-two"));
706    }
707
708    #[test]
709    fn test_builder_padding_conflict() {
710        let result = Dictionary::builder()
711            .chars(vec!['a', 'b', 'c'])
712            .padding('b')
713            .build();
714
715        assert!(result.is_err());
716        assert!(result.unwrap_err().contains("Padding character"));
717    }
718
719    #[test]
720    fn test_builder_full_config() {
721        let dict = Dictionary::builder()
722            .chars_from_str("01")
723            .mode(EncodingMode::Chunked)
724            .padding('=')
725            .build()
726            .unwrap();
727
728        assert_eq!(dict.base(), 2);
729        assert_eq!(dict.mode(), &EncodingMode::Chunked);
730        assert_eq!(dict.padding(), Some('='));
731    }
732
733    // --- is_safe_byte_range boundary tests ---
734
735    #[test]
736    fn test_is_safe_byte_range_nul() {
737        // start=0 maps byte 0 to U+0000 (NUL) -- unsafe
738        assert!(!is_safe_byte_range(0));
739    }
740
741    #[test]
742    fn test_is_safe_byte_range_end_of_c1() {
743        // start=0x009F: end = 0x009F+255 = 0x019E, but start itself is in C1 range -- unsafe
744        assert!(!is_safe_byte_range(0x009F));
745    }
746
747    #[test]
748    fn test_is_safe_byte_range_first_safe() {
749        // start=0x00A0: end = 0x00A0+255 = 0x019F, all valid printable codepoints -- safe
750        assert!(is_safe_byte_range(0x00A0));
751    }
752
753    #[test]
754    fn test_is_safe_byte_range_just_below_surrogates() {
755        // start=0xD700: end = 0xD700+255 = 0xD7FF, just below surrogate start -- safe
756        assert!(is_safe_byte_range(0xD700));
757    }
758
759    #[test]
760    fn test_is_safe_byte_range_overlaps_surrogate_start() {
761        // start=0xD701: end = 0xD701+255 = 0xD800, overlaps surrogate start -- unsafe
762        assert!(!is_safe_byte_range(0xD701));
763    }
764
765    #[test]
766    fn test_is_safe_byte_range_above_surrogates() {
767        // start=0xE000: above surrogate range, all valid -- safe
768        assert!(is_safe_byte_range(0xE000));
769    }
770
771    #[test]
772    fn test_is_safe_byte_range_at_unicode_max() {
773        // start=0x10FF00: end = 0x10FF00+255 = 0x10FFFF, exactly at Unicode max -- safe
774        assert!(is_safe_byte_range(0x10FF00));
775    }
776
777    #[test]
778    fn test_is_safe_byte_range_exceeds_unicode_max() {
779        // start=0x10FF01: end = 0x10FF01+255 = 0x110000, exceeds Unicode max -- unsafe
780        assert!(!is_safe_byte_range(0x10FF01));
781    }
782}