cesu8str/
encoding.rs

1use std::io;
2use std::str::Utf8Error;
3
4use crate::ngstr::prims::enc_surrogates;
5use crate::unicode::utf8_char_width;
6use crate::Cesu8Error;
7use crate::Variant;
8
9/// Validates UTF-8 string as CESU-8, erroring if any non-CESU-8 sequences are found.
10pub(crate) fn utf8_as_cesu8_spec<const ENCODE_NUL: bool>(
11    text: &str,
12) -> Result<(), Cesu8Error> {
13    let mut i = 0;
14    let text_bytes = text.as_bytes();
15    while i < text_bytes.len() {
16        // eprintln!("[{}:{}, encode_nul = {}] i = {}, slice = {:?}, whole = {:?}", file!(), line!(), ENCODE_NUL, i, &text_bytes[i..], &text_bytes);
17        let b = text_bytes[i];
18        if ENCODE_NUL && b == b'\0' {
19            return Err(Cesu8Error::new(i, Some(1), Ok(())));
20        }
21
22        // ascii "fast-path"
23        if b.is_ascii() {
24            i += 1;
25            continue;
26        }
27
28        let w = utf8_char_width(b);
29
30        // if width = 4 then we'd have to re-encode
31        if w == 4 {
32            // str is always valid UTF8, so there was enough characters, and there was exactly four of them (not None)
33            return Err(Cesu8Error::new(i, Some(4), Ok(())));
34        }
35
36        // skip the continuation bytes of the character
37        // this should always be at least 1 for valid UTF8, which &str provides
38        assert_ne!(w, 0, "utf8 char length was 0, this is illegal in well-formed utf8 strings (byte {b:x?}, bytes[{i}] from {text_bytes:x?})");
39        i += w;
40    }
41
42    Ok(())
43}
44
45/// Validates UTF-8 string as CESU-8, erroring if any non-CESU-8 sequences are found.
46#[inline]
47pub(crate) fn utf8_as_cesu8(
48    text: &str,
49    variant: Variant,
50) -> Result<(), Cesu8Error> {
51    match variant {
52        Variant::Standard => utf8_as_cesu8_spec::<false>(text),
53        Variant::Java => utf8_as_cesu8_spec::<true>(text),
54    }
55}
56
57/// Re-encodes UTF-8 bytes as CESU-8, returning the first created Utf8Error
58///
59/// Depends on the caller to provide a writable object of appropriate size, and to cast the written bytes to a Cesu8Str
60///
61/// # Safety
62/// `assume_good` should be an index into `text`, where all the bytes within `&text[..assume_good]` are valid CESU-8.
63///
64/// As this range will be written to `encoded` literally, and not checked, then providing a range with invalid CESU-8 may result in undefined behavior.
65///
66/// A value of `0` for `assume_good` will always be safe.
67pub(crate) unsafe fn utf8_to_cesu8_spec<W: io::Write, const ENCODE_NUL: bool>(
68    text: &str,
69    assume_good: usize,
70    encoded: &mut W,
71) -> io::Result<Result<(), Utf8Error>> {
72    // make an internal function so unsafe parts can still be checked
73    if assume_good != 0 {
74        // check that this is correct on debug builds
75        debug_assert_eq!(
76            utf8_as_cesu8_spec::<ENCODE_NUL>(text)
77                .unwrap_err()
78                .valid_up_to(),
79            assume_good,
80            "tried to assume invalid CESU-8 as good"
81        );
82        debug_assert!(
83            assume_good <= text.len(),
84            "tried to assume_good a chunk larger than the source"
85        );
86    }
87
88    #[inline(always)]
89    fn utf8_to_cesu8_prealloc_internal<W: io::Write, const ENCODE_NUL: bool>(
90        text: &str,
91        assume_good: usize,
92        encoded: &mut W,
93    ) -> io::Result<Result<(), Utf8Error>> {
94        let bytes = text.as_bytes();
95
96        encoded.write_all(&bytes[..assume_good])?;
97
98        // start after we've already decoded some bits
99
100        // index into `text`
101        let mut i = assume_good;
102        let mut utf8_seg = 0;
103        let mut utf8_err = Ok(());
104
105        // how much we've written to 'encoded', for a utf8_err index if necessary
106        let mut written = assume_good;
107
108        macro_rules! write_cesu8 {
109            ($cesu8_slice: expr, $text_len: expr) => {
110                let sl: &[u8] = $cesu8_slice;
111                encoded.write_all(sl)?;
112                written += sl.len();
113                i += $text_len;
114            };
115        }
116
117        macro_rules! push_utf8 {
118            ($errlen: expr) => {
119                if utf8_seg > 0 {
120                    // push utf8_segment
121                    write_cesu8!(&bytes[i..i + utf8_seg], utf8_seg);
122
123                    utf8_seg = 0;
124                }
125
126                // update utf8_err if this is the first error
127                if let Some(err) = $errlen {
128                    if utf8_err.is_ok() {
129                        utf8_err = Err(utf8err_new(written, err));
130                    }
131                }
132            };
133        }
134
135        // while i+utf8_seg < bytes.len() {
136        while let Some(&b) = bytes.get(i + utf8_seg) {
137            // let b = bytes[i+utf8_seg];
138            if ENCODE_NUL && b == b'\0' {
139                push_utf8!(Some(Some(1))); // injected 0xC0,0x80 will be invalid UTF-8
140
141                // re-encode nul, skip it
142                write_cesu8!(&[0xC0, 0x80], 1);
143            } else if b.is_ascii() {
144                // ascii range
145                utf8_seg += 1;
146            } else {
147                match utf8_char_width(b) {
148                    4 => {
149                        push_utf8!(Some(Some(1)));
150
151                        // re-encode character, skip it
152                        let s = unsafe { std::str::from_utf8_unchecked(&bytes[i..i + 4]) };
153                        let c = s.chars().next().unwrap() as u32;
154
155                        write_cesu8!(&enc_surrogates(c), 4);
156                    }
157                    w => {
158                        // w should only be in range 1..=3
159                        utf8_seg += w;
160                    }
161                }
162            }
163        }
164
165        push_utf8!(None);
166
167        // more to prevent unused_assignment warnings in push_utf8 macro than anything
168        debug_assert_eq!(i, text.len(), "did not fully consume the input text bytes");
169        debug_assert_eq!(
170            utf8_seg, 0,
171            "did not fully consume the current utf8 segment"
172        );
173
174        Ok(utf8_err)
175    }
176
177    utf8_to_cesu8_prealloc_internal::<W, ENCODE_NUL>(text, assume_good, encoded)
178}
179
180#[inline]
181pub(crate) fn utf8_to_cesu8_safe(
182    text: &str,
183    encoded: &mut Vec<u8>,
184    variant: Variant,
185) -> Result<(), Utf8Error> {
186    unsafe {
187        utf8_to_cesu8(text, 0, encoded, variant).expect(
188            "io::Error occured within Vec's io::Write implementation. This should not happen.",
189        )
190    }
191}
192
193#[inline]
194pub(crate) unsafe fn utf8_to_cesu8<W: io::Write>(
195    text: &str,
196    assume_good: usize,
197    encoded: &mut W,
198    variant: Variant,
199) -> io::Result<Result<(), Utf8Error>> {
200    match variant {
201        Variant::Standard => utf8_to_cesu8_spec::<W, false>(text, assume_good, encoded),
202        Variant::Java => utf8_to_cesu8_spec::<W, true>(text, assume_good, encoded),
203    }
204}
205
206
207
208/// There is no way to create a Utf8Error outside the stdlibrary, so unsafely artifically create one
209///
210/// This is useful for marking a specific index/length as a UTF8Error without performing O(n) string validation through stdlib
211#[inline]
212pub(crate) fn utf8err_new(valid_up_to: usize, err_len: Option<u8>) -> Utf8Error {
213    #[allow(dead_code)]
214    struct CustomUtf8Error {
215        valid_up_to: usize,
216        err_len: Option<u8>,
217    }
218
219    let err = CustomUtf8Error {
220        valid_up_to,
221        err_len,
222    };
223
224    // (loosly) ensure that Utf8Error does not change
225    debug_assert_eq!(
226        std::mem::align_of::<CustomUtf8Error>(),
227        std::mem::align_of::<Utf8Error>(),
228        "std::str::Utf8Error has unexpectedly changed alignment"
229    );
230    debug_assert_eq!(
231        std::mem::size_of::<CustomUtf8Error>(),
232        std::mem::size_of::<Utf8Error>(),
233        "std::str::Utf8Error has unexpectedly changed alignment"
234    );
235
236    unsafe { std::mem::transmute(err) }
237}
238
239#[inline]
240pub(crate) fn utf8err_inc(err: &Utf8Error, incby: usize) -> Utf8Error {
241    utf8err_new(incby + err.valid_up_to(), err.error_len().map(|b| b as u8))
242}