cesu8str 0.2.4

Convert to and from CESU-8 or Modified UTF-8 encodings, only when necessary.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
#![allow(deprecated)]

use std::borrow::Cow;
use std::error::Error;
use std::fmt;
use std::num::NonZeroUsize;
use std::str::Utf8Error;

use crate::from_utf8_slice;
use crate::from_utf8_vec;
use crate::encoding::utf8_as_cesu8_spec;
use crate::encoding::utf8err_inc;
use crate::encoding::utf8err_new;
use crate::ngstr::prims::CONT_MASK;
use crate::ngstr::prims::TAG_CONT_U8;
use crate::ngstr::prims::dec_surrogates_infallable;
use crate::string::Cesu8Str;
use crate::Variant;

/// Errors which can occur when attempting to interpret a `str` or sequence
/// of `u8` as a CESU8 string.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Cesu8Error {
    /// `bytes[..valid_up_to]` is a valid CESU-8 string
    pub(crate) valid_up_to: usize,

    /// should resume CESU-8 for `bytes[valid_up_to+error_len.unwrap()]`
    ///
    /// If None, more data is needed
    pub(crate) error_len: Option<NonZeroUsize>,

    /// Any UTF-8 errors that would have occured within the CESU-8 slice
    pub(crate) utf8_error: Result<(), Utf8Error>,
}
impl Error for Cesu8Error {
    fn description(&self) -> &str {
        "decoding error"
    }
    fn cause(&self) -> Option<&dyn Error> {
        None
    }
}

impl fmt::Display for Cesu8Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "could not convert CESU-8 data to UTF-8")
    }
}
impl Cesu8Error {
    #[inline] // inline to hopefully take out the .expect if constants are passed (or numbers verifyably over zero)
    pub(crate) fn new(
        valid_up_to: usize,
        error_len: Option<usize>,
        utf8_error: Result<(), Utf8Error>,
    ) -> Cesu8Error {
        Cesu8Error {
            valid_up_to,
            error_len: error_len.map(|el| {
                NonZeroUsize::new(el).expect("attempted to create zero-size endoing error")
            }),
            utf8_error,
        }
    }
    pub(crate) fn with_utf8_error(&self, err: Result<(), Utf8Error>) -> Cesu8Error {
        let mut cesuerr = *self;
        cesuerr.utf8_error = err;
        cesuerr
    }

    /// Creates a new Cesu8DecodingError struct, with `beginning` added to it's `valid_up_to` field.
    ///
    /// This will generally only be used for better error reporting. (such as in streams where data may be handled in chunks)
    pub fn increase_valid_index(&self, beginning: usize) -> Cesu8Error {
        Cesu8Error {
            valid_up_to: self.valid_up_to + beginning,
            error_len: self.error_len,
            utf8_error: self.utf8_error.map_err(|e| {
                utf8err_new(e.valid_up_to() + beginning, e.error_len().map(|u| u as u8))
            }),
        }
    }

    /// Returns the index in the given string up to which valid CESU-8 was
    /// verified.
    ///
    /// It is the maximum index such that `Variant::from_cesu8(&input[..index])`
    /// would return `Ok(_)`.
    ///
    /// # Examples
    ///
    /// Basic usage:
    ///
    /// ```
    /// use cesu8str::{LegacyCesu8Str as Cesu8Str, Variant};
    ///
    /// // some invalid bytes, in a byte string
    /// // the '\xC0' is the first half to an embedded Java-style nul sequence
    /// let sparkle_heart: &[u8] = b"my \xC0 string";
    ///
    /// // cesu8str::Cesu8Str::from_cesu8 returns a Cesu8Error
    /// let error = Cesu8Str::from_cesu8(&sparkle_heart, Variant::Java).unwrap_err();
    ///
    /// // the third byte is invalid here, the error is 1 byte long
    /// assert_eq!(3, error.valid_up_to());
    /// assert_eq!(Some(1), error.error_len());
    /// ```
    #[inline]
    pub fn valid_up_to(&self) -> usize {
        self.valid_up_to
    }

    /// Provides more information about the failure:
    ///
    /// * `None`: the end of the input was reached unexpectedly.
    ///   If a byte stream (such as a file or a network socket) is being decoded incrementally,
    ///   this could be a valid `char` whose CESU-8 byte sequence is spanning multiple chunks.
    ///
    /// * `Some(len)`: an unexpected byte or byte sequence was encountered.
    ///   The length provided is that of the invalid byte sequence
    ///   that starts at the index given by `valid_up_to()`.
    ///   Decoding should resume after that sequence
    ///   (after inserting a [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]) in case of
    ///   lossy decoding.
    ///
    ///   Note that `Cesu8DecodingError::error_len` differs from `Utf8Error::error_len` in that
    ///   the cesu8 version reports the length until the next valid UTF-8 sequence (or end of string)
    ///   while the utf8 version often reports each erroring byte individually.
    ///
    /// [U+FFFD]: std::char::REPLACEMENT_CHARACTER
    #[inline]
    pub fn error_len(&self) -> Option<usize> {
        self.error_len.map(|nzus| nzus.get())
    }

    /// Access an underlying UTF-8 error that may have occured before this CESU-8 error.
    /// If this returns `Ok(())` then it can be assumed that the source slice is valid CESU-8 and UTF-8 up to `self.valid_up_to()`
    ///
    /// If a UTF-8 error would occur at the same index as this CESU-8 error, and they both need more data, this should be `Ok(())`. In other words, `error_len() == None` can also be considered an implicit UTF-8 error.
    #[inline]
    pub fn utf8_error(&self) -> Result<(), Utf8Error> {
        self.utf8_error
    }
}

/// Decodes a valid CESU8 bytestring into a UTF8 string. Always allocates, always validates.
pub(crate) fn cesu8_to_utf8_const<const ENCODE_NUL: bool>(cesu: &Cesu8Str<'_>) -> String {
    // note that we can take advantage of the fact that the input should be well-formed CESU8
    debug_assert_eq!(
        Variant::from(ENCODE_NUL),
        cesu.variant,
        "ran wrong const-generic routine for cesu type"
    );
    if cfg!(debug_assertions) {
        match cesu8_validate::<ENCODE_NUL>(&cesu.bytes) {
            Ok(utf8err) if utf8err != cesu.utf8_error => panic!("internal CESU-8 contains different UTF-8 values than cached after debug validation"),
            Err(_) => panic!("stored invalid CESU-8 within Cesu8Str (cesu8 str: variant={:?}, utf8_err={:?}, bytes={:X?})", cesu.variant, cesu.utf8_error, cesu.bytes),
            Ok(_) => { /* all good */}
        }
    }

    let bytes = cesu.as_bytes();

    // try to copy initial N bytes first
    let (mut i, mut dest) = match cesu.utf8_error {
        Ok(()) => {
            // cesu is valid UTF8 - copy into new String literally
            let as_vec = cesu.bytes.to_vec();

            // SAFETY: our bytes have already been checked as valid UTF-8
            return from_utf8_vec(
                as_vec,
                "invalid UTF8 is in a CESU-8 string without UTF-8 errors",
            );
        }
        Err(utf8_err) => {
            let valid_up_to = utf8_err.valid_up_to();

            // SAFETY: bytes up to valid_up_to have already been validated as UTF-8
            let mut dest = from_utf8_slice(
                &cesu.bytes[..valid_up_to],
                "invalid UTF8 is in a CESU-8 string before the recorded UTF-8 error",
            )
            .to_owned();
            dest.reserve(cesu.bytes.len() - valid_up_to);
            (valid_up_to, dest)
        }
    };

    // let mut dest = String::with_capacity(bytes.len() + (bytes.len() / 4));

    while i < bytes.len() {
        // Try to use fast stdlib from_utf8 except where it is invalid
        // luckily the 4-byte chars as 6-byte sequences are invalid, and so are the 0xC0,0x80 sequences that Java uses
        match std::str::from_utf8(&bytes[i..]) {
            // The rest of the string is valid, append + return
            Ok(s) => {
                // could re-use the allocation if i == 0 and we are passed an owned version
                dest += s;
                return dest;
            }

            // We have reached an invalid character. For valid CESU8, this should be a supplementary character surrogate pair, or for Java's CESU8, a null character.
            Err(e) => {
                let valid_up_to = e.valid_up_to();

                // SAFETY: we have previously validated this portion already
                dest += unsafe { std::str::from_utf8_unchecked(&bytes[i..i + valid_up_to]) };
                debug_assert!(
                    e.error_len().is_some(),
                    "reached unterminated sequence, this should be impossible for validated CESU8"
                );
                i += valid_up_to;

                let rest = &bytes[i..];
                debug_assert!(
                    !rest.is_empty(),
                    "found no bytes to consume without consuming whole string"
                );

                // found either 6-pair, or (if JAVA) a 0xC0,0x80 sequence
                if ENCODE_NUL && rest.starts_with(&[0xC0, 0x80]) {
                    dest.push('\0');
                    i += 2;
                } else if let Some(&[first, second, third, fourth, fifth, sixth]) = rest.get(..6) {
                    debug_assert!(
                        first == 0xED && fourth == 0xED,
                        "expected surrogate pair, recieved something else (err bytes[..6]: {:X?})",
                        &rest[..6]
                    );

                    // from_utf8 should consume any valid three-bytes sequences
                    // our three-byte surrogate pairs should be invalid, and caught here

                    // assert our continuation bytes are indeed continuations
                    // assert our second & fifth bytes are on the right side of each other

                    let utf8bytes: [u8; 4] = dec_surrogates_infallable(second, third, fifth, sixth);

                    dest.push_str(from_utf8_slice(
                        &utf8bytes,
                        "dec_surrogates did not return valid UTF8",
                    ));
                    i += 6;
                } else {
                    unreachable!("unknown CESU8 decoding error. Was a Cesu8Str made with invalid CESU-8 bytes? (next (i={})..6 bytes: {:X?}) (cesu8 str: {:X?})", i, &rest[..6.min(rest.len())], bytes);
                }
            }
        }
    }

    debug_assert_eq!(
        bytes.len(),
        i,
        "did not consume expected number of bytes while converting cesu8 to utf8"
    );
    dest
}

pub(crate) fn cesu8_to_utf8(cesu: &Cesu8Str<'_>) -> String {
    match cesu.variant {
        Variant::Standard => cesu8_to_utf8_const::<false>(cesu),
        Variant::Java => cesu8_to_utf8_const::<true>(cesu),
    }
}

/// Validates raw bytes as CESU8, reporting any errors if found. Will not allocate.
/// 
/// If ENCODE_NUL is enabled, this will error on any interior nuls, even those
/// acting as a nul-terminator. It is not recommended to pass a nul-terminated
/// slice to this function.
pub(crate) fn cesu8_validate<const ENCODE_NUL: bool>(
    bytes: &[u8],
) -> Result<Result<(), Utf8Error>, Cesu8Error> {
    let mut i = 0;
    let mut first_utf8_error: Result<(), Utf8Error> = Ok(());
    // let mut current_utf8_error: Option<Utf8Error> = None;

    while i < bytes.len() {
        let try_utf8 = std::str::from_utf8(&bytes[i..]).map_err(|uerr| utf8err_inc(&uerr, i));

        // update first_utf8_error
        if let Err(uerr) = &try_utf8 {
            if first_utf8_error.is_ok() {
                first_utf8_error = Err(*uerr);
            }
        }

        let try_cesu8: Result<Cesu8Str, Cesu8Error> = {
            let s = try_utf8.unwrap_or_else(|uerr| unsafe {
                std::str::from_utf8_unchecked(&bytes[i..uerr.valid_up_to()])
            });

            utf8_as_cesu8_spec::<ENCODE_NUL>(s)
                .map_err(|e| e.increase_valid_index(i))
                .map(|()| Cesu8Str {
                    variant: ENCODE_NUL.into(),
                    bytes: Cow::Borrowed(s.as_bytes()),
                    // would have returned Err(_) if there was a utf8/cesu8 incompatibility
                    utf8_error: Ok(())
                })
        };

        // This should either return, or explicitly `continue`
        // It doesn't matter what type this is, as long as it's not `()`
        match (try_utf8, try_cesu8) {
            (Ok(_), Ok(c)) => {
                // valid UTF-8/CESU-8 -> consumed rest of string
                i += c.bytes.len();
                debug_assert_eq!(
                    i,
                    bytes.len(),
                    "found valid UTF-8 & CESU-8 that did not consume rest of string"
                );
                return Ok(first_utf8_error);
            }
            (Ok(_) | Err(_), Err(cerr)) => {
                // recieved a chunk of valid UTF-8, which contained a CESU-8 error

                // the CESU-8 error should be prioritized, so return that

                return Err(cerr.with_utf8_error(first_utf8_error));
            }
            (Err(uerr), Ok(c)) => {
                // UTF-8 error, but we have a valid CESU-8 chunk from the valid UTF-8 portion
                debug_assert_eq!(
                    uerr.valid_up_to(),
                    i + c.bytes.len(),
                    "CESU-8 string valid with unexpected length"
                );
                i += c.bytes.len();

                // need to process a new UTF-8 error
                // may simply be that we need more data
                fn validate_byte<F: FnOnce(u8) -> bool>(
                    bytes: &[u8],
                    start: usize,
                    offset: usize,
                    first_utf8_error: Result<(), Utf8Error>,
                    check: F,
                ) -> Result<(), Cesu8Error> {
                    match bytes.get(start + offset) {
                        None => Err(Cesu8Error::new(start, None, first_utf8_error)),
                        Some(b) if check(*b) => Ok(()),
                        Some(_) => Err(until_next_codepoint(bytes, start, first_utf8_error)),
                    }
                }

                if uerr.error_len().is_none() {
                    return Err(Cesu8Error::new(i, None, first_utf8_error));
                }

                debug_assert!(
                    i + 1 < bytes.len(),
                    "there were no more bytes after a UTF-8 error with a length"
                );
                // eprintln!("[{}:{}] reading CESU-8 specific sequence at index {} or 0x{:X} (total len = {}) (assert_cesu = {}) (next 8 bytes: {:X?})", file!(), line!(), i, i, bytes.len(), first_cesu, &bytes[i..(i+8).min(bytes.len())]);

                // do not try to loop this - there are valid UTF-8 sequences starting with 0xED that we could falsely try to interpret as CESU-8
                match bytes[i] {
                    0xC0 if ENCODE_NUL => {
                        validate_byte(bytes, i, 1, first_utf8_error, |b| b == 0x80)?;
                        i += 2;
                    }
                    0xED => {
                        // from_utf8 should consume any valid three-bytes sequences
                        // our three-byte surrogate pairs should be invalid, and caught here

                        // assert our continuation bytes are indeed continuations
                        // assert our second & fifth bytes are on the right side of each other

                        // note that the way that validate_byte works, if there is an error in the first half,
                        // then the error length only accounts for the first half. An unpaired second half will be emitted
                        // as a separate error

                        // could split these up, but these have to occur in pairs - if they don't, it's invalid
                        validate_byte(bytes, i, 1, first_utf8_error, |b| {
                            b & !CONT_MASK == TAG_CONT_U8
                        })?;
                        validate_byte(bytes, i, 1, first_utf8_error, |b| {
                            b & 0b1111_0000 == 0b1010_0000
                        })?; // first half
                        validate_byte(bytes, i, 2, first_utf8_error, |b| {
                            b & !CONT_MASK == TAG_CONT_U8
                        })?;

                        validate_byte(bytes, i, 3, first_utf8_error, |b| b == 0xED)?;
                        validate_byte(bytes, i, 4, first_utf8_error, |b| {
                            b & !CONT_MASK == TAG_CONT_U8
                        })?;
                        validate_byte(bytes, i, 4, first_utf8_error, |b| {
                            b & 0b1111_0000 == 0b1011_0000
                        })?; // second half
                        validate_byte(bytes, i, 5, first_utf8_error, |b| {
                            b & !CONT_MASK == TAG_CONT_U8
                        })?;
                        i += 6;
                    }
                    _ => {
                        // not valid UTF-8 or CESU-8
                        // eprintln!("[{}:{}] returning err of next codepoint (i..len={:?}, bytes[i..len] = {:X?})", file!(), line!(), i..bytes.len(), &bytes[i..bytes.len()]);
                        debug_assert!(
                            std::str::from_utf8(&bytes[i..]).is_err(),
                            "could be solved with more data, but thats not is reported"
                        );
                        return Err(until_next_codepoint(bytes, i, first_utf8_error));
                    }
                }

                continue;
            }
        }
    }

    assert_eq!(bytes.len(), i, "did not error, but reached end without consuming entire byte slice (expected {}, processed {})", bytes.len(), i);
    Ok(first_utf8_error)
}



/// Given a byte buffer and a byte index, returns a Cesu8DecodingError that
/// states `&bytes[..start]` is valid, and that the error lasts until the
/// start of the next ascii character, UTF-8 codepoint, or the end of the string.
pub(crate) fn until_next_codepoint(bytes: &[u8], start: usize, utf8_err: Result<(), Utf8Error>) -> Cesu8Error {
    // TODO: should semantics of this return None if there is no next valid character/UTF-8 codepoint?
    // eg: return error_len() == None if we haven't found valid data
    let skip = bytes
        .iter()
        .copied()
        .skip(start + 1) // skip start byte of error
        .take_while(|b| b & !CONT_MASK == TAG_CONT_U8) // while we are in a continuation byte
        .count()
        + 1; // +1 for skipped start byte
             // TODO: rework the semantics so we can do a whole error? change error_len to usize?
    assert!(
        start + skip < bytes.len(),
        "next valid index may be after chunk - need more data?"
    );
    Cesu8Error::new(start, Some(skip), utf8_err)
}