str-queue 0.0.1

//! UTF-8 related types, functions, and constants.

use core::convert::TryFrom;
use core::str;

/// `U+FFFD REPLACEMENT CHARACTER` of Unicode.
pub(crate) const REPLACEMENT_CHAR_STR: &str = "\u{FFFD}";

/// `U+FFFD REPLACEMENT CHARACTER` of Unicode.
pub(crate) const REPLACEMENT_CHAR: char = '\u{FFFD}';

/// Returns the expected length of a character from a first byte of that character.
///
/// Returned value is 0 to 4.
/// 0 means the byte is invalid as a first byte of a valid UTF-8 sequence.
///
/// Note that the return value is meaningless when the buffer contains
/// no valid UTF-8 sequence.
/// Additional validation should be necessary on the caller side to use the value.
#[inline]
pub(crate) fn expected_char_len(first: u8) -> u8 {
    /// A map from most significant 4 bits to the character length in bytes.
    // Use zero for the invalid first byte, to return zero for empty string
    // and invalid UTF-8 sequence.
    const MSB4_TO_LEN: [u8; 16] = [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4];

    MSB4_TO_LEN[usize::from(first >> 4)]
}

/// Returns the character and its length in bytes.
pub(crate) fn take_char<I>(mut bytes: I) -> Option<(char, u8)>
where
    I: Iterator<Item = u8>,
{
    let first = bytes.next()?;

    let mut buf = [first, 0, 0, 0];
    let expected_len = expected_char_len(first);
    match expected_len {
        0 => return None,
        1 => {
            debug_assert!(first.is_ascii());
            let c = char::try_from(first).expect("[consistency] the byte is valid ASCII character");
            return Some((c, 1));
        }
        2..=4 => {
            for dest in &mut buf[1..usize::from(expected_len)] {
                *dest = bytes.next()?;
            }
        }
        _ => unreachable!(
            "[validity] `expected_char_len()` must return the value less than or equal to 4"
        ),
    };
    // `from_utf8(...).ok()` will return `None` when the first character is incomplete.
    debug_assert!(expected_len >= 2);
    let s = str::from_utf8(&buf[..(expected_len as usize)]).ok()?;
    let c = s
        .chars()
        .next()
        .expect("[consistency] the string is not empty");

    Some((c, expected_len))
}

/// Splits the bytes into valid string and trailing incomplete character.
///
/// # Panics
///
/// Panics if the bytes contain ill-formed UTF-8 sequence except for the
/// possible last incomplete character.
pub(crate) fn split_incomplete_suffix(bytes: &[u8]) -> (&str, Option<&[u8]>) {
    let len = bytes.len();
    let partial_len = last_char_len_in_last_4bytes(bytes).expect(
        "[consistency] bytes should be valid UTF-8 sequence, \
         except for the possible trailing incomplete character",
    );
    let valid_up_to = len - partial_len.len_incomplete();
    let s = str::from_utf8(&bytes[..valid_up_to]).expect(
        "[consistency] bytes should be valid UTF-8 sequence, \
         except for the possible trailing incomplete character",
    );
    let partial = (!partial_len.is_complete()).then(|| &bytes[valid_up_to..]);

    (s, partial)
}

/// Information about the length of the last (possibly incomplete) character.
#[derive(Debug, Clone, Copy)]
pub(super) struct LastCharLen {
    /// Number of available bytes for the last (possibly incomplete) character.
    ///
    /// This should be less than 4.
    pub(super) available: u8,
    /// Number of expected bytes for the last (possibly incomplete) character.
    ///
    /// This should be less than or equal to 4.
    pub(super) expected: u8,
}

impl LastCharLen {
    /// Returns true if the last character is complete or no characters exist.
    #[inline]
    #[must_use]
    pub(super) fn is_complete(self) -> bool {
        self.available == self.expected
    }

    /// Returns the length of available bytes for incomplete characters.
    ///
    /// Returns 0 if the character is complete.
    #[inline]
    #[must_use]
    pub(super) fn len_incomplete(self) -> usize {
        if self.available < self.expected {
            usize::from(self.available)
        } else {
            0
        }
    }

    /// Returns the length of bytes necessary to make the incomplete character complete.
    ///
    /// Returns 0 if the character is already complete.
    #[inline]
    #[must_use]
    pub(super) fn len_missing(self) -> usize {
        usize::from(self.expected - self.available)
    }
}

/// Returns the information about the length of the last (possibly incomplete) character.
///
/// Returns `None` if no valid start bytes of characters found in the last four bytes.
///
/// Note that this function does not validate the reading bytes are really valid
/// UTF-8 sequence. Callers are responsible to validate or guarantee that before
/// using them as a UTF-8 sequence.
// Using `u8` as a length since it is smallest but big enough, and is infallibly
// convertible into `usize`.
pub(crate) fn last_char_len_in_last_4bytes(bytes: &[u8]) -> Option<LastCharLen> {
    if bytes.is_empty() {
        return Some(LastCharLen {
            available: 0,
            expected: 0,
        });
    }

    let last_char_len = bytes
        .iter()
        .rev()
        .take(4)
        .position(|b| (b & 0b1100_0000) != 0x80)?
        + 1;

    debug_assert!(
        (1..=4).contains(&last_char_len),
        "[validity] the iterator is limited to emit at most 4 elements"
    );
    let last_char_start = bytes.len() - last_char_len;
    let last_char_len = last_char_len as u8;
    let expected_last_char_len = expected_char_len(bytes[last_char_start]);
    debug_assert!(
        last_char_len <= expected_last_char_len,
        "[consistency] the character must not be longer than expected"
    );
    Some(LastCharLen {
        available: last_char_len,
        expected: expected_last_char_len,
    })
}