os_str_bytes 7.1.1

Lossless functionality for platform-native strings
Documentation
use std::char;
use std::char::DecodeUtf16;
use std::iter::FusedIterator;
use std::num::NonZeroU16;

use crate::util::BYTE_SHIFT;
use crate::util::CONT_MASK;
use crate::util::CONT_TAG;
use crate::util::MAX_UTF8_LENGTH;

use super::CodePoints;
use super::Result;

const MIN_HIGH_SURROGATE: u16 = 0xD800;

const MIN_LOW_SURROGATE: u16 = 0xDC00;

const MIN_SURROGATE_CODE: u32 = (u16::MAX as u32) + 1;

const MAX_WTF8_LENGTH: usize = MAX_UTF8_LENGTH;

macro_rules! static_assert {
    ( $condition:expr ) => {
        const _: () = assert!($condition, "static assertion failed");
    };
}

pub(in super::super) struct DecodeWide<I>
where
    I: Iterator<Item = u16>,
{
    iter: DecodeUtf16<I>,
    code_point: u32,
    shifts: u8,
}

impl<I> DecodeWide<I>
where
    I: Iterator<Item = u16>,
{
    pub(in super::super) fn new<S>(string: S) -> Self
    where
        S: IntoIterator<IntoIter = I, Item = I::Item>,
    {
        Self {
            iter: char::decode_utf16(string),
            code_point: 0,
            shifts: 0,
        }
    }

    #[inline(always)]
    const fn get_raw_byte(&self) -> u8 {
        (self.code_point >> (self.shifts * BYTE_SHIFT)) as u8
    }
}

impl<I> Iterator for DecodeWide<I>
where
    I: Iterator<Item = u16>,
{
    type Item = u8;

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(shifts) = self.shifts.checked_sub(1) {
            self.shifts = shifts;
            return Some((self.get_raw_byte() & CONT_MASK) | CONT_TAG);
        }

        self.code_point = self
            .iter
            .next()?
            .map(Into::into)
            .unwrap_or_else(|x| x.unpaired_surrogate().into());

        macro_rules! decode {
            ( $tag:expr ) => {
                Some(self.get_raw_byte() | $tag)
            };
        }
        macro_rules! try_decode {
            ( $tag:expr , $upper_bound:expr ) => {
                if self.code_point < $upper_bound {
                    return decode!($tag);
                }
                self.shifts += 1;
            };
        }
        try_decode!(0, 0x80);
        try_decode!(0xC0, 0x800);
        try_decode!(0xE0, MIN_SURROGATE_CODE);
        decode!(0xF0)
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let (low, high) = self.iter.size_hint();
        let shifts = self.shifts.into();
        (
            low.saturating_add(shifts),
            high.and_then(|x| x.checked_mul(MAX_WTF8_LENGTH))
                .and_then(|x| x.checked_add(shifts)),
        )
    }
}

pub(in super::super) struct EncodeWide<I>
where
    I: Iterator<Item = u8>,
{
    iter: CodePoints<I>,
    surrogate: Option<NonZeroU16>,
}

impl<I> EncodeWide<I>
where
    I: Iterator<Item = u8>,
{
    fn new<S>(string: S) -> Self
    where
        S: IntoIterator<IntoIter = I>,
    {
        Self {
            iter: CodePoints::new(string),
            surrogate: None,
        }
    }

    pub(in super::super) const fn is_still_utf8(&self) -> bool {
        self.iter.is_still_utf8()
    }
}

impl<I> FusedIterator for EncodeWide<I> where
    I: FusedIterator + Iterator<Item = u8>
{
}

impl<I> Iterator for EncodeWide<I>
where
    I: Iterator<Item = u8>,
{
    type Item = Result<u16>;

    fn next(&mut self) -> Option<Self::Item> {
        if let Some(surrogate) = self.surrogate.take() {
            return Some(Ok(surrogate.get()));
        }

        self.iter.next().map(|code_point| {
            code_point.map(|code_point| {
                code_point
                    .checked_sub(MIN_SURROGATE_CODE)
                    .map(|offset| {
                        static_assert!(MIN_LOW_SURROGATE != 0);

                        // SAFETY: The above static assertion guarantees that
                        // this value will not be zero.
                        self.surrogate = Some(unsafe {
                            NonZeroU16::new_unchecked(
                                (offset & 0x3FF) as u16 | MIN_LOW_SURROGATE,
                            )
                        });
                        (offset >> 10) as u16 | MIN_HIGH_SURROGATE
                    })
                    .unwrap_or(code_point as u16)
            })
        })
    }

    fn size_hint(&self) -> (usize, Option<usize>) {
        let (low, high) = self.iter.inner_size_hint();
        let additional = self.surrogate.is_some().into();
        (
            (low.saturating_add(2) / 3).saturating_add(additional),
            high.and_then(|x| x.checked_add(additional)),
        )
    }
}

pub(in super::super) fn encode_wide(
    string: &[u8],
) -> EncodeWide<impl '_ + Iterator<Item = u8>> {
    EncodeWide::new(string.iter().copied())
}