os_str_bytes 7.1.1

Lossless functionality for platform-native strings
Documentation
use std::iter::FusedIterator;
use std::iter::Peekable;
use std::mem;

use crate::util::is_continuation;
use crate::util::BYTE_SHIFT;
use crate::util::CONT_MASK;

use super::EncodingError;
use super::Result;

pub(super) struct CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    iter: Peekable<I>,
    surrogate: bool,
    still_utf8: bool,
}

impl<I> CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    pub(super) fn new<S>(string: S) -> Self
    where
        S: IntoIterator<IntoIter = I>,
    {
        Self {
            iter: string.into_iter().peekable(),
            surrogate: false,
            still_utf8: true,
        }
    }

    pub(super) const fn is_still_utf8(&self) -> bool {
        self.still_utf8
    }

    fn consume_next(&mut self, code_point: &mut u32) -> Result<()> {
        let &byte = self.iter.peek().ok_or(EncodingError::End())?;

        if !is_continuation(byte) {
            self.surrogate = false;
            // Not consuming this byte will be useful if this crate ever offers
            // a way to encode lossily.
            return Err(EncodingError::Byte(byte));
        }
        *code_point =
            (*code_point << BYTE_SHIFT) | u32::from(byte & CONT_MASK);

        let removed = self.iter.next();
        debug_assert_eq!(Some(byte), removed);

        Ok(())
    }

    pub(super) fn inner_size_hint(&self) -> (usize, Option<usize>) {
        self.iter.size_hint()
    }
}

impl<I> FusedIterator for CodePoints<I> where
    I: FusedIterator + Iterator<Item = u8>
{
}

impl<I> Iterator for CodePoints<I>
where
    I: Iterator<Item = u8>,
{
    type Item = Result<u32>;

    fn next(&mut self) -> Option<Self::Item> {
        let byte = self.iter.next()?;
        let mut code_point: u32 = byte.into();

        macro_rules! consume_next {
            () => {{
                if let Err(error) = self.consume_next(&mut code_point) {
                    return Some(Err(error));
                }
            }};
        }

        let prev_surrogate = mem::replace(&mut self.surrogate, false);

        let mut invalid = false;
        if !byte.is_ascii() {
            if byte < 0xC2 {
                return Some(Err(EncodingError::Byte(byte)));
            }

            if byte < 0xE0 {
                code_point &= 0x1F;
            } else {
                code_point &= 0x0F;
                consume_next!();

                if byte >= 0xF0 {
                    if code_point.wrapping_sub(0x10) >= 0x100 {
                        invalid = true;
                    }
                    consume_next!();

                // This condition is optimized to detect surrogate code points.
                } else if code_point & 0xFE0 == 0x360 {
                    self.still_utf8 = false;
                    if code_point & 0x10 == 0 {
                        self.surrogate = true;
                    } else if prev_surrogate {
                        // Decoding a broken surrogate pair would be lossy.
                        invalid = true;
                    }
                }

                if code_point < 0x20 {
                    invalid = true;
                }
            }
            consume_next!();
        }
        if invalid {
            return Some(Err(EncodingError::CodePoint(code_point)));
        }

        Some(Ok(code_point))
    }
}