devela 0.27.0 - Docs.rs

// devela::text::char::iter::bytes

use crate::{Char, CharIter, char7, char8, char16, charu, is, unwrap};

/// Methods available when constructed from a byte slice.
impl<'a> CharIter<'a, &[u8]> {
    /* constructors */

    /// Returns a new iterator over the Unicode scalars of a slice of `bytes`.
    pub const fn new(bytes: &'a [u8]) -> Self {
        Self::_new(bytes, 0)
    }

    /// Returns a new iterator over the Unicode scalars of a slice of `bytes`,
    /// starting at `index`.
    ///
    /// Returns `None` if the given index is not a valid character boundary.
    #[must_use] #[inline(always)] #[rustfmt::skip]
    pub const fn new_at(bytes: &'a [u8], index: usize) -> Option<Self> {
        if Char(bytes).is_utf8_boundary(index) {
            Some(Self::_new(bytes, index))
        } else {
            None
        }
    }

    /* misc. */

    /// Returns the total number of Unicode scalars, consuming the iterator.
    pub const fn count(mut self) -> usize {
        let mut counter = 0;
        while self.pos < self.bytes.len() {
            if let Some((_, len)) = Char(self.bytes).to_scalar(self.pos) {
                self.pos += len;
                counter += 1;
            } else {
                break;
            }
        }
        counter
    }

    /* next_char* methods */

    /// Returns the next Unicode scalar.
    ///
    /// This is implemented via `Char::`[`to_char`][Char::to_char].
    ///
    /// # Features
    /// Uses the `unsafe_niche` feature to skip duplicated validation checks.
    #[must_use]
    pub const fn next_char(&mut self) -> Option<char> {
        is![self.pos >= self.bytes.len(), return None];
        let Some((ch, len)) = Char(self.bytes).to_char(self.pos) else { return None };
        self.pos += len;
        Some(ch)
    }

    /// Returns the next Unicode scalar, without performing full UTF-8 validation,
    /// but mostly the final Unicode scalar.
    ///
    /// If the leading byte is invalid it returns the replacement character (`�`).
    ///
    /// This is implemented via `Char::`[`to_char_lenient`][Char::to_char_lenient].
    #[must_use]
    pub const fn next_char_lenient(&mut self) -> Option<char> {
        is![self.pos >= self.bytes.len(), return None];
        let (cp, len) = Char(self.bytes).to_scalar_unchecked(self.pos);
        is![let Some(ch) = char::from_u32(cp), { self.pos += len; Some(ch) }, None]
    }

    /// Returns the next Unicode scalar, without performing UTF-8 validation.
    ///
    /// # Safety
    /// The caller must ensure that:
    /// - `index` is within bounds of `bytes`.
    /// - `bytes[index..]` contains a valid UTF-8 sequence.
    /// - The decoded value is a valid Unicode scalar.
    ///
    /// Violating these conditions may lead to undefined behavior.
    #[must_use]
    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
    #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
    pub const unsafe fn next_char_unchecked(&mut self) -> Option<char> {
        is![self.pos >= self.bytes.len(), return None];
        let (ch, len) = unsafe { Char(self.bytes).to_char_unchecked(self.pos) };
        self.pos += len;
        Some(ch)
    }

    /// Returns the next 7-bit Unicode scalar.
    ///
    /// Returns `None` once there are no more characters left,
    /// or if the next character is not ASCII.
    ///
    /// # Features
    /// Uses the `unsafe_niche` feature to skip validation checks.
    #[must_use]
    pub const fn next_char7(&mut self) -> Option<char7> {
        is![self.pos >= self.bytes.len(), return None];
        let byte = self.bytes[0];
        is![
            byte.is_ascii(),
            {
                self.pos += 1;
                Some(char7::new_unchecked(byte))
            },
            None
        ]
    }

    /// Returns the next 8-bit Unicode scalar.
    ///
    /// Returns `None` once there are no more characters left,
    /// or if the next character can't fit in 1 byte.
    #[must_use]
    pub const fn next_char8(&mut self) -> Option<char8> {
        is![self.pos >= self.bytes.len(), return None];
        let Some((cp, len)) = Char(self.bytes).to_scalar(self.pos) else { return None };
        if Char(cp).len_bytes() == 1 {
            self.pos += len;
            Some(char8(cp as u8))
        } else {
            None
        }
    }

    /// Returns the next 8-bit Unicode scalar, without performing UTF-8 validation.
    ///
    /// Returns `None` once there are no more characters left,
    /// or if the next character can't fit in 1 byte.
    ///
    /// # Panics
    /// It will panic if the index is out of bounds.
    ///
    /// # Safety
    /// The caller must ensure that:
    /// - `index` is within bounds of `bytes`.
    /// - `bytes[index..]` contains a valid UTF-8 sequence.
    /// - The decoded value is a valid Unicode scalar.
    ///
    /// Violating these conditions may lead to undefined behavior.
    #[must_use]
    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
    #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
    pub const unsafe fn next_char8_unchecked(&mut self) -> Option<char8> {
        let (cp, len) = Char(self.bytes).to_scalar_unchecked(self.pos);
        is![
            Char(cp).len_bytes() == 1,
            {
                self.pos += len;
                Some(char8(cp as u8))
            },
            None
        ]
    }

    /// Returns the next 16-bit Unicode scalar.
    ///
    /// Returns `None` once there are no more characters left,
    /// or if the next character can't fit in 2 bytes.
    ///
    /// # Features
    /// Uses the `unsafe_niche` feature to skip validation checks.
    #[must_use]
    pub const fn next_char16(&mut self) -> Option<char16> {
        is![self.pos >= self.bytes.len(), return None];
        let Some((cp, len)) = Char(self.bytes).to_scalar(self.pos) else { return None };
        if Char(cp).len_bytes() <= 2 {
            self.pos += len;
            Some(char16::new_unchecked(cp as u16))
        } else {
            None
        }
    }

    /// Returns the next 16-bit Unicode scalar, without performing UTF-8 validation.
    ///
    /// Returns `None` once there are no more characters left,
    /// or if the next character can't fit in 2 bytes.
    ///
    /// # Panics
    /// It will panic if the index is out of bounds.
    ///
    /// # Safety
    /// The caller must ensure that:
    /// - `index` is within bounds of `bytes`.
    /// - `bytes[index..]` contains a valid UTF-8 sequence.
    /// - The decoded value is a valid Unicode scalar.
    ///
    /// Violating these conditions may lead to undefined behavior.
    ///
    /// # Features
    /// Uses the `unsafe_niche` feature to skip validation checks.
    #[must_use]
    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
    #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
    pub const unsafe fn next_char16_unchecked(&mut self) -> Option<char16> {
        let (cp, len) = Char(self.bytes).to_scalar_unchecked(self.pos);
        if Char(cp).len_bytes() <= 2 {
            self.pos += len;
            Some(char16::new_unchecked(cp as u16))
        } else {
            None
        }
    }
    /// Returns the next Unicode scalar using a UTF-8 representation.
    ///
    /// Returns `None` once there are no more characters left.
    ///
    /// # Features
    /// Uses the `unsafe_hint` feature to optimize out unreachable branches.
    #[must_use] #[rustfmt::skip]
    pub const fn next_charu(&mut self) -> Option<charu> {
        is![self.pos >= self.bytes.len(), return None];
        let (ch, len) = unwrap![some? charu::from_utf8_bytes_with_len(self.bytes)];
        self.pos += len as usize;
        Some(ch)
    }

    /// Returns the next Unicode scalar, without performing UTF-8 validation.
    ///
    /// # Safety
    /// The caller must ensure that:
    /// - `index` is within bounds of `bytes`.
    /// - `bytes[index..]` contains a valid UTF-8 sequence.
    /// - The decoded value is a valid Unicode scalar.
    ///
    /// Violating these conditions may lead to undefined behavior.
    ///
    /// # Features
    /// Uses the `unsafe_hint` feature to optimize out unreachable branches.
    #[must_use]
    #[cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))]
    #[cfg_attr(nightly_doc, doc(cfg(all(not(feature = "safe_text"), feature = "unsafe_str"))))]
    pub const unsafe fn next_charu_unchecked(&mut self) -> Option<charu> {
        is![self.pos >= self.bytes.len(), return None];
        let (ch, len) = unsafe { charu::from_utf8_bytes_with_len_unchecked(self.bytes) };
        self.pos += len as usize;
        Some(ch)
    }

    /* next_scalar* methods */

    /// Returns the next Unicode scalar value.
    ///
    /// This is implemented via `Char::`[`to_scalar`][Char::to_scalar].
    ///
    /// # Features
    /// Uses the `unsafe_niche` feature to skip duplicated validation checks.
    #[must_use]
    pub const fn next_scalar(&mut self) -> Option<u32> {
        is![self.pos >= self.bytes.len(), return None];
        let Some((ch, len)) = Char(self.bytes).to_scalar(self.pos) else { return None };
        self.pos += len;
        Some(ch)
    }

    /// Returns the next Unicode scalar, without performing UTF-8 validation.
    ///
    /// This is implemented via `Char::`[`to_scalar_unchecked`][Char::to_scalar_unchecked].
    ///
    /// It assumes `bytes[index..]` contains a valid UTF-8 sequence,
    /// and it doesn't validate the resulting Unicode scalar.
    ///
    /// If the leading byte is invalid it returns the replacement character (`�`).
    ///
    /// # Panics
    /// It will panic if the index is out of bounds.
    #[must_use]
    pub const fn next_scalar_unchecked(&mut self) -> Option<u32> {
        is![self.pos >= self.bytes.len(), return None];
        let (ch, len) = Char(self.bytes).to_scalar_unchecked(self.pos);
        self.pos += len;
        Some(ch)
    }
}