utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
//! Unicode [grapheme] handling.
//!
//! [grapheme]: https://en.wikipedia.org/wiki/Grapheme

/// Yield the byte indexes where grapheme breaks are permitted.
///
/// Does not include index zero or the final index.
///
/// This is defined by the extended grapheme clusters in UAX#29.
#[inline]
pub fn grapheme_breaks(text: &str) -> GraphemeBreaks<'_> {
    GraphemeBreakIter::from_char_indexes(text.char_indices())
}

/// The type result by [`grapheme_breaks`].
pub type GraphemeBreaks<'a> = GraphemeBreakIter<std::str::CharIndices<'a>>;

/// Yields indexes where grapheme breaks are allowed,
/// as defined by the extended grapheme clusters in UAX#29.
///
/// Does not give the final index or the zero index.
///
/// Wraps an underlying iterator, which is responsible for giving tuples of `(index, codepoint)`.
/// This allows handling multiple string formats (UTF8, UTF16, UTF32, etc...).
#[derive(Clone, Debug)]
pub struct GraphemeBreakIter<I> {
    src: std::iter::Fuse<I>,
    last_value: Option<(usize, char)>,
    state: GraphemeBreakState,
}
impl<I: Iterator<Item = (usize, char)>> GraphemeBreakIter<I> {
    /// Wrap as iterator over codepoints and indexes,
    /// with one that returns which indexes are potential grapheme breaks.
    #[inline]
    pub fn from_char_indexes(iter: I) -> Self {
        GraphemeBreakIter {
            src: iter.fuse(),
            last_value: None,
            state: GraphemeBreakState::new(),
        }
    }
}
impl<I: Iterator<Item = (usize, char)>> Iterator for GraphemeBreakIter<I> {
    type Item = usize;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        loop {
            if self.last_value.is_none() {
                self.last_value = self.src.next();
            }
            let (_last_index, last_codepoint) = self.last_value?;
            let next_value = self.src.next();
            self.last_value = next_value;
            let (next_index, next_codepoint) = next_value?;
            if grapheme_break_stateful(last_codepoint, next_codepoint, &mut self.state) {
                return Some(next_index);
            }
        }
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        (0, self.src.size_hint().1)
    }
}
/// Implicitly fuses the underlying iterator.
impl<I: Iterator<Item = (usize, char)>> std::iter::FusedIterator for GraphemeBreakIter<I> {}

/// The state for the [`grapheme_break_stateful`] function.
///
/// Required as of Unicode 9.0
#[derive(Clone, Debug)]
pub struct GraphemeBreakState(pub(crate) Option<i32>);
impl Default for GraphemeBreakState {
    fn default() -> Self {
        Self::new()
    }
}

impl GraphemeBreakState {
    /// Initializes the standard state
    #[inline]
    pub const fn new() -> Self {
        GraphemeBreakState(Some(0))
    }

    /// Indicates that the state is missing.
    #[inline]
    pub const fn missing() -> Self {
        GraphemeBreakState(None)
    }
}

/// Given a pair of consecutive codepoints, return whether a grapheme break is
/// permitted between them (as defined by the extended grapheme clusters in UAX#29).
///
/// Prefer the high-level [`grapheme_breaks`] and [`GraphemeBreakIter::from_char_indexes`] iterators,
/// which implicitly handle the necessary state.
///
/// ## State
/// Beginning with Version 29 (Unicode 9.0.0), this algorithm requires state to break graphemes.
/// This state is passed as a [`&mut GraphemeBreakState`](GraphemeBreakState),
/// and is initialized with [`GraphemeBreakState::new`].
/// If the state is not passed in (i.e. [`GraphemeBreakState::missing`] is passed),
/// UAX#29 rules GB10/12/13 which require this state will not be applied,
/// essentially the rules in Unicode 8.0.0.
///
/// **WARNING**:  If the state parameter is used, [`grapheme_break_stateful`] must
/// be called *in order* on *all* potential breaks in a string.
/// However, it is safe to reset the state to zero after a grapheme break.
#[inline]
pub fn grapheme_break_stateful(codepoint1: char, codepoint2: char, state: &mut GraphemeBreakState) -> bool {
    // SAFETY: Pointer is either valid or non-null
    unsafe {
        utf8proc_sys::utf8proc_grapheme_break_stateful(
            codepoint1 as i32,
            codepoint2 as i32,
            match state.0 {
                Some(ref mut state) => std::ptr::from_mut::<i32>(state),
                None => std::ptr::null_mut(),
            },
        )
    }
}