utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
//! Operations to transform strings,
//! including the [`map`] function.
//!
//! Contains unicode normalization functionality.

#[allow(unused_imports, reason = "used by docs")]
use crate::ErrorKind;
use crate::transform::advanced::{MaybeMarkerCodepoint, TransformBufferError};
use crate::transform::buffer::{MaybeUninitSlice, SplitInitBuffer};
use bstr::BStr;
use buffer::MaybeUninitSliceExt;
use std::alloc::Layout;
use std::fmt::{Debug, Display, Formatter};
use std::mem::MaybeUninit;

pub mod advanced;
pub mod buffer;
mod options;
#[cfg(feature = "unstable-redundant")]
pub mod redundant;

pub use options::*;

/// A callback to a transform function,
/// allowing the user to change codepoints before
/// utf8proc starts processing them.
pub type TransformCallback<'a> = &'a mut dyn FnMut(char) -> char;

/// Indicates that there was insufficient space in the destination buffer,
/// and indicates the amount of space that would be necessary.
#[derive(Clone, Debug, thiserror::Error)]
#[error("Insufficient space: Need room for {needed_space} elements, but only have space for {actual_space}")]
pub struct InsufficientSpaceError {
    pub(crate) actual_space: usize,
    pub(crate) needed_space: usize,
}
impl InsufficientSpaceError {
    /// Indicates the amount of space that would be needed.
    #[inline]
    pub fn needed_space(&self) -> usize {
        self.needed_space
    }
}

/// An array of codepoints, resulting from the [`decompose_char`] function.
///
/// Creating this value does not require allocation,
/// since the maximum size is known ahead-of-time.
#[derive(Clone)]
pub struct DecomposedChar {
    codepoints: [char; advanced::MAX_DECOMPOSE_CHAR_LENGTH],
    len: usize,
}
impl DecomposedChar {
    /// Iterate over the decomposed characters.
    #[inline]
    pub fn iter(&self) -> std::iter::Copied<std::slice::Iter<'_, char>> {
        <&Self as IntoIterator>::into_iter(self)
    }
    /// Access the characters as a slice.
    #[inline]
    pub fn as_slice(&self) -> &[char] {
        &self.codepoints[..self.len]
    }
}
impl AsRef<[char]> for DecomposedChar {
    #[inline]
    fn as_ref(&self) -> &[char] {
        self.as_slice()
    }
}
impl Debug for DecomposedChar {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.debug_list().entries(self.as_slice()).finish()
    }
}
impl<'a> IntoIterator for &'a DecomposedChar {
    type Item = char;
    type IntoIter = std::iter::Copied<std::slice::Iter<'a, char>>;

    #[inline]
    fn into_iter(self) -> Self::IntoIter {
        self.as_slice().iter().copied()
    }
}
impl IntoIterator for DecomposedChar {
    type Item = char;
    type IntoIter = std::iter::Take<std::array::IntoIter<char, { advanced::MAX_DECOMPOSE_CHAR_LENGTH }>>;

    #[inline]
    fn into_iter(self) -> Self::IntoIter {
        debug_assert!(self.len <= advanced::MAX_DECOMPOSE_CHAR_LENGTH);
        self.codepoints.into_iter().take(self.len)
    }
}

/// Decompose a codepoint into an array of codepoints,
/// applying the specified [`TransformOptions`].
///
/// ## Limitations
/// In order to guarantee the result is valid UTF8, some options are forbidden.
/// Use the [advanced] interface to work around this.
#[cfg_attr(feature = "inline-more", inline)] // potential for constant folding, avoiding moves
pub fn decompose_char(codepoint: char, options: &TransformOptions) -> Result<DecomposedChar, crate::Error> {
    options.validate_utf8();
    let mut buffer = MaybeUninit::<[MaybeMarkerCodepoint; advanced::MAX_DECOMPOSE_CHAR_LENGTH]>::uninit();
    let buffer = MaybeUninitSliceExt::from_uninit_array_mut(&mut buffer);
    match advanced::decompose_char(codepoint, buffer, options, None) {
        Ok((init, _uninit)) => {
            assert!(!init.is_empty(), "result has zero length");
            #[cfg(debug_assertions)]
            {
                for value in &*init {
                    debug_assert!(value.to_char().is_ok(), "produced invalid codepoint");
                }
            }
            let mut result = [0 as char; advanced::MAX_DECOMPOSE_CHAR_LENGTH];
            assert_eq!(Layout::new::<MaybeMarkerCodepoint>(), Layout::new::<char>(),);
            // SAFETY: Buffer contains only valid codepoints
            unsafe {
                init.as_ptr()
                    .cast::<char>()
                    .copy_to_nonoverlapping(result.as_mut_ptr(), init.len());
            }
            Ok(DecomposedChar {
                // SAFETY: Only valid codepoints are produced
                codepoints: result,
                len: init.len(),
            })
        }
        Err(TransformBufferError::InsufficientSpace(cause)) => {
            unreachable!("{cause}")
        }
        Err(TransformBufferError::Other(cause)) => Err(cause),
    }
}

/// Decompose a UTF8 string into an array of codepoints,
/// applying the specified [`TransformOptions`].
///
/// The specified string is not statically required to be UTF8.
/// If invalid UTF8 is encountered,
/// a [`ErrorKind::InvalidUtf8`] will be returned.
/// The result is guaranteed to only contain valid codepoints.
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
///
/// ## Limitations
/// In order to guarantee the result is valid UTF8, some options are forbidden.
/// Use the [advanced] interface to work around this.
#[cfg_attr(feature = "inline-more", inline)] // potential for constant folding
pub fn decompose_buffer<'a>(
    text: &BStr,
    buffer: &'a mut MaybeUninitSlice<char>,
    options: &TransformOptions,
    func: Option<TransformCallback>,
) -> Result<SplitInitBuffer<'a, char>, TransformBufferError> {
    options.validate_utf8();
    assert_eq!(Layout::new::<char>(), Layout::new::<MaybeMarkerCodepoint>(),);
    // SAFETY: MaybeValidCodepoint is less strict that char
    let buffer: &mut MaybeUninitSlice<MaybeMarkerCodepoint> = unsafe {
        std::slice::from_raw_parts_mut(
            buffer.as_mut_ptr().cast::<MaybeUninit<MaybeMarkerCodepoint>>(),
            buffer.len(),
        )
    };
    let (init, uninit) = advanced::decompose_buffer(text, buffer, options, func)?;
    #[cfg(debug_assertions)]
    {
        for &val in &*init {
            debug_assert!(val.to_char().is_ok(), "produced invalid codepoint");
        }
    }
    // SAFETY: Guaranteed to be valid codepoints
    unsafe {
        Ok((
            std::slice::from_raw_parts_mut(init.as_mut_ptr().cast::<char>(), init.len()),
            std::slice::from_raw_parts_mut(uninit.as_mut_ptr().cast::<MaybeUninit<char>>(), uninit.len()),
        ))
    }
}

/// Apply a transformation to a string, indicated by the [`TransformOptions`].
///
/// This is a very thin wrapper around [`map_into`], which allocates a fresh string.
/// It does not support a [`TransformCallback`] to reduce vebrosity.
///
/// Apply a transformation to a string, indicated by the [`TransformOptions`],
/// writing the result into the specified destination string.
///
/// ## Limitations
/// In order to guarantee the result is valid UTF8, some options are forbidden.
/// Use the [advanced] interface to work around this.
#[inline]
pub fn map(text: impl AsRef<BStr>, options: &TransformOptions) -> Result<String, crate::Error> {
    let mut buffer = String::new();
    map_into(text.as_ref(), &mut buffer, options, None)?;
    Ok(buffer)
}

/// Apply a transformation to a string, indicated by the [`TransformOptions`],
/// writing the result into the specified destination string.
///
/// The input string is not required to be valid UTF8.
/// If invalid UTF8 is encountered, an [`ErrorKind::InvalidUtf8`] error will be returned.
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
///
/// ## Limitations
/// In order to guarantee the result is valid UTF8, some options are forbidden.
/// Use the [advanced] interface to work around this.
#[inline]
pub fn map_into(
    text: impl AsRef<BStr>,
    dest: &mut String,
    options: &TransformOptions,
    func: Option<TransformCallback>,
) -> Result<(), crate::Error> {
    options.validate_utf8();
    // SAFETY: Guaranteed to only write valid UTF8, due to checking options
    let dest = unsafe { dest.as_mut_vec() };
    advanced::map_into(text.as_ref(), dest, options, func)
}

/// Normalize a buffer of [`MaybeMarkerCodepoint`] in-place,
/// respecting the [`TransformOptions`]
///
/// Returns the number of codepoints that are valid.
///
/// This function operates in-place because the underlying CAPI function
/// [`utf8proc_normalize_utf32`] does so.
///
/// ## Limitations
/// This function only accepts valid codepoints,
/// because that is what the API requires.
/// While it appears to accept [`advanced::MaybeMarkerCodepoint`] in practice,
/// this is not guaranteed as part of the documentation,
/// which demands valid codepoints.
/// Feel free to open an issue upstream, if this is an issue for you.
/// Once their docs change, I will update this library accordingly.
///
/// [`utf8proc_normalize_utf32`]: utf8proc_sys::utf8proc_normalize_utf32
#[cfg_attr(feature = "inline-more", inline)] // thin FFi wrapper, potential for constant folding in to_ffi
pub fn normalize_utf32(items: &mut [char], options: &TransformOptions) -> Result<usize, crate::Error> {
    // SAFETY: Don't care about UTF8 or codepoint validity
    let options = unsafe { options.to_ffi() };
    assert_eq!(Layout::new::<MaybeMarkerCodepoint>(), Layout::new::<i32>());
    // SAFETY: Input is either valid UTF32, and so is output
    let res_code = unsafe {
        utf8proc_sys::utf8proc_normalize_utf32(items.as_mut_ptr().cast::<i32>(), items.len().cast_signed(), options)
    };
    if res_code < 0 {
        Err(crate::Error::from_code(res_code))
    } else {
        let len = res_code.cast_unsigned();
        assert!(len <= items.len());
        Ok(len)
    }
}

/// The type of [unicode normalization form].
///
/// Used to determine [unicode equivalence] (wikipedia).
///
/// [unicode normalization]: https://unicode.org/reports/tr15/#Norm_Forms
/// [unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub enum UnicodeNormalizationForm {
    /// Normalization Form Canonical Decomposition
    ///
    /// Characters are decomposed by canonical equivalence,
    /// and multiple combining characters are arranged in a specific order.
    NFD,
    /// Normalization Form Canonical Composition
    ///
    /// Characters are decomposed and then recomposed by canonical equivalence.
    NFC,
    /// Normalization Form Compatibility Decomposition
    ///
    /// Characters are decomposed by compatibility equivalence,
    /// and multiple combining characters are arranged in a specific order.
    NFKD,
    /// Normalization Form Compatibility Composition
    ///
    /// Characters are decomposed by compatibility equivalence,
    /// then recomposed by canonical equivalence.
    NFKC,
}
impl Display for UnicodeNormalizationForm {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        write!(f, "{self:?}")
    }
}

/// Apply normalization to the specified string.
#[inline] // likely to constant-fold
pub fn normalize(text: &str, form: UnicodeNormalizationForm) -> Result<String, crate::Error> {
    let mut options = TransformOptions::default();
    match form {
        UnicodeNormalizationForm::NFD => {
            options.composition = Some(CompositionOptions::decompose());
        }
        UnicodeNormalizationForm::NFC => {
            options.composition = Some(CompositionOptions::compose());
        }
        UnicodeNormalizationForm::NFKD => {
            options.composition = Some(CompositionOptions::decompose());
            options.compat = true;
        }
        UnicodeNormalizationForm::NFKC => {
            options.composition = Some(CompositionOptions::compose());
            options.compat = true;
        }
    }
    map(text, &options)
}