utf8proc 0.1.2 - Docs.rs

//! The "advanced" interface to transformations,
//! producing potentially non-UTF8 data.

use super::{InsufficientSpaceError, TransformCallback, TransformOptions};
#[allow(unused_imports, reason = "used by docs")]
use crate::ErrorKind;
use crate::transform::buffer::{MaybeUninitSlice, MaybeUninitSliceExt, SplitInitBuffer};
use bstr::BStr;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use std::ffi::{c_int, c_void};
use std::fmt::{Debug, Formatter};
use std::mem::MaybeUninit;
use utf8proc_sys::utf8proc_custom_func;

/// A special marker value used in the advanced interface.
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, TryFromPrimitive, IntoPrimitive)]
#[non_exhaustive]
#[repr(i32)]
pub enum SpecialMarker {
    /// Used to mark grapheme boundaries when using the
    ///  [`TransformOptions::grapheme_boundary_markers`] option.
    GraphemeBoundary = -1,
}
impl SpecialMarker {
    /// This special marker, as a UTF8-like [`BStr`].\
    ///
    /// This is not a valid UTF8 string,
    /// which is why it makes a good marker.
    #[inline]
    pub fn utf8_marker(&self) -> &'static BStr {
        match self {
            SpecialMarker::GraphemeBoundary => BStr::new(const { &[0xFF] }),
        }
    }

    /// If the specified string starts with a special marker,
    /// return the marker value.
    ///
    /// If the [`BStr`] contains data that was not encoded via [`MaybeMarkerCodepoint::encode_utf8`],
    /// this may return a false positive.
    /// In other words, garbage data may coincidently be interpreted as a marker.
    #[inline]
    pub fn detect_special_marker_starting(&self, x: &BStr) -> Option<SpecialMarker> {
        match x.first() {
            Some(0xFF) => Some(SpecialMarker::GraphemeBoundary),
            _ => None,
        }
    }

    /// This special marker, as a [`MaybeMarkerCodepoint`].
    #[inline]
    pub fn codepoint_marker(&self) -> MaybeMarkerCodepoint {
        let id = i32::from(*self);
        // SAFETY: Markers are valid
        unsafe { MaybeMarkerCodepoint::from_u32_unchecked(id.cast_unsigned()) }
    }
}

/// A value which is either [`char`] or a [special marker](SpecialMarker).
#[derive(Copy, Clone, Eq, PartialEq)]
#[repr(transparent)]
pub struct MaybeMarkerCodepoint(i32);
impl From<char> for MaybeMarkerCodepoint {
    #[inline]
    fn from(value: char) -> Self {
        Self::from_char(value)
    }
}
impl From<SpecialMarker> for MaybeMarkerCodepoint {
    #[inline]
    fn from(value: SpecialMarker) -> Self {
        value.codepoint_marker()
    }
}
impl MaybeMarkerCodepoint {
    /// Create this value from a `u32`,
    /// returning `None` if invalid.
    #[inline]
    pub fn from_u32(x: u32) -> Option<Self> {
        if char::from_u32(x).is_some() || SpecialMarker::try_from_primitive(x.cast_signed()).is_ok() {
            // NOTE: Don't use from_u32_unchecked as that will cause infinite recursion
            Some(MaybeMarkerCodepoint(x.cast_signed()))
        } else {
            None // invalid
        }
    }

    /// Convert from a standard character.
    ///
    /// Cannot fail.
    #[inline]
    pub fn from_char(c: char) -> Self {
        // SAFETY: A valid character
        unsafe { Self::from_u32_unchecked(c as u32) }
    }

    /// Create this value from a raw `u32`,
    /// without checking for validity.
    ///
    /// ## Safety
    /// Undefined behavior if neither a valid character nor a marker.
    #[inline]
    pub unsafe fn from_u32_unchecked(u: u32) -> Self {
        debug_assert!(Self::from_u32(u).is_some(), "invalid codepoint");
        MaybeMarkerCodepoint(u.cast_signed())
    }

    /// Convert this value into a [`char`],
    /// returning the corresponding [marker value](SpecialMarker) otherwise.
    #[inline]
    pub fn to_char(&self) -> Result<char, SpecialMarker> {
        if let Some(x) = char::from_u32(self.0.cast_unsigned()) {
            Ok(x)
        } else {
            let maybe_marker = SpecialMarker::try_from_primitive(self.0);
            // SAFETY: Either marker or char by type invariant
            Err(unsafe { maybe_marker.unwrap_unchecked() })
        }
    }

    /// Convert this value into a [`SpecialMarker`] value,
    /// returning a [`char`] otherwise.
    #[inline]
    pub fn to_marker(&self) -> Result<SpecialMarker, char> {
        match self.to_char() {
            Err(marker) => Ok(marker),
            Ok(char) => Err(char),
        }
    }

    /// The maximum length of this value, when encoded as UTF8.
    ///
    /// Currently 4 bytes, the same as [`char::MAX_LEN_UTF8`].
    pub const MAX_LEN_UTF8: usize = 4;

    /// UTF8 encode this character into a buffer,
    /// returning the number of characters which were written.
    ///
    /// Differs from [`char::encode_utf8`],
    /// because marker characters are encoded specially
    /// as the corresponding [`SpecialMarker::utf8_marker`].
    ///
    /// ## Panics
    /// Panics if there is insufficient space to encode this value.
    /// Using [`Self::MAX_LEN_UTF8`] bytes of space is always sufficient.
    #[inline]
    pub fn encode_utf8(&self, output: &mut [u8]) -> usize {
        match self.to_char() {
            Ok(c) => c.encode_utf8(output).len(),
            Err(marker) => {
                let marker_str = marker.utf8_marker();
                debug_assert_eq!(marker_str.len(), 1); // currently always one byte
                assert!(marker_str.len() <= output.len(), "insufficient length");
                output[..marker_str.len()].copy_from_slice(marker_str);
                marker_str.len()
            }
        }
    }
}
impl Debug for MaybeMarkerCodepoint {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self.to_char() {
            Ok(c) => write!(f, "{c:?}"),
            Err(marker) => write!(f, "{marker:?}"),
        }
    }
}
impl PartialEq<char> for MaybeMarkerCodepoint {
    #[inline]
    fn eq(&self, other: &char) -> bool {
        self.to_char() == Ok(*other)
    }
}

/// State used for determining grapheme boundaries when using the [`TransformOptions::grapheme_boundary_markers`] option.
///
/// If the string is being processed in order, this can be initialized with [`BoundaryState::new`]
/// at the beginning of the string, and is thereafter updated automatically.
#[derive(Default)]
pub struct BoundaryState {
    /// The previous codepoint's `(boundclass + indic_conjunct_break << 1)`.
    ///
    /// May be manually set if you know what you are doing.
    pub last_bound_class: isize,
}
impl BoundaryState {
    /// Initialize the boundary state for the processing of a new string.
    #[inline]
    pub fn new() -> BoundaryState {
        BoundaryState::default()
    }
}

/// An error that occurs when decomposing directly into a buffer.
#[derive(Clone, Debug, thiserror::Error)]
pub enum TransformBufferError {
    /// Indicates there is insufficeint space in the provided buffer.
    #[error(transparent)]
    InsufficientSpace(#[from] InsufficientSpaceError),
    /// An error that occurs while applying decomposition.
    #[error(transparent)]
    Other(#[from] crate::Error),
}

/// Handle the result of either [`decompose_char`] or [`decompose_buffer`].
///
/// ## Safety
/// Result code must accurately indicate either initialized length,
/// insufficient length, or an error,
/// in accordance with the convention of the utf8proc library.
#[allow(clippy::needless_lifetimes)]
#[inline]
unsafe fn handle_decompose_buffer_result(
    res_code: isize,
    dest: &mut MaybeUninitSlice<MaybeMarkerCodepoint>,
) -> Result<SplitInitBuffer<'_, MaybeMarkerCodepoint>, TransformBufferError> {
    if res_code < 0 {
        Err(crate::Error::from_code(res_code).into())
    } else {
        let res_length = res_code.cast_unsigned();
        let original_len = dest.len();
        match dest.split_at_mut_checked(res_length) {
            None => Err(TransformBufferError::InsufficientSpace(InsufficientSpaceError {
                needed_space: res_length,
                actual_space: original_len,
            })),
            Some((initialized_part, uninit_part)) => {
                Ok((
                    // SAFETY: Guaranteed to be initialized
                    unsafe { MaybeUninitSliceExt::assume_init_mut(initialized_part) },
                    uninit_part,
                ))
            }
        }
    }
}

/// The maximum length of the result from the [`decompose_char`] function.
///
/// This length is implicitly used by the simple interface ([`super::decompose_char`]),
/// which returns an iterator using a fixed-length buffer.
///
/// This may increase if certain options are enabled.
/// See [`decompose_char`] for details.
pub const MAX_DECOMPOSE_CHAR_LENGTH: usize = 4;

/// Decompose a codepoint into an array of codepoints.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode,
/// and could contain a [`SpecialMarker`].
///
/// The [`BoundaryState`] is only used for the [`TransformOptions::grapheme_boundary_markers`],
/// and can be `None` otherwise.
///
/// A buffer of length [`MAX_DECOMPOSE_CHAR_LENGTH`] length should never fail
/// unless the [`TransformOptions::grapheme_boundary_markers`] is enabled.
/// In that case double this space is needed.
#[cfg_attr(feature = "inline-more", inline)] // thin ffi wrapper - potential for constant folding
pub fn decompose_char<'a>(
    codepoint: char,
    dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
    options: &TransformOptions,
    boundary_state: Option<&mut BoundaryState>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
    // SAFETY: Don't care about UTF8 validity
    let options = unsafe { options.to_ffi() };
    let state_ptr: *mut isize = match boundary_state {
        Some(&mut BoundaryState {
            ref mut last_bound_class,
        }) => std::ptr::from_mut(last_bound_class),
        None => std::ptr::null_mut(),
    };
    assert!(size_of::<isize>() >= size_of::<c_int>());
    assert!(align_of::<isize>() >= align_of::<c_int>());
    let state_ptr = state_ptr.cast::<c_int>();
    // SAFETY: Passed valid pointer/length
    let res_code = unsafe {
        utf8proc_sys::utf8proc_decompose_char(
            codepoint as i32,
            dest.as_mut_ptr().cast::<i32>(),
            dest.len().cast_signed(),
            options,
            state_ptr, // okay if null
        )
    };
    // SAFETY: Result correctly indicates length or error
    unsafe { handle_decompose_buffer_result(res_code, dest) }
}

/// Decompose a string into a fixed-size buffer.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode.
/// The input is not statically required to be valid UTF8 either,
/// and invalid UTF8 will return a [`ErrorKind::InvalidUtf8`]./
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
#[cfg_attr(feature = "inline-more", inline)] // thin ffi wrapper - potential for constant folding
pub fn decompose_buffer<'a>(
    text: &BStr,
    dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
    options: &TransformOptions,
    mut func: Option<TransformCallback>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
    // SAFETY: No assumption about UTF8 validity
    let options = unsafe { options.to_ffi() };
    // SAFETY: callback trusted to be used correctly
    let (callback, callback_data) = unsafe { convert_callback(&mut func) };
    // SAFETY: Passed valid pointer/length and callback, result is either UTF32 or marker
    let res_code = unsafe {
        utf8proc_sys::utf8proc_decompose_custom(
            text.as_ptr(),
            text.len().cast_signed(),
            dest.as_mut_ptr().cast::<i32>(),
            dest.len().cast_signed(),
            options,
            callback,
            callback_data,
        )
    };
    // SAFETY: Result correctly indicates length or error
    unsafe { handle_decompose_buffer_result(res_code, dest) }
}

/// Apply a transformation to a string, indicated by the [`TransformOptions`],
/// writing the result into the specified destination byte-buffer.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode.
/// The input is not statically required to be valid UTF8 either,
/// and invalid UTF8 will return a [`ErrorKind::InvalidUtf8`].
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
/// The callback is expected to be deterministic.
/// If not, it could trigger unexpected panics (but not undefined behavior).
///
/// Implicitly allocates necessary space, so a [`InsufficientSpaceError`] is impossible.
///
/// ## Implementation
/// This method is behaviorally equivalent to the [`utf8proc_map_custom`] function in the C library,
/// but is reimplemented to have a couple major advantages:
/// - Reuses a destination buffer instead of freshly allocating each time
/// - Avoids calling [`decompose_buffer`] twice if buffer is already of sufficient length
/// - Does not require the input buffer to be word-aligned
/// - Uses rust allocator instead of C allocator
/// - Does not implicitly add null terminator
///
/// [`utf8proc_map_custom`]: utf8proc_sys::utf8proc_map_custom
/*
 * Not marked as #[inline], because there are multiple FFI calls involved.
 * Allocation and copying will likely dwarf smaller costs.
 *
 * TODO: This implementation has a lot of unsafe code.
 * How can we reduce usage of unsafe code?
 * Almost all of it comes from trying to do everything in-place at the end.
 * Is that really worth the safety cost?
 * Honestly, it doesn't seem that much worse than what C does all the time.
 * It's just annoying we need to sprinkle unsafe everywhere.
 */
pub fn map_into(
    text: &BStr,
    dest: &mut Vec<u8>,
    options: &TransformOptions,
    mut func: Option<TransformCallback>,
) -> Result<(), crate::Error> {
    #[inline]
    fn buffer_from_uninit_vec(vec: &mut Vec<u8>) -> &mut MaybeUninitSlice<u8> {
        // SAFETY: Safe to access the uninitialized elements of a vec,
        // the lifetime guarantees temporal validity
        unsafe {
            std::slice::from_raw_parts_mut(
                vec.as_mut_ptr().add(vec.len()).cast::<MaybeUninit<u8>>(),
                vec.capacity() - vec.len(),
            )
        }
    }
    /// I am running into borrow-checker issues having issues with the callback
    /// being used twice (its a mutable reference).
    ///
    /// Since I can't solve these issues, I've come up with the next best thing:
    /// Another layer of indirection.
    #[inline]
    fn callback_add_indirection<'a>(func: &'a mut Option<TransformCallback>) -> Option<TransformCallback<'a>> {
        match *func {
            None => None,
            Some(ref mut callback) => Some(callback as TransformCallback<'a>),
        }
    }
    // either points to a buffer filled with the decoded codepoints,
    // or an error that indicates more space is needed(
    //
    // This is its own block for lifetime purposes.
    // A Vec::reserve call could invalidate the old &mut [...] buffer
    let decomposed_codepoints: Result<*mut [MaybeMarkerCodepoint], InsufficientSpaceError> = {
        // SAFETY: valid to cast from u8 -> u32, subject to alignment
        let (_, codepoint_buffer, _) =
            unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
        let func = callback_add_indirection(&mut func);
        match decompose_buffer(text, codepoint_buffer, options, func) {
            Ok((valid_codepoints, _)) => Ok(std::ptr::from_mut(valid_codepoints)),
            Err(TransformBufferError::InsufficientSpace(space_error)) => Err(space_error),
            Err(TransformBufferError::Other(cause)) => return Err(cause),
        }
    };
    // a buffer filled with the decoded components
    //
    // can not run out of space, because
    let decomposed_codepoints = match decomposed_codepoints {
        Ok(buffer_ptr) => buffer_ptr, // nothing needed
        Err(InsufficientSpaceError {
            needed_space: needed_elements,
            actual_space: _,
        }) => {
            // need alignment - 1 potential adding bytes,
            // do not need null terminator unlike the C code
            const WORST_CASE_OVERHEAD_BYTES: usize = (align_of::<MaybeMarkerCodepoint>() - 1) + 1;
            let needed_bytes = needed_elements
                .checked_mul(size_of::<MaybeMarkerCodepoint>())
                .and_then(|bytes| bytes.checked_add(WORST_CASE_OVERHEAD_BYTES))
                .expect("needed size overflowed as usize");
            dest.reserve(needed_bytes);
            // SAFETY: valid to cast from u8 -> u32, subject to alignment
            let (_prefix_bytes, codepoint_buffer, _suffix_bytes) =
                unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
            // possible if there is a bug on our end, or a non-deterministic callback
            assert!(codepoint_buffer.len() >= needed_elements, "allocated less than needed");
            let func = callback_add_indirection(&mut func);
            match decompose_buffer(text, codepoint_buffer, options, func) {
                Ok((valid_codepoints, _)) => valid_codepoints as *mut [_],
                Err(TransformBufferError::InsufficientSpace(space_error)) => {
                    unreachable!("insufficient space after allocating {needed_elements}: {space_error}")
                }
                Err(TransformBufferError::Other(cause)) => return Err(cause),
            }
        }
    };
    // Now normalize decoded codepoints in-place
    {
        // SAFETY: Looking at source, input appears to accept markers in practice
        // TODO: Open an issue upstream to get this behavior documented,
        // then we can switch to using a safe wrapper
        let res_code = unsafe {
            utf8proc_sys::utf8proc_normalize_utf32(
                decomposed_codepoints.cast::<i32>(),
                decomposed_codepoints.len().cast_signed(),
                // SAFETY: Don't care about UTF8 validity here, markers are acceptable
                options.to_ffi(),
            )
        };
        if res_code < 0 {
            return Err(crate::Error::from_code(res_code));
        }
        let normalized_codepoints_len = res_code.cast_unsigned();
        assert!(
            normalized_codepoints_len <= decomposed_codepoints.len(),
            "normalized length can shrink but not grow"
        );
        // now convert from codepoints to UTF8 in-place
        // Since we are using the same buffer, we have to be really careful
        // and can't use &mut slices because that requires exclusive access
        {
            let src_start = decomposed_codepoints.cast::<MaybeMarkerCodepoint>().cast_const();
            // SAFETY: Length is in bounds
            let src_end = unsafe { src_start.add(normalized_codepoints_len) };
            // SAFETY: Pointer in bounds
            let dest_start = unsafe { dest.as_mut_ptr().add(dest.len()) };
            // SAFETY: Capacity is in bounds, and indicates end of allocated data
            let dest_end = unsafe { dest_start.add(dest.capacity()) };
            assert!(dest_start <= dest_end);
            // The destination potentially points before the source due to alignment,
            // but never points later
            assert!(dest_start.cast_const() <= src_start.cast::<u8>());
            let mut src_current = src_start;
            let mut dest_current = dest_start;
            while src_current < src_end {
                // SAFETY: Checked in loop condition the pointer is in bounds
                let src_entry = unsafe { src_current.read() };
                // SAFETY: Checked in loop condition the pointer is in bounds
                src_current = unsafe { src_current.add(1) };
                // SAFETY: current pointer is always less than end pointer
                let dest_remaining_len = unsafe { dest_end.offset_from_unsigned(dest_current) };
                // This should never happen, but prefer assert to UB
                assert!(
                    dest_remaining_len >= MaybeMarkerCodepoint::MAX_LEN_UTF8,
                    "not enough space left to write entry"
                );
                // creating a &mut slice for a block scope is fine,
                // as long as we are not reading while the reference is live
                {
                    // SAFETY: Checked length is sufficient, dest pointer is valid
                    let buffer =
                        unsafe { std::slice::from_raw_parts_mut(dest_current, MaybeMarkerCodepoint::MAX_LEN_UTF8) };
                    let written_len = src_entry.encode_utf8(buffer);
                    assert!(written_len <= MaybeMarkerCodepoint::MAX_LEN_UTF8);
                    // SAFETY: Verified length is in-bounds
                    unsafe { dest_current = dest_current.add(written_len) };
                    assert!(dest_current.cast_const() <= src_current.cast::<u8>());
                }
            }
            // SAFETY: The dest_current pointer is in-bounds
            let written_len = unsafe { dest_current.offset_from_unsigned(dest_start) };
            // add this length to the value of the
            // SAFETY: All within the allocated buffer,
            // and we just initialized it
            unsafe {
                dest.set_len(dest.len().unchecked_add(written_len));
            }
            Ok(())
        }
    }
}

/// Convert a rust-style [`TransformCallback`] into a C-style [`utf8proc_sys::utf8proc_custom_func`].
///
/// This needs double indirection to make the fat-pointer.
///
/// ## Safety
/// While invoking this function is technically safe, the returned callback is highly unsafe.
///
/// Caller must guarantee that the lifetime of the `&mut Option<&mut dyn FnMut(...)>`
/// will be live whenever the callback is invoked.
/// This includes both pointers, the outer one and the inner one.
/// The callback must only be passed valid Unicode codepoints,
/// that can be represented as a rust [`char`].
/// Whenever the callback is invoked from C, the data pointer must be preserved as-is,
/// and not tampered with or changed.
pub(crate) unsafe fn convert_callback(func: &mut Option<TransformCallback>) -> (utf8proc_custom_func, *mut c_void) {
    type TrampolineCallbackData<'a> = &'a mut dyn FnMut(char) -> char;
    unsafe extern "C" fn callback_trampoline(orig: i32, data: *mut c_void) -> i32 {
        // SAFETY: Caller is trusted to preserve the `data` pointer as-is.
        let data = unsafe { data.cast::<TrampolineCallbackData<'static>>().read() };
        // SAFETY: Caller guarantees that codepoint is valid
        let orig = unsafe { char::from_u32_unchecked(orig.cast_unsigned()) };
        data(orig) as i32
    }
    match *func {
        None => (None, std::ptr::null_mut()),
        Some(ref mut func_ptr) => (
            Some(callback_trampoline),
            std::ptr::from_mut::<TrampolineCallbackData>(func_ptr).cast(),
        ),
    }
}

#[cfg(test)]
mod test {
    use crate::transform::advanced::{MaybeMarkerCodepoint, SpecialMarker};

    #[test]
    fn maybe_marker_codepoint_conversions() {
        assert_eq!(MaybeMarkerCodepoint::from_u32(char::MAX as u32 + 1), None);
        assert_eq!(
            MaybeMarkerCodepoint::from_u32(char::MAX as u32),
            Some(MaybeMarkerCodepoint::from(char::MAX))
        );
        assert_eq!(
            MaybeMarkerCodepoint::from(SpecialMarker::GraphemeBoundary).to_marker(),
            Ok(SpecialMarker::GraphemeBoundary),
        );
    }
}