use super::{InsufficientSpaceError, TransformCallback, TransformOptions};
#[allow(unused_imports, reason = "used by docs")]
use crate::ErrorKind;
use crate::transform::buffer::{MaybeUninitSlice, MaybeUninitSliceExt, SplitInitBuffer};
use bstr::BStr;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use std::ffi::{c_int, c_void};
use std::fmt::{Debug, Formatter};
use std::mem::MaybeUninit;
use utf8proc_sys::utf8proc_custom_func;
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, TryFromPrimitive, IntoPrimitive)]
#[non_exhaustive]
#[repr(i32)]
pub enum SpecialMarker {
GraphemeBoundary = -1,
}
impl SpecialMarker {
#[inline]
pub fn utf8_marker(&self) -> &'static BStr {
match self {
SpecialMarker::GraphemeBoundary => BStr::new(const { &[0xFF] }),
}
}
#[inline]
pub fn detect_special_marker_starting(&self, x: &BStr) -> Option<SpecialMarker> {
match x.first() {
Some(0xFF) => Some(SpecialMarker::GraphemeBoundary),
_ => None,
}
}
#[inline]
pub fn codepoint_marker(&self) -> MaybeMarkerCodepoint {
let id = i32::from(*self);
unsafe { MaybeMarkerCodepoint::from_u32_unchecked(id.cast_unsigned()) }
}
}
#[derive(Copy, Clone, Eq, PartialEq)]
#[repr(transparent)]
pub struct MaybeMarkerCodepoint(i32);
impl From<char> for MaybeMarkerCodepoint {
#[inline]
fn from(value: char) -> Self {
Self::from_char(value)
}
}
impl From<SpecialMarker> for MaybeMarkerCodepoint {
#[inline]
fn from(value: SpecialMarker) -> Self {
value.codepoint_marker()
}
}
impl MaybeMarkerCodepoint {
#[inline]
pub fn from_u32(x: u32) -> Option<Self> {
if char::from_u32(x).is_some() || SpecialMarker::try_from_primitive(x.cast_signed()).is_ok() {
Some(MaybeMarkerCodepoint(x.cast_signed()))
} else {
None }
}
#[inline]
pub fn from_char(c: char) -> Self {
unsafe { Self::from_u32_unchecked(c as u32) }
}
#[inline]
pub unsafe fn from_u32_unchecked(u: u32) -> Self {
debug_assert!(Self::from_u32(u).is_some(), "invalid codepoint");
MaybeMarkerCodepoint(u.cast_signed())
}
#[inline]
pub fn to_char(&self) -> Result<char, SpecialMarker> {
if let Some(x) = char::from_u32(self.0.cast_unsigned()) {
Ok(x)
} else {
let maybe_marker = SpecialMarker::try_from_primitive(self.0);
Err(unsafe { maybe_marker.unwrap_unchecked() })
}
}
#[inline]
pub fn to_marker(&self) -> Result<SpecialMarker, char> {
match self.to_char() {
Err(marker) => Ok(marker),
Ok(char) => Err(char),
}
}
pub const MAX_LEN_UTF8: usize = 4;
#[inline]
pub fn encode_utf8(&self, output: &mut [u8]) -> usize {
match self.to_char() {
Ok(c) => c.encode_utf8(output).len(),
Err(marker) => {
let marker_str = marker.utf8_marker();
debug_assert_eq!(marker_str.len(), 1); assert!(marker_str.len() <= output.len(), "insufficient length");
output[..marker_str.len()].copy_from_slice(marker_str);
marker_str.len()
}
}
}
}
impl Debug for MaybeMarkerCodepoint {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self.to_char() {
Ok(c) => write!(f, "{c:?}"),
Err(marker) => write!(f, "{marker:?}"),
}
}
}
impl PartialEq<char> for MaybeMarkerCodepoint {
#[inline]
fn eq(&self, other: &char) -> bool {
self.to_char() == Ok(*other)
}
}
#[derive(Default)]
pub struct BoundaryState {
pub last_bound_class: isize,
}
impl BoundaryState {
#[inline]
pub fn new() -> BoundaryState {
BoundaryState::default()
}
}
#[derive(Clone, Debug, thiserror::Error)]
pub enum TransformBufferError {
#[error(transparent)]
InsufficientSpace(#[from] InsufficientSpaceError),
#[error(transparent)]
Other(#[from] crate::Error),
}
#[allow(clippy::needless_lifetimes)]
#[inline]
unsafe fn handle_decompose_buffer_result(
res_code: isize,
dest: &mut MaybeUninitSlice<MaybeMarkerCodepoint>,
) -> Result<SplitInitBuffer<'_, MaybeMarkerCodepoint>, TransformBufferError> {
if res_code < 0 {
Err(crate::Error::from_code(res_code).into())
} else {
let res_length = res_code.cast_unsigned();
let original_len = dest.len();
match dest.split_at_mut_checked(res_length) {
None => Err(TransformBufferError::InsufficientSpace(InsufficientSpaceError {
needed_space: res_length,
actual_space: original_len,
})),
Some((initialized_part, uninit_part)) => {
Ok((
unsafe { MaybeUninitSliceExt::assume_init_mut(initialized_part) },
uninit_part,
))
}
}
}
}
pub const MAX_DECOMPOSE_CHAR_LENGTH: usize = 4;
#[cfg_attr(feature = "inline-more", inline)] pub fn decompose_char<'a>(
codepoint: char,
dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
options: &TransformOptions,
boundary_state: Option<&mut BoundaryState>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
let options = unsafe { options.to_ffi() };
let state_ptr: *mut isize = match boundary_state {
Some(&mut BoundaryState {
ref mut last_bound_class,
}) => std::ptr::from_mut(last_bound_class),
None => std::ptr::null_mut(),
};
assert!(size_of::<isize>() >= size_of::<c_int>());
assert!(align_of::<isize>() >= align_of::<c_int>());
let state_ptr = state_ptr.cast::<c_int>();
let res_code = unsafe {
utf8proc_sys::utf8proc_decompose_char(
codepoint as i32,
dest.as_mut_ptr().cast::<i32>(),
dest.len().cast_signed(),
options,
state_ptr, )
};
unsafe { handle_decompose_buffer_result(res_code, dest) }
}
#[cfg_attr(feature = "inline-more", inline)] pub fn decompose_buffer<'a>(
text: &BStr,
dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
options: &TransformOptions,
mut func: Option<TransformCallback>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
let options = unsafe { options.to_ffi() };
let (callback, callback_data) = unsafe { convert_callback(&mut func) };
let res_code = unsafe {
utf8proc_sys::utf8proc_decompose_custom(
text.as_ptr(),
text.len().cast_signed(),
dest.as_mut_ptr().cast::<i32>(),
dest.len().cast_signed(),
options,
callback,
callback_data,
)
};
unsafe { handle_decompose_buffer_result(res_code, dest) }
}
pub fn map_into(
text: &BStr,
dest: &mut Vec<u8>,
options: &TransformOptions,
mut func: Option<TransformCallback>,
) -> Result<(), crate::Error> {
#[inline]
fn buffer_from_uninit_vec(vec: &mut Vec<u8>) -> &mut MaybeUninitSlice<u8> {
unsafe {
std::slice::from_raw_parts_mut(
vec.as_mut_ptr().add(vec.len()).cast::<MaybeUninit<u8>>(),
vec.capacity() - vec.len(),
)
}
}
#[inline]
fn callback_add_indirection<'a>(func: &'a mut Option<TransformCallback>) -> Option<TransformCallback<'a>> {
match *func {
None => None,
Some(ref mut callback) => Some(callback as TransformCallback<'a>),
}
}
let decomposed_codepoints: Result<*mut [MaybeMarkerCodepoint], InsufficientSpaceError> = {
let (_, codepoint_buffer, _) =
unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
let func = callback_add_indirection(&mut func);
match decompose_buffer(text, codepoint_buffer, options, func) {
Ok((valid_codepoints, _)) => Ok(std::ptr::from_mut(valid_codepoints)),
Err(TransformBufferError::InsufficientSpace(space_error)) => Err(space_error),
Err(TransformBufferError::Other(cause)) => return Err(cause),
}
};
let decomposed_codepoints = match decomposed_codepoints {
Ok(buffer_ptr) => buffer_ptr, Err(InsufficientSpaceError {
needed_space: needed_elements,
actual_space: _,
}) => {
const WORST_CASE_OVERHEAD_BYTES: usize = (align_of::<MaybeMarkerCodepoint>() - 1) + 1;
let needed_bytes = needed_elements
.checked_mul(size_of::<MaybeMarkerCodepoint>())
.and_then(|bytes| bytes.checked_add(WORST_CASE_OVERHEAD_BYTES))
.expect("needed size overflowed as usize");
dest.reserve(needed_bytes);
let (_prefix_bytes, codepoint_buffer, _suffix_bytes) =
unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
assert!(codepoint_buffer.len() >= needed_elements, "allocated less than needed");
let func = callback_add_indirection(&mut func);
match decompose_buffer(text, codepoint_buffer, options, func) {
Ok((valid_codepoints, _)) => valid_codepoints as *mut [_],
Err(TransformBufferError::InsufficientSpace(space_error)) => {
unreachable!("insufficient space after allocating {needed_elements}: {space_error}")
}
Err(TransformBufferError::Other(cause)) => return Err(cause),
}
}
};
{
let res_code = unsafe {
utf8proc_sys::utf8proc_normalize_utf32(
decomposed_codepoints.cast::<i32>(),
decomposed_codepoints.len().cast_signed(),
options.to_ffi(),
)
};
if res_code < 0 {
return Err(crate::Error::from_code(res_code));
}
let normalized_codepoints_len = res_code.cast_unsigned();
assert!(
normalized_codepoints_len <= decomposed_codepoints.len(),
"normalized length can shrink but not grow"
);
{
let src_start = decomposed_codepoints.cast::<MaybeMarkerCodepoint>().cast_const();
let src_end = unsafe { src_start.add(normalized_codepoints_len) };
let dest_start = unsafe { dest.as_mut_ptr().add(dest.len()) };
let dest_end = unsafe { dest_start.add(dest.capacity()) };
assert!(dest_start <= dest_end);
assert!(dest_start.cast_const() <= src_start.cast::<u8>());
let mut src_current = src_start;
let mut dest_current = dest_start;
while src_current < src_end {
let src_entry = unsafe { src_current.read() };
src_current = unsafe { src_current.add(1) };
let dest_remaining_len = unsafe { dest_end.offset_from_unsigned(dest_current) };
assert!(
dest_remaining_len >= MaybeMarkerCodepoint::MAX_LEN_UTF8,
"not enough space left to write entry"
);
{
let buffer =
unsafe { std::slice::from_raw_parts_mut(dest_current, MaybeMarkerCodepoint::MAX_LEN_UTF8) };
let written_len = src_entry.encode_utf8(buffer);
assert!(written_len <= MaybeMarkerCodepoint::MAX_LEN_UTF8);
unsafe { dest_current = dest_current.add(written_len) };
assert!(dest_current.cast_const() <= src_current.cast::<u8>());
}
}
let written_len = unsafe { dest_current.offset_from_unsigned(dest_start) };
unsafe {
dest.set_len(dest.len().unchecked_add(written_len));
}
Ok(())
}
}
}
pub(crate) unsafe fn convert_callback(func: &mut Option<TransformCallback>) -> (utf8proc_custom_func, *mut c_void) {
type TrampolineCallbackData<'a> = &'a mut dyn FnMut(char) -> char;
unsafe extern "C" fn callback_trampoline(orig: i32, data: *mut c_void) -> i32 {
let data = unsafe { data.cast::<TrampolineCallbackData<'static>>().read() };
let orig = unsafe { char::from_u32_unchecked(orig.cast_unsigned()) };
data(orig) as i32
}
match *func {
None => (None, std::ptr::null_mut()),
Some(ref mut func_ptr) => (
Some(callback_trampoline),
std::ptr::from_mut::<TrampolineCallbackData>(func_ptr).cast(),
),
}
}
#[cfg(test)]
mod test {
use crate::transform::advanced::{MaybeMarkerCodepoint, SpecialMarker};
#[test]
fn maybe_marker_codepoint_conversions() {
assert_eq!(MaybeMarkerCodepoint::from_u32(char::MAX as u32 + 1), None);
assert_eq!(
MaybeMarkerCodepoint::from_u32(char::MAX as u32),
Some(MaybeMarkerCodepoint::from(char::MAX))
);
assert_eq!(
MaybeMarkerCodepoint::from(SpecialMarker::GraphemeBoundary).to_marker(),
Ok(SpecialMarker::GraphemeBoundary),
);
}
}