utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
//! The "advanced" interface to transformations,
//! producing potentially non-UTF8 data.

use super::{InsufficientSpaceError, TransformCallback, TransformOptions};
#[allow(unused_imports, reason = "used by docs")]
use crate::ErrorKind;
use crate::transform::buffer::{MaybeUninitSlice, MaybeUninitSliceExt, SplitInitBuffer};
use bstr::BStr;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use std::ffi::{c_int, c_void};
use std::fmt::{Debug, Formatter};
use std::mem::MaybeUninit;
use utf8proc_sys::utf8proc_custom_func;

/// A special marker value used in the advanced interface.
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, TryFromPrimitive, IntoPrimitive)]
#[non_exhaustive]
#[repr(i32)]
pub enum SpecialMarker {
    /// Used to mark grapheme boundaries when using the
    ///  [`TransformOptions::grapheme_boundary_markers`] option.
    GraphemeBoundary = -1,
}
impl SpecialMarker {
    /// This special marker, as a UTF8-like [`BStr`].\
    ///
    /// This is not a valid UTF8 string,
    /// which is why it makes a good marker.
    #[inline]
    pub fn utf8_marker(&self) -> &'static BStr {
        match self {
            SpecialMarker::GraphemeBoundary => BStr::new(const { &[0xFF] }),
        }
    }

    /// If the specified string starts with a special marker,
    /// return the marker value.
    ///
    /// If the [`BStr`] contains data that was not encoded via [`MaybeMarkerCodepoint::encode_utf8`],
    /// this may return a false positive.
    /// In other words, garbage data may coincidently be interpreted as a marker.
    #[inline]
    pub fn detect_special_marker_starting(&self, x: &BStr) -> Option<SpecialMarker> {
        match x.first() {
            Some(0xFF) => Some(SpecialMarker::GraphemeBoundary),
            _ => None,
        }
    }

    /// This special marker, as a [`MaybeMarkerCodepoint`].
    #[inline]
    pub fn codepoint_marker(&self) -> MaybeMarkerCodepoint {
        let id = i32::from(*self);
        // SAFETY: Markers are valid
        unsafe { MaybeMarkerCodepoint::from_u32_unchecked(id.cast_unsigned()) }
    }
}

/// A value which is either [`char`] or a [special marker](SpecialMarker).
#[derive(Copy, Clone, Eq, PartialEq)]
#[repr(transparent)]
pub struct MaybeMarkerCodepoint(i32);
impl From<char> for MaybeMarkerCodepoint {
    #[inline]
    fn from(value: char) -> Self {
        Self::from_char(value)
    }
}
impl From<SpecialMarker> for MaybeMarkerCodepoint {
    #[inline]
    fn from(value: SpecialMarker) -> Self {
        value.codepoint_marker()
    }
}
impl MaybeMarkerCodepoint {
    /// Create this value from a `u32`,
    /// returning `None` if invalid.
    #[inline]
    pub fn from_u32(x: u32) -> Option<Self> {
        if char::from_u32(x).is_some() || SpecialMarker::try_from_primitive(x.cast_signed()).is_ok() {
            // NOTE: Don't use from_u32_unchecked as that will cause infinite recursion
            Some(MaybeMarkerCodepoint(x.cast_signed()))
        } else {
            None // invalid
        }
    }

    /// Convert from a standard character.
    ///
    /// Cannot fail.
    #[inline]
    pub fn from_char(c: char) -> Self {
        // SAFETY: A valid character
        unsafe { Self::from_u32_unchecked(c as u32) }
    }

    /// Create this value from a raw `u32`,
    /// without checking for validity.
    ///
    /// ## Safety
    /// Undefined behavior if neither a valid character nor a marker.
    #[inline]
    pub unsafe fn from_u32_unchecked(u: u32) -> Self {
        debug_assert!(Self::from_u32(u).is_some(), "invalid codepoint");
        MaybeMarkerCodepoint(u.cast_signed())
    }

    /// Convert this value into a [`char`],
    /// returning the corresponding [marker value](SpecialMarker) otherwise.
    #[inline]
    pub fn to_char(&self) -> Result<char, SpecialMarker> {
        if let Some(x) = char::from_u32(self.0.cast_unsigned()) {
            Ok(x)
        } else {
            let maybe_marker = SpecialMarker::try_from_primitive(self.0);
            // SAFETY: Either marker or char by type invariant
            Err(unsafe { maybe_marker.unwrap_unchecked() })
        }
    }

    /// Convert this value into a [`SpecialMarker`] value,
    /// returning a [`char`] otherwise.
    #[inline]
    pub fn to_marker(&self) -> Result<SpecialMarker, char> {
        match self.to_char() {
            Err(marker) => Ok(marker),
            Ok(char) => Err(char),
        }
    }

    /// The maximum length of this value, when encoded as UTF8.
    ///
    /// Currently 4 bytes, the same as [`char::MAX_LEN_UTF8`].
    pub const MAX_LEN_UTF8: usize = 4;

    /// UTF8 encode this character into a buffer,
    /// returning the number of characters which were written.
    ///
    /// Differs from [`char::encode_utf8`],
    /// because marker characters are encoded specially
    /// as the corresponding [`SpecialMarker::utf8_marker`].
    ///
    /// ## Panics
    /// Panics if there is insufficient space to encode this value.
    /// Using [`Self::MAX_LEN_UTF8`] bytes of space is always sufficient.
    #[inline]
    pub fn encode_utf8(&self, output: &mut [u8]) -> usize {
        match self.to_char() {
            Ok(c) => c.encode_utf8(output).len(),
            Err(marker) => {
                let marker_str = marker.utf8_marker();
                debug_assert_eq!(marker_str.len(), 1); // currently always one byte
                assert!(marker_str.len() <= output.len(), "insufficient length");
                output[..marker_str.len()].copy_from_slice(marker_str);
                marker_str.len()
            }
        }
    }
}
impl Debug for MaybeMarkerCodepoint {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self.to_char() {
            Ok(c) => write!(f, "{c:?}"),
            Err(marker) => write!(f, "{marker:?}"),
        }
    }
}
impl PartialEq<char> for MaybeMarkerCodepoint {
    #[inline]
    fn eq(&self, other: &char) -> bool {
        self.to_char() == Ok(*other)
    }
}

/// State used for determining grapheme boundaries when using the [`TransformOptions::grapheme_boundary_markers`] option.
///
/// If the string is being processed in order, this can be initialized with [`BoundaryState::new`]
/// at the beginning of the string, and is thereafter updated automatically.
#[derive(Default)]
pub struct BoundaryState {
    /// The previous codepoint's `(boundclass + indic_conjunct_break << 1)`.
    ///
    /// May be manually set if you know what you are doing.
    pub last_bound_class: isize,
}
impl BoundaryState {
    /// Initialize the boundary state for the processing of a new string.
    #[inline]
    pub fn new() -> BoundaryState {
        BoundaryState::default()
    }
}

/// An error that occurs when decomposing directly into a buffer.
#[derive(Clone, Debug, thiserror::Error)]
pub enum TransformBufferError {
    /// Indicates there is insufficeint space in the provided buffer.
    #[error(transparent)]
    InsufficientSpace(#[from] InsufficientSpaceError),
    /// An error that occurs while applying decomposition.
    #[error(transparent)]
    Other(#[from] crate::Error),
}

/// Handle the result of either [`decompose_char`] or [`decompose_buffer`].
///
/// ## Safety
/// Result code must accurately indicate either initialized length,
/// insufficient length, or an error,
/// in accordance with the convention of the utf8proc library.
#[allow(clippy::needless_lifetimes)]
#[inline]
unsafe fn handle_decompose_buffer_result(
    res_code: isize,
    dest: &mut MaybeUninitSlice<MaybeMarkerCodepoint>,
) -> Result<SplitInitBuffer<'_, MaybeMarkerCodepoint>, TransformBufferError> {
    if res_code < 0 {
        Err(crate::Error::from_code(res_code).into())
    } else {
        let res_length = res_code.cast_unsigned();
        let original_len = dest.len();
        match dest.split_at_mut_checked(res_length) {
            None => Err(TransformBufferError::InsufficientSpace(InsufficientSpaceError {
                needed_space: res_length,
                actual_space: original_len,
            })),
            Some((initialized_part, uninit_part)) => {
                Ok((
                    // SAFETY: Guaranteed to be initialized
                    unsafe { MaybeUninitSliceExt::assume_init_mut(initialized_part) },
                    uninit_part,
                ))
            }
        }
    }
}

/// The maximum length of the result from the [`decompose_char`] function.
///
/// This length is implicitly used by the simple interface ([`super::decompose_char`]),
/// which returns an iterator using a fixed-length buffer.
///
/// This may increase if certain options are enabled.
/// See [`decompose_char`] for details.
pub const MAX_DECOMPOSE_CHAR_LENGTH: usize = 4;

/// Decompose a codepoint into an array of codepoints.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode,
/// and could contain a [`SpecialMarker`].
///
/// The [`BoundaryState`] is only used for the [`TransformOptions::grapheme_boundary_markers`],
/// and can be `None` otherwise.
///
/// A buffer of length [`MAX_DECOMPOSE_CHAR_LENGTH`] length should never fail
/// unless the [`TransformOptions::grapheme_boundary_markers`] is enabled.
/// In that case double this space is needed.
#[cfg_attr(feature = "inline-more", inline)] // thin ffi wrapper - potential for constant folding
pub fn decompose_char<'a>(
    codepoint: char,
    dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
    options: &TransformOptions,
    boundary_state: Option<&mut BoundaryState>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
    // SAFETY: Don't care about UTF8 validity
    let options = unsafe { options.to_ffi() };
    let state_ptr: *mut isize = match boundary_state {
        Some(&mut BoundaryState {
            ref mut last_bound_class,
        }) => std::ptr::from_mut(last_bound_class),
        None => std::ptr::null_mut(),
    };
    assert!(size_of::<isize>() >= size_of::<c_int>());
    assert!(align_of::<isize>() >= align_of::<c_int>());
    let state_ptr = state_ptr.cast::<c_int>();
    // SAFETY: Passed valid pointer/length
    let res_code = unsafe {
        utf8proc_sys::utf8proc_decompose_char(
            codepoint as i32,
            dest.as_mut_ptr().cast::<i32>(),
            dest.len().cast_signed(),
            options,
            state_ptr, // okay if null
        )
    };
    // SAFETY: Result correctly indicates length or error
    unsafe { handle_decompose_buffer_result(res_code, dest) }
}

/// Decompose a string into a fixed-size buffer.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode.
/// The input is not statically required to be valid UTF8 either,
/// and invalid UTF8 will return a [`ErrorKind::InvalidUtf8`]./
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
#[cfg_attr(feature = "inline-more", inline)] // thin ffi wrapper - potential for constant folding
pub fn decompose_buffer<'a>(
    text: &BStr,
    dest: &'a mut MaybeUninitSlice<MaybeMarkerCodepoint>,
    options: &TransformOptions,
    mut func: Option<TransformCallback>,
) -> Result<SplitInitBuffer<'a, MaybeMarkerCodepoint>, TransformBufferError> {
    // SAFETY: No assumption about UTF8 validity
    let options = unsafe { options.to_ffi() };
    // SAFETY: callback trusted to be used correctly
    let (callback, callback_data) = unsafe { convert_callback(&mut func) };
    // SAFETY: Passed valid pointer/length and callback, result is either UTF32 or marker
    let res_code = unsafe {
        utf8proc_sys::utf8proc_decompose_custom(
            text.as_ptr(),
            text.len().cast_signed(),
            dest.as_mut_ptr().cast::<i32>(),
            dest.len().cast_signed(),
            options,
            callback,
            callback_data,
        )
    };
    // SAFETY: Result correctly indicates length or error
    unsafe { handle_decompose_buffer_result(res_code, dest) }
}

/// Apply a transformation to a string, indicated by the [`TransformOptions`],
/// writing the result into the specified destination byte-buffer.
///
/// Since this is the advanced interface,
/// the result may not be valid Unicode.
/// The input is not statically required to be valid UTF8 either,
/// and invalid UTF8 will return a [`ErrorKind::InvalidUtf8`].
///
/// May apply a user-specified transformation to each codepoint,
/// before utf8proc does its own transformations.
/// The callback is expected to be deterministic.
/// If not, it could trigger unexpected panics (but not undefined behavior).
///
/// Implicitly allocates necessary space, so a [`InsufficientSpaceError`] is impossible.
///
/// ## Implementation
/// This method is behaviorally equivalent to the [`utf8proc_map_custom`] function in the C library,
/// but is reimplemented to have a couple major advantages:
/// - Reuses a destination buffer instead of freshly allocating each time
/// - Avoids calling [`decompose_buffer`] twice if buffer is already of sufficient length
/// - Does not require the input buffer to be word-aligned
/// - Uses rust allocator instead of C allocator
/// - Does not implicitly add null terminator
///
/// [`utf8proc_map_custom`]: utf8proc_sys::utf8proc_map_custom
/*
 * Not marked as #[inline], because there are multiple FFI calls involved.
 * Allocation and copying will likely dwarf smaller costs.
 *
 * TODO: This implementation has a lot of unsafe code.
 * How can we reduce usage of unsafe code?
 * Almost all of it comes from trying to do everything in-place at the end.
 * Is that really worth the safety cost?
 * Honestly, it doesn't seem that much worse than what C does all the time.
 * It's just annoying we need to sprinkle unsafe everywhere.
 */
pub fn map_into(
    text: &BStr,
    dest: &mut Vec<u8>,
    options: &TransformOptions,
    mut func: Option<TransformCallback>,
) -> Result<(), crate::Error> {
    #[inline]
    fn buffer_from_uninit_vec(vec: &mut Vec<u8>) -> &mut MaybeUninitSlice<u8> {
        // SAFETY: Safe to access the uninitialized elements of a vec,
        // the lifetime guarantees temporal validity
        unsafe {
            std::slice::from_raw_parts_mut(
                vec.as_mut_ptr().add(vec.len()).cast::<MaybeUninit<u8>>(),
                vec.capacity() - vec.len(),
            )
        }
    }
    /// I am running into borrow-checker issues having issues with the callback
    /// being used twice (its a mutable reference).
    ///
    /// Since I can't solve these issues, I've come up with the next best thing:
    /// Another layer of indirection.
    #[inline]
    fn callback_add_indirection<'a>(func: &'a mut Option<TransformCallback>) -> Option<TransformCallback<'a>> {
        match *func {
            None => None,
            Some(ref mut callback) => Some(callback as TransformCallback<'a>),
        }
    }
    // either points to a buffer filled with the decoded codepoints,
    // or an error that indicates more space is needed(
    //
    // This is its own block for lifetime purposes.
    // A Vec::reserve call could invalidate the old &mut [...] buffer
    let decomposed_codepoints: Result<*mut [MaybeMarkerCodepoint], InsufficientSpaceError> = {
        // SAFETY: valid to cast from u8 -> u32, subject to alignment
        let (_, codepoint_buffer, _) =
            unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
        let func = callback_add_indirection(&mut func);
        match decompose_buffer(text, codepoint_buffer, options, func) {
            Ok((valid_codepoints, _)) => Ok(std::ptr::from_mut(valid_codepoints)),
            Err(TransformBufferError::InsufficientSpace(space_error)) => Err(space_error),
            Err(TransformBufferError::Other(cause)) => return Err(cause),
        }
    };
    // a buffer filled with the decoded components
    //
    // can not run out of space, because
    let decomposed_codepoints = match decomposed_codepoints {
        Ok(buffer_ptr) => buffer_ptr, // nothing needed
        Err(InsufficientSpaceError {
            needed_space: needed_elements,
            actual_space: _,
        }) => {
            // need alignment - 1 potential adding bytes,
            // do not need null terminator unlike the C code
            const WORST_CASE_OVERHEAD_BYTES: usize = (align_of::<MaybeMarkerCodepoint>() - 1) + 1;
            let needed_bytes = needed_elements
                .checked_mul(size_of::<MaybeMarkerCodepoint>())
                .and_then(|bytes| bytes.checked_add(WORST_CASE_OVERHEAD_BYTES))
                .expect("needed size overflowed as usize");
            dest.reserve(needed_bytes);
            // SAFETY: valid to cast from u8 -> u32, subject to alignment
            let (_prefix_bytes, codepoint_buffer, _suffix_bytes) =
                unsafe { buffer_from_uninit_vec(dest).align_to_mut::<MaybeUninit<MaybeMarkerCodepoint>>() };
            // possible if there is a bug on our end, or a non-deterministic callback
            assert!(codepoint_buffer.len() >= needed_elements, "allocated less than needed");
            let func = callback_add_indirection(&mut func);
            match decompose_buffer(text, codepoint_buffer, options, func) {
                Ok((valid_codepoints, _)) => valid_codepoints as *mut [_],
                Err(TransformBufferError::InsufficientSpace(space_error)) => {
                    unreachable!("insufficient space after allocating {needed_elements}: {space_error}")
                }
                Err(TransformBufferError::Other(cause)) => return Err(cause),
            }
        }
    };
    // Now normalize decoded codepoints in-place
    {
        // SAFETY: Looking at source, input appears to accept markers in practice
        // TODO: Open an issue upstream to get this behavior documented,
        // then we can switch to using a safe wrapper
        let res_code = unsafe {
            utf8proc_sys::utf8proc_normalize_utf32(
                decomposed_codepoints.cast::<i32>(),
                decomposed_codepoints.len().cast_signed(),
                // SAFETY: Don't care about UTF8 validity here, markers are acceptable
                options.to_ffi(),
            )
        };
        if res_code < 0 {
            return Err(crate::Error::from_code(res_code));
        }
        let normalized_codepoints_len = res_code.cast_unsigned();
        assert!(
            normalized_codepoints_len <= decomposed_codepoints.len(),
            "normalized length can shrink but not grow"
        );
        // now convert from codepoints to UTF8 in-place
        // Since we are using the same buffer, we have to be really careful
        // and can't use &mut slices because that requires exclusive access
        {
            let src_start = decomposed_codepoints.cast::<MaybeMarkerCodepoint>().cast_const();
            // SAFETY: Length is in bounds
            let src_end = unsafe { src_start.add(normalized_codepoints_len) };
            // SAFETY: Pointer in bounds
            let dest_start = unsafe { dest.as_mut_ptr().add(dest.len()) };
            // SAFETY: Capacity is in bounds, and indicates end of allocated data
            let dest_end = unsafe { dest_start.add(dest.capacity()) };
            assert!(dest_start <= dest_end);
            // The destination potentially points before the source due to alignment,
            // but never points later
            assert!(dest_start.cast_const() <= src_start.cast::<u8>());
            let mut src_current = src_start;
            let mut dest_current = dest_start;
            while src_current < src_end {
                // SAFETY: Checked in loop condition the pointer is in bounds
                let src_entry = unsafe { src_current.read() };
                // SAFETY: Checked in loop condition the pointer is in bounds
                src_current = unsafe { src_current.add(1) };
                // SAFETY: current pointer is always less than end pointer
                let dest_remaining_len = unsafe { dest_end.offset_from_unsigned(dest_current) };
                // This should never happen, but prefer assert to UB
                assert!(
                    dest_remaining_len >= MaybeMarkerCodepoint::MAX_LEN_UTF8,
                    "not enough space left to write entry"
                );
                // creating a &mut slice for a block scope is fine,
                // as long as we are not reading while the reference is live
                {
                    // SAFETY: Checked length is sufficient, dest pointer is valid
                    let buffer =
                        unsafe { std::slice::from_raw_parts_mut(dest_current, MaybeMarkerCodepoint::MAX_LEN_UTF8) };
                    let written_len = src_entry.encode_utf8(buffer);
                    assert!(written_len <= MaybeMarkerCodepoint::MAX_LEN_UTF8);
                    // SAFETY: Verified length is in-bounds
                    unsafe { dest_current = dest_current.add(written_len) };
                    assert!(dest_current.cast_const() <= src_current.cast::<u8>());
                }
            }
            // SAFETY: The dest_current pointer is in-bounds
            let written_len = unsafe { dest_current.offset_from_unsigned(dest_start) };
            // add this length to the value of the
            // SAFETY: All within the allocated buffer,
            // and we just initialized it
            unsafe {
                dest.set_len(dest.len().unchecked_add(written_len));
            }
            Ok(())
        }
    }
}

/// Convert a rust-style [`TransformCallback`] into a C-style [`utf8proc_sys::utf8proc_custom_func`].
///
/// This needs double indirection to make the fat-pointer.
///
/// ## Safety
/// While invoking this function is technically safe, the returned callback is highly unsafe.
///
/// Caller must guarantee that the lifetime of the `&mut Option<&mut dyn FnMut(...)>`
/// will be live whenever the callback is invoked.
/// This includes both pointers, the outer one and the inner one.
/// The callback must only be passed valid Unicode codepoints,
/// that can be represented as a rust [`char`].
/// Whenever the callback is invoked from C, the data pointer must be preserved as-is,
/// and not tampered with or changed.
pub(crate) unsafe fn convert_callback(func: &mut Option<TransformCallback>) -> (utf8proc_custom_func, *mut c_void) {
    type TrampolineCallbackData<'a> = &'a mut dyn FnMut(char) -> char;
    unsafe extern "C" fn callback_trampoline(orig: i32, data: *mut c_void) -> i32 {
        // SAFETY: Caller is trusted to preserve the `data` pointer as-is.
        let data = unsafe { data.cast::<TrampolineCallbackData<'static>>().read() };
        // SAFETY: Caller guarantees that codepoint is valid
        let orig = unsafe { char::from_u32_unchecked(orig.cast_unsigned()) };
        data(orig) as i32
    }
    match *func {
        None => (None, std::ptr::null_mut()),
        Some(ref mut func_ptr) => (
            Some(callback_trampoline),
            std::ptr::from_mut::<TrampolineCallbackData>(func_ptr).cast(),
        ),
    }
}

#[cfg(test)]
mod test {
    use crate::transform::advanced::{MaybeMarkerCodepoint, SpecialMarker};

    #[test]
    fn maybe_marker_codepoint_conversions() {
        assert_eq!(MaybeMarkerCodepoint::from_u32(char::MAX as u32 + 1), None);
        assert_eq!(
            MaybeMarkerCodepoint::from_u32(char::MAX as u32),
            Some(MaybeMarkerCodepoint::from(char::MAX))
        );
        assert_eq!(
            MaybeMarkerCodepoint::from(SpecialMarker::GraphemeBoundary).to_marker(),
            Ok(SpecialMarker::GraphemeBoundary),
        );
    }
}