utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
//! Defines [`TransformOptions`] and related types.

#[allow(unused_imports, reason = "used by docs")]
use super::advanced;
#[allow(unused_imports, reason = "used by docs")]
use super::{UnicodeNormalizationForm, decompose_buffer, decompose_char, map};
#[allow(unused_imports, reason = "used by docs")]
use crate::ErrorKind;
#[allow(unused_imports, reason = "used by docs")]
use utf8proc_sys::utf8proc_option_t;

/// Options for the [`map`], [`decompose_buffer`], and [`decompose_char`]  functions.
///
/// Used to flexibly support multiple transformations
/// through a single interface.
///
/// Some options are specific to composition/decomposition,
/// and are stored in [`CompositionOptions`].
///
/// ## Limitation
/// Certain options are only supported in the [advanced] interface,
/// because they have the potential to produce invalid UTF8.
///
/// This currently includes the [`grapheme_boundary_markers`](Self::grapheme_boundary_markers) option,
/// and  [`unassigned_codepoint_handling`](Self::unassigned_codepoint_handling) set to [`UnassignedCodepointHandling::Allow`].
#[derive(Clone, Debug, Default)]
#[non_exhaustive]
#[must_use]
pub struct TransformOptions {
    /// Specify how to handle unassigned codepoints.
    ///
    /// By default, this is set to [`UnassignedCodepointHandling::Forbid`].
    pub unassigned_codepoint_handling: UnassignedCodepointHandling,
    /// Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE..
    ///
    /// This is equivalent to the [`UTF8PROC_IGNORE`] option in the C library.
    ///
    /// [`UTF8PROC_IGNORE`]: utf8proc_option_t::UTF8PROC_IGNORE
    pub ignore: bool,
    /// Apply Unicode case-folding,
    /// to be able to do a case-insensitive
    /// string comparison.
    ///
    /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
    ///
    /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
    pub case_fold: bool,
    /// Inserts marker values at the beginning of each sequence which is representing
    /// a single grapheme cluster (see UAX#29)..
    ///
    /// This is only usable in the [`advanced`] interface,
    /// because it produces invalid UTF8 or codepoints.
    /// Using this option in the simple interface *will panic*.
    ///
    /// The same functionality is also available through the [`crate::grapheme`] module.
    ///
    /// This is equivalent to the [`UTF8PROC_CHARBOUND`] option in the C library.
    ///
    /// [`UTF8PROC_CHARBOUND`]: utf8proc_option_t::UTF8PROC_CHARBOUND
    pub grapheme_boundary_markers: bool,
    /// Replace certain characters with their compatibility decomposition.
    ///
    /// This is used to implement [NFKD] and [NFKC] Unicode normalization.
    ///
    /// This is equivalent to the [`UTF8PROC_COMPAT`] option in the C library.
    ///
    /// [`UTF8PROC_COMPAT`]: utf8proc_option_t::UTF8PROC_COMPAT
    /// [NFKD]: UnicodeNormalizationForm::NFKD
    /// [NFKC]: UnicodeNormalizationForm::NFKC
    pub compat: bool,
    /// If not `None`, enables composition/decomposition of control characters.
    ///
    /// Use [`CompositionOptions::compose`] and  [`CompositionOptions::decompose`]
    /// for default compose/decompose options.
    ///
    /// Equivalent to either [`UTF8PROC_COMPOSE`] or [`UTF8PROC_DECOMPOSE`] in the C library,
    /// depending on the [`CompositionDirection`].
    ///
    ///
    /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
    /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
    pub composition: Option<CompositionOptions>,
    /// Lump certain characters together.
    ///
    /// For example, HYPHEN U+2010 and MINUS U+2212 are converted to ASCII "-".
    /// Documented in [`lump.md`] in the utf8proc repository (link valid as of version v2.10.0).
    ///
    /// If the [`nlf_conversion`](Self::nlf_conversion) option is set,
    /// this includes a transformation of paragraph and
    /// line separators to ASCII line-feed (LF).
    ///
    /// [`lump.md`]: https://github.com/JuliaStrings/utf8proc/blob/v2.10.0/lump.md
    pub lump: bool,
    /// Customize the conversion of NLF-sequences (LF, CRLF, CR, NEL).
    ///
    /// If this is `None`, no conversions are applied.
    /// Can be used to customize the [`strip_control_codes`](Self::strip_control_codes) option.
    pub nlf_conversion: Option<NlfConversionMode>,
    /// Strips and/or converts control characters.
    ///
    /// NLF-sequences are transformed into spaces, except if of the
    /// [`nlf_conversion`](Self::nlf_conversion) option is specified.
    /// `HorizontalTab` (HT) and `FormFeed` (FF)
    /// are treated as a NLF-sequence in this case.
    /// All other control characters are simply removed.
    pub strip_control_codes: bool,
    /// Prohibit combining characters that would violate [Unicode versioning stability].
    ///
    /// [Unicode versioning stability]: https://www.unicode.org/policies/stability_policy.html
    pub stable: bool,
}
impl TransformOptions {
    /// Panic if options are used that could produce non-UTF8 data.
    ///
    /// These are only allowed in the [advanced] interface.
    #[track_caller]
    #[inline] // potential to be constant-folded
    pub(crate) fn validate_utf8(&self) -> &Self {
        assert!(
            !self.grapheme_boundary_markers,
            "Enabling `grapheme_boundary_markers` is forbidden in the simple interface",
        );
        match self.unassigned_codepoint_handling {
            UnassignedCodepointHandling::Forbid | UnassignedCodepointHandling::Strip => { /* acceptable */ }
            UnassignedCodepointHandling::Allow => {
                panic!("Setting `unassigned_codepoint_handling=Allow` is forbidden in the simple interface")
            }
        }
        self
    }
    /// Convert this into a FFI option.
    ///
    /// The returned option should be semantically valid,
    /// and will not trigger a [`ErrorKind::InvalidOptions`] error.
    ///
    /// ## Safety
    /// Certain options have the potential to produce non-UTF8 data,
    /// which will trigger undefined behavior if passed to [`std::str::from_utf8_unchecked`].
    ///
    /// Call [`Self::validate_utf8`] to make sure these options are not present,
    /// and if you allow them don't later convert to UTF8.
    ///
    /// This function itself can not trigger undefined behavior,
    /// but may invalidate future assumptions (see above)
    #[track_caller]
    #[inline]
    #[deny(unused_variables)]
    pub(crate) unsafe fn to_ffi(&self) -> utf8proc_option_t {
        let TransformOptions {
            unassigned_codepoint_handling,
            ignore,
            case_fold,
            grapheme_boundary_markers,
            compat,
            ref composition,
            lump,
            nlf_conversion,
            strip_control_codes,
            stable,
        } = *self;
        let mut res = utf8proc_option_t::NONE;
        res |= match unassigned_codepoint_handling {
            UnassignedCodepointHandling::Forbid => utf8proc_option_t::UTF8PROC_REJECTNA,
            UnassignedCodepointHandling::Strip => utf8proc_option_t::UTF8PROC_STRIPNA,
            UnassignedCodepointHandling::Allow => utf8proc_option_t::NONE,
        };
        if ignore {
            res |= utf8proc_option_t::UTF8PROC_IGNORE;
        }
        if case_fold {
            res |= utf8proc_option_t::UTF8PROC_CASEFOLD;
        }
        if grapheme_boundary_markers {
            res |= utf8proc_option_t::UTF8PROC_CHARBOUND;
        }
        if compat {
            res |= utf8proc_option_t::UTF8PROC_COMPAT;
        }
        if let Some(composition) = composition {
            res |= match composition.direction {
                CompositionDirection::Compose => utf8proc_option_t::UTF8PROC_COMPOSE,
                CompositionDirection::Decompose => utf8proc_option_t::UTF8PROC_DECOMPOSE,
            }
        }
        if lump {
            res |= utf8proc_option_t::UTF8PROC_LUMP;
        }
        res |= match nlf_conversion {
            None => utf8proc_option_t::NONE,
            Some(NlfConversionMode::LineSeparation) => utf8proc_option_t::UTF8PROC_NLF2LS,
            Some(NlfConversionMode::ParagraphSeparator) => utf8proc_option_t::UTF8PROC_NLF2PS,
            Some(NlfConversionMode::Unknown) => utf8proc_option_t::UTF8PROC_NLF2LF,
        };
        if strip_control_codes {
            res |= utf8proc_option_t::UTF8PROC_STRIPCC;
        }
        if stable {
            res |= utf8proc_option_t::UTF8PROC_STABLE;
        }
        res
    }
}

/// Indicates how to handle unassigned codepoints.
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, Default)]
#[must_use]
pub enum UnassignedCodepointHandling {
    /// Return an [`ErrorKind::NotAssigned`] error if an unassigned codepoint is encountered.
    ///
    /// This corresponds to the [`UTF8PROC_REJECTNA`] option in the C library.
    ///
    /// [`UTF8PROC_REJECTNA`]: utf8proc_option_t::UTF8PROC_REJECTNA
    #[default]
    Forbid,
    /// Remove unassigned codepoints.
    ///
    /// This corresponds to the [`UTF8PROC_STRIPNA`] option in the C library.
    ///
    /// [`UTF8PROC_STRIPNA`]: utf8proc_option_t::UTF8PROC_STRIPNA
    Strip,
    /// Allow unassigned codepoints, without returning an error or ignoring them.
    ///
    /// This option can only be set using the [advanced] interface,
    /// as unsigned codepoints have the potential to produce invalid UTF8.
    Allow,
}

/// Controls Unicode composition and decomposition.
///
/// There is no type-wide default, because you must choose a direction.
/// Use [`Self::compose`] or [`Self::decompose`] instead.
#[derive(Clone, Debug)]
#[non_exhaustive]
#[must_use]
pub struct CompositionOptions {
    /// Whether composition or decomposition should be performed
    pub direction: CompositionDirection,
    /// Strips all character markings.
    ///
    /// This includes non-spacing, spacing and enclosing (i.e. accents).
    ///
    /// This is equivalent to the [`UTF8PROC_CASEFOLD`] option in the C library.
    ///
    /// [`UTF8PROC_CASEFOLD`]: utf8proc_option_t::UTF8PROC_CASEFOLD
    pub strip_marks: bool,
}
impl CompositionOptions {
    /// Enable composition, with no additional options.
    #[inline]
    pub const fn compose() -> CompositionOptions {
        CompositionOptions {
            direction: CompositionDirection::Compose,
            strip_marks: false,
        }
    }

    /// Enable decomposition, with no additional options.
    #[inline]
    pub const fn decompose() -> CompositionOptions {
        CompositionOptions {
            direction: CompositionDirection::Decompose,
            ..Self::compose()
        }
    }
}
/// Controls whether composition or decomposition is being performed.
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
#[must_use]
pub enum CompositionDirection {
    /// Enable composition, recomposing characters by canonical equivalence.
    ///
    /// This is the inverse of the [`UTF8PROC_COMPOSE`] option in the C library.
    ///
    /// [`UTF8PROC_COMPOSE`]: utf8proc_option_t::UTF8PROC_COMPOSE
    Compose,
    /// Enable decomposition, decomposing characters by canonical equivalence.
    ///
    /// This is the inverse of the [`UTF8PROC_DECOMPOSE`] option in the C library.
    ///
    /// [`UTF8PROC_DECOMPOSE`]: utf8proc_option_t::UTF8PROC_DECOMPOSE
    Decompose,
}

/// Indicates how NLF-sequences (LF, CRLF, CR, NEL) should be converted.
#[derive(Copy, Clone, Debug)]
#[non_exhaustive]
#[must_use]
pub enum NlfConversionMode {
    /// Indicates that NLF-sequences are representing a
    /// line break, and should be converted to the codepoint for line
    /// separation (LS).
    ///
    /// This is equivalent to the [`UTF8PROC_NLF2LS`] option in the C library.
    ///
    /// [`UTF8PROC_NLF2LS`]: utf8proc_option_t::UTF8PROC_NLF2LS
    LineSeparation,
    /// Indicates that NLF-sequences are representing a paragraph break, and
    /// should be converted to the codepoint for paragraph separation (PS).
    ///
    /// This is equivalent to the [`UTF8PROC_NLF2PS`] option in the C library.
    ///
    /// [`UTF8PROC_NLF2PS`]: utf8proc_option_t::UTF8PROC_NLF2PS
    ParagraphSeparator,
    /// Indicates that the meaning of NLF-sequences is unknown.
    ///
    /// Note that this option is distinct from disabling NLF conversion.
    ///
    ///
    /// This is equivalent to the [`UTF8PROC_NLF2LF`] option in the C library.
    ///
    /// [`UTF8PROC_NLF2LF`]: utf8proc_option_t::UTF8PROC_NLF2LF
    Unknown,
}