utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
//! Defines the [`CharProperties`] type,
//! which contains full information on a Unicode codepoint.

use num_enum::TryFromPrimitive;
use std::fmt::{Display, Formatter, Write};
use utf8proc_sys::utf8proc_category_t;

/// Properties for a Unicode codepoint.
#[derive(Clone)]
pub struct CharProperties {
    info: &'static utf8proc_sys::utf8proc_property_t,
}
impl CharProperties {
    /// Return a reference to the underlying FFI property.
    ///
    /// Can be used for any properties the Rust interface doesn't currently support.
    #[inline]
    pub fn as_ffi_property(&self) -> &'static utf8proc_sys::utf8proc_property_t {
        self.info
    }

    /// Return the properties for the specified character.
    #[inline]
    pub fn for_char(c: char) -> CharProperties {
        // SAFETY: This is a safe function
        let ptr = unsafe { utf8proc_sys::utf8proc_get_property(c as i32) };
        // SAFETY: Trust the library to return a valid pointer
        let info = unsafe { &*ptr };
        CharProperties { info }
    }

    /// Return the Unicode category for the codepoint.
    ///
    /// Roughly corresponds to the `utf8proc_category` function.
    #[inline]
    pub fn general_category(&self) -> GeneralCategory {
        GeneralCategory::try_from_primitive(self.info.category.cast_unsigned() as u32).unwrap()
    }

    /// Return the major category corresponding to the general category.
    #[inline]
    pub fn major_category(&self) -> MajorCategory {
        self.general_category().major_category()
    }

    /// Return the displayed width of the character,
    /// or `None` if the character is not printable.
    ///
    /// Prefer [`Self::general_category`] to check if a character is printable.
    ///
    /// This functionality is not always equivalent to that of the [`unicode-width`] crate.
    /// They currently differ in around 7% of cases.
    /// Unicode is hard.
    ///
    /// [wcwidth]: https://man.archlinux.org/man/wcwidth.3
    /// [`unicode-width`]: https://docs.rs/unicode-width/latest/unicode_width/
    #[inline]
    pub fn char_width(&self) -> Option<usize> {
        match self.info.charwidth() {
            0 => None,
            other => Some(other as usize),
        }
    }
}

macro_rules! define_property_enum {
    (
        #[category(ffi = $ffi:ident, prefix = $prefix:ident)]
        #[repr($repr_type:ty)]
        $(#[$outer_attr:meta])*
        pub enum $name:ident {
            $(
                $(#[$variant_attr:meta])*
                $variant:ident,
            )*
        }
    ) => {
        paste::paste! {
            #[derive(TryFromPrimitive)]
            #[repr($repr_type)]
            $(#[$outer_attr])*
            pub enum $name {
                $(
                    $(#[$variant_attr])*
                    $variant = $ffi :: [< $prefix _ $variant>].0,
                )*
            }
            impl $name {
                /// All possible values.
                pub const ALL: &'static [Self] = &[
                    $(Self::$variant,)*
                ];
                #[inline]
                fn declared_name(&self) -> &'static str {
                    match *self {
                        $(Self::$variant => stringify!($variant),)*
                    }
                }
            }
        }
    };
}
define_property_enum! {
    #[category(ffi = utf8proc_category_t, prefix = UTF8PROC_CATEGORY)]
    #[repr(u32)]
    /// The general category of a Unicode codepoint.
    #[derive(Copy, Clone)]
    pub enum GeneralCategory {
        /// Other, not assigned
        CN,
        /// Letter, uppercase
        LU,
        /// Letter, lowercase
        LL,
        /// Letter, titlecase
        LT,
        /// Letter, modifier
        LM,
        /// Letter, other
        LO,
        /// Mark, non-spacing
        MN,
        /// Mark, spacing combining
        MC,
        /// Mark, enclosing
        ME,
        /// Number, decimal-digit
        ND,
        /// Number, letter
        NL,
        /// Number, other
        NO,
        /// Punctuation, connector
        PC,
        /// Punctuating, dash
        PD,
        /// Punctuation, open
        PS,
        /// Punctuation, close
        PE,
        /// Punctuation, initial quote
        PI,
        /// Punctuation, final quote
        PF,
        /// Punctuation, other
        PO,
        /// Symbol, math
        SM,
        /// Symbol, currency
        SC,
        /// Symbol, modifier
        SK,
        /// Symbol, other
        SO,
        /// Separator, space
        ZS,
        /// Separator, line
        ZL,
        /// Separator, paragraph
        ZP,
        /// Other, control
        CC,
        /// Other, format
        CF,
        /// Other, surrogate
        CS,
        /// Other, private use
        CO,
    }
}
impl GeneralCategory {
    /// The unicode-assigned name of the category.
    #[inline]
    pub fn name(&self) -> &'static str {
        self.declared_name()
    }

    /// The major category that corresponds to this category.
    #[inline]
    pub fn major_category(&self) -> MajorCategory {
        match self {
            GeneralCategory::LU
            | GeneralCategory::LL
            | GeneralCategory::LT
            | GeneralCategory::LM
            | GeneralCategory::LO => MajorCategory::Letter,
            GeneralCategory::MN | GeneralCategory::MC | GeneralCategory::ME => MajorCategory::Mark,
            GeneralCategory::ND | GeneralCategory::NL | GeneralCategory::NO => MajorCategory::Number,
            GeneralCategory::PC
            | GeneralCategory::PD
            | GeneralCategory::PS
            | GeneralCategory::PE
            | GeneralCategory::PI
            | GeneralCategory::PF
            | GeneralCategory::PO => MajorCategory::Punctuation,
            GeneralCategory::SM | GeneralCategory::SC | GeneralCategory::SK | GeneralCategory::SO => {
                MajorCategory::Symbol
            }
            GeneralCategory::ZS | GeneralCategory::ZL | GeneralCategory::ZP => MajorCategory::Separator,
            GeneralCategory::CN
            | GeneralCategory::CC
            | GeneralCategory::CF
            | GeneralCategory::CS
            | GeneralCategory::CO => MajorCategory::Other,
        }
    }
}
/// Displays the [name](GeneralCategory::name) of the category.
impl Display for GeneralCategory {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.declared_name())
    }
}

/// The "major category" for a [`GeneralCategory`].
///
/// Collapses a "cased letter" into a [`MajorCategory::Letter`].
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum MajorCategory {
    /// Indicates a letter, including a "cased letter".
    Letter,
    /// Indicates a mark, like [`GeneralCategory::MN`].
    Mark,
    /// Indicates a number.
    Number,
    /// Indicates punctuation.
    Punctuation,
    /// Indicates a symbol.
    Symbol,
    /// Indicates a separator.
    Separator,
    /// Indicates a character which doesn't fall into one of the other classes.
    Other,
}
impl MajorCategory {
    /// Return the letter corresponding to this major category.
    #[inline]
    pub fn letter(&self) -> char {
        match self {
            MajorCategory::Letter => 'L',
            MajorCategory::Mark => 'M',
            MajorCategory::Number => 'N',
            MajorCategory::Punctuation => 'P',
            MajorCategory::Symbol => 'S',
            MajorCategory::Separator => 'Z',
            MajorCategory::Other => 'C',
        }
    }
}
impl From<GeneralCategory> for MajorCategory {
    #[inline]
    fn from(value: GeneralCategory) -> Self {
        value.major_category()
    }
}
/// Writes the category's letter.
impl Display for MajorCategory {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        f.write_char(self.letter())
    }
}

#[cfg(test)]
mod test {
    use crate::properties::GeneralCategory;

    #[test]
    fn major_categories_match() {
        for &category in GeneralCategory::ALL {
            let major_category = category.major_category();
            assert!(category.name().starts_with(major_category.letter()), "{category}");
        }
    }
}