// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
//! The functions in this module return a [`CodePointMapData`] representing, for
//! each code point in the entire range of code points, the property values
//! for a particular Unicode property.
//!
//! The descriptions of most properties are taken from [`TR44`], the documentation for the
//! Unicode Character Database.
//!
//! [`TR44`]: https://www.unicode.org/reports/tr44
use crate::error::PropertiesError;
use crate::provider::*;
use crate::sets::CodePointSetData;
#[cfg(doc)]
use crate::*;
use core::marker::PhantomData;
use icu_collections::codepointtrie::{CodePointMapRange, CodePointTrie, TrieValue};
use icu_provider::prelude::*;
use zerovec::ZeroVecError;
/// A wrapper around code point map data. It is returned by APIs that return Unicode
/// property data in a map-like form, ex: enumerated property value data keyed
/// by code point. Access its data via the borrowed version,
/// [`CodePointMapDataBorrowed`].
pub struct CodePointMapData<T: TrieValue> {
data: DataPayload<ErasedMaplikeMarker<T>>,
}
/// Private marker type for CodePointMapData
/// to work for all same-value map properties at once
#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
struct ErasedMaplikeMarker<T>(PhantomData<T>);
impl<T: TrieValue> DataMarker for ErasedMaplikeMarker<T> {
type Yokeable = PropertyCodePointMapV1<'static, T>;
}
impl<T: TrieValue> CodePointMapData<T> {
/// Construct a borrowed version of this type that can be queried.
///
/// This avoids a potential small underlying cost per API call (like `get()`) by consolidating it
/// up front.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
///
/// let gc = data.as_borrowed();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
#[inline]
pub fn as_borrowed(&self) -> CodePointMapDataBorrowed<'_, T> {
CodePointMapDataBorrowed {
map: self.data.get(),
}
}
/// Convert this map to a map around another type
///
/// Typically useful for type-erasing maps into maps around integers.
///
/// # Panics
/// Will panic if T and P are different sizes
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
///
/// let gc = data.try_into_converted::<u8>().unwrap();
/// let gc = gc.as_borrowed();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter as u8); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol as u8); // U+1F383 JACK-O-LANTERN
/// ```
pub fn try_into_converted<P>(self) -> Result<CodePointMapData<P>, ZeroVecError>
where
P: TrieValue,
{
self.data
.try_map_project::<ErasedMaplikeMarker<P>, _, _>(move |data, _| {
data.try_into_converted()
})
.map(CodePointMapData::from_data)
}
/// Construct a new one from loaded data
///
/// Typically it is preferable to use getters like [`load_general_category()`] instead
pub fn from_data<M>(data: DataPayload<M>) -> Self
where
M: DataMarker<Yokeable = PropertyCodePointMapV1<'static, T>>,
{
Self { data: data.cast() }
}
/// Construct a new one an owned [`CodePointTrie`]
pub fn from_code_point_trie(trie: CodePointTrie<'static, T>) -> Self {
let set = PropertyCodePointMapV1::from_code_point_trie(trie);
CodePointMapData::from_data(DataPayload::<ErasedMaplikeMarker<T>>::from_owned(set))
}
/// Convert this type to a [`CodePointTrie`] as a borrowed value.
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// This method returns an `Option` in order to return `None` when the backing data provider
/// cannot return a [`CodePointTrie`], or cannot do so within the expected constant time
/// constraint.
pub fn as_code_point_trie(&self) -> Option<&CodePointTrie<'_, T>> {
self.data.get().as_code_point_trie()
}
/// Convert this type to a [`CodePointTrie`], borrowing if possible,
/// otherwise allocating a new [`CodePointTrie`].
///
/// The data backing this is extensible and supports multiple implementations.
/// Currently it is always [`CodePointTrie`]; however in the future more backends may be
/// added, and users may select which at data generation time.
///
/// The performance of the conversion to this specific return type will vary
/// depending on the data structure that is backing `self`.
pub fn to_code_point_trie(&self) -> CodePointTrie<'_, T> {
self.data.get().to_code_point_trie()
}
}
/// A borrowed wrapper around code point set data, returned by
/// [`CodePointSetData::as_borrowed()`]. More efficient to query.
#[derive(Clone, Copy)]
pub struct CodePointMapDataBorrowed<'a, T: TrieValue> {
map: &'a PropertyCodePointMapV1<'a, T>,
}
impl<'a, T: TrieValue> CodePointMapDataBorrowed<'a, T> {
/// Get the value this map has associated with code point `ch`
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let gc = data.as_borrowed();
///
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get(self, ch: char) -> T {
self.map.get32(ch as u32)
}
/// Get the value this map has associated with code point `ch`
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let gc = data.as_borrowed();
///
/// assert_eq!(gc.get32(0x6728), GeneralCategory::OtherLetter); // U+6728 (木)
/// assert_eq!(gc.get32(0x1F383), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get32(self, ch: u32) -> T {
self.map.get32(ch)
}
/// Get a [`CodePointSetData`] for all elements corresponding to a particular value
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data = maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let gc = data.as_borrowed();
///
/// let other_letter_set_data =
/// gc.get_set_for_value(GeneralCategory::OtherLetter);
/// let other_letter_set = other_letter_set_data.as_borrowed();
///
/// assert!(other_letter_set.contains('木')); // U+6728
/// assert!(!other_letter_set.contains('🎃')); // U+1F383 JACK-O-LANTERN
/// ```
pub fn get_set_for_value(self, value: T) -> CodePointSetData {
let set = self.map.get_set_for_value(value);
CodePointSetData::from_code_point_inversion_list(set)
}
/// Yields an [`Iterator`] returning ranges of consecutive code points that
/// share the same value in the [`CodePointMapData`].
///
/// # Examples
///
/// ```
/// use core::ops::RangeInclusive;
/// use icu::properties::maps::CodePointMapData;
/// use icu_collections::codepointtrie::planes;
/// use icu_collections::codepointtrie::CodePointMapRange;
///
/// let planes_trie = planes::get_planes_trie();
/// let cp_map_data = CodePointMapData::from_code_point_trie(planes_trie);
/// let cp_map = cp_map_data.as_borrowed();
///
/// let mut ranges = cp_map.iter_ranges();
///
/// for plane in 0..=16 {
/// let exp_start = plane * 0x1_0000;
/// let exp_end = exp_start + 0xffff;
/// assert_eq!(
/// ranges.next(),
/// Some(CodePointMapRange {
/// range: RangeInclusive::new(exp_start, exp_end),
/// value: plane as u8
/// })
/// );
/// }
///
/// // Hitting the end of the iterator returns `None`, as will subsequent
/// // calls to .next().
/// assert_eq!(ranges.next(), None);
/// assert_eq!(ranges.next(), None);
/// ```
pub fn iter_ranges(self) -> impl Iterator<Item = CodePointMapRange<T>> + 'a {
self.map.iter_ranges()
}
}
macro_rules! make_map_property {
(
// currently unused
property: $prop_name:expr;
// currently unused
marker: $marker_name:ident;
value: $value_ty:path;
keyed_data_marker: $keyed_data_marker:ty;
func:
$(#[$attr:meta])*
$vis:vis fn $name:ident();
) => {
$(#[$attr])*
$vis fn $name(
provider: &(impl DataProvider<$keyed_data_marker> + ?Sized)
) -> Result<CodePointMapData<$value_ty>, PropertiesError> {
Ok(provider.load(Default::default()).and_then(DataResponse::take_payload).map(CodePointMapData::from_data)?)
}
}
}
make_map_property! {
property: "General_Category";
marker: GeneralCategoryProperty;
value: crate::GeneralCategory;
keyed_data_marker: GeneralCategoryV1Marker;
func:
/// Return a [`CodePointMapData`] for the General_Category Unicode enumerated property. See [`GeneralCategory`].
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GeneralCategory};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_general_category(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let gc = data.as_borrowed();
/// assert_eq!(gc.get('木'), GeneralCategory::OtherLetter); // U+6728
/// assert_eq!(gc.get('🎃'), GeneralCategory::OtherSymbol); // U+1F383 JACK-O-LANTERN
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_general_category();
}
make_map_property! {
property: "Bidi_Class";
marker: BidiClassProperty;
value: crate::BidiClass;
keyed_data_marker: BidiClassV1Marker;
func:
/// Return a [`CodePointMapData`] for the Bidi_Class Unicode enumerated property. See [`BidiClass`].
///
/// # Example
///
/// ```
/// use icu::properties::{maps, BidiClass};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_bidi_class(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let bc = data.as_borrowed();
/// assert_eq!(bc.get('y'), BidiClass::LeftToRight); // U+0079
/// assert_eq!(bc.get('ع'), BidiClass::ArabicLetter); // U+0639
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_bidi_class();
}
make_map_property! {
property: "Script";
marker: ScriptProperty;
value: crate::Script;
keyed_data_marker: ScriptV1Marker;
func:
/// Return a [`CodePointMapData`] for the Script Unicode enumerated property. See [`Script`].
///
/// **Note:** Some code points are associated with multiple scripts. If you are trying to
/// determine whether a code point belongs to a certain script, you should use
/// [`load_script_with_extensions_unstable`] and [`ScriptWithExtensionsBorrowed::has_script`]
/// instead of this function.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, Script};
/// use icu_collections::codepointtrie::CodePointTrie;
///
/// let data =
/// maps::load_script(&icu_testdata::unstable())
/// .expect("The data should be valid");
/// let script = data.as_borrowed();
/// assert_eq!(script.get('木'), Script::Han); // U+6728
/// assert_eq!(script.get('🎃'), Script::Common); // U+1F383 JACK-O-LANTERN
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
/// [`load_script_with_extensions_unstable`]: crate::script::load_script_with_extensions_unstable
/// [`ScriptWithExtensionsBorrowed::has_script`]: crate::script::ScriptWithExtensionsBorrowed::has_script
pub fn load_script();
}
make_map_property! {
property: "East_Asian_Width";
marker: EastAsianWidthProperty;
value: crate::EastAsianWidth;
keyed_data_marker: EastAsianWidthV1Marker;
func:
/// Return a [`CodePointMapData`] for the East_Asian_Width Unicode enumerated
/// property. See [`EastAsianWidth`].
///
/// # Example
///
/// ```
/// use icu::properties::{maps, EastAsianWidth};
///
/// let data = maps::load_east_asian_width(&icu_testdata::unstable()).expect("The data should be valid!");
/// let eaw = data.as_borrowed();;
///
/// assert_eq!(eaw.get('ア'), EastAsianWidth::Halfwidth); // U+FF71: Halfwidth Katakana Letter A
/// assert_eq!(eaw.get('ア'), EastAsianWidth::Wide); //U+30A2: Katakana Letter A
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_east_asian_width();
}
make_map_property! {
property: "Line_Break";
marker: LineBreakProperty;
value: crate::LineBreak;
keyed_data_marker: LineBreakV1Marker;
func:
/// Return a [`CodePointMapData`] for the Line_Break Unicode enumerated
/// property. See [`LineBreak`].
///
/// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, LineBreak};
///
/// let data = maps::load_line_break(&icu_testdata::unstable()).expect("The data should be valid!");
/// let lb = data.as_borrowed();
///
/// assert_eq!(lb.get(')'), LineBreak::CloseParenthesis); // U+0029: Right Parenthesis
/// assert_eq!(lb.get('ぁ'), LineBreak::ConditionalJapaneseStarter); //U+3041: Hiragana Letter Small A
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_line_break();
}
make_map_property! {
property: "Grapheme_Cluster_Break";
marker: GraphemeClusterBreakProperty;
value: crate::GraphemeClusterBreak;
keyed_data_marker: GraphemeClusterBreakV1Marker;
func:
/// Return a [`CodePointMapData`] for the Grapheme_Cluster_Break Unicode enumerated
/// property. See [`GraphemeClusterBreak`].
///
/// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, GraphemeClusterBreak};
///
/// let data = maps::load_grapheme_cluster_break(&icu_testdata::unstable()).expect("The data should be valid!");
/// let gcb = data.as_borrowed();
///
/// assert_eq!(gcb.get('🇦'), GraphemeClusterBreak::RegionalIndicator); // U+1F1E6: Regional Indicator Symbol Letter A
/// assert_eq!(gcb.get('ำ'), GraphemeClusterBreak::SpacingMark); //U+0E33: Thai Character Sara Am
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_grapheme_cluster_break();
}
make_map_property! {
property: "Word_Break";
marker: WordBreakProperty;
value: crate::WordBreak;
keyed_data_marker: WordBreakV1Marker;
func:
/// Return a [`CodePointMapData`] for the Word_Break Unicode enumerated
/// property. See [`WordBreak`].
///
/// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, WordBreak};
///
/// let data = maps::load_word_break(&icu_testdata::unstable()).expect("The data should be valid!");
/// let wb = data.as_borrowed();
///
/// assert_eq!(wb.get('.'), WordBreak::MidNumLet); // U+002E: Full Stop
/// assert_eq!(wb.get(','), WordBreak::MidNum); // U+FF0C: Fullwidth Comma
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_word_break();
}
make_map_property! {
property: "Sentence_Break";
marker: SentenceBreakProperty;
value: crate::SentenceBreak;
keyed_data_marker: SentenceBreakV1Marker;
func:
/// Return a [`CodePointMapData`] for the Sentence_Break Unicode enumerated
/// property. See [`SentenceBreak`].
///
/// **Note:** Use `icu::segmenter` for an all-in-one break iterator implementation.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, SentenceBreak};
///
/// let data = maps::load_sentence_break(&icu_testdata::unstable()).expect("The data should be valid!");
/// let sb = data.as_borrowed();;
///
/// assert_eq!(sb.get('9'), SentenceBreak::Numeric); // U+FF19: Fullwidth Digit Nine
/// assert_eq!(sb.get(','), SentenceBreak::SContinue); // U+002C: Comma
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_sentence_break();
}
make_map_property! {
property: "Canonical_Combining_Class";
marker: CanonicalCombiningClassProperty;
value: crate::CanonicalCombiningClass;
keyed_data_marker: CanonicalCombiningClassV1Marker;
func:
/// Return a [`CodePointMapData`] for the Canonical_Combining_Class Unicode property. See
/// [`CanonicalCombiningClass`].
///
/// **Note:** See `icu_normalizer::CanonicalCombiningClassMap` for the preferred API
/// to look up the Canonical_Combining_Class property by scalar value.
///
/// # Example
///
/// ```
/// use icu::properties::{maps, CanonicalCombiningClass};
///
/// let data = maps::load_canonical_combining_class(&icu_testdata::unstable()).expect("The data should be valid!");
/// let sb = data.as_borrowed();;
///
/// assert_eq!(sb.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
/// assert_eq!(sb.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
/// ```
///
/// [`CodePointTrie`]: icu_collections::codepointtrie::CodePointTrie
pub fn load_canonical_combining_class();
}