icu_provider_source 2.2.0

A data provider based on CLDR and ICU data.
Documentation
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::SourceDataProvider;
use icu::collections::codepointtrie::CodePointTrie;
use icu::properties::props::Script;
use icu::properties::provider::{PropertyScriptWithExtensionsV1, ScriptWithExtensionsProperty};
use icu::properties::script::ScriptWithExt;
use icu_provider::prelude::*;
use std::collections::HashSet;
use std::convert::TryFrom;
use zerovec::{VarZeroVec, ZeroSlice, ZeroVec};

// implement data provider
impl DataProvider<PropertyScriptWithExtensionsV1> for SourceDataProvider {
    fn load(
        &self,
        req: DataRequest,
    ) -> Result<DataResponse<PropertyScriptWithExtensionsV1>, DataError> {
        self.check_req::<PropertyScriptWithExtensionsV1>(req)?;
        let scx_data = self
            .icuexport()?
            .read_and_parse_toml::<super::uprops_serde::script_extensions::Main>(&format!(
                "uprops/{}/scx.toml",
                self.trie_type(),
            ))?
            .script_extensions
            .first()
            .ok_or_else(|| DataError::custom("Could not parse Script_Extensions data from TOML"))?;

        if scx_data.long_name != "Script_Extensions" || scx_data.short_name != "scx" {
            return Err(DataError::custom("Property name mismatch")
                .with_marker(PropertyScriptWithExtensionsV1::INFO));
        }

        let cpt_data = &scx_data.code_point_trie;
        let scx_array_data = &scx_data.script_code_array;

        let trie = CodePointTrie::<ScriptWithExt>::try_from(cpt_data).map_err(|e| {
            DataError::custom("Could not parse CodePointTrie TOML").with_display_context(&e)
        })?;

        // Convert the input from Vec<Vec<u16>> to Vec<ZeroVec<Script>> so that
        // we can go through the VarZeroVec construction process for a desired result
        // type of VZV<ZeroSlice<Script>>
        let ule_scx_array_data: Vec<ZeroVec<Script>> = scx_array_data
            .iter()
            .map(|v| {
                v.iter()
                    .copied()
                    .map(Script::from_icu4c_value)
                    .collect::<ZeroVec<Script>>()
            })
            .collect::<Vec<ZeroVec<Script>>>();
        let scx_vzv: VarZeroVec<ZeroSlice<Script>> =
            VarZeroVec::from(ule_scx_array_data.as_slice());

        let data_struct = ScriptWithExtensionsProperty {
            trie,
            extensions: scx_vzv,
        };

        Ok(DataResponse {
            metadata: Default::default(),
            payload: DataPayload::from_owned(data_struct),
        })
    }
}

impl crate::IterableDataProviderCached<PropertyScriptWithExtensionsV1> for SourceDataProvider {
    fn iter_ids_cached(&self) -> Result<HashSet<DataIdentifierCow<'static>>, DataError> {
        Ok(HashSet::from_iter([Default::default()]))
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_script_val_from_script_extensions() {
        let provider = SourceDataProvider::new_testing();

        let swe =
            icu::properties::script::ScriptWithExtensions::try_new_unstable(&provider).unwrap();
        let swe = swe.as_borrowed();

        assert_eq!(swe.get_script_val('𐓐'), Script::Osage); // U+104D0 OSAGE CAPITAL LETTER KHA
        assert_eq!(swe.get_script_val('🥳'), Script::Common); // U+1F973 FACE WITH PARTY HORN AND PARTY HAT
        assert_eq!(swe.get_script_val32(0x200D), Script::Inherited); // ZERO WIDTH JOINER
        assert_eq!(swe.get_script_val(''), Script::Tamil); // U+0BEB TAMIL DIGIT FIVE
        assert_eq!(swe.get_script_val32(0x11303), Script::Grantha); // GRANTHA SIGN VISARGA
        assert_eq!(swe.get_script_val32(0x30A0), Script::Common); // U+30A0 KATAKANA-HIRAGANA DOUBLE HYPHEN
    }

    #[test]
    fn test_scx_array_from_script_extensions() {
        let provider = SourceDataProvider::new_testing();

        let swe =
            icu::properties::script::ScriptWithExtensions::try_new_unstable(&provider).unwrap();
        let swe = swe.as_borrowed();

        assert_eq!(
            swe.get_script_extensions_val('𐓐') /* U+104D0 OSAGE CAPITAL LETTER KHA */
                .iter()
                .collect::<Vec<_>>(),
            [Script::Osage]
        );
        assert_eq!(
            swe.get_script_extensions_val('🥳') /* U+1F973 FACE WITH PARTY HORN AND PARTY HAT */
                .iter()
                .collect::<Vec<_>>(),
            [Script::Common]
        );
        assert_eq!(
            swe.get_script_extensions_val32(0x200D) // ZERO WIDTH JOINER
                .iter()
                .collect::<Vec<_>>(),
            [Script::Inherited]
        );
        assert_eq!(
            swe.get_script_extensions_val('') // U+0BEB TAMIL DIGIT FIVE
                .iter()
                .collect::<Vec<_>>(),
            [Script::Tamil, Script::Grantha]
        );
        assert_eq!(
            swe.get_script_extensions_val32(0x11303) // GRANTHA SIGN VISARGA
                .iter()
                .collect::<Vec<_>>(),
            [Script::Tamil, Script::Grantha]
        );
        assert_eq!(
            swe.get_script_extensions_val32(0x30A0) // KATAKANA-HIRAGANA DOUBLE HYPHEN
                .iter()
                .collect::<Vec<_>>(),
            [Script::Hiragana, Script::Katakana]
        );

        assert_eq!(
            swe.get_script_extensions_val32(0x200D) // ZERO WIDTH JOINER
                .iter()
                .next(),
            Some(Script::Inherited)
        );

        assert!(swe
            .get_script_extensions_val32(0x11303) // GRANTHA SIGN VISARGA
            .contains(&Script::Grantha));

        assert!(!swe
            .get_script_extensions_val32(0x11303) // GRANTHA SIGN VISARGA
            .contains(&Script::Common));

        // // Invalid code point
        assert_eq!(
            swe.get_script_extensions_val32(0x11_0000) // CODE_POINT_MAX + 1 is invalid
                .iter()
                .collect::<Vec<_>>(),
            [Script::Unknown]
        );
    }

    #[test]
    fn test_has_script() {
        let provider = SourceDataProvider::new_testing();

        let swe =
            icu::properties::script::ScriptWithExtensions::try_new_unstable(&provider).unwrap();
        let swe = swe.as_borrowed();

        assert!(swe.has_script('𐓐', Script::Osage));
        assert!(!swe.has_script('𐓐', Script::Common));
        assert!(!swe.has_script('𐓐', Script::Inherited));

        assert!(swe.has_script('🥳', Script::Common));
        assert!(!swe.has_script('🥳', Script::Inherited));

        assert!(!swe.has_script32(0x200D, Script::Common));
        assert!(swe.has_script32(0x200D, Script::Inherited));

        assert!(swe.has_script('', Script::Tamil));
        assert!(swe.has_script('', Script::Grantha));
        assert!(!swe.has_script('', Script::Common));
        assert!(!swe.has_script('', Script::Inherited));

        assert!(swe.has_script32(0x11303, Script::Tamil));
        assert!(swe.has_script32(0x11303, Script::Grantha));
        assert!(!swe.has_script32(0x11303, Script::Common));
        assert!(!swe.has_script32(0x11303, Script::Inherited));

        assert!(swe.has_script32(0x30A0, Script::Hiragana));
        assert!(swe.has_script32(0x30A0, Script::Katakana));
        assert!(!swe.has_script32(0x30A0, Script::Common));
        assert!(!swe.has_script32(0x30A0, Script::Inherited));

        // U+0964 DEVANAGARI DANDA
        assert!(!swe.has_script32(0x0964, Script::Common));
        assert!(swe.has_script32(0x0964, Script::Devanagari));
        assert!(swe.has_script32(0x0964, Script::Bengali));

        // TestHasScript() test cases from ICU4J

        // U+063F ARABIC LETTER FARSI YEH WITH THREE DOTS ABOVE
        assert!(!swe.has_script32(0x063F, Script::Common));
        assert!(swe.has_script32(0x063F, Script::Arabic)); // main Script value
        assert!(!swe.has_script32(0x063F, Script::Syriac));
        assert!(!swe.has_script32(0x063F, Script::Thaana));

        // U+0640 ARABIC TATWEEL
        assert!(!swe.has_script32(0x0640, Script::Common)); // main Script value
        assert!(swe.has_script32(0x0640, Script::Arabic));
        assert!(swe.has_script32(0x0640, Script::Syriac));
        assert!(!swe.has_script32(0x0640, Script::Thaana));

        // U+0650 ARABIC KASRA
        assert!(!swe.has_script32(0x0650, Script::Inherited)); // main Script value
        assert!(swe.has_script32(0x0650, Script::Arabic));
        assert!(swe.has_script32(0x0650, Script::Syriac));
        assert!(!swe.has_script32(0x0650, Script::Thaana));

        // U+0660 ARABIC-INDIC DIGIT ZERO
        assert!(!swe.has_script32(0x0660, Script::Common));
        assert!(swe.has_script32(0x0660, Script::Arabic)); // main Script value
        assert!(!swe.has_script32(0x0660, Script::Syriac));
        assert!(swe.has_script32(0x0660, Script::Thaana));

        // U+FDF2 ARABIC LIGATURE ALLAH ISOLATED FORM
        assert!(!swe.has_script32(0xFDF2, Script::Common));
        assert!(swe.has_script32(0xFDF2, Script::Arabic)); // main Script value
        assert!(!swe.has_script32(0xFDF2, Script::Syriac));
        assert!(swe.has_script32(0xFDF2, Script::Thaana));

        // The ICU4J comment for this test says:
        // An unguarded implementation might go into an infinite loop.
        assert!(!swe.has_script32(0x0640, Script::from_icu4c_value(0xAFFE)));
    }

    #[test]
    fn test_get_script_extensions_set() {
        let provider = SourceDataProvider::new_testing();

        let swe =
            icu::properties::script::ScriptWithExtensions::try_new_unstable(&provider).unwrap();
        let swe = swe.as_borrowed();

        let grantha = swe.get_script_extensions_set(Script::Grantha);
        assert!(!grantha.contains32(0x0BE5)); // unknown with unknown script in Tamil block
        assert!(grantha.contains32(0x0BE6)); // TAMIL DIGIT ZERO
        assert!(grantha.contains32(0x0BEB)); // TAMIL DIGIT FIVE
        assert!(grantha.contains32(0x0BEF)); // TAMIL DIGIT NINE
        assert!(grantha.contains32(0x0BF2)); // TAMIL NUMBER ONE THOUSAND
        assert!(grantha.contains32(0x0BF3)); // TAMIL DAY SIGN
        assert!(!grantha.contains32(0x0BF4)); // TAMIL MONTH SIGN
        assert!(grantha.contains32(0x11300)); // GRANTHA SIGN COMBINING ANUSVARA ABOVE
        assert!(grantha.contains32(0x11301)); // GRANTHA SIGN CANDRABINDU
        assert!(grantha.contains32(0x11302)); // GRANTHA SIGN ANUSVARA
        assert!(grantha.contains32(0x11303)); // GRANTHA SIGN VISARGA
        assert!(!grantha.contains32(0x11304)); // unknown with unknown script in Grantha block
        assert!(grantha.contains32(0x11305)); // GRANTHA LETTER A

        let tamil = swe.get_script_extensions_set(Script::Tamil);
        assert!(!tamil.contains32(0x0BE5)); // unknown with unknown script in Tamil block
        assert!(tamil.contains32(0x0BE6)); // TAMIL DIGIT ZERO
        assert!(tamil.contains32(0x0BEB)); // TAMIL DIGIT FIVE
        assert!(tamil.contains32(0x0BEF)); // TAMIL DIGIT NINE
        assert!(tamil.contains32(0x0BF2)); // TAMIL NUMBER ONE THOUSAND
        assert!(tamil.contains32(0x0BF3)); // TAMIL DAY SIGN
        assert!(tamil.contains32(0x0BF4)); // TAMIL MONTH SIGN
        assert!(!tamil.contains32(0x11300)); // GRANTHA SIGN COMBINING ANUSVARA ABOVE
        assert!(tamil.contains32(0x11301)); // GRANTHA SIGN CANDRABINDU
        assert!(!tamil.contains32(0x11302)); // GRANTHA SIGN ANUSVARA
        assert!(tamil.contains32(0x11303)); // GRANTHA SIGN VISARGA
        assert!(!tamil.contains32(0x11304)); // unknown with unknown script in Grantha block
        assert!(!tamil.contains32(0x11305)); // GRANTHA LETTER A

        let hiragana = swe.get_script_extensions_set(Script::Hiragana);
        assert!(hiragana.contains32(0x3046)); // HIRAGANA LETTER U
        assert!(hiragana.contains32(0x309F)); // HIRAGANA DIGRAPH YORI
        assert!(hiragana.contains32(0x30A0)); // KATAKANA-HIRAGANA DOUBLE HYPHEN
        assert!(!hiragana.contains32(0x30A1)); // KATAKANA LETTER SMALL A
        assert!(hiragana.contains32(0x30FB)); // KATAKANA MIDDLE DOT
        assert!(hiragana.contains32(0x30FC)); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
        assert!(!hiragana.contains32(0x30FD)); // KATAKANA ITERATION MARK

        let katakana = swe.get_script_extensions_set(Script::Katakana);
        assert!(!katakana.contains32(0x3046)); // HIRAGANA LETTER U
        assert!(!katakana.contains32(0x309F)); // HIRAGANA DIGRAPH YORI
        assert!(katakana.contains32(0x30A0)); // KATAKANA-HIRAGANA DOUBLE HYPHEN
        assert!(katakana.contains32(0x30A1)); // KATAKANA LETTER SMALL A
        assert!(katakana.contains32(0x30FB)); // KATAKANA MIDDLE DOT
        assert!(katakana.contains32(0x30FC)); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
        assert!(katakana.contains32(0x30FD)); // KATAKANA ITERATION MARK

        let common = swe.get_script_extensions_set(Script::Common);
        assert!(common.contains('🥳'));
        assert!(!common.contains32(0x200D));
        assert!(!common.contains32(0x30A0));

        let inherited = swe.get_script_extensions_set(Script::Inherited);
        assert!(!inherited.contains('🥳'));
        assert!(inherited.contains32(0x200D));
        assert!(!inherited.contains32(0x30A0));

        // inspired by https://github.com/unicode-org/unicodetools/issues/192

        let bangla = swe.get_script_extensions_set(Script::Bengali);
        assert!(bangla.contains32(0x09E7)); // BENGALI DIGIT ONE
        assert!(!bangla.contains32(0x0963)); // DEVANAGARI VOWEL SIGN VOCALIC LL
        assert!(bangla.contains32(0x0964)); // DEVANAGARI DANDA
        assert!(bangla.contains32(0x0965)); // DEVANAGARI DOUBLE DANDA
        assert!(!bangla.contains32(0x0966)); // DEVANAGARI DIGIT ZERO

        let devanagari = swe.get_script_extensions_set(Script::Devanagari);
        assert!(!devanagari.contains32(0x09E7)); // BENGALI DIGIT ONE
        assert!(devanagari.contains32(0x0963)); // DEVANAGARI VOWEL SIGN VOCALIC LL
        assert!(devanagari.contains32(0x0964)); // DEVANAGARI DANDA
        assert!(devanagari.contains32(0x0965)); // DEVANAGARI DOUBLE DANDA
        assert!(devanagari.contains32(0x0966)); // DEVANAGARI DIGIT ZERO

        assert!(!common.contains32(0x0964)); // DEVANAGARI DANDA
        assert!(!common.contains32(0x0965)); // DEVANAGARI DOUBLE DANDA
    }
}