icu_datagen 1.1.1

Generate data for ICU4X DataProvider
Documentation
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::transform::cldr::cldr_serde::numbers::DecimalFormat;
use icu_compactdecimal::provider::CompactDecimalPatternDataV1;
use icu_compactdecimal::provider::*;
use itertools::Itertools;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::HashMap;
use zerovec::ule::encode_varule_to_box;

/// A [`ParsedPattern`] represents a compact decimal pattern, which consists of
/// literal text with an optional placeholder.  The literal text is unescaped,
/// and the information about the number of 0s in the placeholder is stored
/// separately.
#[derive(PartialEq, Clone)]
struct ParsedPattern {
    /// The unescaped literal text, e.g., " mille" for the pattern "00 mille",
    /// "mille" for the pattern "mille".
    pub literal_text: Cow<'static, str>,
    /// The placeholder; None for patterns such as "mille".
    pub placeholder: Option<ParsedPlaceholder>,
}

/// Represents the placeholder in a compact decimal pattern as its position in
/// the associated text and the number of 0s (which, together with the type
/// associated with the pattern, determines the power of ten being abbreviated).
#[derive(PartialEq, Clone)]
struct ParsedPlaceholder {
    /// The position in the literal text where the placeholder is to be inserted;
    /// in particular, this is 0 for insertion at the beginning, which is the
    /// most frequent case, as in "00 mille".
    pub index: usize,
    pub number_of_0s: i8,
}

/// Parses a compact decimal pattern string, performing any validation that can
/// be done without the context of the associated type and count.
fn parse(pattern: &str) -> Result<Option<ParsedPattern>, Cow<'static, str>> {
    let cldr_overrides: HashMap<String, String> = [
        // Unescaped - in yrl (Nheengatu).
        ("0 millón-ita", "0 millón'-'ita"),
        ("0 billón-ita", "0 billón'-'ita"),
        ("0 tirillón-ita", "0 tirillón'-'ita"),
        ("0 miliãu-ita", "0 miliãu'-'ita"),
        ("0 biliãu-ita", "0 biliãu'-'ita"),
        ("0 tiriliãu-ita", "0 tiriliãu'-'ita"),
        // All compact decimal patterns for sw (Swahili) are split by sign;
        // the sign ends up where it would be as part of the significand, so
        // this special handling is unneeded.  Depending on the region subtag,
        // the space may be breaking or nonbreaking.
        ("elfu 0;elfu -0", "elfu 0"),
        ("milioni 0;milioni -0", "milioni 0"),
        ("bilioni 0;bilioni -0", "bilioni 0"),
        ("trilioni 0;trilioni -0", "trilioni 0"),
        ("elfu\u{A0}0;elfu\u{A0}-0", "elfu\u{A0}0"),
        ("milioni\u{A0}0;milioni\u{A0}-0", "milioni\u{A0}0"),
        ("bilioni\u{A0}0;bilioni\u{A0}-0", "bilioni\u{A0}0"),
        ("trilioni\u{A0}0;trilioni\u{A0}-0", "trilioni\u{A0}0"),
        ("0M;-0M", "0M"),
        ("0B;-0B", "0B"),
        ("0T;-0T", "0B"),
        // Unescaped E in hu (Hungarian).
        ("0\u{A0}E", "0\u{A0}'E'"),
    ]
    .iter()
    .flat_map(|(key, value)| {
        [
            (key.to_string(), value.to_string()),
            (key.replace('0', "00"), value.replace('0', "00")),
            (key.replace('0', "000"), value.replace('0', "000")),
        ]
    })
    .collect();
    let pattern = cldr_overrides
        .get(pattern)
        .map(|s| s.as_str())
        .unwrap_or(pattern);

    if pattern == "0" {
        Ok(None)
    } else {
        let mut placeholder: Option<ParsedPlaceholder> = None;
        let mut literal_text = String::with_capacity(pattern.len());
        // CLDR patterns use quoting for escaping, thus '.' for a literal FULL
        // STOP, as opposed to . for the decimal separator.  A doubled
        // APOSTROPHE ('') represents a single one.
        // See https://www.unicode.org/reports/tr35/tr35-numbers.html#Special_Pattern_Characters.
        // We process the pattern in chunks delimited by ', which are
        // alternatingly unescaped and escaped.
        for (i, chunk) in pattern.split('\'').enumerate() {
            let escaped = i % 2 == 1;
            if escaped {
                if chunk.is_empty() {
                    // '' means '.
                    literal_text.push('\'');
                } else {
                    // Anything else wrapped in apostrophes is literal text.
                    literal_text.push_str(chunk);
                }
            } else {
                // We are in unquoted text, so we need to check for the
                // symbols defined in https://www.unicode.org/reports/tr35/tr35-numbers.html#Number_Pattern_Character_Definitions.
                if chunk
                    .chars()
                    .any(|c| ('1'..'9').contains(&c) || "@#.-,E+%‰,¤*'".contains(c))
                {
                    return Err(format!(
                        "Unsupported symbol in compact decimal pattern {}",
                        pattern
                    )
                    .into());
                }
                // Given the chunk "me0000w", the prefix is "me", and
                // additional_0s_and_suffix is "000w"; given the chunk
                // "me0w", these are "me" and "w" respectively.
                if let Some((prefix, additional_0s_and_suffix)) = chunk.split_once('0') {
                    if placeholder.is_some() {
                        return Err(format!(
                            "Multiple placeholders in compact decimal pattern {})",
                            pattern
                        )
                        .into());
                    }
                    // The prefix goes into the literal text, and the position
                    // of the placeholder is then at the end of the accumulated
                    // text, at literal_text.len().
                    literal_text.push_str(prefix);
                    if let Some((middle_0s, suffix)) = additional_0s_and_suffix.rsplit_once('0') {
                        // More than one 0; in the "me0000w" example, middle_0s
                        // is "00", suffix is "w".
                        if !middle_0s.chars().all(|c| c == '0') {
                            return Err(format!(
                                "Multiple placeholders in compact decimal pattern {}",
                                pattern
                            )
                            .into());
                        }
                        placeholder = Some(ParsedPlaceholder {
                            index: literal_text.len(),
                            number_of_0s: i8::try_from(middle_0s.len() + 2)
                                .map_err(|_| format!("Too many 0s in pattern {}", pattern))?,
                        });
                        literal_text.push_str(suffix);
                    } else {
                        // Only one 0, we are in the "me0w" case.
                        placeholder = Some(ParsedPlaceholder {
                            index: literal_text.len(),
                            number_of_0s: 1,
                        });
                        literal_text.push_str(additional_0s_and_suffix);
                    }
                } else {
                    // No symbols, all literal text.
                    literal_text.push_str(chunk);
                }
            }
        }
        Ok(Some(ParsedPattern {
            literal_text: Cow::Owned(literal_text),
            placeholder,
        }))
    }
}

impl TryFrom<&DecimalFormat> for CompactDecimalPatternDataV1<'static> {
    type Error = Cow<'static, str>;

    fn try_from(other: &DecimalFormat) -> Result<Self, Self::Error> {
        let mut parsed_patterns: BTreeMap<i8, BTreeMap<Count, Option<ParsedPattern>>> =
            BTreeMap::new();
        // First ingest the CLDR mapping.
        for pattern in other.patterns.iter() {
            let mut type_bytes = pattern.compact_decimal_type.bytes();

            if !(type_bytes.next() == Some(b'1') && type_bytes.all(|b| b == b'0')) {
                return Err(format!("Ill-formed type {}", pattern.compact_decimal_type).into());
            }
            let log10_type = i8::try_from(pattern.compact_decimal_type.len() - 1)
                .map_err(|_| format!("Too many digits in type {}", pattern.compact_decimal_type))?;
            let count = match &*pattern.compact_decimal_count {
                "zero" => Count::Zero,
                "one" => Count::One,
                "two" => Count::Two,
                "few" => Count::Few,
                "many" => Count::Many,
                "other" => Count::Other,
                "1" => Count::Explicit1,
                _ => {
                    return Err(format!(
                        "Invalid count {} in type {}",
                        pattern.compact_decimal_count, pattern.compact_decimal_type
                    )
                    .into())
                }
            };
            let plural_map = parsed_patterns.entry(log10_type).or_insert(BTreeMap::new());
            plural_map
                .insert(count, parse(&pattern.pattern)?)
                .map_or_else(
                    // TODO(egg): This should be try_insert.
                    || Ok(()),
                    |_| {
                        Err(format!(
                            "Plural case {:?} is duplicated for type 10^{}",
                            count, log10_type
                        ))
                    },
                )?;
        }
        // Figure out which plural cases are used, and make the map dense by
        // filling out the implicit fallbacks to the 0 (noncompact) pattern.
        let plural_cases: BTreeSet<Count> = parsed_patterns
            .iter()
            .flat_map(|(_, plural_map)| plural_map.keys())
            .copied()
            .filter(|count| count != &Count::Explicit1)
            .collect();
        for log10_type in 0..=parsed_patterns.iter().last().map_or(0, |(key, _)| *key) {
            for plural_case in &plural_cases {
                parsed_patterns
                    .entry(log10_type)
                    .or_insert(BTreeMap::new())
                    .entry(*plural_case)
                    .or_insert(None);
            }
        }
        let mut patterns: BTreeMap<i8, BTreeMap<Count, Pattern>> = BTreeMap::new();
        // Compute the exponents based on the numbers of 0s in the placeholders
        // and the type values: the exponent is 3 for type=1000, "0K", as well
        // as for type=10000, "00K", etc.
        // Remove duplicates of the count=other case in the same iteration.
        for (log10_type, parsed_plural_map) in parsed_patterns {
            let plural_map = patterns.entry(log10_type).or_insert(BTreeMap::new());
            let other_pattern = parsed_plural_map
                .get(&Count::Other)
                .ok_or_else(|| format!("Missing other case for type 10^{}", log10_type))?
                .clone();
            let exponent: i8;
            match &other_pattern {
                None => {
                    if !parsed_plural_map.iter().all(|(_, p)| p.is_none()) {
                        return Err(format!(
                            "Non-0 pattern for type 10^{} whose pattern for count=other is 0",
                            log10_type,
                        )
                        .into());
                    }
                    exponent = 0;
                }
                Some(other_pattern) => {
                    let other_placeholder =
                        other_pattern.placeholder.as_ref().ok_or_else(|| {
                            format!(
                                "Missing placeholder in other case of type 10^{}",
                                log10_type
                            )
                        })?;
                    for (count, pattern) in &parsed_plural_map {
                        if let Some(pattern) = pattern {
                            if let Some(placeholder) = &pattern.placeholder {
                                if placeholder.number_of_0s != other_placeholder.number_of_0s {
                                    return Err(
                            format!(
                                "Inconsistent placeholders within type 10^{}: {} 0s for other, {} 0s for {:?}",
                                log10_type,
                                other_placeholder.number_of_0s,
                                placeholder.number_of_0s,
                                count
                            )
                            .into()
                        );
                                }
                            }
                        }
                    }
                    exponent = log10_type - other_placeholder.number_of_0s + 1;
                    if exponent < 1 {
                        return Err(format!(
                            "Too many 0s in type 10^{}, ({}, implying nonpositive exponent c={})",
                            log10_type, other_placeholder.number_of_0s, exponent
                        )
                        .into());
                    }
                }
            }
            for (count, optional_pattern) in parsed_plural_map {
                // Omit duplicates of the other case.
                if count != Count::Other && optional_pattern == other_pattern {
                    continue;
                }
                plural_map.insert(
                    count,
                    match optional_pattern {
                        None => Pattern {
                            exponent: 0,
                            literal_text: std::borrow::Cow::Borrowed(""),
                            index: 0,
                        },
                        Some(pattern) => Pattern {
                            exponent,
                            literal_text: pattern.literal_text,
                            index: pattern
                                .placeholder
                                .map_or(Some(u8::MAX), |p| {
                                    u8::try_from(p.index)
                                        .ok()
                                        .and_then(|i| (i < u8::MAX).then(|| i))
                                })
                                .ok_or_else(|| {
                                    format!(
                                        "Placeholder index is too large in type=10^{}, count={:?}",
                                        log10_type, count
                                    )
                                })?,
                        },
                    },
                );
            }
        }
        if !patterns
            .iter()
            .tuple_windows()
            .all(|((_, low), (_, high))| {
                low.get(&Count::Other).map(|p| p.exponent)
                    <= high.get(&Count::Other).map(|p| p.exponent)
            })
        {
            Err(format!(
                "Compact decimal exponents should be nondecreasing: {:?}",
                patterns
                    .iter()
                    .map(|(_, plural_map)| plural_map.get(&Count::Other).map(|p| p.exponent))
                    .collect::<Vec<_>>(),
            ))?;
        }
        // Deduplicate sequences of types that have the same plural map (up to =1), keeping the lowest type.
        // The pattern 0 for type 1 is implicit.
        let deduplicated_patterns = patterns
            .iter()
            .coalesce(
                |(log10_low_type, low_plural_map), (log10_high_type, high_plural_map)| {
                    if low_plural_map == high_plural_map
                        || (low_plural_map.contains_key(&Count::Explicit1)
                            && low_plural_map
                                .iter()
                                .filter(|(count, _)| **count != Count::Explicit1)
                                .all(|(k, v)| high_plural_map.get(k) == Some(v))
                            && high_plural_map
                                .iter()
                                .all(|(k, v)| low_plural_map.get(k) == Some(v)))
                    {
                        Ok((log10_low_type, low_plural_map))
                    } else {
                        Err((
                            (log10_low_type, low_plural_map),
                            (log10_high_type, high_plural_map),
                        ))
                    }
                },
            )
            .filter(|(log10_type, plural_map)| {
                **log10_type != 0 || !plural_map.iter().all(|(_, pattern)| pattern.exponent == 0)
            });
        // Turn the BTreeMap of BTreeMaps into a ZeroMap2d.
        Ok(CompactDecimalPatternDataV1 {
            patterns: deduplicated_patterns
                .flat_map(|(log10_type, plural_map)| {
                    plural_map.iter().map(|(count, pattern)| {
                        (*log10_type, *count, encode_varule_to_box(pattern))
                    })
                })
                .collect(),
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use icu_provider::zerofrom::ZeroFrom;
    use zerovec::ule::AsULE;

    #[test]
    fn test_french_compressibility() {
        // French compact-long thousands as of CLDR 42.
        // The type=1000, count=one case is incorrect (it is inconsistent with the
        // plural rules), but it is interesting because it forces a distinction
        // between 1000 and 10000 to be made in the ICU4X data.
        let cldr_42_long_french_data = CompactDecimalPatternDataV1::try_from(
            &serde_json::from_str::<DecimalFormat>(
                r#"
                {
                    "1000-count-1": "mille",
                    "1000-count-one": "0 millier",
                    "1000-count-other": "0 mille",
                    "10000-count-one": "00 mille",
                    "10000-count-other": "00 mille",
                    "100000-count-one": "000 mille",
                    "100000-count-other": "000 mille"
                }
            "#,
            )
            .unwrap(),
        )
        .unwrap();
        let cldr_42_long_french: Box<[(i8, Count, Pattern)]> = cldr_42_long_french_data
            .patterns
            .iter0()
            .flat_map(|kkv| {
                let key0 = *kkv.key0();
                kkv.into_iter1()
                    .map(move |(k, v)| (key0, Count::from_unaligned(*k), Pattern::zero_from(v)))
            })
            .collect();
        assert_eq!(
            cldr_42_long_french.as_ref(),
            [
                (
                    3,
                    Count::One,
                    Pattern {
                        index: 0,
                        exponent: 3,
                        literal_text: Cow::Borrowed(" millier")
                    }
                ),
                (
                    3,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 3,
                        literal_text: Cow::Borrowed(" mille")
                    }
                ),
                (
                    3,
                    Count::Explicit1,
                    Pattern {
                        index: 255,
                        exponent: 3,
                        literal_text: Cow::Borrowed("mille")
                    }
                ),
                (
                    4,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 3,
                        literal_text: Cow::Borrowed(" mille")
                    }
                ),
            ]
        );
        // French compact-long thousands, with the anomalous « millier » removed.
        // This allows 10000 and 1000 to be collapsed.
        let compressible_long_french_data = CompactDecimalPatternDataV1::try_from(
            &serde_json::from_str::<DecimalFormat>(
                r#"
                {
                    "1000-count-1": "mille",
                    "1000-count-one": "0 mille",
                    "1000-count-other": "0 mille",
                    "10000-count-one": "00 mille",
                    "10000-count-other": "00 mille",
                    "100000-count-one": "000 mille",
                    "100000-count-other": "000 mille"
                }
            "#,
            )
            .unwrap(),
        )
        .unwrap();
        let compressible_long_french: Box<[(i8, Count, Pattern)]> = compressible_long_french_data
            .patterns
            .iter0()
            .flat_map(|kkv| {
                let key0 = *kkv.key0();
                kkv.into_iter1()
                    .map(move |(k, v)| (key0, Count::from_unaligned(*k), Pattern::zero_from(v)))
            })
            .collect();
        assert_eq!(
            compressible_long_french.as_ref(),
            [
                (
                    3,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 3,
                        literal_text: Cow::Borrowed(" mille")
                    }
                ),
                (
                    3,
                    Count::Explicit1,
                    Pattern {
                        index: 255,
                        exponent: 3,
                        literal_text: Cow::Borrowed("mille")
                    }
                ),
            ]
        );
    }

    #[test]
    fn test_holes() {
        // Spanish compact-short data as of CLDR 42, up to 10¹¹.
        // Note that the abbreviation for 10⁹ is used only starting with 10¹⁰.
        let spanish_data = CompactDecimalPatternDataV1::try_from(
            &serde_json::from_str::<DecimalFormat>(
                r#"
                {
                    "1000-count-one": "0 mil",
                    "1000-count-other": "0 mil",
                    "10000-count-one": "00 mil",
                    "10000-count-other": "00 mil",
                    "100000-count-one": "000 mil",
                    "100000-count-other": "000 mil",
                    "1000000-count-one": "0 M",
                    "1000000-count-other": "0 M",
                    "10000000-count-one": "00 M",
                    "10000000-count-other": "00 M",
                    "100000000-count-one": "000 M",
                    "100000000-count-other": "000 M",
                    "1000000000-count-one": "0000 M",
                    "1000000000-count-other": "0000 M",
                    "10000000000-count-one": "00 mil M",
                    "10000000000-count-other": "00 mil M",
                    "100000000000-count-one": "000 mil M",
                    "100000000000-count-other": "000 mil M"
                }
            "#,
            )
            .unwrap(),
        )
        .unwrap();
        let spanish: Box<[(i8, Count, Pattern)]> = spanish_data
            .patterns
            .iter0()
            .flat_map(|kkv| {
                let key0 = *kkv.key0();
                kkv.into_iter1()
                    .map(move |(k, v)| (key0, Count::from_unaligned(*k), Pattern::zero_from(v)))
            })
            .collect();
        assert_eq!(
            spanish.as_ref(),
            [
                (
                    3,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 3,
                        literal_text: Cow::Borrowed(" mil")
                    }
                ),
                (
                    6,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 6,
                        literal_text: Cow::Borrowed(" M")
                    }
                ),
                (
                    10,
                    Count::Other,
                    Pattern {
                        index: 0,
                        exponent: 9,
                        literal_text: Cow::Borrowed(" mil M")
                    }
                ),
            ]
        );
    }

    #[test]
    fn test_pattern_syntax_errors() {
        assert_eq!(
            parse("M.").err().unwrap(),
            "Unsupported symbol in compact decimal pattern M."
        );
        assert_eq!(parse("M'.'").unwrap().unwrap().literal_text, "M.");
        assert_eq!(
            parse("0 0").err().unwrap(),
            "Multiple placeholders in compact decimal pattern 0 0"
        );
        assert_eq!(parse("0 '0'").unwrap().unwrap().literal_text, " 0");
        let zeros = str::repeat("0", 256);
        assert_eq!(
            parse(&zeros[..128]).err().unwrap(),
            String::from("Too many 0s in pattern ") + &zeros[..128]
        );
        assert_eq!(parse(&zeros[..127]).unwrap().unwrap().literal_text, "");
    }

    #[test]
    fn test_inter_pattern_errors() {
        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(
                    r#"{ "1000-count-other": "0k", "1000-count-other": "0K" }"#,
                )
                .unwrap(),
            )
            .err()
            .unwrap(),
            "Plural case Other is duplicated for type 10^3"
        );

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(r#"{ "1-count-one": "0" }"#).unwrap()
            )
            .err()
            .unwrap(),
            "Missing other case for type 10^0"
        );

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(
                    r#"{ "1000-count-one": "0k", "1000-count-other": "0" }"#
                )
                .unwrap()
            )
            .err()
            .unwrap(),
            "Non-0 pattern for type 10^3 whose pattern for count=other is 0"
        );

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(r#"{ "1000-count-other": "k" }"#).unwrap()
            )
            .err()
            .unwrap(),
            "Missing placeholder in other case of type 10^3"
        );

        // Given this data, it is ambiguous whether the 10 000 should be formatted as 10 thousand or 1 myriad.
        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(
                    r#"
                        {
                            "10000-count-other": "00 thousand",
                            "10000-count-one": "0 myriad"
                        }
                    "#,
                )
                .unwrap(),
            )
            .err()
            .unwrap(),
            "Inconsistent placeholders within type 10^4: 2 0s for other, 1 0s for One"
        );

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(r#"{ "1000-count-other": "00000 tenths" }"#)
                    .unwrap()
            )
            .err()
            .unwrap(),
            "Too many 0s in type 10^3, (5, implying nonpositive exponent c=-1)"
        );

        let long_pattern = format!("thous{}nds (0)", str::repeat("a", 244));
        let overlong_pattern = format!("thous{}nds (0)", str::repeat("a", 245));

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(
                    format!(r#"{{ "1000-count-other": "{}" }}"#, overlong_pattern).as_str()
                )
                .unwrap()
            )
            .err()
            .unwrap(),
            "Placeholder index is too large in type=10^3, count=Other"
        );

        assert_eq!(
            CompactDecimalPatternDataV1::try_from(
                &serde_json::from_str::<DecimalFormat>(
                    format!(r#"{{ "1000-count-other": "{}" }}"#, long_pattern).as_str()
                )
                .unwrap()
            )
            .unwrap()
            .patterns
            .get0(&3)
            .and_then(|plural_map| plural_map.get1(&Count::Other))
            .unwrap()
            .index,
            254
        );
    }
}