zerotrie 0.2.4

A data structure that efficiently maps strings to integers
Documentation
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use itertools::Either;
use itertools::Itertools;
use rand::Rng;
use rand::SeedableRng;
use std::collections::BTreeMap;
use zerotrie::dense::ZeroAsciiDenseSparse2dTrieBorrowed;
use zerotrie::dense::ZeroAsciiDenseSparse2dTrieOwned;
use zerotrie::dense::ZeroAsciiDenseSparse2dTrieVarULE;
use zerotrie::ZeroTrieSimpleAscii;
use zerovec::VarZeroCow;

mod testdata {
    include!("data/data.rs");
}

const BASIC_DATA: &[&str] = &[
    "ar/FR", "ar/IR", "ar/SA", "ar/UK", "ar/US", "en/AU", "en/FR", "en/UK", "en/US", "fr/FR",
    "fr/SA", "fr/UK", "fr/US", "it/IT",
];

const NON_EXISTENT_BASIC_DATA: &[&str] = &["ar/IT", "en/ZA", "it/FR", "zh/UK"];

fn strings_to_2d_btreemap<'a>(
    strings: &[&'a str],
    delimiter: &str,
) -> BTreeMap<&'a str, BTreeMap<&'a str, usize>> {
    let mut map = BTreeMap::<&str, BTreeMap<&str, usize>>::default();
    for (value, (prefix, suffix)) in strings
        .iter()
        .flat_map(|s| s.split_once(delimiter))
        .enumerate()
    {
        map.entry(prefix).or_default().insert(suffix, value);
    }
    map
}

fn check_data(
    data: &BTreeMap<&str, BTreeMap<&str, usize>>,
    trie: ZeroAsciiDenseSparse2dTrieBorrowed,
    polarity: bool,
) {
    for (prefix, values) in data.iter() {
        for (suffix, value) in values.iter() {
            if polarity {
                assert_eq!(trie.get(prefix, suffix), Some(*value), "{prefix}/{suffix}");
            } else {
                assert_eq!(trie.get(prefix, suffix), None, "{prefix}/{suffix}");
            }
        }
    }
}

#[must_use]
fn check_encoding(trie: ZeroAsciiDenseSparse2dTrieBorrowed) -> usize {
    let ule =
        VarZeroCow::<ZeroAsciiDenseSparse2dTrieVarULE>::from_encodeable(&trie.as_encodeable());
    let decoded = ZeroAsciiDenseSparse2dTrieBorrowed::from(&*ule);
    assert_eq!(decoded, trie);
    ule.as_bytes().len()
}

#[must_use]
fn make_simple_ascii_trie(
    data: &BTreeMap<&str, BTreeMap<&str, usize>>,
) -> ZeroTrieSimpleAscii<Vec<u8>> {
    let mut data_for_simple_ascii_trie = BTreeMap::new();
    for (prefix, values) in data.iter() {
        for (suffix, value) in values.iter() {
            data_for_simple_ascii_trie.insert(format!("{prefix}/{suffix}"), *value);
        }
    }
    ZeroTrieSimpleAscii::try_from_btree_map_str(&data_for_simple_ascii_trie).unwrap()
}

#[test]
fn test_basic() {
    let data_as_map = strings_to_2d_btreemap(BASIC_DATA, "/");
    let dense =
        ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data_as_map, b'/').unwrap();
    check_data(&data_as_map, dense.as_borrowed(), true);
    let non_existent_data_as_map = strings_to_2d_btreemap(NON_EXISTENT_BASIC_DATA, "/");
    check_data(&non_existent_data_as_map, dense.as_borrowed(), false);
    let byte_len = check_encoding(dense.as_borrowed());
    assert_eq!(byte_len, 106);
    let simple_trie = make_simple_ascii_trie(&data_as_map);
    assert_eq!(simple_trie.byte_len(), 71);
}

#[test]
fn test_locales_with_aux() {
    let data_as_map = strings_to_2d_btreemap(testdata::locales_with_aux::STRINGS, "-x-");
    let dense =
        ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data_as_map, b'/').unwrap();
    check_data(&data_as_map, dense.as_borrowed(), true);
    let byte_len = check_encoding(dense.as_borrowed());
    assert_eq!(byte_len, 4045);
    let simple_trie = make_simple_ascii_trie(&data_as_map);
    assert_eq!(simple_trie.byte_len(), 4614);
}

#[test]
fn test_short_subtags() {
    let (prefixes, suffixes): (Vec<&str>, Vec<&str>) = testdata::short_subtags::STRINGS
        .iter()
        .copied()
        .partition_map(|s| match s.strip_prefix("und-") {
            Some(suffix) => Either::Right(suffix),
            None => Either::Left(s),
        });
    assert_eq!(prefixes.len(), 1427);
    assert_eq!(suffixes.len(), 450);
    let mut counter = 0;
    let mut rng = rand_pcg::Lcg64Xsh32::seed_from_u64(2025);
    let mut data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();
    for prefix in prefixes.iter() {
        // Two types of keys: big and small. Pick at random with a certain chance of big.
        let is_big = rng.random::<f64>() < 0.3;
        for suffix in suffixes.iter() {
            // Include the suffix with a probability dependent on is_big.
            let include_value = rng.random::<f64>() < if is_big { 0.8 } else { 0.05 };
            if include_value {
                data.entry(prefix).or_default().insert(suffix, counter);
                counter += 1;
            }
        }
    }
    assert_eq!(data.values().map(|m| m.len()).sum::<usize>(), 175306);
    let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();
    check_data(&data, dense.as_borrowed(), true);
    let byte_len = check_encoding(dense.as_borrowed());
    assert_eq!(byte_len, 555240);
    let simple_trie = make_simple_ascii_trie(&data);
    assert_eq!(simple_trie.byte_len(), 1099634);
}

#[test]
fn test_dense_sparse_window_selection() {
    //Make initial Btree (not using enumerate)
    let row_width = usize::from(u16::MAX); // Densetype max
    let far_low = 0;
    let cluster_start = 50;
    let far_high = cluster_start + row_width + 100;

    let mut inner = BTreeMap::new();
    inner.insert("low", far_low);
    inner.insert("a", 50); // cluster_start
    inner.insert("c", 53); // cluster_start + 3
    inner.insert("d", cluster_start + row_width - 3);
    inner.insert("e", cluster_start + row_width - 2);
    inner.insert("f", cluster_start + row_width - 1);
    inner.insert("c2", 53); // cluster_start + 3
    inner.insert("g", cluster_start + row_width);
    inner.insert("h", cluster_start + row_width + 1);
    inner.insert("b", 52); // cluster_start + 2
    inner.insert("high", far_high);
    inner.insert("c3", 53); // cluster_start + 3
    inner.insert("low2", far_low);

    let mut data = BTreeMap::new();
    data.insert("p", inner);

    // Build the 2d trie.
    let dense = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();
    let trie = dense.as_borrowed();

    assert_eq!(
        format!("{:?}", trie),
        "ZeroAsciiDenseSparse2dTrieBorrowed { primary: ZeroTrieSimpleAscii { store: [112, 128, 47, 144, 36, 195, 97, 104, 108, 2, 8, 144, 34, 105, 103, 104, 147, 241, 5, 111, 119, 128, 50, 128] }, suffixes: ZeroTrieSimpleAscii { store: [201, 97, 98, 99, 100, 101, 102, 103, 104, 108, 1, 2, 9, 10, 11, 12, 13, 18, 128, 129, 130, 194, 50, 51, 1, 131, 132, 133, 134, 135, 136, 137, 105, 103, 104, 138, 111, 119, 139, 50, 140] }, dense: ZeroVec([65535, 0, 1, 1, 1, 65530, 65531, 65532, 65533, 65534, 65535, 65535, 65535]), suffix_count: 13, delimiter: 47 }"
    );

    check_data(&data, trie, true);

    let byte_len = check_encoding(dense.as_borrowed());
    assert_eq!(byte_len, 102);
    let simple_trie = make_simple_ascii_trie(&data);
    assert_eq!(simple_trie.byte_len(), 60);
}

#[test]
fn test_dense_suffix_count_with_low_frequency_suffixes() {
    // Create a dataset where many suffixes appear in only one prefix.
    // This should result in fewer dense suffixes than if we included all.

    let mut data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();

    // Common suffixes that appear in multiple prefixes
    data.entry("en").or_default().insert("US", 1);
    data.entry("en").or_default().insert("GB", 2);
    data.entry("en").or_default().insert("CA", 3);

    data.entry("fr").or_default().insert("FR", 4);
    data.entry("fr").or_default().insert("CA", 5);
    data.entry("fr").or_default().insert("BE", 6);

    data.entry("de").or_default().insert("DE", 7);
    data.entry("de").or_default().insert("AT", 8);
    data.entry("de").or_default().insert("CH", 9);

    // Low-frequency suffixes (appear in only one prefix each)
    data.entry("en").or_default().insert("rare1", 100);
    data.entry("fr").or_default().insert("rare2", 101);
    data.entry("de").or_default().insert("rare3", 102);
    data.entry("es").or_default().insert("rare4", 103);
    data.entry("es").or_default().insert("ES", 104);

    let trie = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();

    assert_eq!(
        format!("{:?}", trie.as_borrowed()),
        "ZeroAsciiDenseSparse2dTrieBorrowed { primary: ZeroTrieSimpleAscii { store: [195, 100, 101, 102, 22, 61, 101, 47, 196, 65, 67, 68, 114, 2, 4, 6, 84, 136, 72, 137, 69, 135, 97, 114, 101, 51, 144, 86, 194, 110, 115, 21, 47, 196, 67, 71, 85, 114, 2, 4, 6, 65, 131, 66, 130, 83, 129, 97, 114, 101, 49, 144, 84, 47, 194, 69, 114, 3, 83, 144, 88, 97, 114, 101, 52, 144, 87, 114, 47, 196, 66, 67, 70, 114, 2, 4, 6, 69, 134, 65, 133, 82, 132, 97, 114, 101, 50, 144, 85] }, suffixes: ZeroTrieSimpleAscii { store: [201, 65, 66, 67, 68, 69, 70, 71, 85, 114, 2, 4, 10, 12, 14, 16, 18, 20, 84, 128, 69, 129, 194, 65, 72, 1, 130, 131, 69, 132, 83, 133, 82, 134, 66, 135, 83, 136, 97, 114, 101, 196, 49, 50, 51, 52, 1, 2, 3, 137, 138, 139, 140] }, dense: ZeroVec([]), suffix_count: 13, delimiter: 47 }"
    );

    check_data(&data, trie.as_borrowed(), true);

    // Non-existent lookups
    let mut non_existent: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();
    non_existent.entry("en").or_default().insert("FR", 0);
    non_existent
        .entry("nonexistent")
        .or_default()
        .insert("US", 0);
    check_data(&non_existent, trie.as_borrowed(), false);
}

#[test]
fn test_empty_dense_when_no_suffix_meets_threshold() {
    // Create a dataset where NO suffix appears in enough prefixes to meet threshold.
    // Dense matrix should be empty, all lookups use sparse path.

    let mut data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();

    // Each suffix appears in only one prefix (0% overlap)
    data.entry("a").or_default().insert("s1", 1);
    data.entry("b").or_default().insert("s2", 2);
    data.entry("c").or_default().insert("s3", 3);

    let trie = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();

    assert_eq!(
        format!("{:?}", trie.as_borrowed()),
        "ZeroAsciiDenseSparse2dTrieBorrowed { primary: ZeroTrieSimpleAscii { store: [195, 97, 98, 99, 4, 8, 47, 115, 49, 129, 47, 115, 50, 130, 47, 115, 51, 131] }, suffixes: ZeroTrieSimpleAscii { store: [115, 195, 49, 50, 51, 1, 2, 128, 129, 130] }, dense: ZeroVec([]), suffix_count: 3, delimiter: 47 }"
    );

    check_data(&data, trie.as_borrowed(), true);

    // Non-existent
    let mut non_existent: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();
    non_existent.entry("a").or_default().insert("s2", 0);
    non_existent.entry("b").or_default().insert("s3", 0);
    check_data(&non_existent, trie.as_borrowed(), false);
}

#[test]
fn test_empty_input() {
    let data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();

    let trie = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();

    check_data(&data, trie.as_borrowed(), true);
}

#[test]
fn test_lookup_semantics_remain_correct() {
    // Verify that both dense and sparse suffixes return correct values

    let mut data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();

    // High-frequency suffix (will be dense)
    for i in 0..10 {
        let prefix = format!("p{}", i);
        data.entry(prefix.leak() as &str)
            .or_default()
            .insert("common", 100 + i);
    }

    // Low-frequency suffix (will be sparse)
    data.entry("p0").or_default().insert("rare", 999);

    let trie = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/').unwrap();

    assert_eq!(
        format!("{:?}", trie.as_borrowed()),
        "ZeroAsciiDenseSparse2dTrieBorrowed { primary: ZeroTrieSimpleAscii { store: [112, 202, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 4, 8, 12, 16, 20, 24, 28, 32, 36, 128, 47, 144, 84, 129, 47, 144, 85, 130, 47, 144, 86, 131, 47, 144, 87, 132, 47, 144, 88, 133, 47, 144, 89, 134, 47, 144, 90, 135, 47, 144, 91, 136, 47, 144, 92, 137, 47, 144, 93] }, suffixes: ZeroTrieSimpleAscii { store: [194, 99, 114, 6, 111, 109, 109, 111, 110, 128, 97, 114, 101, 129] }, dense: ZeroVec([0, 899, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535, 0, 65535]), suffix_count: 2, delimiter: 47 }"
    );

    check_data(&data, trie.as_borrowed(), true);

    // Non-existent combinations
    let mut non_existent: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();
    non_existent.entry("p1").or_default().insert("rare", 0);
    non_existent.entry("p99").or_default().insert("common", 0);
    check_data(&non_existent, trie.as_borrowed(), false);
}

#[test]
fn test_delimiter_validation() {
    let mut data: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();

    // Try to use delimiter in suffix
    data.entry("test").or_default().insert("bad/suffix", 1);

    let result = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data, b'/');
    assert!(result.is_err());

    // Try to use delimiter in prefix
    let mut data2: BTreeMap<&str, BTreeMap<&str, usize>> = BTreeMap::new();
    data2.entry("bad/prefix").or_default().insert("ok", 1);

    let result2 = ZeroAsciiDenseSparse2dTrieOwned::try_from_btree_map_str(&data2, b'/');
    assert!(result2.is_err());
}