zerotrie 0.2.4

A data structure that efficiently maps strings to integers
Documentation
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use super::super::branch_meta::BranchMeta;
use super::super::slice_indices::{self, ByteSliceWithIndices};
use super::store::ConstArrayBuilder;
use super::store::ConstLengthsStack;
use super::store::ConstSlice;
use crate::varint;

/// A low-level builder for [`ZeroTrieSimpleAscii`]. Works in const contexts.
///
/// All methods that grow the trie will panic if the capacity N is not enough.
pub(crate) struct ZeroTrieBuilderConst<const N: usize> {
    data: ConstArrayBuilder<N, u8>,
}

impl<const N: usize> ZeroTrieBuilderConst<N> {
    /// Non-const function that returns the current trie data as a slice.
    #[cfg(feature = "litemap")]
    pub fn as_bytes(&self) -> &[u8] {
        self.data.as_const_slice().as_slice()
    }

    /// Returns the trie data, panicking if the buffer is the wrong size.
    pub const fn build_or_panic(self) -> [u8; N] {
        self.data.const_build_or_panic()
    }

    /// Creates a new empty builder.
    pub const fn new() -> Self {
        Self {
            data: ConstArrayBuilder::new_empty([0; N], N),
        }
    }

    /// Prepends an ASCII node to the front of the builder. Returns the new builder
    /// and the delta in length, which is always 1.
    pub const fn prepend_ascii(&mut self, ascii: u8) -> usize {
        if ascii >= 128 {
            panic!("Non-ASCII not supported in ZeroTrieSimpleAscii");
        }
        self.data.const_push_front_or_panic(ascii);
        1
    }

    /// Prepends a value node to the front of the builder. Returns the new builder
    /// and the delta in length, which depends on the size of the varint.
    pub const fn prepend_value(&mut self, value: usize) -> usize {
        let varint_array = varint::write_varint_meta3(value);
        // Can panic (as documented in class docs):
        self.data
            .const_extend_front_or_panic(varint_array.as_const_slice());
        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
        self.data.const_bitor_assign_or_panic(0, 0b10000000);
        varint_array.len()
    }

    /// Prepends a branch node to the front of the builder. Returns the new builder
    /// and the delta in length, which depends on the size of the varint.
    pub const fn prepend_branch(&mut self, value: usize) -> usize {
        let varint_array = varint::write_varint_meta2(value);
        // Can panic (as documented in type-level docs):
        self.data
            .const_extend_front_or_panic(varint_array.as_const_slice());
        // Shouldn't panic: index 0 is always a valid index, and the array is nonempty now
        self.data.const_bitor_assign_or_panic(0, 0b11000000);
        varint_array.len()
    }

    /// Prepends multiple arbitrary bytes to the front of the builder. Returns the new builder
    /// and the delta in length, which is the length of the slice.
    pub const fn prepend_slice(&mut self, s: ConstSlice<u8>) -> usize {
        let mut i = s.len();
        while i > 0 {
            // Can panic (as documented in type-level docs):
            self.data.const_push_front_or_panic(*s.get_or_panic(i - 1));
            i -= 1;
        }
        s.len()
    }

    /// Prepends multiple zeros to the front of the builder. Returns the new builder.
    pub const fn prepend_n_zeros(&mut self, n: usize) {
        let mut i = 0;
        while i < n {
            // Can panic (as documented in type-level docs):
            self.data.const_push_front_or_panic(0);
            i += 1;
        }
    }

    /// Performs the operation `self[index] |= bits`
    pub const fn bitor_assign_at_or_panic(&mut self, index: usize, bits: u8) {
        self.data.const_bitor_assign_or_panic(index, bits);
    }

    /// Creates a new builder containing the elements in the given slice of key/value pairs.
    ///
    /// `K` is the stack size of the lengths stack. If you get an error such as
    /// `AsciiTrie Builder: Need more stack`, try increasing `K`.
    ///
    /// # Panics
    ///
    /// Panics if the items are not sorted
    pub const fn from_tuple_slice<'a, const K: usize>(items: ByteSliceWithIndices<'a>) -> Self {
        assert!(
            items.is_sorted(),
            "Strings in ZeroTrie constructor are not sorted"
        );
        Self::from_slice_with_indices::<K>(items)
    }

    /// Creates a new builder containing the elements in the given slice of key/value pairs.
    ///
    /// Assumes that the items are sorted. If they are not, unexpected behavior may occur.
    ///
    /// `K` is the stack size of the lengths stack. If you get an error such as
    /// `AsciiTrie Builder: Need more stack`, try increasing `K`.
    pub const fn from_slice_with_indices<const K: usize>(items: ByteSliceWithIndices) -> Self {
        let mut result = Self::new();
        let total_size = result.create_or_panic::<K>(items);
        debug_assert!(total_size == result.data.len());
        result
    }

    /// The actual builder algorithm. For an explanation, see [`crate::builder`].
    #[must_use]
    #[expect(
        clippy::indexing_slicing,
        reason = "all call sites are well-commented and will not panic"
    )]
    const fn create_or_panic<const K: usize>(&mut self, all_items: ByteSliceWithIndices) -> usize {
        let mut prefix_len = match all_items.last() {
            Some(x) => x.0.len(),
            // Empty slice:
            None => return 0,
        };
        // Initialize the main loop to point at the last string.
        let mut lengths_stack = ConstLengthsStack::<K>::new();
        let mut i = all_items.len() - 1;
        let mut j = all_items.len();
        let mut current_len = 0;
        // Start the main loop.
        loop {
            let item_i = all_items.get_or_panic(i);
            let item_j = all_items.get_or_panic(j - 1);
            debug_assert!(slice_indices::prefix_eq_or_panic(
                item_i.0, item_j.0, prefix_len
            ));
            // Check if we need to add a value node here.
            if item_i.0.len() == prefix_len {
                current_len += self.prepend_value(item_i.1);
            }
            if prefix_len == 0 {
                // All done! Leave the main loop.
                break;
            }
            // Reduce the prefix length by 1 and recalculate i and j.
            prefix_len -= 1;
            let mut new_i = i;
            let mut new_j = j;
            // `prefix_len` is guaranteed to be in bounds for `item_i` since it is
            // instantiated and updated based on the length of `item_i`, and then only decremented.
            // This is a bit harder to prove for `item_j`, but it is a loop invariant
            // TODO(sffc): Consider proving this or perhaps using a checked get.
            let mut ascii_i = item_i.0[prefix_len];
            let mut ascii_j = item_j.0[prefix_len];
            debug_assert!(ascii_i == ascii_j);
            let key_ascii = ascii_i;
            loop {
                if new_i == 0 {
                    break;
                }
                let candidate = all_items.get_or_panic(new_i - 1).0;
                if candidate.len() < prefix_len {
                    // Too short
                    break;
                }
                if slice_indices::prefix_eq_or_panic(item_i.0, candidate, prefix_len) {
                    new_i -= 1;
                } else {
                    break;
                }

                if candidate.len() == prefix_len {
                    // A string that equals the prefix does not take part in the branch node.
                    break;
                }
                let candidate = candidate[prefix_len];
                if candidate != ascii_i {
                    ascii_i = candidate;
                }
            }
            loop {
                if new_j == all_items.len() {
                    break;
                }
                let candidate = all_items.get_or_panic(new_j).0;
                if candidate.len() < prefix_len {
                    // Too short
                    break;
                }
                if slice_indices::prefix_eq_or_panic(item_j.0, candidate, prefix_len) {
                    new_j += 1;
                } else {
                    break;
                }
                if candidate.len() == prefix_len {
                    panic!("A shorter string should be earlier in the sequence");
                }
                let candidate = candidate[prefix_len];
                if candidate != ascii_j {
                    ascii_j = candidate;
                }
            }
            // If there are no different bytes at this prefix level, we can add an ASCII or Span
            // node and then continue to the next iteration of the main loop.
            if ascii_i == key_ascii && ascii_j == key_ascii {
                current_len += self.prepend_ascii(ascii_i);
                debug_assert!(i == new_i || i == new_i + 1);
                i = new_i;
                debug_assert!(j == new_j);
                continue;
            }
            // If i and j changed, we are a target of a branch node.
            if ascii_j == key_ascii {
                // We are the _last_ target of a branch node.
                lengths_stack.push_or_panic(BranchMeta {
                    ascii: key_ascii,
                    cumulative_length: current_len,
                    local_length: current_len,
                    count: 1,
                });
            } else {
                // We are the _not the last_ target of a branch node.
                let BranchMeta {
                    cumulative_length,
                    count,
                    ..
                } = lengths_stack.peek_or_panic();
                lengths_stack.push_or_panic(BranchMeta {
                    ascii: key_ascii,
                    cumulative_length: cumulative_length + current_len,
                    local_length: current_len,
                    count: count + 1,
                });
            }
            if ascii_i != key_ascii {
                // We are _not the first_ target of a branch node.
                // Set the cursor to the previous string and continue the loop.
                j = i;
                i -= 1;
                prefix_len = all_items.get_or_panic(i).0.len();
                current_len = 0;
                continue;
            }
            // Branch (first)
            let (total_length, total_count) = {
                let BranchMeta {
                    cumulative_length,
                    count,
                    ..
                } = lengths_stack.peek_or_panic();
                (cumulative_length, count)
            };
            let branch_metas = lengths_stack.pop_many_or_panic(total_count);
            let original_keys = branch_metas.map_to_ascii_bytes();
            // Write out the offset table
            current_len = total_length;
            let w = (usize::BITS as usize - (total_length.leading_zeros() as usize) - 1) / 8;
            if w > 3 {
                panic!("ZeroTrie capacity exceeded");
            }
            let mut k = 0;
            while k <= w {
                self.prepend_n_zeros(total_count - 1);
                current_len += total_count - 1;
                let mut l = 0;
                let mut length_to_write = 0;
                while l < total_count {
                    let BranchMeta { local_length, .. } = *branch_metas
                        .as_const_slice()
                        .get_or_panic(total_count - l - 1);
                    let mut adjusted_length = length_to_write;
                    let mut m = 0;
                    while m < k {
                        adjusted_length >>= 8;
                        m += 1;
                    }
                    if l > 0 {
                        self.bitor_assign_at_or_panic(l - 1, adjusted_length as u8);
                    }
                    l += 1;
                    length_to_write += local_length;
                }
                k += 1;
            }
            // Write out the lookup table
            assert!(0 < total_count && total_count <= 256);
            let branch_value = (w << 8) + (total_count & 0xff);
            current_len += self.prepend_slice(original_keys.as_const_slice());
            current_len += self.prepend_branch(branch_value);
            i = new_i;
            j = new_j;
        }
        assert!(lengths_stack.is_empty());
        current_len
    }
}