simd-normalizer 0.1.1

SIMD-accelerated Unicode normalization (NFC, NFD, NFKC, NFKD)
Documentation
//! Two-level CodePointTrie optimized for BMP fast-path and sequential access.
//!
//! BMP lookup: `data[bmp_index[cp >> 5] + (cp & 0x1F)]` -- 2 array accesses, no branching.
//! Supplementary lookup: 3-level index hierarchy, cold path.

/// Shift for BMP index blocks (32 entries per block).
const BMP_SHIFT: u32 = 5;
/// Mask for the offset within a BMP block.
const BMP_MASK: u32 = (1 << BMP_SHIFT) - 1; // 0x1F
/// Number of data-array entries per BMP index block.
#[allow(dead_code)]
pub(crate) const BMP_BLOCK_SIZE: usize = 1 << BMP_SHIFT;

/// Shift for supplementary index level 1 (top-level partitioning).
const SUPP_SHIFT_1: u32 = 11;
/// Shift for supplementary index level 2 (mid-level partitioning).
const SUPP_SHIFT_2: u32 = 5;
/// Mask for supplementary level-2 offset.
const SUPP_MASK_2: u32 = (1 << (SUPP_SHIFT_1 - SUPP_SHIFT_2)) - 1; // 0x3F
/// Mask for supplementary data block offset.
const SUPP_MASK_DATA: u32 = (1 << SUPP_SHIFT_2) - 1; // 0x1F

/// A two-level code point trie with BMP fast-path and supplementary 3-level index.
///
/// All data is `&'static` -- the trie is zero-cost to construct and copy.
#[derive(Clone, Copy)]
pub(crate) struct CodePointTrie {
    /// Block pointers for BMP (U+0000..U+FFFF).
    /// 2048 entries (65536 >> 5), each pointing into `data`.
    pub(crate) bmp_index: &'static [u16],
    /// Trie values -- both BMP and supplementary data blocks live here.
    pub(crate) data: &'static [u32],
    /// Level-1 index for supplementary code points (U+10000..U+10FFFF).
    pub(crate) supp_index1: &'static [u16],
    /// Level-2 index for supplementary code points.
    pub(crate) supp_index2: &'static [u16],
    /// Default value returned for unmapped / out-of-range code points.
    pub(crate) default_value: u32,
}

impl CodePointTrie {
    /// Look up the trie value for a code point.
    #[inline]
    pub(crate) fn get(&self, cp: u32) -> u32 {
        if cp < 0x10000 {
            // SAFETY: cp < 0x10000, and our generated bmp_index has exactly 2048
            // entries covering the full BMP range. Table generator guarantees
            // all block pointers produce valid data indices.
            unsafe { self.get_bmp_unchecked(cp) }
        } else if cp <= 0x10FFFF {
            self.get_supplementary(cp)
        } else {
            self.default_value
        }
    }

    /// BMP-only lookup path. Two array accesses, no branching.
    #[allow(dead_code)]
    #[inline(always)]
    fn get_bmp(&self, cp: u32) -> u32 {
        debug_assert!(cp < 0x10000);
        let block_idx = (cp >> BMP_SHIFT) as usize;
        let offset = (cp & BMP_MASK) as usize;
        let base = self.bmp_index[block_idx] as usize;
        self.data[base + offset]
    }

    /// BMP-only lookup without bounds checks. Use when `cp` is known < 0x10000
    /// and the trie is guaranteed well-formed (generated tables).
    #[inline(always)]
    pub(crate) unsafe fn get_bmp_unchecked(&self, cp: u32) -> u32 {
        debug_assert!(cp < 0x10000);
        let block_idx = (cp >> BMP_SHIFT) as usize;
        let offset = (cp & BMP_MASK) as usize;
        let base = unsafe { *self.bmp_index.get_unchecked(block_idx) } as usize;
        unsafe { *self.data.get_unchecked(base + offset) }
    }

    /// Pipelined 2-codepoint BMP lookup: stage-1 load of cp1 is issued while
    /// stage-2 of cp0 is in flight, letting the out-of-order engine hide one
    /// L2 miss per pair. Bit-identical to two serial `get_bmp_unchecked` calls.
    ///
    /// # Safety
    /// Both `cp0` and `cp1` must be `< 0x10000`, and the trie tables must be
    /// well-formed (guaranteed for generated tables).
    #[allow(dead_code)]
    // retained as a primitive for future pipelining work
    #[inline(always)]
    pub(crate) unsafe fn get_two_bmp_pipelined_unchecked(&self, cp0: u32, cp1: u32) -> [u32; 2] {
        debug_assert!(cp0 < 0x10000 && cp1 < 0x10000);
        let idx0 = (cp0 >> BMP_SHIFT) as usize;
        let idx1 = (cp1 >> BMP_SHIFT) as usize;
        let off0 = (cp0 & BMP_MASK) as usize;
        let off1 = (cp1 & BMP_MASK) as usize;
        // Issue both stage-1 loads back-to-back so they overlap.
        let base0 = unsafe { *self.bmp_index.get_unchecked(idx0) } as usize;
        let base1 = unsafe { *self.bmp_index.get_unchecked(idx1) } as usize;
        // bmp_index stores absolute data offsets, so final index is base + offset
        // (NOT base * BMP_BLOCK_SIZE + offset). Matches `get_bmp_unchecked`.
        let v0 = unsafe { *self.data.get_unchecked(base0 + off0) };
        let v1 = unsafe { *self.data.get_unchecked(base1 + off1) };
        [v0, v1]
    }

    /// Supplementary lookup path (U+10000..U+10FFFF).
    #[cold]
    #[inline(never)]
    fn get_supplementary(&self, cp: u32) -> u32 {
        debug_assert!((0x10000..=0x10FFFF).contains(&cp));
        let adjusted = cp - 0x10000;

        let i1 = (adjusted >> SUPP_SHIFT_1) as usize;
        let l1_entry = match self.supp_index1.get(i1) {
            Some(&v) => v as usize,
            None => return self.default_value,
        };

        let i2_offset = ((adjusted >> SUPP_SHIFT_2) & SUPP_MASK_2) as usize;
        let l2_entry = match self.supp_index2.get(l1_entry + i2_offset) {
            Some(&v) => v as usize,
            None => return self.default_value,
        };

        let data_offset = (adjusted & SUPP_MASK_DATA) as usize;
        match self.data.get(l2_entry + data_offset) {
            Some(&v) => v,
            None => self.default_value,
        }
    }

    /// Fast supplementary lookup without bounds checks.
    ///
    /// # Safety
    /// `cp` must be in U+10000..U+10FFFF, and the trie tables must be well-formed
    /// (guaranteed for generated tables).
    #[inline(always)]
    pub(crate) unsafe fn get_supplementary_unchecked(&self, cp: u32) -> u32 {
        debug_assert!((0x10000..=0x10FFFF).contains(&cp));
        let adjusted = cp - 0x10000;

        let i1 = (adjusted >> SUPP_SHIFT_1) as usize;
        let l1_entry = unsafe { *self.supp_index1.get_unchecked(i1) } as usize;

        let i2_offset = ((adjusted >> SUPP_SHIFT_2) & SUPP_MASK_2) as usize;
        let l2_entry = unsafe { *self.supp_index2.get_unchecked(l1_entry + i2_offset) } as usize;

        let data_offset = (adjusted & SUPP_MASK_DATA) as usize;
        unsafe { *self.data.get_unchecked(l2_entry + data_offset) }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    static TEST_BMP_INDEX: [u16; 2048] = {
        let mut arr = [128u16; 2048]; // default block at data[128]
        arr[0] = 0; // U+0000..U+001F -> data[0]
        arr[1] = 32; // U+0020..U+003F -> data[32]
        arr[2] = 64; // U+0040..U+005F -> data[64]
        arr[3] = 96; // U+0060..U+007F -> data[96]
        arr[0x270] = 160; // U+4E00..U+4E1F -> data[160]
        arr
    };

    static TEST_DATA: [u32; 224] = {
        let mut arr = [0u32; 224];
        let mut i = 0u32;
        while i < 128 {
            arr[i as usize] = i;
            i += 1;
        }
        // CJK block at data[160..192]
        let mut j = 0u32;
        while j < 32 {
            arr[160 + j as usize] = 0xC000 + j;
            j += 1;
        }
        // Supplementary block at data[192..224]
        let mut k = 0u32;
        while k < 32 {
            arr[192 + k as usize] = 0xE000 + k;
            k += 1;
        }
        arr
    };

    static TEST_SUPP_INDEX1: [u16; 528] = {
        let mut arr = [64u16; 528]; // null block in supp_index2
        arr[0] = 0; // first L1 entry -> supp_index2[0]
        arr
    };

    static TEST_SUPP_INDEX2: [u16; 128] = {
        let mut arr = [128u16; 128]; // default data block
        arr[0] = 192; // -> data[192] (supplementary test block)
        arr
    };

    fn test_trie() -> CodePointTrie {
        CodePointTrie {
            bmp_index: &TEST_BMP_INDEX,
            data: &TEST_DATA,
            supp_index1: &TEST_SUPP_INDEX1,
            supp_index2: &TEST_SUPP_INDEX2,
            default_value: 0,
        }
    }

    #[test]
    fn test_ascii_lookup() {
        let trie = test_trie();
        assert_eq!(trie.get(0x00), 0x00);
        assert_eq!(trie.get(0x41), 0x41); // 'A'
        assert_eq!(trie.get(0x61), 0x61); // 'a'
        assert_eq!(trie.get(0x7F), 0x7F);
    }

    #[test]
    fn test_bmp_cjk_lookup() {
        let trie = test_trie();
        assert_eq!(trie.get(0x4E00), 0xC000);
        assert_eq!(trie.get(0x4E01), 0xC001);
        assert_eq!(trie.get(0x4E1F), 0xC01F);
    }

    #[test]
    fn test_supplementary_lookup() {
        let trie = test_trie();
        assert_eq!(trie.get(0x10000), 0xE000);
        assert_eq!(trie.get(0x10001), 0xE001);
        assert_eq!(trie.get(0x1001F), 0xE01F);
    }

    #[test]
    fn test_unmapped_returns_default() {
        let trie = test_trie();
        assert_eq!(trie.get(0x0100), 0);
        assert_eq!(trie.get(0x100000), 0);
        assert_eq!(trie.get(0x110000), 0);
        assert_eq!(trie.get(0xFFFFFFFF), 0);
    }

    #[test]
    fn test_bmp_boundary() {
        let trie = test_trie();
        assert_eq!(trie.get(0xFFFF), 0);
        assert_eq!(trie.get(0x10000), 0xE000);
    }

    #[test]
    fn test_all_ascii_round_trip() {
        let trie = test_trie();
        for cp in 0u32..=0x7F {
            assert_eq!(trie.get(cp), cp, "mismatch at U+{cp:04X}");
        }
    }

    #[test]
    fn test_supplementary_end_of_range() {
        let trie = test_trie();
        assert_eq!(trie.get(0x10FFFF), 0);
    }

    #[test]
    fn test_get_is_consistent_with_get_bmp() {
        let trie = test_trie();
        for cp in (0u32..0x10000).step_by(997) {
            assert_eq!(trie.get(cp), trie.get_bmp(cp), "mismatch at U+{cp:04X}");
        }
    }
}