pretokie 0.0.1

Fast, zero-allocation pretokenizers for BPE tokenizers
Documentation
//! Shared byte-level utilities.

#[inline(always)]
pub fn is_ascii_letter(b: u8) -> bool {
    (b | 0x20).wrapping_sub(b'a') < 26
}

#[inline(always)]
pub fn is_digit(b: u8) -> bool {
    b.wrapping_sub(b'0') < 10
}

#[inline(always)]
pub fn decode_utf8(bytes: &[u8]) -> (char, usize) {
    let b0 = bytes[0];
    if b0 < 0x80 {
        (b0 as char, 1)
    } else if b0 < 0xE0 {
        let c = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 2)
    } else if b0 < 0xF0 {
        let c = ((b0 as u32 & 0x0F) << 12)
            | ((bytes[1] as u32 & 0x3F) << 6)
            | (bytes[2] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 3)
    } else {
        let c = ((b0 as u32 & 0x07) << 18)
            | ((bytes[1] as u32 & 0x3F) << 12)
            | ((bytes[2] as u32 & 0x3F) << 6)
            | (bytes[3] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 4)
    }
}

/// Check if a Unicode character is a combining/spacing mark (M category).
#[inline]
pub fn is_unicode_mark(c: char) -> bool {
    let cp = c as u32;
    matches!(cp,
        0x0300..=0x036F |   // Combining Diacritical Marks
        0x0483..=0x0489 |   // Cyrillic combining marks
        0x0591..=0x05BD | 0x05BF | 0x05C1..=0x05C2 | 0x05C4..=0x05C5 | 0x05C7 |
        0x0610..=0x061A | 0x064B..=0x065F | 0x0670 |
        0x06D6..=0x06DC | 0x06DF..=0x06E4 | 0x06E7..=0x06E8 | 0x06EA..=0x06ED |
        0x0711 | 0x0730..=0x074A | 0x07A6..=0x07B0 |
        0x0901..=0x0903 | 0x093A..=0x094F | 0x0951..=0x0957 | 0x0962..=0x0963 |
        0x0981..=0x0983 | 0x09BC | 0x09BE..=0x09C4 | 0x09C7..=0x09C8 |
        0x09CB..=0x09CD | 0x09D7 | 0x09E2..=0x09E3 |
        0x0A01..=0x0A03 | 0x0A3C | 0x0A3E..=0x0A42 | 0x0A47..=0x0A48 |
        0x0A4B..=0x0A4D | 0x0A51 | 0x0A70..=0x0A71 | 0x0A75 |
        0x0B01..=0x0B03 | 0x0B3C | 0x0B3E..=0x0B44 | 0x0B47..=0x0B48 |
        0x0B4B..=0x0B4D | 0x0B56..=0x0B57 | 0x0B62..=0x0B63 |
        0x0B82 | 0x0BBE..=0x0BC2 | 0x0BC6..=0x0BC8 | 0x0BCA..=0x0BCD | 0x0BD7 |
        0x0C01..=0x0C03 | 0x0C3E..=0x0C44 | 0x0C46..=0x0C48 | 0x0C4A..=0x0C4D |
        0x0C55..=0x0C56 | 0x0C62..=0x0C63 |
        0x0D02..=0x0D03 | 0x0D3E..=0x0D44 | 0x0D46..=0x0D48 | 0x0D4A..=0x0D4D | 0x0D57 |
        0x0E31 | 0x0E34..=0x0E3A | 0x0E47..=0x0E4E |  // Thai
        0x0EB1 | 0x0EB4..=0x0EB9 | 0x0EBB..=0x0EBC | 0x0EC8..=0x0ECD |  // Lao
        0x1100..=0x115F | // Hangul Jamo (leading)
        0x1DC0..=0x1DFF |   // Combining marks supplement
        0x20D0..=0x20FF |   // Combining marks for symbols
        0xFE00..=0xFE0F |   // Variation selectors
        0xFE20..=0xFE2F     // Combining half marks
    )
}