pretokie 0.0.4

Fast, zero-allocation pretokenizers for BPE tokenizers
Documentation
//! Shared byte-level utilities.

#[inline(always)]
pub fn is_ascii_letter(b: u8) -> bool {
    (b | 0x20).wrapping_sub(b'a') < 26
}

#[inline(always)]
pub fn is_lower(b: u8) -> bool {
    b.wrapping_sub(b'a') < 26
}

#[inline(always)]
pub fn is_upper(b: u8) -> bool {
    b.wrapping_sub(b'A') < 26
}

#[inline(always)]
pub fn is_digit(b: u8) -> bool {
    b.wrapping_sub(b'0') < 10
}

#[inline(always)]
pub fn decode_utf8(bytes: &[u8]) -> (char, usize) {
    let b0 = bytes[0];
    if b0 < 0x80 {
        (b0 as char, 1)
    } else if b0 < 0xE0 {
        let c = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 2)
    } else if b0 < 0xF0 {
        let c = ((b0 as u32 & 0x0F) << 12)
            | ((bytes[1] as u32 & 0x3F) << 6)
            | (bytes[2] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 3)
    } else {
        let c = ((b0 as u32 & 0x07) << 18)
            | ((bytes[1] as u32 & 0x3F) << 12)
            | ((bytes[2] as u32 & 0x3F) << 6)
            | (bytes[3] as u32 & 0x3F);
        (unsafe { char::from_u32_unchecked(c) }, 4)
    }
}

/// Check if a Unicode char is in General_Category = Letter (\p{L}).
///
/// This is narrower than `char::is_alphabetic()`, which also returns true for
/// some marks (Mn/Mc) that have the Unicode `Alphabetic` property.
/// For pretokenizer regex patterns using `\p{L}`, use this instead.
#[inline(always)]
pub fn is_unicode_letter(c: char) -> bool {
    c.is_alphabetic() && !is_unicode_mark(c)
}

/// Check if a Unicode character is a combining/spacing mark (M category).
#[inline]
pub fn is_unicode_mark(c: char) -> bool {
    let cp = c as u32;
    matches!(cp,
        0x0300..=0x036F |   // Combining Diacritical Marks
        0x0483..=0x0489 |   // Cyrillic combining marks
        0x0591..=0x05BD | 0x05BF | 0x05C1..=0x05C2 | 0x05C4..=0x05C5 | 0x05C7 |
        0x0610..=0x061A | 0x064B..=0x065F | 0x0670 |
        0x06D6..=0x06DC | 0x06DF..=0x06E4 | 0x06E7..=0x06E8 | 0x06EA..=0x06ED |
        0x0711 | 0x0730..=0x074A | 0x07A6..=0x07B0 |
        0x0901..=0x0903 | 0x093A..=0x094F | 0x0951..=0x0957 | 0x0962..=0x0963 |
        0x0981..=0x0983 | 0x09BC | 0x09BE..=0x09C4 | 0x09C7..=0x09C8 |
        0x09CB..=0x09CD | 0x09D7 | 0x09E2..=0x09E3 |
        0x0A01..=0x0A03 | 0x0A3C | 0x0A3E..=0x0A42 | 0x0A47..=0x0A48 |
        0x0A4B..=0x0A4D | 0x0A51 | 0x0A70..=0x0A71 | 0x0A75 |
        0x0B01..=0x0B03 | 0x0B3C | 0x0B3E..=0x0B44 | 0x0B47..=0x0B48 |
        0x0B4B..=0x0B4D | 0x0B56..=0x0B57 | 0x0B62..=0x0B63 |
        0x0B82 | 0x0BBE..=0x0BC2 | 0x0BC6..=0x0BC8 | 0x0BCA..=0x0BCD | 0x0BD7 |
        0x0C01..=0x0C04 | 0x0C3C | 0x0C3E..=0x0C44 | 0x0C46..=0x0C48 | 0x0C4A..=0x0C4D |
        0x0C55..=0x0C56 | 0x0C62..=0x0C63 |
        0x0C81..=0x0C83 | 0x0CBC | 0x0CBE..=0x0CC4 | 0x0CC6..=0x0CC8 |  // Kannada
        0x0CCA..=0x0CCD | 0x0CD5..=0x0CD6 | 0x0CE2..=0x0CE3 |           // Kannada
        0x0D00..=0x0D03 | 0x0D3B..=0x0D3C | 0x0D3E..=0x0D44 | 0x0D46..=0x0D48 |
        0x0D4A..=0x0D4D | 0x0D57 | 0x0D62..=0x0D63 |  // Malayalam
        0x0D81..=0x0D83 | 0x0DCA | 0x0DCF..=0x0DD4 | 0x0DD6 |  // Sinhala
        0x0DD8..=0x0DDF | 0x0DF2..=0x0DF3 |                     // Sinhala
        0x0E31 | 0x0E34..=0x0E3A | 0x0E47..=0x0E4E |  // Thai
        0x0EB1 | 0x0EB4..=0x0EB9 | 0x0EBB..=0x0EBC | 0x0EC8..=0x0ECD |  // Lao
        0x0F18..=0x0F19 | 0x0F35 | 0x0F37 | 0x0F39 | 0x0F3E..=0x0F3F |  // Tibetan
        0x0F71..=0x0F84 | 0x0F86..=0x0F87 | 0x0F8D..=0x0FBC | 0x0FC6 |  // Tibetan
        0x102B..=0x103E | 0x1056..=0x1059 | 0x105E..=0x1060 |  // Myanmar
        0x1062..=0x1064 | 0x1067..=0x106D | 0x1071..=0x1074 |  // Myanmar
        0x1082..=0x108D | 0x108F | 0x109A..=0x109D |            // Myanmar
        0x1100..=0x115F | // Hangul Jamo (leading)
        0x135D..=0x135F |   // Ethiopic combining marks
        0x1712..=0x1715 | 0x1732..=0x1734 | 0x1752..=0x1753 |  // Philippine
        0x1772..=0x1773 |
        0x17B4..=0x17D3 | 0x17DD |  // Khmer
        0x1920..=0x192B | 0x1930..=0x193B |  // Limbu/Buginese
        0x1A17..=0x1A1B | 0x1A55..=0x1A5E | 0x1A60..=0x1A7C | 0x1A7F |  // Tai Tham
        0x1AB0..=0x1ACE |   // Combining marks extended
        0x1B00..=0x1B04 | 0x1B34..=0x1B44 | 0x1B6B..=0x1B73 |  // Balinese
        0x1B80..=0x1B82 | 0x1BA1..=0x1BAD |  // Sundanese
        0x1BE6..=0x1BF3 |   // Batak
        0x1C24..=0x1C37 |   // Lepcha
        0x1CD0..=0x1CF9 |   // Vedic extensions
        0x1DC0..=0x1DFF |   // Combining marks supplement
        0x20D0..=0x20FF |   // Combining marks for symbols
        0xA802 | 0xA806 | 0xA80B | 0xA823..=0xA827 |  // Syloti Nagri
        0xA880..=0xA881 | 0xA8B4..=0xA8C5 |  // Saurashtra
        0xA8E0..=0xA8F1 |   // Devanagari extended
        0xA926..=0xA92D | 0xA947..=0xA953 |  // Kayah Li/Rejang
        0xA980..=0xA983 | 0xA9B3..=0xA9C0 |  // Javanese
        0xAA29..=0xAA36 | 0xAA43 | 0xAA4C..=0xAA4D |  // Cham
        0xAAB0 | 0xAAB2..=0xAAB4 | 0xAAB7..=0xAAB8 | 0xAABE..=0xAABF | 0xAAC1 |  // Tai Viet
        0xAAEB..=0xAAEF | 0xAAF5..=0xAAF6 |  // Meetei Mayek
        0xFE00..=0xFE0F |   // Variation selectors
        0xFE20..=0xFE2F |   // Combining half marks
        0x10A01..=0x10A03 | 0x10A05..=0x10A06 | 0x10A0C..=0x10A0F |  // Kharoshthi
        0x11000..=0x11002 | 0x11038..=0x11046 |  // Brahmi
        0x11080..=0x11082 | 0x110B0..=0x110BA |  // Kaithi
        0x1D165..=0x1D169 | 0x1D16D..=0x1D172 | 0x1D17B..=0x1D182 | 0x1D185..=0x1D18B |  // Musical symbols
        0x1D1AA..=0x1D1AD |
        0xE0100..=0xE01EF  // Variation selectors supplement
    )
}