iri-string 0.5.6

IRI as string types
Documentation
//! Characters.

use crate::spec::Spec;

/// A mask to test whether the character is continue character of `scheme`.
// `ALPHA / DIGIT / "+" / "-" / "."`
const MASK_SCHEME_CONTINUE: u8 = 1 << 0;

/// A mask to test whether the character matches `unreserved`.
// `unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"`
const MASK_UNRESERVED: u8 = 1 << 1;

///// A mask to test whether the character matches `gen-delims`.
//// `gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"`
//const MASK_GEN_DELIMS: u8 = 1 << 2;

/// A mask to test whether the character matches `sub-delims`.
// `sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="`
const MASK_SUB_DELIMS: u8 = 1 << 3;

/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes).
// `pchar = unreserved / pct-encoded / sub-delims / ":" / "@"`
const MASK_PCHAR: u8 = 1 << 4;

/// A mask to test whether the character can appear in `query` and `fragment`.
// `query = *( pchar / "/" / "?" )`
// `fragment = *( pchar / "/" / "?" )`
const MASK_FRAG_QUERY: u8 = 1 << 5;

/// A mask to test whether the character can appear in `userinfo` and address of `IPvFuture`.
// `userinfo = *( unreserved / pct-encoded / sub-delims / ":" )`
const MASK_USERINFO_IPVFUTUREADDR: u8 = 1 << 6;

/// A mask to test whether the character matches `pchar` (modulo percent-encoded bytes) or slash.
const MASK_PCHAR_SLASH: u8 = 1 << 7;

/// ASCII characters' properties.
const TABLE: [u8; 128] = [
    0b_0000_0000, // NUL
    0b_0000_0000, // SOH
    0b_0000_0000, // STX
    0b_0000_0000, // ETX
    0b_0000_0000, // EOT
    0b_0000_0000, // ENQ
    0b_0000_0000, // ACK
    0b_0000_0000, // BEL
    0b_0000_0000, // BS
    0b_0000_0000, // HT
    0b_0000_0000, // LF
    0b_0000_0000, // VT
    0b_0000_0000, // FF
    0b_0000_0000, // CR
    0b_0000_0000, // SO
    0b_0000_0000, // SI
    0b_0000_0000, // DLE
    0b_0000_0000, // DC1
    0b_0000_0000, // DC2
    0b_0000_0000, // DC3
    0b_0000_0000, // DC4
    0b_0000_0000, // NAK
    0b_0000_0000, // SYN
    0b_0000_0000, // ETB
    0b_0000_0000, // CAN
    0b_0000_0000, // EM
    0b_0000_0000, // SUB
    0b_0000_0000, // ESC
    0b_0000_0000, // FS
    0b_0000_0000, // GS
    0b_0000_0000, // RS
    0b_0000_0000, // US
    0b_0000_0000, // SPACE
    0b_1111_1000, // !
    0b_0000_0000, // "
    0b_0000_0100, // #
    0b_1111_1000, // $
    0b_0000_0000, // %
    0b_1111_1000, // &
    0b_1111_1000, // '
    0b_1111_1000, // (
    0b_1111_1000, // )
    0b_1111_1000, // *
    0b_1111_1001, // +
    0b_1111_1000, // ,
    0b_1111_0011, // -
    0b_1111_0011, // .
    0b_1010_0100, // /
    0b_1111_0011, // 0
    0b_1111_0011, // 1
    0b_1111_0011, // 2
    0b_1111_0011, // 3
    0b_1111_0011, // 4
    0b_1111_0011, // 5
    0b_1111_0011, // 6
    0b_1111_0011, // 7
    0b_1111_0011, // 8
    0b_1111_0011, // 9
    0b_1111_0100, // :
    0b_1111_1000, // ;
    0b_0000_0000, // <
    0b_1111_1000, // =
    0b_0000_0000, // >
    0b_0010_0100, // ?
    0b_1011_0100, // @
    0b_1111_0011, // A
    0b_1111_0011, // B
    0b_1111_0011, // C
    0b_1111_0011, // D
    0b_1111_0011, // E
    0b_1111_0011, // F
    0b_1111_0011, // G
    0b_1111_0011, // H
    0b_1111_0011, // I
    0b_1111_0011, // J
    0b_1111_0011, // K
    0b_1111_0011, // L
    0b_1111_0011, // M
    0b_1111_0011, // N
    0b_1111_0011, // O
    0b_1111_0011, // P
    0b_1111_0011, // Q
    0b_1111_0011, // R
    0b_1111_0011, // S
    0b_1111_0011, // T
    0b_1111_0011, // U
    0b_1111_0011, // V
    0b_1111_0011, // W
    0b_1111_0011, // X
    0b_1111_0011, // Y
    0b_1111_0011, // Z
    0b_0000_0100, // [
    0b_0000_0000, // \
    0b_0000_0100, // ]
    0b_0000_0000, // ^
    0b_1111_0010, // _
    0b_0000_0000, // `
    0b_1111_0011, // a
    0b_1111_0011, // b
    0b_1111_0011, // c
    0b_1111_0011, // d
    0b_1111_0011, // e
    0b_1111_0011, // f
    0b_1111_0011, // g
    0b_1111_0011, // h
    0b_1111_0011, // i
    0b_1111_0011, // j
    0b_1111_0011, // k
    0b_1111_0011, // l
    0b_1111_0011, // m
    0b_1111_0011, // n
    0b_1111_0011, // o
    0b_1111_0011, // p
    0b_1111_0011, // q
    0b_1111_0011, // r
    0b_1111_0011, // s
    0b_1111_0011, // t
    0b_1111_0011, // u
    0b_1111_0011, // v
    0b_1111_0011, // w
    0b_1111_0011, // x
    0b_1111_0011, // y
    0b_1111_0011, // z
    0b_0000_0000, // {
    0b_0000_0000, // |
    0b_0000_0000, // }
    0b_1111_0010, // ~
    0b_0000_0000, // DEL
];

/// Returns `true` if the given ASCII character is allowed as continue character of `scheme` part.
#[inline]
#[must_use]
pub(crate) const fn is_ascii_scheme_continue(c: u8) -> bool {
    (TABLE[c as usize] & MASK_SCHEME_CONTINUE) != 0
}

/// Returns `true` if the given ASCII character matches `unreserved`.
#[inline]
#[must_use]
pub(crate) const fn is_ascii_unreserved(c: u8) -> bool {
    (TABLE[c as usize] & MASK_UNRESERVED) != 0
}

/// Returns true if the character is unreserved.
#[inline]
#[must_use]
pub(crate) fn is_unreserved<S: Spec>(c: char) -> bool {
    if c.is_ascii() {
        is_ascii_unreserved(c as u8)
    } else {
        S::is_nonascii_char_unreserved(c)
    }
}

///// Returns `true` if the given ASCII character matches `gen-delims`.
//#[inline]
//#[must_use]
//pub(crate) const fn is_ascii_gen_delims(c: u8) -> bool {
//    (TABLE[c as usize] & MASK_GEN_DELIMS) != 0
//}

///// Returns `true` if the given ASCII character matches `sub-delims`.
//#[inline]
//#[must_use]
//pub(crate) const fn is_ascii_sub_delims(c: u8) -> bool {
//    (TABLE[c as usize] & MASK_SUB_DELIMS) != 0
//}

///// Returns `true` if the given ASCII character matches `reserved`.
//#[inline]
//#[must_use]
//pub(crate) const fn is_ascii_reserved(c: u8) -> bool {
//    (TABLE[c as usize] & (MASK_GEN_DELIMS | MASK_SUB_DELIMS)) != 0
//}

/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded`.
#[inline]
#[must_use]
pub(crate) const fn is_ascii_pchar(c: u8) -> bool {
    (TABLE[c as usize] & MASK_PCHAR) != 0
}

/// Returns `true` if the given ASCII character is allowed to appear in `query` and `fragment`.
#[inline]
#[must_use]
pub(crate) const fn is_ascii_frag_query(c: u8) -> bool {
    (TABLE[c as usize] & MASK_FRAG_QUERY) != 0
}

/// Returns `true` if the given non-ASCII character is allowed to appear in `iquery`.
#[inline]
#[must_use]
pub(crate) fn is_nonascii_query<S: Spec>(c: char) -> bool {
    S::is_nonascii_char_unreserved(c) || S::is_nonascii_char_private(c)
}

/// Returns `true` if the given non-ASCII character is allowed to appear in `ifragment`.
#[inline]
#[must_use]
pub(crate) fn is_nonascii_fragment<S: Spec>(c: char) -> bool {
    S::is_nonascii_char_unreserved(c)
}

/// Returns `true` if the given ASCII character is allowed to appear in `userinfo`
#[inline]
#[must_use]
pub(crate) const fn is_ascii_userinfo_ipvfutureaddr(c: u8) -> bool {
    (TABLE[c as usize] & MASK_USERINFO_IPVFUTUREADDR) != 0
}

/// Returns `true` if the given non-ASCII character is allowed to appear in `iuserinfo`.
#[inline]
#[must_use]
pub(crate) fn is_nonascii_userinfo<S: Spec>(c: char) -> bool {
    S::is_nonascii_char_unreserved(c)
}

/// Returns `true` if the given ASCII character is allowed to appear in `reg-name`
#[inline]
#[must_use]
pub(crate) const fn is_ascii_regname(c: u8) -> bool {
    (TABLE[c as usize] & (MASK_UNRESERVED | MASK_SUB_DELIMS)) != 0
}

/// Returns `true` if the given non-ASCII character is allowed to appear in `ireg-name`.
#[inline]
#[must_use]
pub(crate) fn is_nonascii_regname<S: Spec>(c: char) -> bool {
    S::is_nonascii_char_unreserved(c)
}

/// Returns `true` if the given ASCII character matches `pchar` modulo `pct-encoded` or a slash.
#[inline]
#[must_use]
pub(crate) const fn is_ascii_pchar_slash(c: u8) -> bool {
    (TABLE[c as usize] & MASK_PCHAR_SLASH) != 0
}

/// Checks if the given character matches `ucschar` rule.
#[must_use]
pub(crate) fn is_ucschar(c: char) -> bool {
    matches!(
        u32::from(c),
        0xA0..=0xD7FF |
        0xF900..=0xFDCF |
        0xFDF0..=0xFFEF |
        0x1_0000..=0x1_FFFD |
        0x2_0000..=0x2_FFFD |
        0x3_0000..=0x3_FFFD |
        0x4_0000..=0x4_FFFD |
        0x5_0000..=0x5_FFFD |
        0x6_0000..=0x6_FFFD |
        0x7_0000..=0x7_FFFD |
        0x8_0000..=0x8_FFFD |
        0x9_0000..=0x9_FFFD |
        0xA_0000..=0xA_FFFD |
        0xB_0000..=0xB_FFFD |
        0xC_0000..=0xC_FFFD |
        0xD_0000..=0xD_FFFD |
        0xE_1000..=0xE_FFFD
    )
}

/// Returns true if the given value is a continue byte of UTF-8.
#[inline(always)]
#[must_use]
pub(crate) fn is_utf8_byte_continue(byte: u8) -> bool {
    // `0x80..=0xbf` (i.e. `0b_1000_0000..=0b_1011_1111`) is not the first byte,
    // and `0xc0..=0xc1` (i.e. `0b_1100_0000..=0b_1100_0001` shouldn't appear
    // anywhere in UTF-8 byte sequence.
    // `0x80 as i8` is -128, and `0xc0 as i8` is -96.
    //
    // The first byte of the UTF-8 character is not `0b10xx_xxxx`, and
    // the continue bytes is `0b10xx_xxxx`.
    // `0b1011_1111 as i8` is -65, and `0b1000_0000 as i8` is -128.
    (byte as i8) < -64
}