djr 0.0.1 - Docs.rs

//! This module is here to abstract away some Unicode character checking.
//!
//! CommonMark has clear definitions for the different types of characters. I'm assuming the same
//! definitions for Djot.
//!
//! Many of these definitions are in terms of Unicode categories. I'm handling all characters as
//! single bytes, which is sufficient, since all the characters that _do_ something in a Djot
//! document are ASCII. However, some of the u8 utility functions included in std are not entirely
//! compliant with the definitions in CommonMark. This module exists to override these methods,
//! allowing me to make them spec compliant (and change them down the road, if Djot ends up with
//! different definitions.)

/// This checks if a byte is a carriage return or line feed, which both are defined as a "line
/// ending".
#[inline]
pub(crate) fn is_line_ending(c: u8) -> bool {
    matches!(c, 0x000A | 0x000D)
}

/// This extends the std `is_ascii_whitespace` with the no break space (U+00A0) character, which is
/// included in the Unicode Zs category. This is required to make a byte-based whitespace check
/// that is compliant with Djot's definition of a whitespace character:
/// https://spec.commonmark.org/0.30/#unicode-whitespace-character
#[inline]
pub(crate) fn _is_whitespace(c: u8) -> bool {
    matches!(c, 0x0020 | 0x0009 | 0x000A | 0x000C | 0x000D | 0x00A0)
}

/// Tabs and spaces are considered part of a "blank" line. This checks if a character matches any
/// of these.
#[inline]
pub(crate) fn is_blank(c: u8) -> bool {
    matches!(c, 0x0020 | 0x0009)
}

/// Wrapper function that allows extensibility down the road, and gives us a similar interface to
/// the functions defined above.
#[inline]
pub(crate) fn is_digit(c: u8) -> bool {
    c.is_ascii_digit()
}

/// Wrapper function that allows extensibility down the road, and gives us a similar interface to
/// the functions defined above.
#[inline]
pub(crate) fn is_alphabetic(c: u8) -> bool {
    c.is_ascii_alphabetic()
}

/// Wrapper function that allows extensibility down the road, and gives us a similar interface to
/// the functions defined above.
#[inline]
pub(crate) fn is_alphanumeric(c: u8) -> bool {
    c.is_ascii_alphanumeric()
}

/// Wrapper function that allows extensibility down the road, and gives us a similar interface to
/// the functions defined above.
#[inline]
pub(crate) fn _is_punctuation(c: u8) -> bool {
    c.is_ascii_punctuation()
}

/// Wrapper function that allows extensibility down the road, and gives us a similar interface to
/// the functions defined above.
#[inline]
pub(crate) fn _is_control(c: u8) -> bool {
    c.is_ascii_control()
}

/// Check if a character would be a valid in a URI.
///
/// Uses the syntax definition found
/// [here](https://en.wikipedia.org/wiki/Uniform_Resource_Identifier#Syntax) to define the valid
/// characters.
#[inline]
pub(crate) fn is_uri_friendly(c: u8) -> bool {
    c.is_ascii_alphanumeric()
        || matches!(
            c,
            0x21..=0x2F | 0x3A | 0x3B | 0x3D | 0x3F | 0x40 | 0x5B | 0x5D | 0x5F | 0x7E
        )
}