html-cat 0.1.0

HTML5 parser: tokenizer + tree builder producing a Document tree of Element/Text/Comment nodes. No mut, no Rc/Arc, no interior mutability, no panics, exhaustive matches. First sub-crate of a Servo-replacement webview runtime targeting Tauri.
//! Character-reference decoding (`&` → `&`, `{` → `{`, ...).
//!
//! v0 covers the common named refs plus all numeric refs.  A full
//! WHATWG named-entity table is deferred to v0.2.

/// Decode the character reference body between `&` and `;` (exclusive
/// of both).  Returns `None` if the body is not recognised; the
/// tokenizer keeps the literal `&...` in the output in that case.
#[must_use]
pub fn decode(body: &str) -> Option<String> {
    if let Some(rest) = body.strip_prefix('#') {
        decode_numeric(rest)
    } else {
        decode_named(body)
    }
}

fn decode_numeric(body: &str) -> Option<String> {
    let (radix, digits) = body
        .strip_prefix('x')
        .or_else(|| body.strip_prefix('X'))
        .map_or((10, body), |hex| (16, hex));
    u32::from_str_radix(digits, radix)
        .ok()
        .and_then(char::from_u32)
        .map(|c| c.to_string())
}

fn decode_named(body: &str) -> Option<String> {
    match body {
        "amp" => Some("&".to_owned()),
        "lt" => Some("<".to_owned()),
        "gt" => Some(">".to_owned()),
        "quot" => Some("\"".to_owned()),
        "apos" => Some("'".to_owned()),
        "nbsp" => Some("\u{00a0}".to_owned()),
        "copy" => Some("\u{00a9}".to_owned()),
        "reg" => Some("\u{00ae}".to_owned()),
        "trade" => Some("\u{2122}".to_owned()),
        "hellip" => Some("\u{2026}".to_owned()),
        "mdash" => Some("\u{2014}".to_owned()),
        "ndash" => Some("\u{2013}".to_owned()),
        "lsquo" => Some("\u{2018}".to_owned()),
        "rsquo" => Some("\u{2019}".to_owned()),
        "ldquo" => Some("\u{201c}".to_owned()),
        "rdquo" => Some("\u{201d}".to_owned()),
        _other => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::error::Error;

    #[test]
    fn decodes_amp() -> Result<(), Error> {
        decode("amp")
            .filter(|s| s == "&")
            .map(|_| ())
            .ok_or(Error::InvalidEntity {
                at: crate::span::Span::synthetic(),
                text: "amp".to_owned(),
            })
    }

    #[test]
    fn decodes_decimal() -> Result<(), Error> {
        decode("#65")
            .filter(|s| s == "A")
            .map(|_| ())
            .ok_or(Error::InvalidEntity {
                at: crate::span::Span::synthetic(),
                text: "#65".to_owned(),
            })
    }

    #[test]
    fn decodes_hex() -> Result<(), Error> {
        decode("#x41")
            .filter(|s| s == "A")
            .map(|_| ())
            .ok_or(Error::InvalidEntity {
                at: crate::span::Span::synthetic(),
                text: "#x41".to_owned(),
            })
    }

    #[test]
    fn rejects_unknown() -> Result<(), Error> {
        decode("notarealentity")
            .is_none()
            .then_some(())
            .ok_or(Error::InvalidEntity {
                at: crate::span::Span::synthetic(),
                text: "notarealentity".to_owned(),
            })
    }
}