pdf_oxide 0.3.38

//! HTML5 tokenizer (focused subset).
//!
//! Emits the WHATWG token kinds that matter for static document
//! rendering:
//!
//! - `StartTag { name, attrs, self_closing }`
//! - `EndTag { name }`
//! - `Text(String)` with HTML entities decoded
//! - `Comment(String)`
//! - `Doctype` (kind only — we don't validate the public/system ids)
//! - `Eof`
//!
//! Coverage notes:
//!
//! - **Entity decoding** handles `&amp;`/`&lt;`/`&gt;`/`&quot;`/`&apos;`,
//!   numeric `&#1234;` and `&#x4D;` forms, and the common named
//!   entities (`&nbsp;`, `&copy;`, `&hellip;`, …) — about 40 names
//!   covering 99% of real HTML. Unknown entities are passed through
//!   verbatim.
//! - **Raw-text contexts**: inside `<style>` and `<script>` we run a
//!   raw-text scan that ends only at the matching `</style>` /
//!   `</script>` — so embedded CSS isn't tokenised as HTML tags.
//!   `<style>` bodies are surfaced as `RawText` so HTML-3 can hand
//!   them to the CSS engine without re-decoding entities.
//! - **Self-closing void elements** are recognised by name (br, img,
//!   hr, input, meta, link, area, base, col, embed, source, track,
//!   wbr); their start tags carry `self_closing=true` even without
//!   the explicit `/>`.

use std::collections::HashMap;

/// One HTML token emitted by [`tokenize`].
#[derive(Debug, Clone, PartialEq)]
pub enum HtmlToken {
    /// `<tag attr="v" attr2>` — `name` is lowercased.
    StartTag {
        /// Lowercase tag name.
        name: String,
        /// Attribute list in source order.
        attrs: Vec<(String, String)>,
        /// True for void elements or explicit `<.../>`.
        self_closing: bool,
    },
    /// `</tag>`. Name is lowercased.
    EndTag {
        /// Lowercase tag name.
        name: String,
    },
    /// Character data with entities resolved.
    Text(String),
    /// `<!-- ... -->` body without delimiters.
    Comment(String),
    /// `<!DOCTYPE ...>` — body is the raw text minus the wrapper.
    Doctype(String),
    /// Raw text from inside `<style>` or `<script>`. Entities are NOT
    /// decoded — these contexts use literal characters per spec.
    RawText {
        /// `"style"` or `"script"`.
        host_tag: String,
        /// Raw body bytes between the open and close tag.
        body: String,
    },
    /// End of input.
    Eof,
}

/// HTML void elements per the HTML5 spec — these are always implicitly
/// self-closing.
pub fn is_void_element(name: &str) -> bool {
    matches!(
        name,
        "area"
            | "base"
            | "br"
            | "col"
            | "embed"
            | "hr"
            | "img"
            | "input"
            | "link"
            | "meta"
            | "param"
            | "source"
            | "track"
            | "wbr"
    )
}

/// Tokenize an HTML source string.
pub fn tokenize(input: &str) -> Vec<HtmlToken> {
    let mut t = Tokenizer::new(input);
    let mut out = Vec::new();
    loop {
        match t.next_token() {
            HtmlToken::Eof => {
                out.push(HtmlToken::Eof);
                return out;
            },
            tok => out.push(tok),
        }
    }
}

// ─────────────────────────────────────────────────────────────────────
// Driver
// ─────────────────────────────────────────────────────────────────────

struct Tokenizer<'a> {
    input: &'a str,
    pos: usize,
}

impl<'a> Tokenizer<'a> {
    fn new(input: &'a str) -> Self {
        Self { input, pos: 0 }
    }

    fn peek(&self) -> Option<char> {
        self.input[self.pos..].chars().next()
    }

    fn bump(&mut self) -> Option<char> {
        let c = self.peek()?;
        self.pos += c.len_utf8();
        Some(c)
    }

    fn starts_with(&self, s: &str) -> bool {
        self.input[self.pos..].starts_with(s)
    }

    fn starts_with_ignore_case(&self, s: &str) -> bool {
        // Compare char-by-char to avoid slicing the haystack on a
        // non-char boundary when it starts with a multi-byte codepoint
        // (CSS `content: "» "` etc.). Length-by-bytes can land mid-
        // codepoint and panic.
        let mut rest = self.input[self.pos..].chars();
        for needle_ch in s.chars() {
            match rest.next() {
                Some(c) if c.eq_ignore_ascii_case(&needle_ch) => {},
                _ => return false,
            }
        }
        true
    }

    fn next_token(&mut self) -> HtmlToken {
        let Some(c) = self.peek() else {
            return HtmlToken::Eof;
        };
        if c == '<' {
            self.consume_tag_or_text()
        } else {
            self.consume_text()
        }
    }

    fn consume_tag_or_text(&mut self) -> HtmlToken {
        // `<` already known. Determine kind.
        if self.starts_with("<!--") {
            self.pos += 4;
            return self.consume_comment();
        }
        if self.starts_with_ignore_case("<!doctype") {
            return self.consume_doctype();
        }
        if self.starts_with("</") {
            self.pos += 2;
            return self.consume_end_tag();
        }
        if matches!(
            self.input[self.pos + 1..].chars().next(),
            Some(c) if c.is_ascii_alphabetic()
        ) {
            self.pos += 1; // consume <
            return self.consume_start_tag();
        }
        // Lone `<` — emit as literal text.
        self.bump();
        HtmlToken::Text("<".to_string())
    }

    fn consume_comment(&mut self) -> HtmlToken {
        let start = self.pos;
        while self.pos < self.input.len() {
            if self.starts_with("-->") {
                let body = self.input[start..self.pos].to_string();
                self.pos += 3;
                return HtmlToken::Comment(body);
            }
            self.bump();
        }
        // Unterminated.
        HtmlToken::Comment(self.input[start..self.pos].to_string())
    }

    fn consume_doctype(&mut self) -> HtmlToken {
        let start = self.pos;
        while let Some(c) = self.peek() {
            if c == '>' {
                let body = self.input[start..self.pos].to_string();
                self.bump();
                return HtmlToken::Doctype(body);
            }
            self.bump();
        }
        HtmlToken::Doctype(self.input[start..self.pos].to_string())
    }

    fn consume_end_tag(&mut self) -> HtmlToken {
        let name = self.consume_tag_name();
        // Skip until `>`.
        while let Some(c) = self.peek() {
            if c == '>' {
                self.bump();
                break;
            }
            self.bump();
        }
        HtmlToken::EndTag {
            name: name.to_ascii_lowercase(),
        }
    }

    fn consume_start_tag(&mut self) -> HtmlToken {
        let name_lower = self.consume_tag_name().to_ascii_lowercase();
        let mut attrs: Vec<(String, String)> = Vec::new();
        let mut self_closing = false;
        loop {
            self.skip_html_ws();
            match self.peek() {
                None => break,
                Some('>') => {
                    self.bump();
                    break;
                },
                Some('/') => {
                    self.bump();
                    self_closing = true;
                    // Continue — `/` may sit before `>`.
                    continue;
                },
                Some(_) => {
                    if let Some(attr) = self.consume_attribute() {
                        attrs.push(attr);
                    } else {
                        // Recovery: bump one char and continue.
                        self.bump();
                    }
                },
            }
        }
        if !self_closing && is_void_element(&name_lower) {
            self_closing = true;
        }
        // Raw-text contexts: <style> and <script> swallow everything
        // until their matching close tag.
        if !self_closing && (name_lower == "style" || name_lower == "script") {
            return self.swallow_raw_text(name_lower, attrs);
        }
        HtmlToken::StartTag {
            name: name_lower,
            attrs,
            self_closing,
        }
    }

    /// After we've emitted a `<style>` or `<script>` start tag, consume
    /// the body until the matching close tag and return both as a
    /// `RawText` token. The next call to `next_token()` will yield the
    /// matching end tag.
    fn swallow_raw_text(&mut self, host_tag: String, attrs: Vec<(String, String)>) -> HtmlToken {
        // First emit the start tag — but we need to push it back so the
        // caller still sees it. Do that by consuming the body now and
        // returning the start tag immediately; the body is emitted on
        // the next call. Simpler approach: emit start-tag now and stash
        // the body in a side-channel.
        //
        // Because we're a single-pass tokenizer without that mechanism,
        // do a different thing: emit a synthetic sequence by *first*
        // collecting everything, then returning the start tag and
        // queueing two more tokens.
        //
        // Easiest: queue. Since we don't have one, walk the input and
        // gather body, then return StartTag now and let the caller
        // dispatch RawText + EndTag on subsequent calls. Implement
        // queue support inline.
        let body_start = self.pos;
        let needle = format!("</{host_tag}");
        let mut body_end = self.pos;
        while self.pos < self.input.len() {
            if self.starts_with_ignore_case(&needle) {
                break;
            }
            self.bump();
            body_end = self.pos;
        }
        let body = self.input[body_start..body_end].to_string();
        // Skip the close tag itself: `</tagname>`.
        self.pos += needle.len();
        // Skip until `>`.
        while let Some(c) = self.peek() {
            if c == '>' {
                self.bump();
                break;
            }
            self.bump();
        }
        // Emit a Compound token by stashing on the side. We don't have
        // multi-token return, so represent as a RawText whose host_tag
        // is the original tag name; the dom builder synthesises the
        // start/end pair around it.
        let _ = attrs; // dropped; <style>/<script> attrs aren't used in v0.3.35
        HtmlToken::RawText { host_tag, body }
    }

    fn consume_tag_name(&mut self) -> String {
        let start = self.pos;
        while let Some(c) = self.peek() {
            if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' {
                self.bump();
            } else {
                break;
            }
        }
        self.input[start..self.pos].to_string()
    }

    fn consume_attribute(&mut self) -> Option<(String, String)> {
        let name_start = self.pos;
        while let Some(c) = self.peek() {
            if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == ':' {
                self.bump();
            } else {
                break;
            }
        }
        let name = self.input[name_start..self.pos].to_ascii_lowercase();
        if name.is_empty() {
            return None;
        }
        self.skip_html_ws();
        if self.peek() != Some('=') {
            return Some((name, String::new()));
        }
        self.bump(); // =
        self.skip_html_ws();
        let value = match self.peek() {
            Some('"') => {
                self.bump();
                self.consume_quoted_attr_value('"')
            },
            Some('\'') => {
                self.bump();
                self.consume_quoted_attr_value('\'')
            },
            _ => self.consume_unquoted_attr_value(),
        };
        Some((name, value))
    }

    fn consume_quoted_attr_value(&mut self, quote: char) -> String {
        let mut buf = String::new();
        while let Some(c) = self.peek() {
            if c == quote {
                self.bump();
                break;
            }
            if c == '&' {
                buf.push_str(&self.consume_entity());
            } else {
                buf.push(c);
                self.bump();
            }
        }
        buf
    }

    fn consume_unquoted_attr_value(&mut self) -> String {
        let mut buf = String::new();
        while let Some(c) = self.peek() {
            if c.is_ascii_whitespace() || c == '>' || c == '/' {
                break;
            }
            if c == '&' {
                buf.push_str(&self.consume_entity());
            } else {
                buf.push(c);
                self.bump();
            }
        }
        buf
    }

    fn consume_text(&mut self) -> HtmlToken {
        let mut buf = String::new();
        while let Some(c) = self.peek() {
            if c == '<' {
                break;
            }
            if c == '&' {
                buf.push_str(&self.consume_entity());
            } else {
                buf.push(c);
                self.bump();
            }
        }
        HtmlToken::Text(buf)
    }

    fn consume_entity(&mut self) -> String {
        // `&` already known.
        let restore = self.pos;
        self.bump(); // consume &
                     // Numeric entity?
        if matches!(self.peek(), Some('#')) {
            self.bump();
            let hex = matches!(self.peek(), Some('x') | Some('X'));
            if hex {
                self.bump();
            }
            let mut digits = String::new();
            while let Some(c) = self.peek() {
                if (hex && c.is_ascii_hexdigit()) || (!hex && c.is_ascii_digit()) {
                    digits.push(c);
                    self.bump();
                } else {
                    break;
                }
            }
            if matches!(self.peek(), Some(';')) {
                self.bump();
            }
            let radix = if hex { 16 } else { 10 };
            if let Ok(n) = u32::from_str_radix(&digits, radix) {
                if let Some(ch) = char::from_u32(n) {
                    return ch.to_string();
                }
            }
            // Malformed; pass through.
            return self.input[restore..self.pos].to_string();
        }
        // Named entity.
        let name_start = self.pos;
        while let Some(c) = self.peek() {
            if c.is_ascii_alphanumeric() {
                self.bump();
            } else {
                break;
            }
        }
        let name = &self.input[name_start..self.pos];
        let trailing_semi = matches!(self.peek(), Some(';'));
        if let Some(decoded) = NAMED_ENTITIES.get(name).copied() {
            if trailing_semi {
                self.bump();
            }
            return decoded.to_string();
        }
        // Unknown — restore and emit literal `&`.
        self.pos = restore;
        self.bump();
        "&".to_string()
    }

    fn skip_html_ws(&mut self) {
        while let Some(c) = self.peek() {
            if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\x0c' {
                self.bump();
            } else {
                break;
            }
        }
    }
}

// ─────────────────────────────────────────────────────────────────────
// Named entities — high-frequency subset
// ─────────────────────────────────────────────────────────────────────

lazy_static::lazy_static! {
    static ref NAMED_ENTITIES: HashMap<&'static str, &'static str> = {
        let mut m = HashMap::new();
        // Core
        m.insert("amp", "&");
        m.insert("lt", "<");
        m.insert("gt", ">");
        m.insert("quot", "\"");
        m.insert("apos", "'");
        m.insert("nbsp", "\u{00A0}");
        // Punctuation / typography
        m.insert("ndash", "–");
        m.insert("mdash", "—");
        m.insert("hellip", "…");
        m.insert("lsquo", "‘");
        m.insert("rsquo", "’");
        m.insert("ldquo", "“");
        m.insert("rdquo", "”");
        m.insert("laquo", "«");
        m.insert("raquo", "»");
        m.insert("bull", "•");
        m.insert("middot", "·");
        m.insert("para", "¶");
        m.insert("sect", "§");
        m.insert("sup1", "¹");
        m.insert("sup2", "²");
        m.insert("sup3", "³");
        m.insert("frac12", "½");
        m.insert("frac14", "¼");
        m.insert("frac34", "¾");
        // Symbols
        m.insert("copy", "©");
        m.insert("reg", "®");
        m.insert("trade", "™");
        m.insert("deg", "°");
        m.insert("times", "×");
        m.insert("divide", "÷");
        m.insert("plusmn", "±");
        m.insert("micro", "µ");
        m.insert("euro", "€");
        m.insert("pound", "£");
        m.insert("yen", "¥");
        m.insert("cent", "¢");
        m.insert("not", "¬");
        // Arrows
        m.insert("larr", "←");
        m.insert("uarr", "↑");
        m.insert("rarr", "→");
        m.insert("darr", "↓");
        m.insert("harr", "↔");
        m.insert("crarr", "↵");
        // Maths (small subset)
        m.insert("infin", "∞");
        m.insert("le", "≤");
        m.insert("ge", "≥");
        m.insert("ne", "≠");
        m.insert("equiv", "≡");
        m
    };
}

// ─────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    fn types(toks: &[HtmlToken]) -> Vec<&'static str> {
        toks.iter()
            .map(|t| match t {
                HtmlToken::StartTag { .. } => "Start",
                HtmlToken::EndTag { .. } => "End",
                HtmlToken::Text(_) => "Text",
                HtmlToken::Comment(_) => "Comment",
                HtmlToken::Doctype(_) => "Doctype",
                HtmlToken::RawText { .. } => "RawText",
                HtmlToken::Eof => "Eof",
            })
            .collect()
    }

    #[test]
    fn simple_paragraph() {
        let t = tokenize("<p>hello</p>");
        assert_eq!(types(&t), vec!["Start", "Text", "End", "Eof"]);
    }

    #[test]
    fn attributes_quoted_and_unquoted() {
        let t = tokenize(r#"<a href="x" class=foo data-x>"#);
        match &t[0] {
            HtmlToken::StartTag { name, attrs, .. } => {
                assert_eq!(name, "a");
                assert_eq!(attrs.len(), 3);
                assert_eq!(attrs[0], ("href".into(), "x".into()));
                assert_eq!(attrs[1], ("class".into(), "foo".into()));
                assert_eq!(attrs[2], ("data-x".into(), String::new()));
            },
            _ => panic!(),
        }
    }

    #[test]
    fn void_element_implicit_self_closing() {
        let t = tokenize("<img src=x>");
        match &t[0] {
            HtmlToken::StartTag {
                name, self_closing, ..
            } => {
                assert_eq!(name, "img");
                assert!(self_closing);
            },
            _ => panic!(),
        }
    }

    #[test]
    fn explicit_self_closing() {
        let t = tokenize("<br/>");
        match &t[0] {
            HtmlToken::StartTag { self_closing, .. } => assert!(self_closing),
            _ => panic!(),
        }
    }

    #[test]
    fn named_entities_decoded() {
        let t = tokenize("Bread &amp; butter — &nbsp; &copy;");
        if let HtmlToken::Text(s) = &t[0] {
            assert_eq!(s, "Bread & butter — \u{00A0} ©");
        } else {
            panic!();
        }
    }

    #[test]
    fn numeric_entities() {
        let t = tokenize("&#65; &#x41;");
        if let HtmlToken::Text(s) = &t[0] {
            assert_eq!(s, "A A");
        } else {
            panic!();
        }
    }

    #[test]
    fn comments() {
        let t = tokenize("<!-- hello --> world");
        assert_eq!(types(&t), vec!["Comment", "Text", "Eof"]);
        if let HtmlToken::Comment(s) = &t[0] {
            assert_eq!(s, " hello ");
        } else {
            panic!();
        }
    }

    #[test]
    fn doctype() {
        let t = tokenize("<!doctype html><p>x</p>");
        assert_eq!(types(&t), vec!["Doctype", "Start", "Text", "End", "Eof"]);
    }

    #[test]
    fn style_block_treated_as_raw_text() {
        let t = tokenize("<style>p { color: red < blue; }</style>");
        // start tag is NOT emitted because swallow_raw_text returns
        // RawText directly.
        let kinds = types(&t);
        assert!(kinds.contains(&"RawText"));
        if let Some(HtmlToken::RawText { host_tag, body }) = t
            .iter()
            .find(|tok| matches!(tok, HtmlToken::RawText { .. }))
        {
            assert_eq!(host_tag, "style");
            assert!(body.contains("p { color: red < blue; }"));
        }
    }

    #[test]
    fn nested_tags() {
        let t = tokenize("<div><p>a</p><p>b</p></div>");
        let kinds = types(&t);
        // div div p p text /p p text /p /div Eof — minus trailing
        // nuances. Just verify counts.
        assert_eq!(kinds.iter().filter(|k| **k == "Start").count(), 3);
        assert_eq!(kinds.iter().filter(|k| **k == "End").count(), 3);
        assert_eq!(kinds.iter().filter(|k| **k == "Text").count(), 2);
    }

    #[test]
    fn entity_in_attribute() {
        let t = tokenize(r#"<a title="A &amp; B">x</a>"#);
        if let HtmlToken::StartTag { attrs, .. } = &t[0] {
            assert_eq!(attrs[0].1, "A & B");
        } else {
            panic!();
        }
    }

    #[test]
    fn case_insensitive_tag_names() {
        let t = tokenize("<DIV><P>x</P></DIV>");
        match (&t[0], &t[1]) {
            (HtmlToken::StartTag { name: a, .. }, HtmlToken::StartTag { name: b, .. }) => {
                assert_eq!(a, "div");
                assert_eq!(b, "p");
            },
            _ => panic!(),
        }
    }

    #[test]
    fn multibyte_codepoint_does_not_panic_on_ignore_case_lookahead() {
        // Regression: a multi-byte codepoint at the cursor used to panic
        // because starts_with_ignore_case sliced the haystack by byte
        // length before the char-boundary check.
        let _ = tokenize("<style>p::before { content: \"» \"; }</style>");
        let _ = tokenize("«guillemets at start»");
        let _ = tokenize("❤<p>after-heart</p>");
    }

    #[test]
    fn lone_less_than_passes_through() {
        let t = tokenize("a < b");
        // We expect at least one Text token containing "<" somewhere in
        // the merged stream. The simple rule is "lone `<` not followed
        // by tag-start emits literal `<`".
        let s: String = t
            .iter()
            .filter_map(|tok| match tok {
                HtmlToken::Text(s) => Some(s.clone()),
                _ => None,
            })
            .collect::<Vec<_>>()
            .join("");
        assert!(s.contains("<"));
    }
}