html-cat 0.1.0 - Docs.rs

//! HTML5 tokenizer.
//!
//! Single-pass scan that emits a `Vec<Token>`.  The implementation is
//! recursive: each call to a per-token helper consumes one logical
//! token (text run, tag, comment, or doctype) and returns the new
//! position.  Recursion depth tracks token count, not character count.

use crate::attr::{Attribute, Attributes};
use crate::entity;
use crate::node::{is_escapable_raw_text_element, is_raw_text_element};
use crate::span::{Position, Span};
use crate::token::Token;

/// Tokenize `source` into a sequence of HTML5 tokens, terminated by
/// [`Token::Eof`].
#[must_use]
pub fn tokenize(source: &str) -> Vec<Token> {
    let bytes = source.as_bytes();
    let initial = Position::new(1, 1, 0);
    drain_tokens(bytes, source, 0, initial, Vec::new())
}

fn drain_tokens(
    bytes: &[u8],
    source: &str,
    pos: usize,
    position: Position,
    acc: Vec<Token>,
) -> Vec<Token> {
    if pos >= bytes.len() {
        append(
            acc,
            Token::Eof {
                span: Span::new(position, position),
            },
        )
    } else if at_byte(bytes, pos, b'<') {
        let (token, next_pos, next_position) = scan_tag_like(bytes, source, pos, position);
        let raw_text_name = raw_text_target(&token);
        let extended = append(acc, token);
        if let Some((name, escapable)) = raw_text_name {
            let (text_token, after_text_pos, after_text_position) =
                scan_raw_text(bytes, source, next_pos, next_position, &name, escapable);
            let after_text_acc =
                text_token.map_or_else(|| extended.clone(), |t| append(extended.clone(), t));
            drain_tokens(
                bytes,
                source,
                after_text_pos,
                after_text_position,
                after_text_acc,
            )
        } else {
            drain_tokens(bytes, source, next_pos, next_position, extended)
        }
    } else {
        let (text_token, next_pos, next_position) = scan_text(bytes, source, pos, position);
        let extended = text_token.map_or_else(|| acc.clone(), |t| append(acc.clone(), t));
        drain_tokens(bytes, source, next_pos, next_position, extended)
    }
}

fn raw_text_target(token: &Token) -> Option<(String, bool)> {
    match token {
        Token::StartTag {
            name, self_closing, ..
        } if !self_closing
            && (is_raw_text_element(name) || is_escapable_raw_text_element(name)) =>
        {
            Some((name.clone(), is_escapable_raw_text_element(name)))
        }
        _other => None,
    }
}

fn append(acc: Vec<Token>, token: Token) -> Vec<Token> {
    acc.into_iter().chain(std::iter::once(token)).collect()
}

fn scan_text(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Option<Token>, usize, Position) {
    let end = scan_until(bytes, start, |b| b == b'<');
    let raw = source.get(start..end).unwrap_or("");
    let decoded = decode_entities(raw);
    let end_position = advance_position(position, raw);
    let token = if decoded.is_empty() {
        None
    } else {
        Some(Token::Text {
            content: decoded,
            span: Span::new(position, end_position),
        })
    };
    (token, end, end_position)
}

fn scan_raw_text(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
    name: &str,
    escapable: bool,
) -> (Option<Token>, usize, Position) {
    let end = find_raw_text_end(bytes, source, start, name);
    let raw = source.get(start..end).unwrap_or("");
    let content = if escapable {
        decode_entities(raw)
    } else {
        raw.to_owned()
    };
    let end_position = advance_position(position, raw);
    let token = if content.is_empty() {
        None
    } else {
        Some(Token::Text {
            content,
            span: Span::new(position, end_position),
        })
    };
    (token, end, end_position)
}

fn find_raw_text_end(bytes: &[u8], source: &str, start: usize, name: &str) -> usize {
    let lower_name = name.to_ascii_lowercase();
    step_to_close_tag(bytes, source, start, &lower_name)
}

fn step_to_close_tag(bytes: &[u8], source: &str, start: usize, lower_name: &str) -> usize {
    if start >= bytes.len() {
        bytes.len()
    } else if at_byte(bytes, start, b'<')
        && at_byte(bytes, start + 1, b'/')
        && matches_tag_name_ci(source, start + 2, lower_name)
        && is_tag_name_terminator(bytes, start + 2 + lower_name.len())
    {
        start
    } else {
        step_to_close_tag(bytes, source, start + 1, lower_name)
    }
}

fn matches_tag_name_ci(source: &str, start: usize, lower_name: &str) -> bool {
    source
        .get(start..start + lower_name.len())
        .is_some_and(|slice| slice.eq_ignore_ascii_case(lower_name))
}

fn is_tag_name_terminator(bytes: &[u8], pos: usize) -> bool {
    bytes
        .get(pos)
        .is_some_and(|b| matches!(*b, b' ' | b'\t' | b'\n' | b'\r' | b'/' | b'>'))
}

fn scan_tag_like(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    if at_byte(bytes, start + 1, b'!') {
        scan_markup_declaration(bytes, source, start, position)
    } else if at_byte(bytes, start + 1, b'/') {
        scan_end_tag(bytes, source, start, position)
    } else if is_ascii_letter(bytes.get(start + 1).copied()) {
        scan_start_tag(bytes, source, start, position)
    } else {
        // Bogus `<`; emit as text.
        let next_position = advance_position(position, "<");
        let token = Token::Text {
            content: "<".to_owned(),
            span: Span::new(position, next_position),
        };
        (token, start + 1, next_position)
    }
}

fn scan_markup_declaration(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    if source.get(start..start + 4).is_some_and(|s| s == "<!--") {
        scan_comment(bytes, source, start, position)
    } else if source
        .get(start..start + 9)
        .is_some_and(|s| s.eq_ignore_ascii_case("<!doctype"))
    {
        scan_doctype(bytes, source, start, position)
    } else {
        // Bogus declaration; consume to `>` and emit as comment.
        scan_bogus_declaration(bytes, source, start, position)
    }
}

fn scan_comment(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    let body_start = start + 4;
    let body_end = find_comment_end(bytes, body_start);
    let text = source.get(body_start..body_end).unwrap_or("").to_owned();
    let after_close = (body_end + 3).min(bytes.len());
    let raw = source.get(start..after_close).unwrap_or("");
    let end_position = advance_position(position, raw);
    (
        Token::Comment {
            text,
            span: Span::new(position, end_position),
        },
        after_close,
        end_position,
    )
}

fn find_comment_end(bytes: &[u8], start: usize) -> usize {
    if start + 2 >= bytes.len() {
        bytes.len()
    } else if bytes.get(start) == Some(&b'-')
        && bytes.get(start + 1) == Some(&b'-')
        && bytes.get(start + 2) == Some(&b'>')
    {
        start
    } else {
        find_comment_end(bytes, start + 1)
    }
}

fn scan_doctype(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    let body_start = start + 9;
    let end = scan_until(bytes, body_start, |b| b == b'>');
    let body = source.get(body_start..end).unwrap_or("").trim();
    let after_close = (end + 1).min(bytes.len());
    let raw = source.get(start..after_close).unwrap_or("");
    let end_position = advance_position(position, raw);
    let (name, public_id, system_id) = parse_doctype_body(body);
    (
        Token::Doctype {
            name,
            public_id,
            system_id,
            span: Span::new(position, end_position),
        },
        after_close,
        end_position,
    )
}

fn parse_doctype_body(body: &str) -> (String, Option<String>, Option<String>) {
    let mut parts = body.split_ascii_whitespace();
    let name = parts.next().unwrap_or("").to_ascii_lowercase();
    let kind = parts.next();
    let public_id = if kind.is_some_and(|k| k.eq_ignore_ascii_case("PUBLIC")) {
        parts
            .next()
            .map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
    } else {
        None
    };
    let system_id = if kind.is_some_and(|k| k.eq_ignore_ascii_case("SYSTEM")) {
        parts
            .next()
            .map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
    } else if public_id.is_some() {
        parts
            .next()
            .map(|s| s.trim_matches('"').trim_matches('\'').to_owned())
    } else {
        None
    };
    (name, public_id, system_id)
}

fn scan_bogus_declaration(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    let end = scan_until(bytes, start, |b| b == b'>');
    let body = source.get(start + 2..end).unwrap_or("").to_owned();
    let after_close = (end + 1).min(bytes.len());
    let raw = source.get(start..after_close).unwrap_or("");
    let end_position = advance_position(position, raw);
    (
        Token::Comment {
            text: body,
            span: Span::new(position, end_position),
        },
        after_close,
        end_position,
    )
}

fn scan_end_tag(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    let name_start = start + 2;
    let name_end = scan_while(bytes, name_start, is_ascii_alphanumeric_byte);
    let name = source
        .get(name_start..name_end)
        .unwrap_or("")
        .to_ascii_lowercase();
    let end = scan_until(bytes, name_end, |b| b == b'>');
    let after_close = (end + 1).min(bytes.len());
    let raw = source.get(start..after_close).unwrap_or("");
    let end_position = advance_position(position, raw);
    (
        Token::EndTag {
            name,
            span: Span::new(position, end_position),
        },
        after_close,
        end_position,
    )
}

fn scan_start_tag(
    bytes: &[u8],
    source: &str,
    start: usize,
    position: Position,
) -> (Token, usize, Position) {
    let name_start = start + 1;
    let name_end = scan_while(bytes, name_start, is_ascii_alphanumeric_byte);
    let name = source
        .get(name_start..name_end)
        .unwrap_or("")
        .to_ascii_lowercase();
    let (attributes, after_attrs, self_closing) =
        scan_attributes(bytes, source, name_end, Attributes::new());
    let after_close = (after_attrs + 1).min(bytes.len());
    let raw = source.get(start..after_close).unwrap_or("");
    let end_position = advance_position(position, raw);
    let is_void_self_close = self_closing || crate::node::is_void_element(&name);
    (
        Token::StartTag {
            name,
            attributes,
            self_closing: is_void_self_close,
            span: Span::new(position, end_position),
        },
        after_close,
        end_position,
    )
}

fn scan_attributes(
    bytes: &[u8],
    source: &str,
    start: usize,
    acc: Attributes,
) -> (Attributes, usize, bool) {
    let skipped = skip_whitespace(bytes, start);
    if at_byte(bytes, skipped, b'>') {
        (acc, skipped, false)
    } else if at_byte(bytes, skipped, b'/') && at_byte(bytes, skipped + 1, b'>') {
        (acc, skipped + 1, true)
    } else if skipped >= bytes.len() {
        (acc, skipped, false)
    } else {
        let (attr, next_pos) = scan_one_attribute(bytes, source, skipped);
        let extended = if let Some(a) = attr { acc.with(a) } else { acc };
        scan_attributes(bytes, source, next_pos, extended)
    }
}

fn scan_one_attribute(bytes: &[u8], source: &str, start: usize) -> (Option<Attribute>, usize) {
    let name_end = scan_while(bytes, start, is_attribute_name_byte);
    if name_end == start {
        // Stray byte; skip past it to avoid infinite recursion.
        (None, start + 1)
    } else {
        let name = source
            .get(start..name_end)
            .unwrap_or("")
            .to_ascii_lowercase();
        let after_name_ws = skip_whitespace(bytes, name_end);
        if at_byte(bytes, after_name_ws, b'=') {
            let value_start = skip_whitespace(bytes, after_name_ws + 1);
            let (value, after_value) = scan_attribute_value(bytes, source, value_start);
            (Some(Attribute::new(name, value)), after_value)
        } else {
            (Some(Attribute::new(name, "")), name_end)
        }
    }
}

fn scan_attribute_value(bytes: &[u8], source: &str, start: usize) -> (String, usize) {
    if at_byte(bytes, start, b'"') {
        let end = scan_until(bytes, start + 1, |b| b == b'"');
        let raw = source.get(start + 1..end).unwrap_or("");
        ((decode_entities(raw)), (end + 1).min(bytes.len()))
    } else if at_byte(bytes, start, b'\'') {
        let end = scan_until(bytes, start + 1, |b| b == b'\'');
        let raw = source.get(start + 1..end).unwrap_or("");
        ((decode_entities(raw)), (end + 1).min(bytes.len()))
    } else {
        let end = scan_while(bytes, start, is_unquoted_value_byte);
        let raw = source.get(start..end).unwrap_or("");
        (decode_entities(raw), end)
    }
}

fn decode_entities(raw: &str) -> String {
    decode_entities_recursive(raw, 0, String::new())
}

// `acc` is threaded by value as an accumulator (format! borrows it for
// each step rather than moving), so clippy's pass-by-value lint reads
// false-positive here.
#[allow(clippy::needless_pass_by_value)]
fn decode_entities_recursive(raw: &str, start: usize, acc: String) -> String {
    let amp_rel = raw.get(start..).and_then(|tail| tail.find('&'));
    if let Some(rel) = amp_rel {
        let amp_pos = start + rel;
        let prefix = raw.get(start..amp_pos).unwrap_or("");
        let body_end = raw
            .get(amp_pos + 1..)
            .and_then(|tail| tail.find(|c: char| c == ';' || c.is_whitespace() || c == '<'))
            .map_or(raw.len(), |p| amp_pos + 1 + p);
        let semi = raw.get(body_end..body_end + 1) == Some(";");
        if semi {
            let body = raw.get(amp_pos + 1..body_end).unwrap_or("");
            let next_start = body_end + 1;
            let decoded = entity::decode(body).unwrap_or_else(|| format!("&{body};"));
            decode_entities_recursive(raw, next_start, format!("{acc}{prefix}{decoded}"))
        } else {
            let next_start = amp_pos + 1;
            decode_entities_recursive(raw, next_start, format!("{acc}{prefix}&"))
        }
    } else {
        let tail = raw.get(start..).unwrap_or("");
        format!("{acc}{tail}")
    }
}

fn at_byte(bytes: &[u8], pos: usize, byte: u8) -> bool {
    bytes.get(pos).is_some_and(|b| *b == byte)
}

fn is_ascii_letter(byte: Option<u8>) -> bool {
    byte.is_some_and(|b| b.is_ascii_alphabetic())
}

fn is_ascii_alphanumeric_byte(byte: u8) -> bool {
    byte.is_ascii_alphanumeric() || byte == b'-'
}

fn is_attribute_name_byte(byte: u8) -> bool {
    !matches!(
        byte,
        b' ' | b'\t' | b'\n' | b'\r' | b'/' | b'>' | b'=' | b'"' | b'\'' | b'<'
    )
}

fn is_unquoted_value_byte(byte: u8) -> bool {
    !matches!(
        byte,
        b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'<' | b'"' | b'\'' | b'=' | b'`'
    )
}

fn scan_until(bytes: &[u8], start: usize, stop: impl Fn(u8) -> bool) -> usize {
    bytes
        .iter()
        .enumerate()
        .skip(start)
        .find(|(_, b)| stop(**b))
        .map_or(bytes.len(), |(i, _)| i)
}

fn scan_while(bytes: &[u8], start: usize, take: impl Fn(u8) -> bool) -> usize {
    bytes
        .iter()
        .enumerate()
        .skip(start)
        .find(|(_, b)| !take(**b))
        .map_or(bytes.len(), |(i, _)| i)
}

fn skip_whitespace(bytes: &[u8], start: usize) -> usize {
    scan_while(bytes, start, |b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
}

fn advance_position(start: Position, text: &str) -> Position {
    text.chars().fold(start, |pos, c| {
        if c == '\n' {
            Position::new(pos.line() + 1, 1, pos.offset() + char_len(c))
        } else {
            Position::new(pos.line(), pos.column() + 1, pos.offset() + char_len(c))
        }
    })
}

fn char_len(c: char) -> u32 {
    u32::try_from(c.len_utf8()).unwrap_or(1)
}