browser_tester 1.5.0

Deterministic lightweight browser runtime for Rust tests
Documentation
use super::*;

#[path = "html/html_optional_tag_rules.rs"]
mod html_optional_tag_rules;
#[path = "html/html_raw_end_tag_scanner.rs"]
mod html_raw_end_tag_scanner;
#[path = "html/html_script_helpers.rs"]
mod html_script_helpers;
#[path = "html/html_tag_parsing.rs"]
mod html_tag_parsing;
#[path = "html/html_text_entities.rs"]
mod html_text_entities;

use html_optional_tag_rules::*;
use html_raw_end_tag_scanner::*;
use html_script_helpers::*;
use html_tag_parsing::*;
use html_text_entities::*;

pub(super) fn unescape_string(src: &str) -> String {
    let mut out = String::new();
    let chars = src.chars().collect::<Vec<_>>();
    let mut i = 0usize;
    while i < chars.len() {
        if chars[i] == '\\' && i + 1 < chars.len() {
            match chars[i + 1] {
                'n' => {
                    out.push('\n');
                    i += 2;
                }
                'r' => {
                    out.push('\r');
                    i += 2;
                }
                't' => {
                    out.push('\t');
                    i += 2;
                }
                '\\' => {
                    out.push('\\');
                    i += 2;
                }
                '\'' => {
                    out.push('\'');
                    i += 2;
                }
                '"' => {
                    out.push('"');
                    i += 2;
                }
                '`' => {
                    out.push('`');
                    i += 2;
                }
                '$' => {
                    out.push('$');
                    i += 2;
                }
                'u' if i + 5 < chars.len() => {
                    let hex = [chars[i + 2], chars[i + 3], chars[i + 4], chars[i + 5]];
                    let mut parsed = String::new();
                    for ch in hex {
                        parsed.push(ch);
                    }
                    if parsed.chars().all(|ch| ch.is_ascii_hexdigit()) {
                        if let Ok(codepoint) = u16::from_str_radix(&parsed, 16) {
                            out.push(crate::js_regex::internalize_utf16_code_unit(codepoint));
                            i += 6;
                            continue;
                        }
                    }
                    out.push('u');
                    i += 2;
                }
                'x' if i + 3 < chars.len() => {
                    let hex = [chars[i + 2], chars[i + 3]];
                    let mut parsed = String::new();
                    for ch in hex {
                        parsed.push(ch);
                    }
                    if parsed.chars().all(|ch| ch.is_ascii_hexdigit()) {
                        if let Ok(codepoint) = u32::from_str_radix(&parsed, 16) {
                            if let Some(ch) = char::from_u32(codepoint) {
                                out.push(ch);
                                i += 4;
                                continue;
                            }
                        }
                    }
                    out.push('x');
                    i += 2;
                }
                other => {
                    out.push(other);
                    i += 2;
                }
            }
        } else {
            out.push(chars[i]);
            i += 1;
        }
    }
    out
}

pub(super) fn parse_html(html: &str) -> Result<ParseOutput> {
    let mut dom = Dom::new();
    let mut scripts = Vec::new();

    let mut stack = vec![dom.root];
    let bytes = html.as_bytes();
    let mut i = 0usize;

    while i < bytes.len() {
        if starts_with_at(bytes, i, b"<!--") {
            if let Some(end) = find_subslice(bytes, i + 4, b"-->") {
                i = end + 3;
            } else {
                return Err(Error::HtmlParse("unclosed HTML comment".into()));
            }
            continue;
        }

        if bytes[i] == b'<' {
            if starts_with_at(bytes, i, b"</") {
                let (tag, next) = parse_end_tag(html, i)?;
                i = next;

                while stack.len() > 1 {
                    let top = *stack
                        .last()
                        .ok_or_else(|| Error::HtmlParse("invalid stack state".into()))?;
                    let top_tag = dom.tag_name(top).unwrap_or("");
                    stack.pop();
                    if top_tag.eq_ignore_ascii_case(&tag) {
                        break;
                    }
                }
                continue;
            }

            if starts_with_at(bytes, i, b"<!") {
                i = parse_declaration_tag(html, i)?;
                continue;
            }

            let (tag, attrs, self_closing, next) = parse_start_tag(html, i)?;
            i = next;
            let inside_template = stack.iter().any(|node| {
                dom.tag_name(*node)
                    .is_some_and(|open_tag| open_tag.eq_ignore_ascii_case("template"))
            });
            let script_src_attr = attrs.get("src").cloned();
            let executable_script = tag.eq_ignore_ascii_case("script")
                && !inside_template
                && is_executable_script_type(attrs.get("type").map(String::as_str));
            let module_script = tag.eq_ignore_ascii_case("script")
                && !inside_template
                && is_module_script_type(attrs.get("type").map(String::as_str));
            close_optional_description_item_start_tag(&dom, &mut stack, &tag);
            close_optional_list_item_start_tag(&dom, &mut stack, &tag);
            close_optional_option_start_tag(&dom, &mut stack, &tag);
            close_optional_optgroup_start_tag(&dom, &mut stack, &tag);
            close_optional_ruby_text_start_tag(&dom, &mut stack, &tag);
            close_optional_ruby_fallback_parenthesis_start_tag(&dom, &mut stack, &tag);
            close_optional_paragraph_start_tag(&dom, &mut stack, &tag);

            let parent = *stack
                .last()
                .ok_or_else(|| Error::HtmlParse("missing parent element".into()))?;
            let node = dom.create_element(parent, tag.clone(), attrs);

            if tag.eq_ignore_ascii_case("script") {
                let close = if executable_script {
                    find_case_insensitive_end_tag(bytes, i, b"script")
                        .or_else(|| find_case_insensitive_raw_end_tag(bytes, i, b"script"))
                } else {
                    find_case_insensitive_raw_end_tag(bytes, i, b"script")
                }
                .ok_or_else(|| Error::HtmlParse("unclosed <script>".into()))?;
                if let Some(script_body) = html.get(i..close) {
                    if !script_body.is_empty() {
                        dom.create_text(node, script_body.to_string());
                    }
                    if executable_script {
                        if let Some(src) = script_src_attr.as_deref() {
                            if let Some(source) = decode_data_script_source(src)? {
                                scripts.push(ScriptSource {
                                    code: source,
                                    is_module: module_script,
                                });
                            }
                        } else if !script_body.is_empty() {
                            scripts.push(ScriptSource {
                                code: script_body.to_string(),
                                is_module: module_script,
                            });
                        }
                    }
                }
                i = close;
                let (_, after_end) = parse_end_tag(html, i)?;
                i = after_end;
                continue;
            }

            if tag.eq_ignore_ascii_case("noscript") && !self_closing {
                let close = find_case_insensitive_raw_end_tag(bytes, i, b"noscript")
                    .ok_or_else(|| Error::HtmlParse("unclosed <noscript>".into()))?;
                if let Some(noscript_body) = html.get(i..close) {
                    if !noscript_body.is_empty() {
                        dom.create_text(node, noscript_body.to_string());
                    }
                }
                i = close;
                let (_, after_end) = parse_end_tag(html, i)?;
                i = after_end;
                continue;
            }

            if tag.eq_ignore_ascii_case("title") && !self_closing {
                let close = find_case_insensitive_raw_end_tag(bytes, i, b"title")
                    .ok_or_else(|| Error::HtmlParse("unclosed <title>".into()))?;
                if let Some(title_body) = html.get(i..close) {
                    if !title_body.is_empty() {
                        let decoded = decode_html_character_references(title_body);
                        if !decoded.is_empty() {
                            dom.create_text(node, decoded);
                        }
                    }
                }
                i = close;
                let (_, after_end) = parse_end_tag(html, i)?;
                i = after_end;
                continue;
            }

            if !self_closing && !is_void_tag(&tag) {
                stack.push(node);
            }
            continue;
        }

        let text_start = i;
        while i < bytes.len() && bytes[i] != b'<' {
            i += 1;
        }

        if let Some(text) = html.get(text_start..i) {
            if !text.is_empty() {
                let parent = *stack
                    .last()
                    .ok_or_else(|| Error::HtmlParse("missing parent element".into()))?;
                let mut decoded = decode_html_character_references(text);
                if should_strip_initial_pre_newline(&dom, parent) {
                    decoded = strip_initial_pre_newline(&decoded);
                }
                if !decoded.is_empty() {
                    dom.create_text(parent, decoded);
                }
            }
        }
    }

    dom.initialize_form_control_values()?;
    dom.normalize_radio_groups()?;
    dom.normalize_named_details_groups()?;
    dom.normalize_single_head_element()?;
    dom.normalize_single_body_element()?;
    dom.normalize_implied_table_bodies()?;
    Ok(ParseOutput { dom, scripts })
}

pub(super) fn is_void_tag(tag: &str) -> bool {
    matches!(
        tag,
        "area"
            | "base"
            | "br"
            | "col"
            | "embed"
            | "hr"
            | "img"
            | "input"
            | "link"
            | "meta"
            | "param"
            | "source"
            | "track"
            | "wbr"
    )
}

pub(super) fn can_start_regex_literal(previous: Option<u8>) -> bool {
    match previous {
        None => true,
        Some(byte) => matches!(
            byte,
            b'(' | b'['
                | b'{'
                | b','
                | b';'
                | b':'
                | b'='
                | b'!'
                | b'?'
                | b'&'
                | b'|'
                | b'^'
                | b'~'
                | b'<'
                | b'>'
                | b'+'
                | b'-'
                | b'*'
                | b'%'
                | b'/'
        ),
    }
}