fast_h2m 0.4.2

High-performance HTML to Markdown converter
Documentation
//! Single-pass byte scanner that cleans HTML and emits signals
//! consumed by the tier-1/tier-2 router (added in M2).

use std::borrow::Cow;
use std::ops::Range;
use std::str;

use memchr::memchr;

/// Signals captured during the single prescan pass.
#[derive(Debug, Default, Clone)]
pub struct PrescanReport {
    /// Byte range of the contents of `<head>…</head>` (between the tags) in the
    /// **cleaned** buffer, or `None`.
    pub head_range: Option<Range<usize>>,
    /// Any tag-open whose name contains `-` (custom-elements heuristic).
    pub had_custom_elements: bool,
    /// Any occurrence of `<![CDATA[`.
    pub had_cdata: bool,
    /// Any `<` that the prescan escaped via the invalid-tag branch.
    pub had_unescaped_lt: bool,
    /// Saw `<script>` or `<style>` in the source.
    pub has_script_or_style: bool,
    /// SVG depth ever exceeded zero.
    pub has_svg: bool,
}

// Tags that are stripped of their content by the prescan.
const STRIP_CONTENT_TAGS: [&[u8]; 2] = [b"script", b"style"];

const SVG_TAG: &[u8] = b"svg";
const HEAD_TAG: &[u8] = b"head";
const CDATA_START: &[u8] = b"<![CDATA[";
const DOCTYPE: &[u8] = b"doctype";
const EMPTY_COMMENT: &[u8] = b"<!---->";
const SELF_CLOSING: [(&[u8], &str); 3] =
    [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];

/// Run the prescan over `html`, returning the cleaned buffer and signals.
///
/// `Cow::Borrowed` is returned when no transformation was needed.
///
/// # Panics
///
/// Panics if a tag-name byte sequence encountered during script/style stripping
/// is not valid UTF-8 (this cannot happen in practice because it is always a
/// sub-slice of the valid UTF-8 input `html`).
pub fn run(html: &str) -> (Cow<'_, str>, PrescanReport) {
    let bytes = html.as_bytes();
    let len = bytes.len();

    if len == 0 {
        return (Cow::Borrowed(html), PrescanReport::default());
    }

    let mut report = PrescanReport::default();

    let mut idx = 0usize;
    let mut last = 0usize;
    let mut output: Option<String> = None;

    let mut svg_depth = 0usize;

    // Head-range tracking: byte index in the *output* buffer after `<head…>` closes.
    let mut head_open_end: Option<usize> = None;

    while idx < len {
        if bytes[idx] != b'<' {
            match memchr(b'<', &bytes[idx + 1..]) {
                Some(next) => {
                    idx += next + 1;
                }
                None => break,
            }
        }

        // ── `<![CDATA[` detection (signal only; cleaning falls through) ─────────
        // The `<` in `<![CDATA[` will be processed by the is_valid_tag check below
        // (it is NOT a valid tag: `!` followed by `[` fails the validity test), so
        // it gets escaped to `&lt;` — exactly what the original preprocess_html did.
        // We only set the signal here without `continue`.
        if bytes[idx..].starts_with(CDATA_START) {
            report.had_cdata = true;
            // Fall through to is_valid_tag / escape logic below.
        }

        // ── Empty-comment normalisation: `<!---->` → `<!-- -->` ───────────────
        if bytes[idx..].starts_with(EMPTY_COMMENT) {
            let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
            // flush output position accounting for bytes emitted into `output`
            out.push_str(&html[last..idx]);
            out.push_str("<!-- -->");
            idx += EMPTY_COMMENT.len();
            last = idx;
            continue;
        }

        // ── Self-closing normalisation: `<br/>` → `<br>` etc. ────────────────
        {
            let mut replaced = false;
            for (pattern, replacement) in &SELF_CLOSING {
                if bytes[idx..].starts_with(pattern) {
                    let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
                    out.push_str(&html[last..idx]);
                    out.push_str(replacement);
                    idx += pattern.len();
                    last = idx;
                    replaced = true;
                    break;
                }
            }
            if replaced {
                continue;
            }
        }

        // ── SVG open / close ──────────────────────────────────────────────────
        if matches_tag_start(bytes, idx + 1, SVG_TAG) {
            if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG_TAG.len()) {
                svg_depth += 1;
                report.has_svg = true;
                idx = open_end;
                continue;
            }
        } else if matches_end_tag_start(bytes, idx + 1, SVG_TAG)
            && let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG_TAG.len())
        {
            if svg_depth > 0 {
                svg_depth = svg_depth.saturating_sub(1);
            }
            idx = close_end;
            continue;
        }

        // ── Operations only outside SVG ───────────────────────────────────────
        if svg_depth == 0 {
            // ── `<script>` / `<style>` content stripping ──────────────────────
            let mut handled = false;
            for tag in &STRIP_CONTENT_TAGS {
                if matches_tag_start(bytes, idx + 1, tag)
                    && let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len())
                {
                    report.has_script_or_style = true;
                    let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
                    let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
                    out.push_str(&html[last..idx]);
                    out.push_str(&html[idx..open_end]);
                    out.push_str("</");
                    out.push_str(str::from_utf8(tag).unwrap());
                    out.push('>');
                    last = remove_end;
                    idx = remove_end;
                    handled = true;
                    break;
                }
            }

            if handled {
                continue;
            }

            // ── DOCTYPE stripping ─────────────────────────────────────────────
            if idx + 2 < len && bytes[idx + 1] == b'!' {
                let mut cursor = idx + 2;
                while cursor < len && bytes[cursor].is_ascii_whitespace() {
                    cursor += 1;
                }
                if cursor + DOCTYPE.len() <= len
                    && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
                    && let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len())
                {
                    let out = output.get_or_insert_with(|| String::with_capacity(html.len()));
                    out.push_str(&html[last..idx]);
                    last = end;
                    idx = end;
                    continue;
                }
            }

            // ── Signal: `<head>` / `</head>` ─────────────────────────────────
            if matches_tag_start(bytes, idx + 1, HEAD_TAG) {
                if let Some(open_end) = find_tag_end(bytes, idx + 1 + HEAD_TAG.len()) {
                    // Record output position after the `<head…>` close-bracket.
                    // We need to compute the offset in the *output* buffer.
                    let flushed_so_far = if let Some(ref out) = output {
                        out.len() + (open_end - last)
                    } else {
                        open_end
                    };
                    head_open_end = Some(flushed_so_far);
                    idx = open_end;
                    continue;
                }
            } else if matches_end_tag_start(bytes, idx + 1, HEAD_TAG)
                && let Some(close_end) = find_tag_end(bytes, idx + 2 + HEAD_TAG.len())
            {
                if let Some(start) = head_open_end.take() {
                    // The `</head>` tag itself starts at the current output position.
                    let flushed_so_far = if let Some(ref out) = output {
                        out.len() + (idx - last)
                    } else {
                        idx
                    };
                    report.head_range = Some(start..flushed_so_far);
                }
                idx = close_end;
                continue;
            }

            // ── Signal: custom elements (tag name contains `-`) ───────────────
            // Only fires for open tags, not close tags.
            {
                let tag_start = idx + 1;
                if tag_start < len && (bytes[tag_start].is_ascii_alphabetic()) {
                    // Find the end of the tag name.
                    let name_end = {
                        let mut e = tag_start;
                        while e < len
                            && (bytes[e].is_ascii_alphanumeric()
                                || bytes[e] == b'-'
                                || bytes[e] == b'_')
                        {
                            e += 1;
                        }
                        e
                    };
                    let tag_name = &bytes[tag_start..name_end];
                    if tag_name.contains(&b'-') {
                        report.had_custom_elements = true;
                    }
                }
            }
        }

        // ── Validity check (applies at all depths) ────────────────────────────
        let is_valid_tag = if idx + 1 < len {
            match bytes[idx + 1] {
                b'!' => {
                    idx + 2 < len
                        && (bytes[idx + 2] == b'-'
                            || bytes[idx + 2].is_ascii_alphabetic()
                            || bytes[idx + 2].is_ascii_uppercase())
                }
                b'/' => {
                    idx + 2 < len
                        && (bytes[idx + 2].is_ascii_alphabetic()
                            || bytes[idx + 2].is_ascii_uppercase())
                }
                b'?' => true,
                c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
                _ => false,
            }
        } else {
            false
        };

        if !is_valid_tag {
            report.had_unescaped_lt = true;
            let out = output.get_or_insert_with(|| String::with_capacity(html.len() + 4));
            out.push_str(&html[last..idx]);
            out.push_str("&lt;");
            idx += 1;
            last = idx;
            continue;
        }

        idx += 1;
    }

    // If `<head>` was opened but `</head>` was never seen, record to EOF.
    if let Some(start) = head_open_end.take() {
        let end = if let Some(ref out) = output {
            out.len() + (len - last)
        } else {
            len
        };
        report.head_range = Some(start..end);
    }

    let cow = if let Some(mut out) = output {
        if last < len {
            out.push_str(&html[last..]);
        }
        Cow::Owned(out)
    } else {
        Cow::Borrowed(html)
    };

    (cow, report)
}

// ── Private helpers (mirrors of the ones in converter.rs) ──────────────────

fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
    if start >= bytes.len() || start + tag.len() > bytes.len() {
        return false;
    }
    if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
        return false;
    }
    start += tag.len();
    matches!(
        bytes.get(start),
        Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') | None
    )
}

fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
    if start >= bytes.len() || bytes[start] != b'/' {
        return false;
    }
    matches_tag_start(bytes, start + 1, tag)
}

fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
    let len = bytes.len();
    let mut in_quote: Option<u8> = None;
    while idx < len {
        let next = find_quote_or_close(bytes, idx)?;
        idx = next;
        match bytes[idx] {
            b'"' | b'\'' => {
                if let Some(current) = in_quote {
                    if current == bytes[idx] {
                        in_quote = None;
                    }
                } else {
                    in_quote = Some(bytes[idx]);
                }
            }
            b'>' if in_quote.is_none() => return Some(idx + 1),
            _ => {}
        }
        idx += 1;
    }
    None
}

fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
    let len = bytes.len();
    let mut depth = 1usize;
    while idx < len {
        let Some(next_lt) = find_lt(bytes, idx) else {
            break;
        };
        idx = next_lt;
        if matches_tag_start(bytes, idx + 1, tag) {
            if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
                depth += 1;
                idx = next;
                continue;
            }
        } else if matches_end_tag_start(bytes, idx + 1, tag)
            && let Some(close) = find_tag_end(bytes, idx + 2 + tag.len())
        {
            depth -= 1;
            if depth == 0 {
                return Some(close);
            }
            idx = close;
            continue;
        }
        idx += 1;
    }
    None
}

#[inline]
fn find_quote_or_close(bytes: &[u8], start: usize) -> Option<usize> {
    memchr::memchr3(b'"', b'\'', b'>', &bytes[start..]).map(|pos| start + pos)
}

#[inline]
fn find_lt(bytes: &[u8], start: usize) -> Option<usize> {
    memchr(b'<', &bytes[start..]).map(|pos| start + pos)
}