index-dom 1.0.0

//! HTML parsing boundary for Index.
//!
//! Milestone 1 uses `scraper` to parse hostile, malformed static HTML into a
//! small semantic representation. Terminal rendering concerns stay out of this
//! crate.

use std::fmt::{Display, Formatter};

use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
use url::Url;

const MAX_LAYOUT_SPACER_LINES: u8 = 3;
const INDEX_MANIFEST_VERSION: &str = "index.idx/v1";
const MAX_MANIFEST_BYTES: usize = 32 * 1024;
const MAX_MANIFEST_HINTS: usize = 64;
const MAX_MANIFEST_STRING_LEN: usize = 256;
const MAX_MANIFEST_SELECTOR_COMPLEXITY: usize = 16;

/// Parsed `index.idx` manifest.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexManifest {
    /// Protocol version.
    pub version: String,
    /// Manifest source URL.
    pub source_url: String,
    /// Scope path prefix.
    pub scope: String,
    /// Content presentation hints.
    pub content: IndexContentHint,
    /// Region hints.
    pub regions: Vec<IndexRegionHint>,
    /// Field hints.
    pub fields: Vec<IndexFieldHint>,
    /// Form hints.
    pub forms: Vec<IndexFormHint>,
    /// Date hints.
    pub dates: Vec<IndexDateHint>,
}

/// Content-level hints.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct IndexContentHint {
    /// Optional preferred main-content selector.
    pub main_selector: Option<String>,
}

/// Region hint entry.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexRegionHint {
    /// Stable role label (for example `main`, `navigation`, `related`).
    pub role: String,
    /// CSS selector for the region.
    pub selector: String,
    /// Whether the region should be initially collapsed.
    pub collapsed: bool,
}

/// Field hint entry.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexFieldHint {
    /// Stable field name.
    pub name: String,
    /// Optional field label.
    pub label: Option<String>,
}

/// Form hint entry.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexFormHint {
    /// Stable form name.
    pub name: String,
    /// Optional CSS selector.
    pub selector: Option<String>,
    /// Optional short note.
    pub note: Option<String>,
}

/// Date style hint.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexDateStyle {
    /// Date-only presentation.
    Date,
    /// Date and time presentation.
    DateTime,
}

impl IndexDateStyle {
    fn parse(input: &str) -> Option<Self> {
        match input.trim().to_ascii_lowercase().as_str() {
            "date" => Some(Self::Date),
            "datetime" | "date-time" => Some(Self::DateTime),
            _ => None,
        }
    }
}

/// Date hint entry.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndexDateHint {
    /// Field name this hint applies to.
    pub field: String,
    /// Requested style.
    pub style: IndexDateStyle,
}

/// Manifest parsing and validation errors.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IndexManifestError {
    /// Manifest exceeds size limit.
    TooLarge {
        /// Maximum allowed manifest bytes.
        max_bytes: usize,
        /// Actual manifest bytes.
        actual_bytes: usize,
    },
    /// Manifest JSON is invalid.
    InvalidJson(String),
    /// Manifest version is unsupported.
    UnsupportedVersion(String),
    /// Manifest source URL is invalid.
    InvalidSourceUrl(String),
    /// Page URL is invalid.
    InvalidPageUrl(String),
    /// Manifest source is not same-origin with page URL.
    CrossOrigin {
        /// Manifest URL.
        source_url: String,
        /// Page URL.
        page_url: String,
    },
    /// Scope value is invalid.
    InvalidScope(String),
    /// Scope does not include page URL.
    OutOfScope {
        /// Manifest scope.
        scope: String,
        /// Requested page path.
        page_path: String,
    },
    /// Hint count exceeded limits.
    TooManyHints {
        /// Hint category.
        kind: &'static str,
        /// Maximum allowed entries for the category.
        max: usize,
    },
    /// Hint value failed validation.
    InvalidHint {
        /// Hint category.
        kind: &'static str,
        /// Validation failure reason.
        reason: String,
    },
}

impl Display for IndexManifestError {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::TooLarge {
                max_bytes,
                actual_bytes,
            } => {
                write!(
                    f,
                    "manifest exceeds limit: {actual_bytes} bytes (max {max_bytes})"
                )
            }
            Self::InvalidJson(error) => write!(f, "manifest JSON is invalid: {error}"),
            Self::UnsupportedVersion(version) => {
                write!(f, "unsupported manifest version: {version}")
            }
            Self::InvalidSourceUrl(url) => write!(f, "manifest source URL is invalid: {url}"),
            Self::InvalidPageUrl(url) => write!(f, "page URL is invalid: {url}"),
            Self::CrossOrigin {
                source_url,
                page_url,
            } => write!(
                f,
                "manifest source must be same-origin: {source_url} vs {page_url}"
            ),
            Self::InvalidScope(scope) => write!(f, "manifest scope is invalid: {scope}"),
            Self::OutOfScope { scope, page_path } => {
                write!(f, "page path {page_path} is outside manifest scope {scope}")
            }
            Self::TooManyHints { kind, max } => {
                write!(f, "manifest has too many {kind} hints (max {max})")
            }
            Self::InvalidHint { kind, reason } => {
                write!(f, "manifest {kind} hint is invalid: {reason}")
            }
        }
    }
}

impl std::error::Error for IndexManifestError {}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
struct RawIndexManifest {
    version: String,
    #[serde(default)]
    scope: Option<String>,
    #[serde(default)]
    content: RawIndexContentHint,
    #[serde(default)]
    regions: Vec<RawIndexRegionHint>,
    #[serde(default)]
    fields: Vec<RawIndexFieldHint>,
    #[serde(default)]
    forms: Vec<RawIndexFormHint>,
    #[serde(default)]
    dates: Vec<RawIndexDateHint>,
}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Default)]
struct RawIndexContentHint {
    #[serde(default)]
    main_selector: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexRegionHint {
    role: String,
    selector: String,
    #[serde(default)]
    collapsed: bool,
}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexFieldHint {
    name: String,
    #[serde(default)]
    label: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexFormHint {
    name: String,
    #[serde(default)]
    selector: Option<String>,
    #[serde(default)]
    note: Option<String>,
}

#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
struct RawIndexDateHint {
    field: String,
    style: String,
}

/// Returns the canonical same-origin `/.well-known/index.idx` candidate URL.
#[must_use]
pub fn well_known_index_manifest_url(page_url: &str) -> Option<String> {
    let mut url = Url::parse(page_url).ok()?;
    url.set_path("/.well-known/index.idx");
    url.set_query(None);
    url.set_fragment(None);
    Some(url.to_string())
}

/// Discovers a manifest URL from HTML `<link rel=\"index-manifest\">`.
#[must_use]
pub fn discover_index_manifest_link_from_html(html: &str, page_url: &str) -> Option<String> {
    let base = Url::parse(page_url).ok()?;
    let document = Html::parse_document(html);
    let selector = selector("link[rel][href]")?;
    for link in document.select(&selector) {
        let rel = link.value().attr("rel").unwrap_or_default();
        let is_manifest = rel
            .split(|c: char| c.is_whitespace() || c == ',')
            .any(|token| token.eq_ignore_ascii_case("index-manifest"));
        if !is_manifest {
            continue;
        }
        let href = link.value().attr("href")?;
        let joined = base.join(href).ok()?;
        return Some(joined.to_string());
    }
    None
}

/// Discovers a manifest URL from an HTTP `Link` header value.
#[must_use]
pub fn discover_index_manifest_link_from_http_link_header(
    header_value: &str,
    page_url: &str,
) -> Option<String> {
    let base = Url::parse(page_url).ok()?;
    for chunk in header_value.split(',') {
        let trimmed = chunk.trim();
        let Some((target_part, params_part)) = trimmed.split_once('>') else {
            continue;
        };
        let target = target_part.trim().strip_prefix('<')?;
        let is_manifest = params_part
            .split(';')
            .map(str::trim)
            .filter_map(|part| part.strip_prefix("rel="))
            .map(|rel| rel.trim_matches('"'))
            .any(|rel| {
                rel.split_whitespace()
                    .any(|token| token.eq_ignore_ascii_case("index-manifest"))
            });
        if !is_manifest {
            continue;
        }
        let joined = base.join(target).ok()?;
        return Some(joined.to_string());
    }
    None
}

/// Parses and validates an `index.idx` manifest body.
pub fn parse_index_manifest(
    input: &str,
    source_url: &str,
    page_url: &str,
) -> Result<IndexManifest, IndexManifestError> {
    let actual_bytes = input.len();
    if actual_bytes > MAX_MANIFEST_BYTES {
        return Err(IndexManifestError::TooLarge {
            max_bytes: MAX_MANIFEST_BYTES,
            actual_bytes,
        });
    }
    let source = Url::parse(source_url)
        .map_err(|_| IndexManifestError::InvalidSourceUrl(source_url.to_owned()))?;
    let page = Url::parse(page_url)
        .map_err(|_| IndexManifestError::InvalidPageUrl(page_url.to_owned()))?;
    if !same_origin(&source, &page) {
        return Err(IndexManifestError::CrossOrigin {
            source_url: source_url.to_owned(),
            page_url: page_url.to_owned(),
        });
    }

    let raw = serde_json::from_str::<RawIndexManifest>(input)
        .map_err(|error| IndexManifestError::InvalidJson(error.to_string()))?;
    if raw.version != INDEX_MANIFEST_VERSION {
        return Err(IndexManifestError::UnsupportedVersion(raw.version));
    }

    let scope = normalize_scope(raw.scope.as_deref(), page.path())?;
    if !page.path().starts_with(&scope) {
        return Err(IndexManifestError::OutOfScope {
            scope,
            page_path: page.path().to_owned(),
        });
    }

    if raw.regions.len() > MAX_MANIFEST_HINTS {
        return Err(IndexManifestError::TooManyHints {
            kind: "region",
            max: MAX_MANIFEST_HINTS,
        });
    }
    if raw.fields.len() > MAX_MANIFEST_HINTS {
        return Err(IndexManifestError::TooManyHints {
            kind: "field",
            max: MAX_MANIFEST_HINTS,
        });
    }
    if raw.forms.len() > MAX_MANIFEST_HINTS {
        return Err(IndexManifestError::TooManyHints {
            kind: "form",
            max: MAX_MANIFEST_HINTS,
        });
    }
    if raw.dates.len() > MAX_MANIFEST_HINTS {
        return Err(IndexManifestError::TooManyHints {
            kind: "date",
            max: MAX_MANIFEST_HINTS,
        });
    }

    let main_selector = raw
        .content
        .main_selector
        .as_deref()
        .map(str::trim)
        .filter(|value| !value.is_empty())
        .map(|value| validate_selector("content.main_selector", value))
        .transpose()?;

    let regions = raw
        .regions
        .into_iter()
        .map(|raw| {
            let role = validate_hint_text("region.role", &raw.role)?;
            let selector = validate_selector("region.selector", &raw.selector)?;
            Ok(IndexRegionHint {
                role,
                selector,
                collapsed: raw.collapsed,
            })
        })
        .collect::<Result<Vec<_>, IndexManifestError>>()?;

    let fields = raw
        .fields
        .into_iter()
        .map(|raw| {
            let name = validate_hint_text("field.name", &raw.name)?;
            let label = raw
                .label
                .as_deref()
                .map(str::trim)
                .filter(|value| !value.is_empty())
                .map(|value| validate_hint_text("field.label", value))
                .transpose()?;
            Ok(IndexFieldHint { name, label })
        })
        .collect::<Result<Vec<_>, IndexManifestError>>()?;

    let forms = raw
        .forms
        .into_iter()
        .map(|raw| {
            let name = validate_hint_text("form.name", &raw.name)?;
            let selector = raw
                .selector
                .as_deref()
                .map(str::trim)
                .filter(|value| !value.is_empty())
                .map(|value| validate_selector("form.selector", value))
                .transpose()?;
            let note = raw
                .note
                .as_deref()
                .map(str::trim)
                .filter(|value| !value.is_empty())
                .map(|value| validate_hint_text("form.note", value))
                .transpose()?;
            Ok(IndexFormHint {
                name,
                selector,
                note,
            })
        })
        .collect::<Result<Vec<_>, IndexManifestError>>()?;

    let dates = raw
        .dates
        .into_iter()
        .map(|raw| {
            let field = validate_hint_text("date.field", &raw.field)?;
            let style = IndexDateStyle::parse(&raw.style).ok_or_else(|| {
                IndexManifestError::InvalidHint {
                    kind: "date.style",
                    reason: format!("unsupported style: {}", raw.style),
                }
            })?;
            Ok(IndexDateHint { field, style })
        })
        .collect::<Result<Vec<_>, IndexManifestError>>()?;

    Ok(IndexManifest {
        version: INDEX_MANIFEST_VERSION.to_owned(),
        source_url: source_url.to_owned(),
        scope,
        content: IndexContentHint { main_selector },
        regions,
        fields,
        forms,
        dates,
    })
}

fn same_origin(left: &Url, right: &Url) -> bool {
    left.scheme() == right.scheme()
        && left.host_str() == right.host_str()
        && left.port_or_known_default() == right.port_or_known_default()
}

fn normalize_scope(scope: Option<&str>, page_path: &str) -> Result<String, IndexManifestError> {
    let normalized = scope.unwrap_or("/").trim();
    if normalized.is_empty() || !normalized.starts_with('/') {
        return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
    }
    if normalized.len() > MAX_MANIFEST_STRING_LEN {
        return Err(IndexManifestError::InvalidScope(normalized.to_owned()));
    }
    if page_path.is_empty() {
        return Ok(normalized.to_owned());
    }
    Ok(normalized.to_owned())
}

fn validate_hint_text(kind: &'static str, text: &str) -> Result<String, IndexManifestError> {
    let value = text.trim();
    if value.is_empty() || value.len() > MAX_MANIFEST_STRING_LEN {
        return Err(IndexManifestError::InvalidHint {
            kind,
            reason: "text length is out of bounds".to_owned(),
        });
    }
    Ok(value.to_owned())
}

fn validate_selector(kind: &'static str, selector: &str) -> Result<String, IndexManifestError> {
    let value = validate_hint_text(kind, selector)?;
    let complexity = value
        .chars()
        .filter(|ch| matches!(ch, '>' | '+' | '~' | '[' | ']' | ':' | '*' | '#'))
        .count();
    if complexity > MAX_MANIFEST_SELECTOR_COMPLEXITY {
        return Err(IndexManifestError::InvalidHint {
            kind,
            reason: "selector complexity exceeds limit".to_owned(),
        });
    }
    Selector::parse(&value)
        .map_err(|_| IndexManifestError::InvalidHint {
            kind,
            reason: "selector syntax is invalid".to_owned(),
        })
        .map(|_| value)
}

/// Parsed HTML document.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlDocument {
    /// Original input.
    pub raw: String,
    /// Best extracted title.
    pub title: Option<String>,
    /// Extracted headings in document order.
    pub headings: Vec<HtmlHeading>,
    /// Extracted links from the main content region.
    pub links: Vec<HtmlLink>,
    /// Forms extracted from the main content region.
    pub forms: Vec<HtmlForm>,
    /// Semantic nodes extracted from the main content region.
    pub nodes: Vec<HtmlNode>,
    /// Extracted document metadata.
    pub metadata: HtmlMetadata,
    /// Text content from extracted user-visible nodes.
    pub body_text: String,
}

/// Metadata extracted from the HTML head.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct HtmlMetadata {
    /// Canonical URL when known.
    pub canonical_url: Option<String>,
    /// Declared document language when known.
    pub language: Option<String>,
    /// Standard description metadata.
    pub description: Option<String>,
    /// OpenGraph title metadata.
    pub open_graph_title: Option<String>,
    /// OpenGraph description metadata.
    pub open_graph_description: Option<String>,
}

/// Heading extracted from HTML.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlHeading {
    /// One-based heading level.
    pub level: u8,
    /// Heading text.
    pub text: String,
}

/// Link extracted from HTML.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlLink {
    /// Link text.
    pub text: String,
    /// Link target.
    pub href: String,
}

/// Semantic HTML region role.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HtmlSectionRole {
    /// Primary content region.
    Main,
    /// Navigation region.
    Navigation,
    /// Sidebar or complementary content.
    Aside,
    /// Footer or content information.
    Footer,
    /// Comments or discussion region.
    Comments,
    /// Related links or related content.
    Related,
    /// Unknown secondary region.
    Unknown,
}

/// Form extracted from HTML.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlForm {
    /// Form name or inferred label.
    pub name: String,
    /// Submission method.
    pub method: String,
    /// Action target.
    pub action: String,
    /// Input fields.
    pub inputs: Vec<HtmlInput>,
    /// Button actions.
    pub buttons: Vec<HtmlButton>,
}

/// Input extracted from a form.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlInput {
    /// Input name.
    pub name: String,
    /// Input kind.
    pub kind: String,
    /// Input value.
    pub value: Option<String>,
    /// Whether the field is required.
    pub required: bool,
}

/// Button extracted from a form.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct HtmlButton {
    /// Optional button name.
    pub name: Option<String>,
    /// Optional button value.
    pub value: Option<String>,
    /// Button label.
    pub label: String,
}

/// Semantic HTML content node.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum HtmlNode {
    /// Heading with one-based level.
    Heading {
        /// One-based heading level.
        level: u8,
        /// Heading text.
        text: String,
    },
    /// Paragraph text.
    Paragraph(String),
    /// Link node used inside semantic sections.
    Link(HtmlLink),
    /// Ordered or unordered list.
    List {
        /// Whether the list is ordered.
        ordered: bool,
        /// List item text in source order.
        items: Vec<String>,
    },
    /// Code block.
    CodeBlock {
        /// Optional declared language.
        language: Option<String>,
        /// Code text.
        code: String,
    },
    /// Table rows.
    Table {
        /// Rows in source order.
        rows: Vec<Vec<String>>,
    },
    /// Bounded vertical spacing hint.
    Spacer {
        /// Extra terminal lines suggested by semantic block rhythm.
        lines: u8,
    },
    /// Semantic page region.
    Section {
        /// Inferred region role.
        role: HtmlSectionRole,
        /// Optional region title.
        title: Option<String>,
        /// Whether renderers should initially summarize this region.
        collapsed: bool,
        /// Region contents.
        nodes: Vec<HtmlNode>,
    },
    /// Image proxy metadata.
    Image {
        /// Alternate text or fallback label.
        alt: String,
        /// Optional normalized source URL.
        src: Option<String>,
    },
    /// Web form.
    Form(HtmlForm),
}

/// Parses HTML into a semantic document representation.
#[must_use]
pub fn parse_html(input: impl Into<String>) -> HtmlDocument {
    let raw = input.into();
    let html = Html::parse_document(&raw);
    let base_url = extract_base_url(&html);
    let metadata = extract_metadata(&html, base_url.as_ref());
    let layout_rules = extract_layout_rules(&html);
    let root = main_content_root(&html);
    let mut nodes = root
        .as_ref()
        .map(|root| extract_nodes(&root.element, base_url.as_ref(), &layout_rules))
        .unwrap_or_default();
    if let Some(root) = root.as_ref().filter(|root| root.explicit) {
        nodes.extend(extract_secondary_sections(
            &html,
            &root.element,
            base_url.as_ref(),
            &layout_rules,
        ));
    }
    let headings = nodes
        .iter()
        .filter_map(|node| match node {
            HtmlNode::Heading { level, text } => Some(HtmlHeading {
                level: *level,
                text: text.clone(),
            }),
            _ => None,
        })
        .collect::<Vec<_>>();
    let links = root
        .as_ref()
        .map(|root| extract_links(&root.element, base_url.as_ref()))
        .unwrap_or_default();
    let forms = nodes
        .iter()
        .filter_map(|node| match node {
            HtmlNode::Form(form) => Some(form.clone()),
            _ => None,
        })
        .collect();
    let title = extract_title(&html, &metadata, &headings);
    let body_text = body_text_from_nodes(&nodes);

    HtmlDocument {
        raw,
        title,
        headings,
        links,
        forms,
        nodes,
        metadata,
        body_text,
    }
}

fn selector(query: &str) -> Option<Selector> {
    Selector::parse(query).ok()
}

struct MainContentRoot<'a> {
    element: ElementRef<'a>,
    explicit: bool,
}

fn main_content_root(html: &Html) -> Option<MainContentRoot<'_>> {
    let mut candidates = Vec::new();
    for query in [
        "main",
        "article",
        "[role=\"main\"]",
        "[itemprop=\"articleBody\"]",
        "#content",
        ".content",
        ".article",
        ".post",
        ".entry-content",
        ".markdown-body",
    ] {
        if let Some(selector) = selector(query) {
            for element in html.select(&selector) {
                candidates.push(MainContentRoot {
                    element,
                    explicit: true,
                });
            }
        }
    }

    if let Some(best) = best_main_root(candidates) {
        return Some(best);
    }

    let body = selector("body").and_then(|selector| html.select(&selector).next());
    if let Some(body) = body {
        if let Some(dense_region) = densest_body_region(&body) {
            return Some(MainContentRoot {
                element: dense_region,
                explicit: false,
            });
        }
        return Some(MainContentRoot {
            element: body,
            explicit: false,
        });
    }

    html.root_element()
        .first_child()
        .and_then(ElementRef::wrap)
        .map(|element| MainContentRoot {
            element,
            explicit: false,
        })
}

fn best_main_root<'a>(roots: Vec<MainContentRoot<'a>>) -> Option<MainContentRoot<'a>> {
    roots
        .into_iter()
        .max_by_key(|root| main_root_score(&root.element))
}

fn densest_body_region<'a>(body: &ElementRef<'a>) -> Option<ElementRef<'a>> {
    let selector = selector("main, article, section, div")?;
    body.select(&selector)
        .filter(|element| !is_boilerplate_container(element))
        .map(|element| {
            let score = main_root_score(&element);
            (element, score)
        })
        .filter(|(_element, score)| *score >= 6)
        .max_by_key(|(_element, score)| *score)
        .map(|(element, _)| element)
}

fn main_root_score(root: &ElementRef<'_>) -> i32 {
    let mut heading_count = 0i32;
    let mut paragraph_count = 0i32;
    let mut list_count = 0i32;
    let mut code_count = 0i32;
    let mut table_count = 0i32;
    let mut link_count = 0i32;
    let mut text_chars = 0i32;
    let mut boilerplate_penalty = 0i32;

    for node in root.descendants() {
        let Some(element) = ElementRef::wrap(node) else {
            continue;
        };
        let tag = element.value().name();
        if is_boilerplate_container(&element) {
            boilerplate_penalty += 3;
        }
        match tag {
            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => heading_count += 1,
            "p" => {
                paragraph_count += 1;
                text_chars += element_text(&element).chars().count() as i32;
            }
            "ul" | "ol" => list_count += 1,
            "pre" | "code" => {
                code_count += 1;
                text_chars += code_text(&element).chars().count() as i32;
            }
            "table" => table_count += 1,
            "a" => link_count += 1,
            _ => {}
        }
    }

    let dense_text_score = text_chars / 120;
    let link_penalty = (link_count - paragraph_count * 5).clamp(0, 14);

    heading_count * 4
        + paragraph_count * 5
        + list_count * 2
        + code_count * 5
        + table_count * 3
        + dense_text_score
        - link_penalty
        - boilerplate_penalty
}

fn extract_title(html: &Html, metadata: &HtmlMetadata, headings: &[HtmlHeading]) -> Option<String> {
    first_text(html, "title")
        .or_else(|| metadata.open_graph_title.clone())
        .or_else(|| headings.first().map(|heading| heading.text.clone()))
}

fn extract_base_url(html: &Html) -> Option<Url> {
    let selector = selector("base[href]")?;
    let href = html
        .select(&selector)
        .next()
        .and_then(|node| node.value().attr("href"))?;
    Url::parse(href).ok()
}

fn extract_metadata(html: &Html, base_url: Option<&Url>) -> HtmlMetadata {
    HtmlMetadata {
        canonical_url: extract_link_href(html, "link[rel~=\"canonical\"]", base_url),
        language: extract_language(html),
        description: extract_meta_content(html, "meta[name=\"description\"]"),
        open_graph_title: extract_meta_content(html, "meta[property=\"og:title\"]"),
        open_graph_description: extract_meta_content(html, "meta[property=\"og:description\"]"),
    }
}

fn extract_language(html: &Html) -> Option<String> {
    let selector = selector("html[lang]")?;
    html.select(&selector)
        .next()
        .and_then(|node| node.value().attr("lang"))
        .map(str::trim)
        .filter(|value| !value.is_empty())
        .map(ToOwned::to_owned)
}

fn extract_link_href(html: &Html, query: &str, base_url: Option<&Url>) -> Option<String> {
    let selector = selector(query)?;
    html.select(&selector)
        .next()
        .and_then(|node| node.value().attr("href"))
        .map(|href| resolve_url(href, base_url))
}

fn extract_meta_content(html: &Html, query: &str) -> Option<String> {
    let selector = selector(query)?;
    html.select(&selector)
        .next()
        .and_then(|node| node.value().attr("content"))
        .map(clean_text)
        .filter(|text| !text.is_empty())
}

fn first_text(html: &Html, query: &str) -> Option<String> {
    let selector = selector(query)?;
    html.select(&selector)
        .next()
        .map(|node| element_text(&node))
        .filter(|text| !text.is_empty())
}

fn extract_nodes(
    root: &ElementRef<'_>,
    base_url: Option<&Url>,
    layout_rules: &LayoutRules,
) -> Vec<HtmlNode> {
    let mut nodes = Vec::new();
    extract_element_nodes(root, base_url, layout_rules, &mut nodes);
    trim_layout_spacers(&mut nodes);
    nodes
}

fn extract_element_nodes(
    element: &ElementRef<'_>,
    base_url: Option<&Url>,
    layout_rules: &LayoutRules,
    nodes: &mut Vec<HtmlNode>,
) {
    let tag = element.value().name();
    if tag == "br" {
        push_spacer(nodes, 1);
        return;
    }

    if is_boilerplate_container(element) && !is_primary_content_container(element) {
        return;
    }

    if is_content_element(tag) {
        let spacing = layout_spacing(element, layout_rules);
        push_spacer(nodes, spacing.before);
        if let Some(node) = html_node_from_element(element, base_url) {
            nodes.push(node);
        }
        push_spacer(nodes, spacing.after);
        return;
    }

    let spacing = is_layout_boundary(tag).then(|| layout_spacing(element, layout_rules));
    if let Some(spacing) = spacing {
        push_spacer(nodes, spacing.before);
    }

    for child in element.children() {
        if let Some(child_element) = ElementRef::wrap(child) {
            extract_element_nodes(&child_element, base_url, layout_rules, nodes);
        }
    }

    if let Some(spacing) = spacing {
        push_spacer(nodes, spacing.after);
    }
}

fn html_node_from_element(element: &ElementRef<'_>, base_url: Option<&Url>) -> Option<HtmlNode> {
    match element.value().name() {
        "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
            let text = element_text(element);
            if text.is_empty() {
                None
            } else {
                Some(HtmlNode::Heading {
                    level: heading_level(element.value().name()),
                    text,
                })
            }
        }
        "p" => {
            let text = element_text(element);
            (!text.is_empty()
                && !is_anchor_only_paragraph(element, &text)
                && !is_boilerplate_paragraph(&text))
            .then_some(HtmlNode::Paragraph(text))
        }
        "blockquote" => {
            let text = element_text(element);
            (!text.is_empty()).then_some(HtmlNode::Paragraph(format!("> {text}")))
        }
        "ul" | "ol" => {
            let items = list_items(element);
            (!items.is_empty()).then_some(HtmlNode::List {
                ordered: element.value().name() == "ol",
                items,
            })
        }
        "pre" | "code" => {
            let code = code_text(element);
            (!code.trim().is_empty()).then_some(HtmlNode::CodeBlock {
                language: code_language(element),
                code,
            })
        }
        "table" => {
            let rows = table_rows(element);
            (!rows.is_empty()).then_some(HtmlNode::Table { rows })
        }
        "img" => Some(HtmlNode::Image {
            alt: image_alt(element),
            src: element
                .value()
                .attr("src")
                .map(|src| resolve_url(src, base_url)),
        }),
        "form" => Some(HtmlNode::Form(extract_form(element, base_url))),
        _ => None,
    }
}

fn extract_secondary_sections(
    html: &Html,
    main_root: &ElementRef<'_>,
    base_url: Option<&Url>,
    layout_rules: &LayoutRules,
) -> Vec<HtmlNode> {
    let Some(selector) = selector(
        "nav, aside, footer, [role=\"navigation\"], [role=\"contentinfo\"], .sidebar, .related, #related, .comments, #comments",
    ) else {
        return Vec::new();
    };

    html.select(&selector)
        .filter(|element| {
            !is_descendant_of(element, main_root)
                && !is_descendant_of(main_root, element)
                && element.id() != main_root.id()
        })
        .filter_map(|element| secondary_section_from_element(&element, base_url, layout_rules))
        .collect()
}

fn secondary_section_from_element(
    element: &ElementRef<'_>,
    base_url: Option<&Url>,
    layout_rules: &LayoutRules,
) -> Option<HtmlNode> {
    let mut nodes = extract_nodes(element, base_url, layout_rules);
    if nodes.is_empty() {
        nodes.extend(
            extract_links(element, base_url)
                .into_iter()
                .map(HtmlNode::Link),
        );
    }
    trim_layout_spacers(&mut nodes);

    (!nodes.is_empty()).then(|| HtmlNode::Section {
        role: section_role(element),
        title: section_title(element),
        collapsed: true,
        nodes,
    })
}

fn is_descendant_of(element: &ElementRef<'_>, ancestor: &ElementRef<'_>) -> bool {
    let mut parent = element.parent();
    while let Some(node) = parent {
        if node.id() == ancestor.id() {
            return true;
        }
        parent = node.parent();
    }
    false
}

fn section_role(element: &ElementRef<'_>) -> HtmlSectionRole {
    let names = element
        .value()
        .attr("id")
        .into_iter()
        .chain(element.value().attr("class"))
        .flat_map(str::split_whitespace)
        .map(str::to_ascii_lowercase)
        .collect::<Vec<_>>();

    if names.iter().any(|name| name.contains("comment")) {
        return HtmlSectionRole::Comments;
    }
    if names.iter().any(|name| name.contains("related")) {
        return HtmlSectionRole::Related;
    }

    match element.value().name() {
        "nav" => return HtmlSectionRole::Navigation,
        "aside" => return HtmlSectionRole::Aside,
        "footer" => return HtmlSectionRole::Footer,
        _ => {}
    }

    match element.value().attr("role") {
        Some("navigation") => return HtmlSectionRole::Navigation,
        Some("contentinfo") => return HtmlSectionRole::Footer,
        _ => {}
    }

    if names
        .iter()
        .any(|name| name.contains("side") || name.contains("rail"))
    {
        HtmlSectionRole::Aside
    } else {
        HtmlSectionRole::Unknown
    }
}

fn section_title(element: &ElementRef<'_>) -> Option<String> {
    element
        .value()
        .attr("aria-label")
        .or_else(|| element.value().attr("title"))
        .map(clean_text)
        .filter(|title| !title.is_empty())
        .or_else(|| {
            let selector = selector("h1, h2, h3, h4, h5, h6")?;
            element
                .select(&selector)
                .next()
                .map(|heading| element_text(&heading))
                .filter(|title| !title.is_empty())
        })
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
struct LayoutSpacing {
    before: u8,
    after: u8,
}

#[derive(Debug, Clone, PartialEq, Eq, Default)]
struct LayoutRules {
    rules: Vec<LayoutRule>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct LayoutRule {
    selector: LayoutSelector,
    spacing: LayoutSpacing,
}

#[derive(Debug, Clone, PartialEq, Eq)]
enum LayoutSelector {
    Tag(String),
    Class(String),
    Id(String),
}

fn is_content_element(tag: &str) -> bool {
    matches!(
        tag,
        "h1" | "h2"
            | "h3"
            | "h4"
            | "h5"
            | "h6"
            | "p"
            | "blockquote"
            | "ul"
            | "ol"
            | "pre"
            | "code"
            | "table"
            | "img"
            | "form"
    )
}

fn is_layout_boundary(tag: &str) -> bool {
    matches!(
        tag,
        "article" | "section" | "header" | "footer" | "aside" | "div"
    )
}

fn layout_spacing(element: &ElementRef<'_>, rules: &LayoutRules) -> LayoutSpacing {
    let mut spacing = default_layout_spacing(element.value().name());

    if has_layout_hint_name(element) {
        spacing.before = spacing.before.max(1);
        spacing.after = spacing.after.max(1);
    }

    for rule in rules.rules_for(element) {
        spacing.before = spacing.before.max(rule.spacing.before);
        spacing.after = spacing.after.max(rule.spacing.after);
    }

    if let Some(style) = element.value().attr("style") {
        let inline_spacing = spacing_from_declarations(style);
        spacing.before = spacing.before.max(inline_spacing.before);
        spacing.after = spacing.after.max(inline_spacing.after);
    }

    LayoutSpacing {
        before: spacing.before.min(MAX_LAYOUT_SPACER_LINES),
        after: spacing.after.min(MAX_LAYOUT_SPACER_LINES),
    }
}

fn default_layout_spacing(tag: &str) -> LayoutSpacing {
    match tag {
        "article" | "section" | "header" | "footer" | "aside" => LayoutSpacing {
            before: 1,
            after: 1,
        },
        _ => LayoutSpacing::default(),
    }
}

fn has_layout_hint_name(element: &ElementRef<'_>) -> bool {
    element
        .value()
        .attr("id")
        .into_iter()
        .chain(element.value().attr("class"))
        .flat_map(str::split_whitespace)
        .any(|name| {
            let name = name.to_ascii_lowercase();
            [
                "section", "hero", "intro", "outro", "spacer", "block", "panel", "card",
            ]
            .iter()
            .any(|hint| name.contains(hint))
        })
}

fn is_primary_content_container(element: &ElementRef<'_>) -> bool {
    let tag = element.value().name();
    if matches!(tag, "main" | "article") {
        return true;
    }
    if matches!(element.value().attr("role"), Some("main")) {
        return true;
    }
    element
        .value()
        .attr("id")
        .into_iter()
        .chain(element.value().attr("class"))
        .flat_map(str::split_whitespace)
        .any(|name| {
            let lowered = name.to_ascii_lowercase();
            lowered.contains("content")
                || lowered.contains("article")
                || lowered.contains("post")
                || lowered.contains("entry")
                || lowered.contains("markdown")
        })
}

fn is_boilerplate_container(element: &ElementRef<'_>) -> bool {
    let tag = element.value().name();
    if matches!(tag, "nav" | "footer") {
        return true;
    }

    element
        .value()
        .attr("id")
        .into_iter()
        .chain(element.value().attr("class"))
        .flat_map(str::split_whitespace)
        .any(|name| {
            let lowered = name.to_ascii_lowercase();
            lowered.contains("nav")
                || lowered.contains("menu")
                || lowered.contains("footer")
                || lowered.contains("header")
                || lowered.contains("sidebar")
                || lowered.contains("related")
                || lowered.contains("breadcrumb")
                || lowered.contains("cookie")
                || lowered.contains("newsletter")
                || lowered.contains("subscribe")
                || lowered.contains("promo")
                || lowered.contains("advert")
                || lowered.contains("social")
                || lowered.contains("share")
        })
}

fn is_boilerplate_paragraph(text: &str) -> bool {
    let lowered = text.to_ascii_lowercase();
    let matches_phrase = lowered.contains("sign up for")
        || lowered.contains("subscribe")
        || lowered.contains("cookie policy")
        || lowered.contains("all rights reserved")
        || lowered.contains("follow us")
        || lowered.contains("share this")
        || lowered.contains("advertisement");
    matches_phrase && text.chars().count() <= 180
}

impl LayoutRules {
    fn rules_for<'a>(
        &'a self,
        element: &'a ElementRef<'_>,
    ) -> impl Iterator<Item = &'a LayoutRule> {
        self.rules
            .iter()
            .filter(move |rule| rule.selector.matches(element))
    }
}

impl LayoutSelector {
    fn matches(&self, element: &ElementRef<'_>) -> bool {
        match self {
            Self::Tag(tag) => element.value().name() == tag,
            Self::Class(class) => element
                .value()
                .attr("class")
                .unwrap_or_default()
                .split_whitespace()
                .any(|candidate| candidate == class),
            Self::Id(id) => element.value().attr("id") == Some(id.as_str()),
        }
    }
}

fn extract_layout_rules(html: &Html) -> LayoutRules {
    let Some(selector) = selector("style") else {
        return LayoutRules::default();
    };

    let mut rules = Vec::new();
    for style in html.select(&selector) {
        rules.extend(parse_stylesheet_rules(
            &style.text().collect::<Vec<_>>().join(" "),
        ));
    }

    LayoutRules { rules }
}

fn parse_stylesheet_rules(stylesheet: &str) -> Vec<LayoutRule> {
    stylesheet
        .split('}')
        .filter_map(|rule| {
            let (selectors, declarations) = rule.split_once('{')?;
            let spacing = spacing_from_declarations(declarations);
            if spacing.before == 0 && spacing.after == 0 {
                return None;
            }
            Some(
                selectors
                    .split(',')
                    .filter_map(parse_layout_selector)
                    .map(move |selector| LayoutRule { selector, spacing })
                    .collect::<Vec<_>>(),
            )
        })
        .flatten()
        .collect()
}

fn parse_layout_selector(selector: &str) -> Option<LayoutSelector> {
    let selector = selector.trim();
    if selector.is_empty()
        || selector
            .chars()
            .any(|ch| matches!(ch, ' ' | '\t' | '\n' | '\r' | '>' | '+' | '~' | '[' | ':'))
    {
        return None;
    }

    if let Some(class) = selector.strip_prefix('.') {
        (!class.is_empty()).then(|| LayoutSelector::Class(class.to_owned()))
    } else if let Some(id) = selector.strip_prefix('#') {
        (!id.is_empty()).then(|| LayoutSelector::Id(id.to_owned()))
    } else {
        selector
            .chars()
            .all(|ch| ch.is_ascii_alphanumeric() || ch == '-')
            .then(|| LayoutSelector::Tag(selector.to_ascii_lowercase()))
    }
}

fn spacing_from_declarations(declarations: &str) -> LayoutSpacing {
    let mut spacing = LayoutSpacing::default();
    for declaration in declarations.split(';') {
        let Some((property, value)) = declaration.split_once(':') else {
            continue;
        };
        let property = property.trim().to_ascii_lowercase();
        let lines = css_length_to_lines(value.trim());
        if lines == 0 {
            continue;
        }

        match property.as_str() {
            "margin-top" | "padding-top" => spacing.before = spacing.before.max(lines),
            "margin-bottom" | "padding-bottom" | "gap" | "row-gap" => {
                spacing.after = spacing.after.max(lines);
            }
            "margin" | "padding" => {
                spacing.before = spacing.before.max(lines);
                spacing.after = spacing.after.max(lines);
            }
            _ => {}
        }
    }
    spacing
}

fn css_length_to_lines(value: &str) -> u8 {
    let value = value.trim().to_ascii_lowercase();
    if value == "0"
        || value.starts_with("0px")
        || value.starts_with("0rem")
        || value.starts_with("0em")
    {
        return 0;
    }

    let Some(number) = first_css_number(&value) else {
        return 0;
    };
    if number <= 0.0 {
        return 0;
    }

    let lines = if value.contains("rem") || value.contains("em") {
        if number >= 4.0 {
            3
        } else if number >= 2.0 {
            2
        } else {
            1
        }
    } else if value.contains("px") {
        if number >= 48.0 {
            3
        } else if number >= 28.0 {
            2
        } else if number >= 12.0 {
            1
        } else {
            0
        }
    } else if value.contains("vh") || value.contains("vw") {
        if number >= 14.0 {
            3
        } else if number >= 8.0 {
            2
        } else {
            1
        }
    } else {
        1
    };

    lines.min(MAX_LAYOUT_SPACER_LINES)
}

fn first_css_number(value: &str) -> Option<f32> {
    let mut start = None;
    let mut end = 0;
    for (index, ch) in value.char_indices() {
        if start.is_none() && (ch.is_ascii_digit() || ch == '.') {
            start = Some(index);
        }
        if start.is_some() {
            if ch.is_ascii_digit() || ch == '.' {
                end = index + ch.len_utf8();
            } else {
                break;
            }
        }
    }
    value.get(start?..end)?.parse().ok()
}

fn push_spacer(nodes: &mut Vec<HtmlNode>, lines: u8) {
    if lines == 0 {
        return;
    }
    let lines = lines.min(MAX_LAYOUT_SPACER_LINES);
    if let Some(HtmlNode::Spacer { lines: existing }) = nodes.last_mut() {
        *existing = (*existing).max(lines).min(MAX_LAYOUT_SPACER_LINES);
    } else {
        nodes.push(HtmlNode::Spacer { lines });
    }
}

fn trim_layout_spacers(nodes: &mut Vec<HtmlNode>) {
    while matches!(nodes.first(), Some(HtmlNode::Spacer { .. })) {
        nodes.remove(0);
    }
    while matches!(nodes.last(), Some(HtmlNode::Spacer { .. })) {
        nodes.pop();
    }
}

fn extract_form(form: &ElementRef<'_>, base_url: Option<&Url>) -> HtmlForm {
    let method = form
        .value()
        .attr("method")
        .map(clean_text)
        .filter(|method| !method.is_empty())
        .unwrap_or_else(|| "GET".to_owned())
        .to_ascii_uppercase();
    let action = form
        .value()
        .attr("action")
        .map(|action| resolve_url(action, base_url))
        .or_else(|| base_url.map(Url::to_string))
        .unwrap_or_default();
    let name = form
        .value()
        .attr("name")
        .or_else(|| form.value().attr("id"))
        .map(clean_text)
        .filter(|name| !name.is_empty())
        .unwrap_or_else(|| "form".to_owned());

    HtmlForm {
        name,
        method,
        action,
        inputs: extract_inputs(form),
        buttons: extract_buttons(form),
    }
}

fn extract_inputs(form: &ElementRef<'_>) -> Vec<HtmlInput> {
    let Some(selector) = selector("input[name], textarea[name], select[name]") else {
        return Vec::new();
    };

    form.select(&selector)
        .filter_map(|input| {
            let name = input.value().attr("name").map(clean_text)?;
            (!name.is_empty()).then_some(HtmlInput {
                name,
                kind: input_kind(&input),
                value: input_value(&input),
                required: input.value().attr("required").is_some(),
            })
        })
        .collect()
}

fn extract_buttons(form: &ElementRef<'_>) -> Vec<HtmlButton> {
    let Some(selector) = selector("button, input[type=\"submit\"], input[type=\"button\"]") else {
        return Vec::new();
    };

    form.select(&selector)
        .map(|button| HtmlButton {
            name: button.value().attr("name").map(clean_text),
            value: button.value().attr("value").map(clean_text),
            label: button_label(&button),
        })
        .filter(|button| !button.label.is_empty() || button.name.is_some())
        .collect()
}

fn input_kind(input: &ElementRef<'_>) -> String {
    match input.value().name() {
        "textarea" => "textarea".to_owned(),
        "select" => "select".to_owned(),
        _ => input
            .value()
            .attr("type")
            .map(clean_text)
            .filter(|kind| !kind.is_empty())
            .unwrap_or_else(|| "text".to_owned()),
    }
}

fn input_value(input: &ElementRef<'_>) -> Option<String> {
    match input.value().name() {
        "textarea" => Some(element_text(input)).filter(|value| !value.is_empty()),
        "select" => {
            let selected = selector("option[selected]")
                .and_then(|selector| input.select(&selector).next())
                .and_then(|option| option_value(&option));
            selected.or_else(|| {
                selector("option")
                    .and_then(|selector| input.select(&selector).next())
                    .and_then(|option| option_value(&option))
            })
        }
        _ => input.value().attr("value").map(clean_text),
    }
}

fn option_value(option: &ElementRef<'_>) -> Option<String> {
    option
        .value()
        .attr("value")
        .map(clean_text)
        .or_else(|| Some(element_text(option)))
        .filter(|value| !value.is_empty())
}

fn button_label(button: &ElementRef<'_>) -> String {
    let text = element_text(button);
    if !text.is_empty() {
        return text;
    }

    button
        .value()
        .attr("value")
        .map(clean_text)
        .filter(|value| !value.is_empty())
        .unwrap_or_else(|| "submit".to_owned())
}

fn is_anchor_only_paragraph(element: &ElementRef<'_>, paragraph_text: &str) -> bool {
    let Some(selector) = selector("a[href]") else {
        return false;
    };
    let link_text = element
        .select(&selector)
        .map(|link| element_text(&link))
        .filter(|text| !text.is_empty())
        .collect::<Vec<_>>()
        .join(" ");

    !link_text.is_empty() && link_text == paragraph_text
}

fn heading_level(tag: &str) -> u8 {
    tag.strip_prefix('h')
        .and_then(|level| level.parse::<u8>().ok())
        .filter(|level| (1..=6).contains(level))
        .unwrap_or(1)
}

fn extract_links(root: &ElementRef<'_>, base_url: Option<&Url>) -> Vec<HtmlLink> {
    let Some(selector) = selector("a[href]") else {
        return Vec::new();
    };

    root.select(&selector)
        .filter_map(|element| {
            let text = element_text(&element);
            let href = element.value().attr("href")?;
            (!text.is_empty()).then_some(HtmlLink {
                text,
                href: resolve_url(href, base_url),
            })
        })
        .collect()
}

fn table_rows(table: &ElementRef<'_>) -> Vec<Vec<String>> {
    let Some(row_selector) = selector("tr") else {
        return Vec::new();
    };
    let Some(cell_selector) = selector("th, td") else {
        return Vec::new();
    };

    table
        .select(&row_selector)
        .map(|row| {
            row.select(&cell_selector)
                .map(|cell| element_text(&cell))
                .filter(|text| !text.is_empty())
                .collect::<Vec<_>>()
        })
        .filter(|row| !row.is_empty())
        .collect()
}

fn list_items(list: &ElementRef<'_>) -> Vec<String> {
    let Some(item_selector) = selector("li") else {
        return Vec::new();
    };

    list.select(&item_selector)
        .filter(|item| {
            item.parent()
                .and_then(ElementRef::wrap)
                .is_some_and(|parent| parent.id() == list.id())
        })
        .map(|item| element_text(&item))
        .filter(|text| !text.is_empty())
        .collect()
}

fn image_alt(image: &ElementRef<'_>) -> String {
    image
        .value()
        .attr("alt")
        .or_else(|| image.value().attr("title"))
        .map(clean_text)
        .filter(|text| !text.is_empty())
        .unwrap_or_else(|| "image".to_owned())
}

fn code_language(element: &ElementRef<'_>) -> Option<String> {
    let class = element.value().attr("class").unwrap_or_default();
    class
        .split_whitespace()
        .find_map(language_from_class)
        .or_else(|| {
            let selector = selector("code")?;
            element.select(&selector).find_map(|code| {
                code.value()
                    .attr("class")
                    .unwrap_or_default()
                    .split_whitespace()
                    .find_map(language_from_class)
            })
        })
}

fn language_from_class(class: &str) -> Option<String> {
    class
        .strip_prefix("language-")
        .or_else(|| class.strip_prefix("lang-"))
        .map(ToOwned::to_owned)
        .filter(|language| !language.is_empty())
}

fn resolve_url(input: &str, base_url: Option<&Url>) -> String {
    let trimmed = input.trim();
    if let Ok(url) = Url::parse(trimmed) {
        return url.to_string();
    }

    base_url
        .and_then(|base| base.join(trimmed).ok())
        .map(|url| url.to_string())
        .unwrap_or_else(|| trimmed.to_owned())
}

fn body_text_from_nodes(nodes: &[HtmlNode]) -> String {
    let parts = nodes
        .iter()
        .filter_map(|node| match node {
            HtmlNode::Heading { text, .. }
            | HtmlNode::Paragraph(text)
            | HtmlNode::CodeBlock { code: text, .. } => Some(text.clone()),
            HtmlNode::List { items, .. } => Some(items.join(" ")).filter(|text| !text.is_empty()),
            HtmlNode::Link(link) => Some(link.text.clone()).filter(|text| !text.is_empty()),
            HtmlNode::Form(_form) => None,
            HtmlNode::Spacer { .. } => None,
            HtmlNode::Section { nodes, .. } => {
                Some(body_text_from_nodes(nodes)).filter(|text| !text.is_empty())
            }
            _ => None,
        })
        .collect::<Vec<_>>();

    clean_text(parts.join(" "))
}

fn element_text(element: &ElementRef<'_>) -> String {
    clean_text(element.text().collect::<Vec<_>>().join(" "))
}

fn code_text(element: &ElementRef<'_>) -> String {
    element.text().collect::<String>()
}

fn clean_text(input: impl AsRef<str>) -> String {
    input
        .as_ref()
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
}

#[cfg(test)]
mod tests {
    use super::{
        HtmlNode, HtmlSectionRole, IndexDateStyle, discover_index_manifest_link_from_html,
        discover_index_manifest_link_from_http_link_header, parse_html, parse_index_manifest,
        well_known_index_manifest_url,
    };

    #[test]
    fn extracts_title() {
        let doc = parse_html("<html><title>Hello</title><body></body></html>");
        assert_eq!(doc.title.as_deref(), Some("Hello"));
    }

    #[test]
    fn extracts_headings_with_levels() {
        let doc = parse_html("<main><h1>Main</h1><h2>Sub</h2></main>");
        assert_eq!(doc.headings.len(), 2);
        assert_eq!(doc.headings[0].level, 1);
        assert_eq!(doc.headings[0].text, "Main");
        assert_eq!(doc.headings[1].level, 2);
        assert_eq!(doc.headings[1].text, "Sub");
    }

    #[test]
    fn extracts_links_from_main_content() {
        let doc = parse_html(
            r#"<nav><a href="/noise">Noise</a></nav><main><a href="https://example.com">Example</a></main>"#,
        );
        assert_eq!(doc.links.len(), 1);
        assert_eq!(doc.links[0].text, "Example");
        assert_eq!(doc.links[0].href, "https://example.com/");
    }

    #[test]
    fn prioritizes_explicit_main_and_collapses_secondary_regions() {
        let doc = parse_html(
            r#"
            <body>
              <nav aria-label="Site"><a href="/docs">Docs</a><a href="/about">About</a></nav>
              <main><h1>Main Article</h1><p>Readable body.</p></main>
              <aside class="related"><h2>Related</h2><a href="/next">Next</a></aside>
              <footer><a href="/license">License</a></footer>
            </body>
            "#,
        );

        assert!(matches!(
            doc.nodes.first(),
            Some(HtmlNode::Heading { text, .. }) if text == "Main Article"
        ));
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Section {
                role: HtmlSectionRole::Navigation,
                title: Some(title),
                collapsed: true,
                nodes
            } if title == "Site" && nodes.len() == 2
        )));
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Section {
                role: HtmlSectionRole::Related,
                collapsed: true,
                ..
            }
        )));
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Section {
                role: HtmlSectionRole::Footer,
                collapsed: true,
                ..
            }
        )));
    }

    #[test]
    fn main_root_scoring_prefers_dense_article_over_chrome_like_main() {
        let doc = parse_html(
            r#"
            <body>
              <main class="top-nav">
                <a href="/home">Home</a>
                <a href="/pricing">Pricing</a>
                <a href="/docs">Docs</a>
              </main>
              <article id="story">
                <h1>Deep Story</h1>
                <p>This paragraph carries the actual article payload for readers.</p>
                <p>Second paragraph keeps the dense main-content region obvious.</p>
              </article>
            </body>
            "#,
        );

        assert!(matches!(
            doc.nodes.first(),
            Some(HtmlNode::Heading { text, .. }) if text == "Deep Story"
        ));
        assert!(!doc.body_text.contains("Pricing"));
    }

    #[test]
    fn suppresses_boilerplate_containers_and_paragraphs_inside_main() {
        let doc = parse_html(
            r#"
            <main>
              <p>Primary body remains.</p>
              <div class="newsletter signup">
                <p>Sign up for updates and subscribe.</p>
              </div>
              <div class="related">
                <a href="/related">Related link</a>
              </div>
            </main>
            "#,
        );

        assert!(doc.nodes.iter().any(
            |node| matches!(node, HtmlNode::Paragraph(text) if text == "Primary body remains.")
        ));
        assert!(!doc.body_text.contains("Sign up for updates"));
        assert!(!doc.body_text.contains("Related link"));
    }

    #[test]
    fn preserves_br_boundaries_as_spacing_hints() {
        let doc = parse_html("<main><p>First line.</p><br><p>Second line.</p></main>");
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Spacer { lines: 1 }))
        );
    }

    #[test]
    fn extracts_structured_nodes() {
        let doc = parse_html(
            r#"
            <main>
              <pre><code class="language-rust">fn main() {}</code></pre>
              <ul><li>Read</li><li>Search</li></ul>
              <table><tr><th>Name</th></tr><tr><td>Index</td></tr></table>
              <img src="/logo.png" alt="Index logo">
            </main>
            "#,
        );

        assert!(matches!(
            doc.nodes.first(),
            Some(HtmlNode::CodeBlock {
                language: Some(language),
                ..
            }) if language == "rust"
        ));
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
        );
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Table { rows } if rows.len() == 2))
        );
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Image { alt, .. } if alt == "Index logo"))
        );
    }

    #[test]
    fn preserves_pre_and_code_text_whitespace() {
        let doc = parse_html(
            "<main><pre><code class=\"language-rust\">fn main() {\n    println!(\"hi\");\n}</code></pre><code>  raw\n  block</code></main>",
        );

        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::CodeBlock { language: Some(language), code }
                if language == "rust" && code == "fn main() {\n    println!(\"hi\");\n}"
        )));
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::CodeBlock { language: None, code } if code == "  raw\n  block"
        )));
    }

    #[test]
    fn preserves_blockquote_as_quoted_paragraph() {
        let doc = parse_html("<main><blockquote>Quoted reply text.</blockquote></main>");
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Paragraph(text) if text == "> Quoted reply text."
        )));
    }

    #[test]
    fn extracts_bounded_layout_spacing_from_css_and_block_boundaries() {
        let doc = parse_html(
            r#"
            <html lang="en-US">
              <head>
                <style>
                  .hero { margin-bottom: 60px; }
                  .chapter { padding-top: 2rem; padding-bottom: 6rem; }
                  p { margin-bottom: 1rem; }
                </style>
              </head>
              <main>
                <section class="hero"><h1>Landing</h1><p>Intro.</p></section>
                <section class="chapter"><p>Chapter body.</p></section>
              </main>
            </html>
            "#,
        );

        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Spacer { lines } if (1..=3).contains(lines)))
        );
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Spacer { lines: 3 }))
        );
        assert!(matches!(
            doc.nodes.first(),
            Some(HtmlNode::Heading { text, .. }) if text == "Landing"
        ));
        assert!(matches!(
            doc.nodes.last(),
            Some(HtmlNode::Paragraph(text)) if text == "Chapter body."
        ));
    }

    #[test]
    fn extracts_metadata_and_resolves_relative_urls_against_base() {
        let doc = parse_html(
            r#"
            <html lang="en-US">
              <head>
                <base href="https://example.com/docs/">
                <link rel="canonical" href="guide.html">
                <meta name="description" content="A calm reader">
                <meta property="og:title" content="OG Title">
                <meta property="og:description" content="OG Description">
              </head>
              <main><a href="chapter.html">Chapter</a><img src="img.png" alt="Image"></main>
            </html>
            "#,
        );

        assert_eq!(
            doc.metadata.canonical_url.as_deref(),
            Some("https://example.com/docs/guide.html")
        );
        assert_eq!(doc.metadata.language.as_deref(), Some("en-US"));
        assert_eq!(doc.metadata.description.as_deref(), Some("A calm reader"));
        assert_eq!(doc.metadata.open_graph_title.as_deref(), Some("OG Title"));
        assert_eq!(
            doc.metadata.open_graph_description.as_deref(),
            Some("OG Description")
        );
        assert_eq!(
            doc.links.first().map(|link| link.href.as_str()),
            Some("https://example.com/docs/chapter.html")
        );
        assert!(doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Image { src: Some(src), .. } if src == "https://example.com/docs/img.png"
        )));
    }

    #[test]
    fn extracts_forms_inputs_buttons_and_resolves_actions() {
        let doc = parse_html(
            r#"
            <html>
              <head><base href="https://example.com/docs/"></head>
              <main>
                <form id="search" method="get" action="../search">
                  <input type="search" name="q" required>
                  <input type="hidden" name="source" value="docs">
                  <button type="submit" name="go" value="1">Search</button>
                </form>
              </main>
            </html>
            "#,
        );

        assert_eq!(doc.forms.len(), 1);
        let form = &doc.forms[0];
        assert_eq!(form.name, "search");
        assert_eq!(form.method, "GET");
        assert_eq!(form.action, "https://example.com/search");
        assert_eq!(form.inputs.len(), 2);
        assert!(
            form.inputs
                .iter()
                .any(|input| input.name == "q" && input.required)
        );
        assert!(
            form.buttons
                .iter()
                .any(|button| { button.name.as_deref() == Some("go") && button.label == "Search" })
        );
        assert!(
            doc.nodes
                .iter()
                .any(|node| matches!(node, HtmlNode::Form(form) if form.name == "search"))
        );
    }

    #[test]
    fn extracts_select_values_for_form_inputs() {
        let doc = parse_html(
            r#"
            <main>
              <form id="filters" action="/search">
                <select name="sort">
                  <option value="relevance">Relevance</option>
                  <option value="recent" selected>Most recent</option>
                </select>
                <select name="view">
                  <option>compact</option>
                  <option value="expanded">expanded</option>
                </select>
              </form>
            </main>
            "#,
        );

        let form = &doc.forms[0];
        assert!(
            form.inputs
                .iter()
                .any(|input| input.name == "sort" && input.value.as_deref() == Some("recent"))
        );
        assert!(
            form.inputs
                .iter()
                .any(|input| input.name == "view" && input.value.as_deref() == Some("compact"))
        );
    }

    #[test]
    fn drops_anchor_only_paragraph_nodes_but_keeps_links() {
        let doc = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
        assert!(!doc.nodes.iter().any(|node| matches!(
            node,
            HtmlNode::Paragraph(text) if text == "Read more"
        )));
        assert_eq!(doc.links.len(), 1);
    }

    #[test]
    fn malformed_html_does_not_panic() {
        let doc = parse_html("<main><h1>Broken<p>Still readable<a href=\"/x\">link");
        assert_eq!(doc.title.as_deref(), Some("Broken Still readable link"));
        assert!(!doc.body_text.is_empty());
    }

    #[test]
    fn index_manifest_discovery_supports_well_known_html_and_header_links() {
        let page_url = "https://example.org/docs/guide";
        assert_eq!(
            well_known_index_manifest_url(page_url).as_deref(),
            Some("https://example.org/.well-known/index.idx")
        );
        assert_eq!(
            discover_index_manifest_link_from_html(
                r#"<html><head><link rel="index-manifest preload" href="/manifests/site.idx"></head></html>"#,
                page_url
            )
            .as_deref(),
            Some("https://example.org/manifests/site.idx")
        );
        assert_eq!(
            discover_index_manifest_link_from_http_link_header(
                r#"</meta/index.idx>; rel="index-manifest"; type="application/json""#,
                page_url
            )
            .as_deref(),
            Some("https://example.org/meta/index.idx")
        );
    }

    #[test]
    fn parse_index_manifest_validates_same_origin_scope_and_date_styles()
    -> Result<(), Box<dyn std::error::Error>> {
        let manifest = parse_index_manifest(
            r#"{
                "version": "index.idx/v1",
                "scope": "/docs",
                "content": { "main_selector": "main article" },
                "regions": [{ "role": "related", "selector": "aside.related", "collapsed": true }],
                "fields": [{ "name": "updated", "label": "Updated" }],
                "forms": [{ "name": "search", "selector": "form.search", "note": "Public search" }],
                "dates": [{ "field": "updated", "style": "date" }]
            }"#,
            "https://example.org/.well-known/index.idx",
            "https://example.org/docs/guide",
        )?;

        assert_eq!(manifest.version, "index.idx/v1");
        assert_eq!(manifest.scope, "/docs");
        assert_eq!(
            manifest.content.main_selector.as_deref(),
            Some("main article")
        );
        assert_eq!(manifest.regions.len(), 1);
        assert_eq!(manifest.fields.len(), 1);
        assert_eq!(manifest.forms.len(), 1);
        assert_eq!(manifest.dates.len(), 1);
        assert_eq!(manifest.dates[0].style, IndexDateStyle::Date);
        Ok(())
    }

    #[test]
    fn parse_index_manifest_rejects_cross_origin_and_invalid_scope() {
        let cross_origin = parse_index_manifest(
            r#"{"version":"index.idx/v1"}"#,
            "https://cdn.example.org/index.idx",
            "https://example.org/docs/guide",
        );
        assert!(matches!(
            cross_origin,
            Err(super::IndexManifestError::CrossOrigin { .. })
        ));

        let invalid_scope = parse_index_manifest(
            r#"{"version":"index.idx/v1","scope":"docs"}"#,
            "https://example.org/index.idx",
            "https://example.org/docs/guide",
        );
        assert!(matches!(
            invalid_scope,
            Err(super::IndexManifestError::InvalidScope(_))
        ));

        let out_of_scope = parse_index_manifest(
            r#"{"version":"index.idx/v1","scope":"/blog"}"#,
            "https://example.org/index.idx",
            "https://example.org/docs/guide",
        );
        assert!(matches!(
            out_of_scope,
            Err(super::IndexManifestError::OutOfScope { .. })
        ));
    }
}