index-readability 1.0.0

Main-content extraction prototype for Index.
Documentation
//! Main-content extraction for static HTML.

use index_dom::{HtmlDocument, HtmlForm, HtmlLink, HtmlNode, HtmlSectionRole};

/// A readable page extracted from parsed HTML.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ReadablePage {
    /// Best-known page title.
    pub title: String,
    /// Paragraph-like text chunks.
    pub paragraphs: Vec<String>,
    /// Structured readable nodes.
    pub nodes: Vec<ReadableNode>,
    /// Page links.
    pub links: Vec<HtmlLink>,
    /// Page forms.
    pub forms: Vec<HtmlForm>,
    /// Page metadata.
    pub metadata: ReadableMetadata,
}

/// Readable semantic node.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ReadableNode {
    /// Heading with one-based level.
    Heading {
        /// One-based heading level.
        level: u8,
        /// Heading text.
        text: String,
    },
    /// Paragraph text.
    Paragraph(String),
    /// Link node used inside semantic sections.
    Link(HtmlLink),
    /// Ordered or unordered list.
    List {
        /// Whether numbering is semantic.
        ordered: bool,
        /// List item text in source order.
        items: Vec<String>,
    },
    /// Code block.
    CodeBlock {
        /// Optional declared language.
        language: Option<String>,
        /// Code text.
        code: String,
    },
    /// Table rows.
    Table {
        /// Rows in source order.
        rows: Vec<Vec<String>>,
    },
    /// Bounded vertical spacing hint.
    Spacer {
        /// Extra terminal lines suggested by upstream layout rhythm.
        lines: u8,
    },
    /// Semantic page region.
    Section {
        /// Inferred region role.
        role: ReadableSectionRole,
        /// Optional region title.
        title: Option<String>,
        /// Whether renderers should initially summarize this region.
        collapsed: bool,
        /// Region contents.
        nodes: Vec<ReadableNode>,
    },
    /// Image proxy metadata.
    Image {
        /// Alternate text or fallback label.
        alt: String,
        /// Optional normalized source URL.
        src: Option<String>,
    },
    /// Web form.
    Form(HtmlForm),
}

/// Readable semantic region role.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ReadableSectionRole {
    /// Primary content region.
    Main,
    /// Navigation region.
    Navigation,
    /// Sidebar or complementary content.
    Aside,
    /// Footer or content information.
    Footer,
    /// Comments or discussion region.
    Comments,
    /// Related links or related content.
    Related,
    /// Unknown secondary region.
    Unknown,
}

/// Readable page metadata.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct ReadableMetadata {
    /// Canonical URL when known.
    pub canonical_url: Option<String>,
    /// Declared document language when known.
    pub language: Option<String>,
    /// Description metadata.
    pub description: Option<String>,
    /// OpenGraph title metadata.
    pub open_graph_title: Option<String>,
    /// OpenGraph description metadata.
    pub open_graph_description: Option<String>,
}

impl ReadablePage {
    /// Extracts readable content from an HTML document.
    #[must_use]
    pub fn from_html(doc: &HtmlDocument) -> Self {
        let title = doc
            .title
            .clone()
            .or_else(|| doc.headings.first().map(|heading| heading.text.clone()))
            .unwrap_or_else(|| "Untitled".to_owned());

        let link_texts = doc
            .links
            .iter()
            .map(|link| link.text.as_str())
            .collect::<Vec<_>>();

        let mut nodes = doc
            .nodes
            .iter()
            .filter_map(|node| readable_node_from_html(node, &link_texts))
            .collect::<Vec<_>>();

        if nodes.is_empty() && !doc.body_text.is_empty() {
            nodes.push(ReadableNode::Paragraph(doc.body_text.clone()));
        }

        let paragraphs = nodes
            .iter()
            .filter_map(|node| match node {
                ReadableNode::Paragraph(text) => Some(text.clone()),
                _ => None,
            })
            .collect::<Vec<_>>();

        Self {
            title,
            paragraphs,
            nodes,
            links: doc.links.clone(),
            forms: doc.forms.clone(),
            metadata: ReadableMetadata {
                canonical_url: doc.metadata.canonical_url.clone(),
                language: doc.metadata.language.clone(),
                description: doc.metadata.description.clone(),
                open_graph_title: doc.metadata.open_graph_title.clone(),
                open_graph_description: doc.metadata.open_graph_description.clone(),
            },
        }
    }

    /// Returns whether extraction produced meaningful text or semantic content.
    #[must_use]
    pub fn has_body(&self) -> bool {
        !self.nodes.is_empty()
    }
}

fn readable_node_from_html(node: &HtmlNode, link_texts: &[&str]) -> Option<ReadableNode> {
    match node {
        HtmlNode::Heading { level, text } if !text.is_empty() => Some(ReadableNode::Heading {
            level: *level,
            text: text.clone(),
        }),
        HtmlNode::Paragraph(text) if !text.is_empty() && !link_texts.contains(&text.as_str()) => {
            Some(ReadableNode::Paragraph(text.clone()))
        }
        HtmlNode::Link(link) if !link.text.is_empty() => Some(ReadableNode::Link(link.clone())),
        HtmlNode::List { ordered, items } if !items.is_empty() => Some(ReadableNode::List {
            ordered: *ordered,
            items: items.clone(),
        }),
        HtmlNode::CodeBlock { language, code } if !code.is_empty() => {
            Some(ReadableNode::CodeBlock {
                language: language.clone(),
                code: code.clone(),
            })
        }
        HtmlNode::Table { rows } if !rows.is_empty() => {
            Some(ReadableNode::Table { rows: rows.clone() })
        }
        HtmlNode::Spacer { lines } if *lines > 0 => Some(ReadableNode::Spacer { lines: *lines }),
        HtmlNode::Section {
            role,
            title,
            collapsed,
            nodes,
        } => {
            let section_nodes = nodes
                .iter()
                .filter_map(|node| readable_node_from_html(node, link_texts))
                .collect::<Vec<_>>();
            (!section_nodes.is_empty()).then(|| ReadableNode::Section {
                role: readable_section_role(*role),
                title: title.clone(),
                collapsed: *collapsed,
                nodes: section_nodes,
            })
        }
        HtmlNode::Image { alt, src } if !alt.is_empty() || src.is_some() => {
            Some(ReadableNode::Image {
                alt: alt.clone(),
                src: src.clone(),
            })
        }
        HtmlNode::Form(form) => Some(ReadableNode::Form(form.clone())),
        _ => None,
    }
}

fn readable_section_role(role: HtmlSectionRole) -> ReadableSectionRole {
    match role {
        HtmlSectionRole::Main => ReadableSectionRole::Main,
        HtmlSectionRole::Navigation => ReadableSectionRole::Navigation,
        HtmlSectionRole::Aside => ReadableSectionRole::Aside,
        HtmlSectionRole::Footer => ReadableSectionRole::Footer,
        HtmlSectionRole::Comments => ReadableSectionRole::Comments,
        HtmlSectionRole::Related => ReadableSectionRole::Related,
        HtmlSectionRole::Unknown => ReadableSectionRole::Unknown,
    }
}

#[cfg(test)]
mod tests {
    use index_dom::parse_html;

    use super::{ReadableNode, ReadablePage};

    #[test]
    fn uses_title_when_available() {
        let html = parse_html("<title>Doc</title><main><p>Hello world.</p></main>");
        let page = ReadablePage::from_html(&html);
        assert_eq!(page.title, "Doc");
    }

    #[test]
    fn falls_back_to_heading() {
        let html = parse_html("<main><h1>Heading</h1><p>Hello world.</p></main>");
        let page = ReadablePage::from_html(&html);
        assert_eq!(page.title, "Heading");
    }

    #[test]
    fn extracts_body_paragraphs() {
        let html = parse_html("<main><p>Hello world. This is readable.</p></main>");
        let page = ReadablePage::from_html(&html);
        assert!(page.has_body());
        assert_eq!(
            page.paragraphs,
            vec!["Hello world. This is readable.".to_owned()]
        );
    }

    #[test]
    fn preserves_structured_reader_nodes() {
        let html = parse_html(
            r#"
            <main>
              <h2>Install</h2>
              <ol><li>Install Rust</li><li>Run Index</li></ol>
              <pre><code class="language-sh">cargo install index</code></pre>
              <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
              <img src="/logo.png" alt="Logo">
              <form id="search" action="/search"><input name="q" required></form>
            </main>
            "#,
        );
        let page = ReadablePage::from_html(&html);

        assert!(page.nodes.iter().any(
            |node| matches!(node, ReadableNode::Heading { level: 2, text } if text == "Install")
        ));
        assert!(page.nodes.iter().any(
            |node| matches!(node, ReadableNode::List { ordered: true, items } if items == &vec!["Install Rust".to_owned(), "Run Index".to_owned()])
        ));
        assert!(page.nodes.iter().any(|node| matches!(node, ReadableNode::CodeBlock { language: Some(language), .. } if language == "sh")));
        assert!(
            page.nodes
                .iter()
                .any(|node| matches!(node, ReadableNode::Table { rows } if rows.len() == 2))
        );
        assert!(
            page.nodes
                .iter()
                .any(|node| matches!(node, ReadableNode::Image { alt, .. } if alt == "Logo"))
        );
        assert!(
            page.nodes
                .iter()
                .any(|node| matches!(node, ReadableNode::Form(form) if form.name == "search"))
        );
    }

    #[test]
    fn preserves_layout_spacers() {
        let html = parse_html(
            r#"
            <head><style>.section { margin-bottom: 40px; }</style></head>
            <main><section class="section"><p>First.</p></section><p>Second.</p></main>
            "#,
        );
        let page = ReadablePage::from_html(&html);

        assert!(
            page.nodes
                .iter()
                .any(|node| matches!(node, ReadableNode::Spacer { lines } if *lines >= 1))
        );
    }

    #[test]
    fn preserves_collapsed_secondary_sections() {
        let html = parse_html(
            r#"
            <nav aria-label="Site"><a href="/docs">Docs</a></nav>
            <main><p>Main body.</p></main>
            "#,
        );
        let page = ReadablePage::from_html(&html);

        assert!(page.nodes.iter().any(|node| matches!(
            node,
            ReadableNode::Section {
                role: super::ReadableSectionRole::Navigation,
                title: Some(title),
                collapsed: true,
                nodes
            } if title == "Site" && matches!(nodes.first(), Some(ReadableNode::Link(link)) if link.text == "Docs")
        )));
    }

    #[test]
    fn carries_metadata_forward() {
        let html = parse_html(
            r#"
            <html lang="en">
            <head>
              <meta name="description" content="Readable docs">
              <meta property="og:title" content="Index">
            </head>
            <main><p>Body.</p></main>
            </html>
            "#,
        );
        let page = ReadablePage::from_html(&html);
        assert_eq!(page.metadata.language.as_deref(), Some("en"));
        assert_eq!(page.metadata.description.as_deref(), Some("Readable docs"));
        assert_eq!(page.metadata.open_graph_title.as_deref(), Some("Index"));
    }

    #[test]
    fn drops_anchor_only_paragraphs_but_keeps_link() {
        let html = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
        let page = ReadablePage::from_html(&html);
        assert!(page.paragraphs.is_empty());
        assert_eq!(page.links.len(), 1);
    }
}