Skip to main content

index_readability/
lib.rs

1//! Main-content extraction for static HTML.
2
3use index_dom::{HtmlDocument, HtmlForm, HtmlLink, HtmlNode, HtmlSectionRole};
4
5/// A readable page extracted from parsed HTML.
6#[derive(Debug, Clone, PartialEq, Eq)]
7pub struct ReadablePage {
8    /// Best-known page title.
9    pub title: String,
10    /// Paragraph-like text chunks.
11    pub paragraphs: Vec<String>,
12    /// Structured readable nodes.
13    pub nodes: Vec<ReadableNode>,
14    /// Page links.
15    pub links: Vec<HtmlLink>,
16    /// Page forms.
17    pub forms: Vec<HtmlForm>,
18    /// Page metadata.
19    pub metadata: ReadableMetadata,
20}
21
22/// Readable semantic node.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub enum ReadableNode {
25    /// Heading with one-based level.
26    Heading {
27        /// One-based heading level.
28        level: u8,
29        /// Heading text.
30        text: String,
31    },
32    /// Paragraph text.
33    Paragraph(String),
34    /// Link node used inside semantic sections.
35    Link(HtmlLink),
36    /// Ordered or unordered list.
37    List {
38        /// Whether numbering is semantic.
39        ordered: bool,
40        /// List item text in source order.
41        items: Vec<String>,
42    },
43    /// Code block.
44    CodeBlock {
45        /// Optional declared language.
46        language: Option<String>,
47        /// Code text.
48        code: String,
49    },
50    /// Table rows.
51    Table {
52        /// Rows in source order.
53        rows: Vec<Vec<String>>,
54    },
55    /// Bounded vertical spacing hint.
56    Spacer {
57        /// Extra terminal lines suggested by upstream layout rhythm.
58        lines: u8,
59    },
60    /// Semantic page region.
61    Section {
62        /// Inferred region role.
63        role: ReadableSectionRole,
64        /// Optional region title.
65        title: Option<String>,
66        /// Whether renderers should initially summarize this region.
67        collapsed: bool,
68        /// Region contents.
69        nodes: Vec<ReadableNode>,
70    },
71    /// Image proxy metadata.
72    Image {
73        /// Alternate text or fallback label.
74        alt: String,
75        /// Optional normalized source URL.
76        src: Option<String>,
77    },
78    /// Web form.
79    Form(HtmlForm),
80}
81
82/// Readable semantic region role.
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum ReadableSectionRole {
85    /// Primary content region.
86    Main,
87    /// Navigation region.
88    Navigation,
89    /// Sidebar or complementary content.
90    Aside,
91    /// Footer or content information.
92    Footer,
93    /// Comments or discussion region.
94    Comments,
95    /// Related links or related content.
96    Related,
97    /// Unknown secondary region.
98    Unknown,
99}
100
101/// Readable page metadata.
102#[derive(Debug, Clone, PartialEq, Eq, Default)]
103pub struct ReadableMetadata {
104    /// Canonical URL when known.
105    pub canonical_url: Option<String>,
106    /// Declared document language when known.
107    pub language: Option<String>,
108    /// Description metadata.
109    pub description: Option<String>,
110    /// OpenGraph title metadata.
111    pub open_graph_title: Option<String>,
112    /// OpenGraph description metadata.
113    pub open_graph_description: Option<String>,
114}
115
116impl ReadablePage {
117    /// Extracts readable content from an HTML document.
118    #[must_use]
119    pub fn from_html(doc: &HtmlDocument) -> Self {
120        let title = doc
121            .title
122            .clone()
123            .or_else(|| doc.headings.first().map(|heading| heading.text.clone()))
124            .unwrap_or_else(|| "Untitled".to_owned());
125
126        let link_texts = doc
127            .links
128            .iter()
129            .map(|link| link.text.as_str())
130            .collect::<Vec<_>>();
131
132        let mut nodes = doc
133            .nodes
134            .iter()
135            .filter_map(|node| readable_node_from_html(node, &link_texts))
136            .collect::<Vec<_>>();
137
138        if nodes.is_empty() && !doc.body_text.is_empty() {
139            nodes.push(ReadableNode::Paragraph(doc.body_text.clone()));
140        }
141
142        let paragraphs = nodes
143            .iter()
144            .filter_map(|node| match node {
145                ReadableNode::Paragraph(text) => Some(text.clone()),
146                _ => None,
147            })
148            .collect::<Vec<_>>();
149
150        Self {
151            title,
152            paragraphs,
153            nodes,
154            links: doc.links.clone(),
155            forms: doc.forms.clone(),
156            metadata: ReadableMetadata {
157                canonical_url: doc.metadata.canonical_url.clone(),
158                language: doc.metadata.language.clone(),
159                description: doc.metadata.description.clone(),
160                open_graph_title: doc.metadata.open_graph_title.clone(),
161                open_graph_description: doc.metadata.open_graph_description.clone(),
162            },
163        }
164    }
165
166    /// Returns whether extraction produced meaningful text or semantic content.
167    #[must_use]
168    pub fn has_body(&self) -> bool {
169        !self.nodes.is_empty()
170    }
171}
172
173fn readable_node_from_html(node: &HtmlNode, link_texts: &[&str]) -> Option<ReadableNode> {
174    match node {
175        HtmlNode::Heading { level, text } if !text.is_empty() => Some(ReadableNode::Heading {
176            level: *level,
177            text: text.clone(),
178        }),
179        HtmlNode::Paragraph(text) if !text.is_empty() && !link_texts.contains(&text.as_str()) => {
180            Some(ReadableNode::Paragraph(text.clone()))
181        }
182        HtmlNode::Link(link) if !link.text.is_empty() => Some(ReadableNode::Link(link.clone())),
183        HtmlNode::List { ordered, items } if !items.is_empty() => Some(ReadableNode::List {
184            ordered: *ordered,
185            items: items.clone(),
186        }),
187        HtmlNode::CodeBlock { language, code } if !code.is_empty() => {
188            Some(ReadableNode::CodeBlock {
189                language: language.clone(),
190                code: code.clone(),
191            })
192        }
193        HtmlNode::Table { rows } if !rows.is_empty() => {
194            Some(ReadableNode::Table { rows: rows.clone() })
195        }
196        HtmlNode::Spacer { lines } if *lines > 0 => Some(ReadableNode::Spacer { lines: *lines }),
197        HtmlNode::Section {
198            role,
199            title,
200            collapsed,
201            nodes,
202        } => {
203            let section_nodes = nodes
204                .iter()
205                .filter_map(|node| readable_node_from_html(node, link_texts))
206                .collect::<Vec<_>>();
207            (!section_nodes.is_empty()).then(|| ReadableNode::Section {
208                role: readable_section_role(*role),
209                title: title.clone(),
210                collapsed: *collapsed,
211                nodes: section_nodes,
212            })
213        }
214        HtmlNode::Image { alt, src } if !alt.is_empty() || src.is_some() => {
215            Some(ReadableNode::Image {
216                alt: alt.clone(),
217                src: src.clone(),
218            })
219        }
220        HtmlNode::Form(form) => Some(ReadableNode::Form(form.clone())),
221        _ => None,
222    }
223}
224
225fn readable_section_role(role: HtmlSectionRole) -> ReadableSectionRole {
226    match role {
227        HtmlSectionRole::Main => ReadableSectionRole::Main,
228        HtmlSectionRole::Navigation => ReadableSectionRole::Navigation,
229        HtmlSectionRole::Aside => ReadableSectionRole::Aside,
230        HtmlSectionRole::Footer => ReadableSectionRole::Footer,
231        HtmlSectionRole::Comments => ReadableSectionRole::Comments,
232        HtmlSectionRole::Related => ReadableSectionRole::Related,
233        HtmlSectionRole::Unknown => ReadableSectionRole::Unknown,
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use index_dom::parse_html;
240
241    use super::{ReadableNode, ReadablePage};
242
243    #[test]
244    fn uses_title_when_available() {
245        let html = parse_html("<title>Doc</title><main><p>Hello world.</p></main>");
246        let page = ReadablePage::from_html(&html);
247        assert_eq!(page.title, "Doc");
248    }
249
250    #[test]
251    fn falls_back_to_heading() {
252        let html = parse_html("<main><h1>Heading</h1><p>Hello world.</p></main>");
253        let page = ReadablePage::from_html(&html);
254        assert_eq!(page.title, "Heading");
255    }
256
257    #[test]
258    fn extracts_body_paragraphs() {
259        let html = parse_html("<main><p>Hello world. This is readable.</p></main>");
260        let page = ReadablePage::from_html(&html);
261        assert!(page.has_body());
262        assert_eq!(
263            page.paragraphs,
264            vec!["Hello world. This is readable.".to_owned()]
265        );
266    }
267
268    #[test]
269    fn preserves_structured_reader_nodes() {
270        let html = parse_html(
271            r#"
272            <main>
273              <h2>Install</h2>
274              <ol><li>Install Rust</li><li>Run Index</li></ol>
275              <pre><code class="language-sh">cargo install index</code></pre>
276              <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
277              <img src="/logo.png" alt="Logo">
278              <form id="search" action="/search"><input name="q" required></form>
279            </main>
280            "#,
281        );
282        let page = ReadablePage::from_html(&html);
283
284        assert!(page.nodes.iter().any(
285            |node| matches!(node, ReadableNode::Heading { level: 2, text } if text == "Install")
286        ));
287        assert!(page.nodes.iter().any(
288            |node| matches!(node, ReadableNode::List { ordered: true, items } if items == &vec!["Install Rust".to_owned(), "Run Index".to_owned()])
289        ));
290        assert!(page.nodes.iter().any(|node| matches!(node, ReadableNode::CodeBlock { language: Some(language), .. } if language == "sh")));
291        assert!(
292            page.nodes
293                .iter()
294                .any(|node| matches!(node, ReadableNode::Table { rows } if rows.len() == 2))
295        );
296        assert!(
297            page.nodes
298                .iter()
299                .any(|node| matches!(node, ReadableNode::Image { alt, .. } if alt == "Logo"))
300        );
301        assert!(
302            page.nodes
303                .iter()
304                .any(|node| matches!(node, ReadableNode::Form(form) if form.name == "search"))
305        );
306    }
307
308    #[test]
309    fn preserves_layout_spacers() {
310        let html = parse_html(
311            r#"
312            <head><style>.section { margin-bottom: 40px; }</style></head>
313            <main><section class="section"><p>First.</p></section><p>Second.</p></main>
314            "#,
315        );
316        let page = ReadablePage::from_html(&html);
317
318        assert!(
319            page.nodes
320                .iter()
321                .any(|node| matches!(node, ReadableNode::Spacer { lines } if *lines >= 1))
322        );
323    }
324
325    #[test]
326    fn preserves_collapsed_secondary_sections() {
327        let html = parse_html(
328            r#"
329            <nav aria-label="Site"><a href="/docs">Docs</a></nav>
330            <main><p>Main body.</p></main>
331            "#,
332        );
333        let page = ReadablePage::from_html(&html);
334
335        assert!(page.nodes.iter().any(|node| matches!(
336            node,
337            ReadableNode::Section {
338                role: super::ReadableSectionRole::Navigation,
339                title: Some(title),
340                collapsed: true,
341                nodes
342            } if title == "Site" && matches!(nodes.first(), Some(ReadableNode::Link(link)) if link.text == "Docs")
343        )));
344    }
345
346    #[test]
347    fn carries_metadata_forward() {
348        let html = parse_html(
349            r#"
350            <html lang="en">
351            <head>
352              <meta name="description" content="Readable docs">
353              <meta property="og:title" content="Index">
354            </head>
355            <main><p>Body.</p></main>
356            </html>
357            "#,
358        );
359        let page = ReadablePage::from_html(&html);
360        assert_eq!(page.metadata.language.as_deref(), Some("en"));
361        assert_eq!(page.metadata.description.as_deref(), Some("Readable docs"));
362        assert_eq!(page.metadata.open_graph_title.as_deref(), Some("Index"));
363    }
364
365    #[test]
366    fn drops_anchor_only_paragraphs_but_keeps_link() {
367        let html = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
368        let page = ReadablePage::from_html(&html);
369        assert!(page.paragraphs.is_empty());
370        assert_eq!(page.links.len(), 1);
371    }
372}