1use index_dom::{HtmlDocument, HtmlForm, HtmlLink, HtmlNode, HtmlSectionRole};
4
5#[derive(Debug, Clone, PartialEq, Eq)]
7pub struct ReadablePage {
8 pub title: String,
10 pub paragraphs: Vec<String>,
12 pub nodes: Vec<ReadableNode>,
14 pub links: Vec<HtmlLink>,
16 pub forms: Vec<HtmlForm>,
18 pub metadata: ReadableMetadata,
20}
21
22#[derive(Debug, Clone, PartialEq, Eq)]
24pub enum ReadableNode {
25 Heading {
27 level: u8,
29 text: String,
31 },
32 Paragraph(String),
34 Link(HtmlLink),
36 List {
38 ordered: bool,
40 items: Vec<String>,
42 },
43 CodeBlock {
45 language: Option<String>,
47 code: String,
49 },
50 Table {
52 rows: Vec<Vec<String>>,
54 },
55 Spacer {
57 lines: u8,
59 },
60 Section {
62 role: ReadableSectionRole,
64 title: Option<String>,
66 collapsed: bool,
68 nodes: Vec<ReadableNode>,
70 },
71 Image {
73 alt: String,
75 src: Option<String>,
77 },
78 Form(HtmlForm),
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum ReadableSectionRole {
85 Main,
87 Navigation,
89 Aside,
91 Footer,
93 Comments,
95 Related,
97 Unknown,
99}
100
101#[derive(Debug, Clone, PartialEq, Eq, Default)]
103pub struct ReadableMetadata {
104 pub canonical_url: Option<String>,
106 pub language: Option<String>,
108 pub description: Option<String>,
110 pub open_graph_title: Option<String>,
112 pub open_graph_description: Option<String>,
114}
115
116impl ReadablePage {
117 #[must_use]
119 pub fn from_html(doc: &HtmlDocument) -> Self {
120 let title = doc
121 .title
122 .clone()
123 .or_else(|| doc.headings.first().map(|heading| heading.text.clone()))
124 .unwrap_or_else(|| "Untitled".to_owned());
125
126 let link_texts = doc
127 .links
128 .iter()
129 .map(|link| link.text.as_str())
130 .collect::<Vec<_>>();
131
132 let mut nodes = doc
133 .nodes
134 .iter()
135 .filter_map(|node| readable_node_from_html(node, &link_texts))
136 .collect::<Vec<_>>();
137
138 if nodes.is_empty() && !doc.body_text.is_empty() {
139 nodes.push(ReadableNode::Paragraph(doc.body_text.clone()));
140 }
141
142 let paragraphs = nodes
143 .iter()
144 .filter_map(|node| match node {
145 ReadableNode::Paragraph(text) => Some(text.clone()),
146 _ => None,
147 })
148 .collect::<Vec<_>>();
149
150 Self {
151 title,
152 paragraphs,
153 nodes,
154 links: doc.links.clone(),
155 forms: doc.forms.clone(),
156 metadata: ReadableMetadata {
157 canonical_url: doc.metadata.canonical_url.clone(),
158 language: doc.metadata.language.clone(),
159 description: doc.metadata.description.clone(),
160 open_graph_title: doc.metadata.open_graph_title.clone(),
161 open_graph_description: doc.metadata.open_graph_description.clone(),
162 },
163 }
164 }
165
166 #[must_use]
168 pub fn has_body(&self) -> bool {
169 !self.nodes.is_empty()
170 }
171}
172
173fn readable_node_from_html(node: &HtmlNode, link_texts: &[&str]) -> Option<ReadableNode> {
174 match node {
175 HtmlNode::Heading { level, text } if !text.is_empty() => Some(ReadableNode::Heading {
176 level: *level,
177 text: text.clone(),
178 }),
179 HtmlNode::Paragraph(text) if !text.is_empty() && !link_texts.contains(&text.as_str()) => {
180 Some(ReadableNode::Paragraph(text.clone()))
181 }
182 HtmlNode::Link(link) if !link.text.is_empty() => Some(ReadableNode::Link(link.clone())),
183 HtmlNode::List { ordered, items } if !items.is_empty() => Some(ReadableNode::List {
184 ordered: *ordered,
185 items: items.clone(),
186 }),
187 HtmlNode::CodeBlock { language, code } if !code.is_empty() => {
188 Some(ReadableNode::CodeBlock {
189 language: language.clone(),
190 code: code.clone(),
191 })
192 }
193 HtmlNode::Table { rows } if !rows.is_empty() => {
194 Some(ReadableNode::Table { rows: rows.clone() })
195 }
196 HtmlNode::Spacer { lines } if *lines > 0 => Some(ReadableNode::Spacer { lines: *lines }),
197 HtmlNode::Section {
198 role,
199 title,
200 collapsed,
201 nodes,
202 } => {
203 let section_nodes = nodes
204 .iter()
205 .filter_map(|node| readable_node_from_html(node, link_texts))
206 .collect::<Vec<_>>();
207 (!section_nodes.is_empty()).then(|| ReadableNode::Section {
208 role: readable_section_role(*role),
209 title: title.clone(),
210 collapsed: *collapsed,
211 nodes: section_nodes,
212 })
213 }
214 HtmlNode::Image { alt, src } if !alt.is_empty() || src.is_some() => {
215 Some(ReadableNode::Image {
216 alt: alt.clone(),
217 src: src.clone(),
218 })
219 }
220 HtmlNode::Form(form) => Some(ReadableNode::Form(form.clone())),
221 _ => None,
222 }
223}
224
225fn readable_section_role(role: HtmlSectionRole) -> ReadableSectionRole {
226 match role {
227 HtmlSectionRole::Main => ReadableSectionRole::Main,
228 HtmlSectionRole::Navigation => ReadableSectionRole::Navigation,
229 HtmlSectionRole::Aside => ReadableSectionRole::Aside,
230 HtmlSectionRole::Footer => ReadableSectionRole::Footer,
231 HtmlSectionRole::Comments => ReadableSectionRole::Comments,
232 HtmlSectionRole::Related => ReadableSectionRole::Related,
233 HtmlSectionRole::Unknown => ReadableSectionRole::Unknown,
234 }
235}
236
237#[cfg(test)]
238mod tests {
239 use index_dom::parse_html;
240
241 use super::{ReadableNode, ReadablePage};
242
243 #[test]
244 fn uses_title_when_available() {
245 let html = parse_html("<title>Doc</title><main><p>Hello world.</p></main>");
246 let page = ReadablePage::from_html(&html);
247 assert_eq!(page.title, "Doc");
248 }
249
250 #[test]
251 fn falls_back_to_heading() {
252 let html = parse_html("<main><h1>Heading</h1><p>Hello world.</p></main>");
253 let page = ReadablePage::from_html(&html);
254 assert_eq!(page.title, "Heading");
255 }
256
257 #[test]
258 fn extracts_body_paragraphs() {
259 let html = parse_html("<main><p>Hello world. This is readable.</p></main>");
260 let page = ReadablePage::from_html(&html);
261 assert!(page.has_body());
262 assert_eq!(
263 page.paragraphs,
264 vec!["Hello world. This is readable.".to_owned()]
265 );
266 }
267
268 #[test]
269 fn preserves_structured_reader_nodes() {
270 let html = parse_html(
271 r#"
272 <main>
273 <h2>Install</h2>
274 <ol><li>Install Rust</li><li>Run Index</li></ol>
275 <pre><code class="language-sh">cargo install index</code></pre>
276 <table><tr><th>Command</th></tr><tr><td>index</td></tr></table>
277 <img src="/logo.png" alt="Logo">
278 <form id="search" action="/search"><input name="q" required></form>
279 </main>
280 "#,
281 );
282 let page = ReadablePage::from_html(&html);
283
284 assert!(page.nodes.iter().any(
285 |node| matches!(node, ReadableNode::Heading { level: 2, text } if text == "Install")
286 ));
287 assert!(page.nodes.iter().any(
288 |node| matches!(node, ReadableNode::List { ordered: true, items } if items == &vec!["Install Rust".to_owned(), "Run Index".to_owned()])
289 ));
290 assert!(page.nodes.iter().any(|node| matches!(node, ReadableNode::CodeBlock { language: Some(language), .. } if language == "sh")));
291 assert!(
292 page.nodes
293 .iter()
294 .any(|node| matches!(node, ReadableNode::Table { rows } if rows.len() == 2))
295 );
296 assert!(
297 page.nodes
298 .iter()
299 .any(|node| matches!(node, ReadableNode::Image { alt, .. } if alt == "Logo"))
300 );
301 assert!(
302 page.nodes
303 .iter()
304 .any(|node| matches!(node, ReadableNode::Form(form) if form.name == "search"))
305 );
306 }
307
308 #[test]
309 fn preserves_layout_spacers() {
310 let html = parse_html(
311 r#"
312 <head><style>.section { margin-bottom: 40px; }</style></head>
313 <main><section class="section"><p>First.</p></section><p>Second.</p></main>
314 "#,
315 );
316 let page = ReadablePage::from_html(&html);
317
318 assert!(
319 page.nodes
320 .iter()
321 .any(|node| matches!(node, ReadableNode::Spacer { lines } if *lines >= 1))
322 );
323 }
324
325 #[test]
326 fn preserves_collapsed_secondary_sections() {
327 let html = parse_html(
328 r#"
329 <nav aria-label="Site"><a href="/docs">Docs</a></nav>
330 <main><p>Main body.</p></main>
331 "#,
332 );
333 let page = ReadablePage::from_html(&html);
334
335 assert!(page.nodes.iter().any(|node| matches!(
336 node,
337 ReadableNode::Section {
338 role: super::ReadableSectionRole::Navigation,
339 title: Some(title),
340 collapsed: true,
341 nodes
342 } if title == "Site" && matches!(nodes.first(), Some(ReadableNode::Link(link)) if link.text == "Docs")
343 )));
344 }
345
346 #[test]
347 fn carries_metadata_forward() {
348 let html = parse_html(
349 r#"
350 <html lang="en">
351 <head>
352 <meta name="description" content="Readable docs">
353 <meta property="og:title" content="Index">
354 </head>
355 <main><p>Body.</p></main>
356 </html>
357 "#,
358 );
359 let page = ReadablePage::from_html(&html);
360 assert_eq!(page.metadata.language.as_deref(), Some("en"));
361 assert_eq!(page.metadata.description.as_deref(), Some("Readable docs"));
362 assert_eq!(page.metadata.open_graph_title.as_deref(), Some("Index"));
363 }
364
365 #[test]
366 fn drops_anchor_only_paragraphs_but_keeps_link() {
367 let html = parse_html(r#"<main><p><a href="https://example.com">Read more</a></p></main>"#);
368 let page = ReadablePage::from_html(&html);
369 assert!(page.paragraphs.is_empty());
370 assert_eq!(page.links.len(), 1);
371 }
372}