verso/reader/
plaintext.rs1use scraper::{Html, Selector};
2
3pub fn from_html(html: &str) -> String {
6 let doc = Html::parse_document(html);
8 let body_sel = Selector::parse("body, html").unwrap();
9
10 let mut out = String::new();
11 if let Some(root) = doc.select(&body_sel).next() {
13 walk(root, &mut out);
14 }
15 if out.is_empty() {
16 walk(doc.root_element(), &mut out);
18 }
19 normalise_whitespace(&out)
20}
21
22fn walk(node: scraper::ElementRef, out: &mut String) {
23 use scraper::Node;
24 for child in node.children() {
25 match child.value() {
26 Node::Text(t) => out.push_str(&collapse_spaces(t)),
27 Node::Element(el) => {
28 let name = el.name();
29 if matches!(name, "script" | "style" | "iframe" | "object" | "embed") {
30 continue;
31 }
32 let is_block = matches!(
33 name,
34 "p" | "div"
35 | "br"
36 | "h1"
37 | "h2"
38 | "h3"
39 | "h4"
40 | "h5"
41 | "h6"
42 | "li"
43 | "blockquote"
44 | "pre"
45 | "tr"
46 | "hr"
47 | "figure"
48 | "figcaption"
49 );
50 if is_block && !out.ends_with('\n') {
51 out.push('\n');
52 }
53 if let Some(er) = scraper::ElementRef::wrap(child) {
54 walk(er, out);
55 }
56 if matches!(
57 name,
58 "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" | "pre" | "figure"
59 ) && !out.ends_with("\n\n")
60 {
61 out.push('\n');
62 }
63 }
64 _ => {}
65 }
66 }
67}
68
69fn normalise_whitespace(s: &str) -> String {
70 let mut out = String::with_capacity(s.len());
71 let mut last_blank = false;
72 for line in s.split('\n') {
73 let trimmed = collapse_spaces(line.trim_end());
74 if trimmed.is_empty() {
75 if !last_blank && !out.is_empty() {
76 out.push_str("\n\n");
77 last_blank = true;
78 }
79 } else {
80 if !out.is_empty() && !out.ends_with("\n\n") && !out.ends_with('\n') {
81 out.push('\n');
82 }
83 out.push_str(&trimmed);
84 last_blank = false;
85 }
86 }
87 out.trim().to_string()
88}
89
90fn collapse_spaces(s: &str) -> String {
91 let mut out = String::with_capacity(s.len());
92 let mut prev_space = false;
93 for ch in s.chars() {
94 if ch.is_whitespace() {
95 if !prev_space {
96 out.push(' ');
97 }
98 prev_space = true;
99 } else {
100 out.push(ch);
101 prev_space = false;
102 }
103 }
104 out
105}