Skip to main content

verso/reader/
plaintext.rs

1use scraper::{Html, Selector};
2
3/// Produce the canonical plain-text extraction used for the location model.
4/// Must be deterministic and stable — `char_offset` semantics depend on it.
5pub fn from_html(html: &str) -> String {
6    // Parse once; traverse in document order.
7    let doc = Html::parse_document(html);
8    let body_sel = Selector::parse("body, html").unwrap();
9
10    let mut out = String::new();
11    // First body element only.
12    if let Some(root) = doc.select(&body_sel).next() {
13        walk(root, &mut out);
14    }
15    if out.is_empty() {
16        // Fallback: some EPUB chapters are fragments without a body.
17        walk(doc.root_element(), &mut out);
18    }
19    normalise_whitespace(&out)
20}
21
22fn walk(node: scraper::ElementRef, out: &mut String) {
23    use scraper::Node;
24    for child in node.children() {
25        match child.value() {
26            Node::Text(t) => out.push_str(&collapse_spaces(t)),
27            Node::Element(el) => {
28                let name = el.name();
29                if matches!(name, "script" | "style" | "iframe" | "object" | "embed") {
30                    continue;
31                }
32                let is_block = matches!(
33                    name,
34                    "p" | "div"
35                        | "br"
36                        | "h1"
37                        | "h2"
38                        | "h3"
39                        | "h4"
40                        | "h5"
41                        | "h6"
42                        | "li"
43                        | "blockquote"
44                        | "pre"
45                        | "tr"
46                        | "hr"
47                        | "figure"
48                        | "figcaption"
49                );
50                if is_block && !out.ends_with('\n') {
51                    out.push('\n');
52                }
53                if let Some(er) = scraper::ElementRef::wrap(child) {
54                    walk(er, out);
55                }
56                if matches!(
57                    name,
58                    "p" | "h1" | "h2" | "h3" | "h4" | "h5" | "h6" | "blockquote" | "pre" | "figure"
59                ) && !out.ends_with("\n\n")
60                {
61                    out.push('\n');
62                }
63            }
64            _ => {}
65        }
66    }
67}
68
69fn normalise_whitespace(s: &str) -> String {
70    let mut out = String::with_capacity(s.len());
71    let mut last_blank = false;
72    for line in s.split('\n') {
73        let trimmed = collapse_spaces(line.trim_end());
74        if trimmed.is_empty() {
75            if !last_blank && !out.is_empty() {
76                out.push_str("\n\n");
77                last_blank = true;
78            }
79        } else {
80            if !out.is_empty() && !out.ends_with("\n\n") && !out.ends_with('\n') {
81                out.push('\n');
82            }
83            out.push_str(&trimmed);
84            last_blank = false;
85        }
86    }
87    out.trim().to_string()
88}
89
90fn collapse_spaces(s: &str) -> String {
91    let mut out = String::with_capacity(s.len());
92    let mut prev_space = false;
93    for ch in s.chars() {
94        if ch.is_whitespace() {
95            if !prev_space {
96                out.push(' ');
97            }
98            prev_space = true;
99        } else {
100            out.push(ch);
101            prev_space = false;
102        }
103    }
104    out
105}