1use scraper::{Html, Node};
2
3#[derive(Debug, Default, Clone, PartialEq, Eq)]
4pub struct Style {
5 pub bold: bool,
6 pub italic: bool,
7 pub code: bool,
8 pub link: bool,
9 pub heading: Option<u8>,
10}
11
12#[derive(Debug, Clone, PartialEq, Eq)]
13pub struct Span {
14 pub text: String,
15 pub style: Style,
16 pub char_offset: usize,
18}
19
20pub fn to_spans(html: &str) -> Vec<Span> {
21 let doc = Html::parse_document(html);
22 let mut offset = 0usize;
23 let mut out = Vec::new();
24 walk(doc.root_element(), Style::default(), &mut offset, &mut out);
25 out
26}
27
28fn walk(node: scraper::ElementRef, style: Style, offset: &mut usize, out: &mut Vec<Span>) {
29 for child in node.children() {
30 match child.value() {
31 Node::Text(t) => {
32 let text = t.to_string();
33 if text.is_empty() {
34 continue;
35 }
36 let len = text.chars().count();
37 out.push(Span {
38 text,
39 style: style.clone(),
40 char_offset: *offset,
41 });
42 *offset += len;
43 }
44 Node::Element(el) => {
45 let name = el.name();
46 if matches!(name, "script" | "style" | "iframe" | "object" | "embed") {
47 continue;
48 }
49 let mut s = style.clone();
50 match name {
51 "em" | "i" => s.italic = true,
52 "strong" | "b" => s.bold = true,
53 "code" | "kbd" | "samp" => s.code = true,
54 "a" => s.link = true,
55 "h1" => s.heading = Some(1),
56 "h2" => s.heading = Some(2),
57 "h3" => s.heading = Some(3),
58 "h4" => s.heading = Some(4),
59 "h5" => s.heading = Some(5),
60 "h6" => s.heading = Some(6),
61 _ => {}
62 }
63 if let Some(er) = scraper::ElementRef::wrap(child) {
64 walk(er, s, offset, out);
65 }
66 }
67 _ => {}
68 }
69 }
70}