node_html_parser/dom/element/
text_ops.rs1use super::main::HTMLElement;
2use crate::dom::node::Node;
3
4#[cfg(feature = "parallel")]
6use rayon::prelude::*; impl HTMLElement {
9 pub fn structured_text(&self) -> String {
10 use std::collections::HashSet;
11 use std::sync::OnceLock;
12 static BLOCK: OnceLock<HashSet<&'static str>> = OnceLock::new();
13 let block = BLOCK.get_or_init(|| {
14 [
15 "h1",
16 "h2",
17 "h3",
18 "h4",
19 "h5",
20 "h6",
21 "header",
22 "hgroup",
23 "details",
24 "dialog",
25 "dd",
26 "div",
27 "dt",
28 "fieldset",
29 "figcaption",
30 "figure",
31 "footer",
32 "form",
33 "table",
34 "td",
35 "tr",
36 "address",
37 "article",
38 "aside",
39 "blockquote",
40 "br",
41 "hr",
42 "li",
43 "main",
44 "nav",
45 "ol",
46 "p",
47 "pre",
48 "section",
49 "ul",
50 ]
51 .into_iter()
52 .collect()
53 });
54 #[derive(Default)]
56 struct LineBlock {
57 parts: Vec<String>,
58 prepend_ws: bool,
59 }
60 let mut blocks: Vec<LineBlock> = vec![LineBlock::default()];
61 fn dfs(
63 cur: &HTMLElement,
64 block: &std::collections::HashSet<&'static str>,
65 ) -> Vec<LineBlock> {
66 let tag = cur.name();
67 let is_block =
68 !cur.is_root() && (block.contains(tag) || block.contains(&tag.to_lowercase()[..]));
69 let children = &cur.children;
70 let mut acc: Vec<LineBlock> = Vec::new();
72 let mut current = LineBlock::default();
73 for child in children {
75 match child {
76 Node::Element(e) => {
77 let cname = e.name();
78 let child_block =
79 block.contains(cname) || block.contains(&cname.to_lowercase()[..]);
80 if child_block && !current.parts.is_empty() {
81 acc.push(current);
82 current = LineBlock::default();
83 }
84 let sub_blocks = dfs(e, block); for (i, sb) in sub_blocks.into_iter().enumerate() {
86 if i == 0 {
87 if current.prepend_ws && !sb.parts.is_empty() {
89 current.parts.push(format!(" {}", sb.parts.join("")));
90 current.prepend_ws = false;
91 } else {
92 current.parts.extend(sb.parts);
93 }
94 if sb.prepend_ws {
95 current.prepend_ws = true;
96 }
97 } else {
98 acc.push(current);
99 current = sb; }
101 }
102 if child_block && !current.parts.is_empty() {
103 acc.push(current);
104 current = LineBlock::default();
105 }
106 }
107 Node::Text(t0) => {
108 if t0.is_whitespace() {
109 current.prepend_ws = true;
110 continue;
111 }
112 let mut tc = t0.clone();
113 let txt = tc.trimmed_text().to_string();
114 if current.prepend_ws {
115 current.parts.push(format!(" {}", txt));
116 current.prepend_ws = false;
117 } else {
118 current.parts.push(txt);
119 }
120 }
121 Node::Comment(_) => {}
122 }
123 }
124 if !current.parts.is_empty() {
125 acc.push(current);
126 }
127 if is_block {
128 acc.push(LineBlock::default());
129 }
130 acc
131 }
132 let mut collected = dfs(self, block);
133 blocks.append(&mut collected);
134 blocks
135 .into_iter()
136 .filter(|b| !b.parts.is_empty())
137 .map(|b| {
138 let joined = b.parts.join("");
139 regex::Regex::new(r"\s{2,}")
140 .unwrap()
141 .replace_all(&joined, " ")
142 .to_string()
143 })
144 .collect::<Vec<_>>()
145 .join("\n")
146 .trim_end()
147 .to_string()
148 }
149
150 fn collect_structured_text(&self, buf: &mut String, is_root: bool) {
151 for (i, child) in self.children.iter().enumerate() {
152 match child {
153 Node::Text(t) => {
154 let txt = html_escape::decode_html_entities(&t.raw);
155 if !txt.trim().is_empty() {
156 buf.push_str(txt.trim());
157 if i + 1 < self.children.len() {
158 buf.push('\n');
159 }
160 }
161 }
162 Node::Element(e) => {
163 e.collect_structured_text(buf, false);
164 }
165 Node::Comment(_) => {}
166 }
167 }
168 }
169}