node_html_parser/dom/element/
text_ops.rs

1use super::main::HTMLElement;
2use crate::dom::node::Node;
3
4// (占位)未来若需要对子节点 map/reduce 并行,可在启用 parallel 特性时引入 rayon::ParallelIterator。
5#[cfg(feature = "parallel")]
6use rayon::prelude::*; // 预留:未来可将同级 Element 子树收集并行化
7
8impl HTMLElement {
9	pub fn structured_text(&self) -> String {
10		use std::collections::HashSet;
11		use std::sync::OnceLock;
12		static BLOCK: OnceLock<HashSet<&'static str>> = OnceLock::new();
13		let block = BLOCK.get_or_init(|| {
14			[
15				"h1",
16				"h2",
17				"h3",
18				"h4",
19				"h5",
20				"h6",
21				"header",
22				"hgroup",
23				"details",
24				"dialog",
25				"dd",
26				"div",
27				"dt",
28				"fieldset",
29				"figcaption",
30				"figure",
31				"footer",
32				"form",
33				"table",
34				"td",
35				"tr",
36				"address",
37				"article",
38				"aside",
39				"blockquote",
40				"br",
41				"hr",
42				"li",
43				"main",
44				"nav",
45				"ol",
46				"p",
47				"pre",
48				"section",
49				"ul",
50			]
51			.into_iter()
52			.collect()
53		});
54		// Each block: collected text fragments plus optional postponed whitespace flag
55		#[derive(Default)]
56		struct LineBlock {
57			parts: Vec<String>,
58			prepend_ws: bool,
59		}
60		let mut blocks: Vec<LineBlock> = vec![LineBlock::default()];
61		// 并行化策略:对同级子节点(Element/Text)收集片段时并行映射,再按顺序合并。
62		fn dfs(
63			cur: &HTMLElement,
64			block: &std::collections::HashSet<&'static str>,
65		) -> Vec<LineBlock> {
66			let tag = cur.name();
67			let is_block =
68				!cur.is_root() && (block.contains(tag) || block.contains(&tag.to_lowercase()[..]));
69			let children = &cur.children;
70			// 收集子节点处理结果
71			let mut acc: Vec<LineBlock> = Vec::new();
72			let mut current = LineBlock::default();
73			// 为保持顺序:仍按序遍历,但对 Element 内部递归结果使用 maybe_par_iter 预先收集(子树内部可再并行)。
74			for child in children {
75				match child {
76					Node::Element(e) => {
77						let cname = e.name();
78						let child_block =
79							block.contains(cname) || block.contains(&cname.to_lowercase()[..]);
80						if child_block && !current.parts.is_empty() {
81							acc.push(current);
82							current = LineBlock::default();
83						}
84						let sub_blocks = dfs(e, block); // 递归(内部自会并行展开)
85						for (i, sb) in sub_blocks.into_iter().enumerate() {
86							if i == 0 {
87								// 第一块并入 current
88								if current.prepend_ws && !sb.parts.is_empty() {
89									current.parts.push(format!(" {}", sb.parts.join("")));
90									current.prepend_ws = false;
91								} else {
92									current.parts.extend(sb.parts);
93								}
94								if sb.prepend_ws {
95									current.prepend_ws = true;
96								}
97							} else {
98								acc.push(current);
99								current = sb; // 将之前 current 推入,接手子块
100							}
101						}
102						if child_block && !current.parts.is_empty() {
103							acc.push(current);
104							current = LineBlock::default();
105						}
106					}
107					Node::Text(t0) => {
108						if t0.is_whitespace() {
109							current.prepend_ws = true;
110							continue;
111						}
112						let mut tc = t0.clone();
113						let txt = tc.trimmed_text().to_string();
114						if current.prepend_ws {
115							current.parts.push(format!(" {}", txt));
116							current.prepend_ws = false;
117						} else {
118							current.parts.push(txt);
119						}
120					}
121					Node::Comment(_) => {}
122				}
123			}
124			if !current.parts.is_empty() {
125				acc.push(current);
126			}
127			if is_block {
128				acc.push(LineBlock::default());
129			}
130			acc
131		}
132		let mut collected = dfs(self, block);
133		blocks.append(&mut collected);
134		blocks
135			.into_iter()
136			.filter(|b| !b.parts.is_empty())
137			.map(|b| {
138				let joined = b.parts.join("");
139				regex::Regex::new(r"\s{2,}")
140					.unwrap()
141					.replace_all(&joined, " ")
142					.to_string()
143			})
144			.collect::<Vec<_>>()
145			.join("\n")
146			.trim_end()
147			.to_string()
148	}
149
150	fn collect_structured_text(&self, buf: &mut String, is_root: bool) {
151		for (i, child) in self.children.iter().enumerate() {
152			match child {
153				Node::Text(t) => {
154					let txt = html_escape::decode_html_entities(&t.raw);
155					if !txt.trim().is_empty() {
156						buf.push_str(txt.trim());
157						if i + 1 < self.children.len() {
158							buf.push('\n');
159						}
160					}
161				}
162				Node::Element(e) => {
163					e.collect_structured_text(buf, false);
164				}
165				Node::Comment(_) => {}
166			}
167		}
168	}
169}