Skip to main content

node_html_parser/dom/
text.rs

1use std::sync::OnceLock;
2
3// 缓存空白字符检测的正则表达式
4static WHITESPACE_REGEX: OnceLock<regex::Regex> = OnceLock::new();
5
6#[derive(Debug, Clone)]
7pub struct TextNode {
8	pub raw: String,
9	pub range: Option<(usize, usize)>,
10	trimmed_raw_cache: Option<String>,
11	trimmed_txt_cache: Option<String>,
12}
13
14impl TextNode {
15	pub fn new(raw: String) -> Self {
16		Self {
17			raw,
18			range: None,
19			trimmed_raw_cache: None,
20			trimmed_txt_cache: None,
21		}
22	}
23	pub fn with_range(raw: String, start: usize, end: usize) -> Self {
24		Self {
25			raw,
26			range: Some((start, end)),
27			trimmed_raw_cache: None,
28			trimmed_txt_cache: None,
29		}
30	}
31	pub fn range(&self) -> Option<(usize, usize)> {
32		self.range
33	}
34	fn invalidate(&mut self) {
35		self.trimmed_raw_cache = None;
36		self.trimmed_txt_cache = None;
37	}
38	pub fn set_raw(&mut self, v: String) {
39		self.raw = v;
40		self.invalidate();
41	}
42	fn trim_alg(text: &str) -> String {
43		if text.is_empty() {
44			return String::new();
45		}
46		let bytes = text.as_bytes();
47		let mut start = 0usize;
48		let mut end = bytes.len() - 1;
49		while start < bytes.len() {
50			if !bytes[start].is_ascii_whitespace() {
51				break;
52			}
53			start += 1;
54		}
55		while end > start {
56			if !bytes[end].is_ascii_whitespace() {
57				break;
58			}
59			end -= 1;
60		}
61		let has_leading = start > 0;
62		let has_trailing = end < bytes.len() - 1;
63		format!(
64			"{}{}{}",
65			if has_leading { " " } else { "" },
66			&text[start..=end],
67			if has_trailing { " " } else { "" }
68		)
69	}
70	pub fn trimmed_raw_text(&mut self) -> &str {
71		if self.trimmed_raw_cache.is_none() {
72			self.trimmed_raw_cache = Some(Self::trim_alg(&self.raw));
73		}
74		self.trimmed_raw_cache.as_ref().unwrap()
75	}
76	pub fn trimmed_text(&mut self) -> &str {
77		if self.trimmed_txt_cache.is_none() {
78			let dec = html_escape::decode_html_entities(&self.raw).to_string();
79			self.trimmed_txt_cache = Some(Self::trim_alg(&dec));
80		}
81		self.trimmed_txt_cache.as_ref().unwrap()
82	}
83	pub fn is_whitespace(&self) -> bool {
84		WHITESPACE_REGEX
85			.get_or_init(|| regex::Regex::new(r"^(?:\s|&nbsp;)*$").unwrap())
86			.is_match(&self.raw)
87	}
88	pub fn text(&self) -> String {
89		html_escape::decode_html_entities(&self.raw).to_string()
90	}
91	pub fn raw_text(&self) -> &str {
92		&self.raw
93	}
94	pub fn decoded_text(&self) -> String {
95		self.text()
96	}
97}