1use scraper::{ElementRef, Element};
4use url::Url;
5use std::collections::HashSet;
6
7pub const PHRASING_ELEMS: &[&str] = &[
9 "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
10 "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
11 "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS",
12 "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
13 "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
14];
15
16pub const PRESENTATIONAL_ATTRIBUTES: &[&str] = &[
21 "align", "background", "bgcolor", "border", "cellpadding", "cellspacing",
22 "frame", "hspace", "rules", "style", "valign", "vspace"
23];
24
25pub fn to_absolute_uri(uri: &str, base_uri: &str) -> String {
27 if uri.starts_with('#') {
29 return uri.to_string();
30 }
31
32 match Url::parse(base_uri) {
34 Ok(base) => {
35 match base.join(uri) {
36 Ok(absolute_url) => absolute_url.to_string(),
37 Err(_) => uri.to_string(), }
39 }
40 Err(_) => uri.to_string(), }
42}
43
44pub fn is_url(text: &str) -> bool {
46 Url::parse(text).is_ok()
47}
48
49pub fn get_inner_text(element: &ElementRef, normalize_spaces: bool) -> String {
51 let text = element.text().collect::<Vec<_>>().join(" ");
52 if normalize_spaces {
53 normalize_whitespace(&text)
54 } else {
55 text
56 }
57}
58
59pub fn normalize_whitespace(text: &str) -> String {
61 let mut result = String::new();
63 let mut prev_was_space = false;
64
65 for ch in text.chars() {
66 if ch.is_whitespace() {
67 if !prev_was_space {
68 result.push(' ');
69 prev_was_space = true;
70 }
71 } else {
72 result.push(ch);
73 prev_was_space = false;
74 }
75 }
76
77 result.trim().to_string()
78}
79
80pub fn get_char_count(text: &str, separator: Option<char>) -> usize {
82 if let Some(sep) = separator {
83 text.matches(sep).count()
84 } else {
85 text.chars().count()
86 }
87}
88
89pub fn is_phrasing_content(tag_name: &str) -> bool {
91 PHRASING_ELEMS.contains(&tag_name.to_uppercase().as_str())
92}
93
94pub fn is_single_image(element: &ElementRef) -> bool {
96 let tag_name = element.value().name().to_uppercase();
97 if tag_name == "IMG" {
98 return true;
99 }
100
101 let children: Vec<_> = element.children().collect();
103 if children.len() == 1 {
104 if let Some(child_element) = children[0].value().as_element() {
105 return child_element.name().to_uppercase() == "IMG";
106 }
107 }
108
109 false
110}
111
112pub fn is_node_visible(element: &ElementRef) -> bool {
114 let style = element.value().attr("style").unwrap_or("");
115
116 if style.contains("display:none") || style.contains("display: none") {
118 return false;
119 }
120
121 if style.contains("visibility:hidden") || style.contains("visibility: hidden") {
123 return false;
124 }
125
126 if element.value().attr("hidden").is_some() {
128 return false;
129 }
130
131 if element.value().attr("aria-hidden") == Some("true") {
133 return false;
134 }
135
136 true
137}
138
139pub fn has_ancestor_tag(
141 element: &ElementRef,
142 tag_name: &str,
143 max_depth: Option<usize>,
144 filter_fn: Option<fn(&ElementRef) -> bool>
145) -> bool {
146 let target_tag = tag_name.to_uppercase();
147 let mut current = element.parent_element();
148 let mut depth = 0;
149
150 while let Some(parent) = current {
151 if let Some(max) = max_depth {
152 if depth >= max {
153 break;
154 }
155 }
156
157 if parent.value().name().to_uppercase() == target_tag {
158 if let Some(filter) = filter_fn {
159 if filter(&parent) {
160 return true;
161 }
162 } else {
163 return true;
164 }
165 }
166
167 current = parent.parent_element();
168 depth += 1;
169 }
170
171 false
172}
173
174pub fn get_node_ancestors<'a>(element: &'a ElementRef<'a>, max_depth: usize) -> Vec<ElementRef<'a>> {
176 let mut ancestors = Vec::new();
177 let mut current = element.parent();
178 let mut depth = 0;
179
180 while let Some(parent) = current {
181 if depth >= max_depth {
182 break;
183 }
184
185 if let Some(parent_element) = ElementRef::wrap(parent) {
186 ancestors.push(parent_element);
187 current = parent.parent();
188 depth += 1;
189 } else {
190 break;
191 }
192 }
193
194 ancestors
195}
196
197pub fn is_element_without_content(element: &ElementRef) -> bool {
201 let tag_name = element.value().name().to_uppercase();
202
203 match tag_name.as_str() {
204 "IMG" | "VIDEO" | "AUDIO" | "EMBED" | "OBJECT" | "IFRAME" => false,
205 _ => {
206 let text_content = get_inner_text(element, true);
207 text_content.is_empty()
208 }
209 }
210}
211
212pub fn has_single_tag_inside_element(element: &ElementRef, tag: &str) -> bool {
214 let children: Vec<_> = element.children()
215 .filter_map(|child| child.value().as_element())
216 .collect();
217
218 children.len() == 1 &&
219 children[0].name().eq_ignore_ascii_case(tag)
220}
221
222pub fn has_child_block_element(element: &ElementRef) -> bool {
224 for child in element.children() {
225 if let Some(child_element) = child.value().as_element() {
226 let tag_name = child_element.name().to_uppercase();
227 if !is_phrasing_content(&tag_name) {
228 return true;
229 }
230 }
231 }
232 false
233}
234
235pub fn should_clean_attribute(attr_name: &str) -> bool {
237 PRESENTATIONAL_ATTRIBUTES.contains(&attr_name.to_lowercase().as_str())
238}
239
240pub fn extract_text_content(element: &ElementRef) -> String {
242 element.text().collect::<Vec<_>>().join(" ")
243}
244
245pub fn word_count(text: &str) -> usize {
247 text.split_whitespace().count()
248}
249
250pub fn is_title_candidate(text: &str, current_title: Option<&str>) -> bool {
252 let word_count = word_count(text);
253
254 if word_count < 2 || word_count > 10 || text.len() > 80 {
256 return false;
257 }
258
259 if let Some(title) = current_title {
261 let similarity = text_similarity(text, title);
262 similarity > 0.3 } else {
264 true
265 }
266}
267
268pub fn text_similarity(text_a: &str, text_b: &str) -> f64 {
270 let words_a: HashSet<&str> = text_a.split_whitespace().collect();
271 let words_b: HashSet<&str> = text_b.split_whitespace().collect();
272
273 if words_a.is_empty() && words_b.is_empty() {
274 return 1.0;
275 }
276
277 if words_a.is_empty() || words_b.is_empty() {
278 return 0.0;
279 }
280
281 let intersection = words_a.intersection(&words_b).count();
282 let union = words_a.union(&words_b).count();
283
284 intersection as f64 / union as f64
285}
286
287pub fn unescape_html_entities(text: &str) -> String {
289 let text = text.replace("&", "&");
291
292 text.replace("<", "<")
294 .replace(">", ">")
295 .replace(""", "\"")
296 .replace("'", "'")
297 .replace("'", "'")
298 }
300
301pub fn clean_text(text: &str) -> String {
303 let unescaped = unescape_html_entities(text);
304 normalize_whitespace(&unescaped)
305}
306
307pub fn get_link_density(element: &ElementRef) -> f64 {
309 let total_text_length = get_inner_text(element, false).len();
310 if total_text_length == 0 {
311 return 0.0;
312 }
313
314 let mut link_text_length = 0;
316 for descendant in element.descendants() {
317 if let Some(descendant_element) = descendant.value().as_element() {
318 if descendant_element.name().eq_ignore_ascii_case("a") {
319 let link_element = ElementRef::wrap(descendant).unwrap();
320 link_text_length += get_inner_text(&link_element, false).len();
321 }
322 }
323 }
324
325 link_text_length as f64 / total_text_length as f64
326}
327
328#[cfg(test)]
329mod tests {
330 use super::*;
331
332 #[test]
333 fn test_normalize_whitespace() {
334 assert_eq!(normalize_whitespace("hello world\n\ntest"), "hello world test");
335 assert_eq!(normalize_whitespace(" \n\t "), "");
336 assert_eq!(normalize_whitespace("single"), "single");
337 }
338
339 #[test]
340 fn test_word_count() {
341 assert_eq!(word_count("hello world"), 2);
342 assert_eq!(word_count(" hello world test "), 3);
343 assert_eq!(word_count(""), 0);
344 }
345
346 #[test]
347 fn test_text_similarity() {
348 assert_eq!(text_similarity("hello world", "hello world"), 1.0);
349 assert!(text_similarity("hello world", "hello there") > 0.0);
350 assert!(text_similarity("hello world", "hello there") < 1.0);
351 assert_eq!(text_similarity("hello", "world"), 0.0);
352 assert_eq!(text_similarity("", ""), 1.0);
353 }
354
355 #[test]
356 fn test_is_url() {
357 assert!(is_url("https://example.com"));
358 assert!(is_url("http://example.com"));
359 assert!(!is_url("not a url"));
360 assert!(!is_url(""));
361 }
362
363 #[test]
364 fn test_to_absolute_uri() {
365 let base = "https://example.com/path/";
366 assert_eq!(to_absolute_uri("#anchor", base), "#anchor");
367 assert_eq!(to_absolute_uri("/absolute", base), "https://example.com/absolute");
368 assert_eq!(to_absolute_uri("relative", base), "https://example.com/path/relative");
369 }
370
371 #[test]
372 fn test_is_phrasing_content() {
373 assert!(is_phrasing_content("span"));
374 assert!(is_phrasing_content("STRONG"));
375 assert!(!is_phrasing_content("div"));
376 assert!(!is_phrasing_content("section"));
377 }
378
379 #[test]
380 fn test_unescape_html_entities() {
381 assert_eq!(unescape_html_entities("<div>"), "<div>");
382 assert_eq!(unescape_html_entities(""hello""), "\"hello\"");
383 assert_eq!(unescape_html_entities("&nbsp;"), " ");
384 }
385
386 #[test]
387 fn test_is_title_candidate() {
388 assert!(is_title_candidate("A Great Article Title", None));
389 assert!(!is_title_candidate("A", None)); assert!(!is_title_candidate("This is way too long to be a reasonable title for an article", None)); }
392
393 #[test]
394 fn test_get_char_count() {
395 assert_eq!(get_char_count("hello,world,test", Some(',')), 2);
396 assert_eq!(get_char_count("hello world", None), 11);
397 }
398}