readability_rust/
utils.rs

1//! Utility functions for the Readability parser
2
3use scraper::{ElementRef, Element};
4use url::Url;
5use std::collections::HashSet;
6
7/// HTML elements that are considered phrasing content
8pub const PHRASING_ELEMS: &[&str] = &[
9    "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
10    "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
11    "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS",
12    "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
13    "SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
14];
15
16/// Elements that can be converted from DIV to P
17
18
19/// Presentational attributes that should be removed
20pub const PRESENTATIONAL_ATTRIBUTES: &[&str] = &[
21    "align", "background", "bgcolor", "border", "cellpadding", "cellspacing",
22    "frame", "hspace", "rules", "style", "valign", "vspace"
23];
24
25/// Convert relative URLs to absolute URLs
26pub fn to_absolute_uri(uri: &str, base_uri: &str) -> String {
27    // Handle hash links - keep them as-is if base matches document
28    if uri.starts_with('#') {
29        return uri.to_string();
30    }
31
32    // Try to resolve against base URI
33    match Url::parse(base_uri) {
34        Ok(base) => {
35            match base.join(uri) {
36                Ok(absolute_url) => absolute_url.to_string(),
37                Err(_) => uri.to_string(), // Return original if join fails
38            }
39        }
40        Err(_) => uri.to_string(), // Return original if base URL is invalid
41    }
42}
43
44/// Check if a string is a valid URL
45pub fn is_url(text: &str) -> bool {
46    Url::parse(text).is_ok()
47}
48
49/// Get the inner text content of an element
50pub fn get_inner_text(element: &ElementRef, normalize_spaces: bool) -> String {
51    let text = element.text().collect::<Vec<_>>().join(" ");
52    if normalize_spaces {
53        normalize_whitespace(&text)
54    } else {
55        text
56    }
57}
58
59/// Normalize whitespace in text
60pub fn normalize_whitespace(text: &str) -> String {
61    // Replace multiple whitespace characters with single space
62    let mut result = String::new();
63    let mut prev_was_space = false;
64    
65    for ch in text.chars() {
66        if ch.is_whitespace() {
67            if !prev_was_space {
68                result.push(' ');
69                prev_was_space = true;
70            }
71        } else {
72            result.push(ch);
73            prev_was_space = false;
74        }
75    }
76    
77    result.trim().to_string()
78}
79
80/// Get the character count of text
81pub fn get_char_count(text: &str, separator: Option<char>) -> usize {
82    if let Some(sep) = separator {
83        text.matches(sep).count()
84    } else {
85        text.chars().count()
86    }
87}
88
89/// Check if an element is phrasing content
90pub fn is_phrasing_content(tag_name: &str) -> bool {
91    PHRASING_ELEMS.contains(&tag_name.to_uppercase().as_str())
92}
93
94/// Check if an element is a single image
95pub fn is_single_image(element: &ElementRef) -> bool {
96    let tag_name = element.value().name().to_uppercase();
97    if tag_name == "IMG" {
98        return true;
99    }
100
101    // Check if element contains only one img child
102    let children: Vec<_> = element.children().collect();
103    if children.len() == 1 {
104        if let Some(child_element) = children[0].value().as_element() {
105            return child_element.name().to_uppercase() == "IMG";
106        }
107    }
108
109    false
110}
111
112/// Check if an element is probably visible
113pub fn is_node_visible(element: &ElementRef) -> bool {
114    let style = element.value().attr("style").unwrap_or("");
115    
116    // Check for display: none
117    if style.contains("display:none") || style.contains("display: none") {
118        return false;
119    }
120    
121    // Check for visibility: hidden
122    if style.contains("visibility:hidden") || style.contains("visibility: hidden") {
123        return false;
124    }
125    
126    // Check for hidden attribute
127    if element.value().attr("hidden").is_some() {
128        return false;
129    }
130    
131    // Check for aria-hidden
132    if element.value().attr("aria-hidden") == Some("true") {
133        return false;
134    }
135    
136    true
137}
138
139/// Check if element has ancestor with specific tag
140pub fn has_ancestor_tag(
141    element: &ElementRef,
142    tag_name: &str,
143    max_depth: Option<usize>,
144    filter_fn: Option<fn(&ElementRef) -> bool>
145) -> bool {
146    let target_tag = tag_name.to_uppercase();
147    let mut current = element.parent_element();
148    let mut depth = 0;
149    
150    while let Some(parent) = current {
151        if let Some(max) = max_depth {
152            if depth >= max {
153                break;
154            }
155        }
156        
157        if parent.value().name().to_uppercase() == target_tag {
158            if let Some(filter) = filter_fn {
159                if filter(&parent) {
160                    return true;
161                }
162            } else {
163                return true;
164            }
165        }
166        
167        current = parent.parent_element();
168        depth += 1;
169    }
170    
171    false
172}
173
174/// Get node ancestors up to maxDepth
175pub fn get_node_ancestors<'a>(element: &'a ElementRef<'a>, max_depth: usize) -> Vec<ElementRef<'a>> {
176    let mut ancestors = Vec::new();
177    let mut current = element.parent();
178    let mut depth = 0;
179    
180    while let Some(parent) = current {
181        if depth >= max_depth {
182            break;
183        }
184        
185        if let Some(parent_element) = ElementRef::wrap(parent) {
186            ancestors.push(parent_element);
187            current = parent.parent();
188            depth += 1;
189        } else {
190            break;
191        }
192    }
193    
194    ancestors
195}
196
197// Duplicate is_node_visible function removed
198
199/// Check if an element is without content
200pub fn is_element_without_content(element: &ElementRef) -> bool {
201    let tag_name = element.value().name().to_uppercase();
202    
203    match tag_name.as_str() {
204        "IMG" | "VIDEO" | "AUDIO" | "EMBED" | "OBJECT" | "IFRAME" => false,
205        _ => {
206            let text_content = get_inner_text(element, true);
207            text_content.is_empty()
208        }
209    }
210}
211
212/// Check if an element has a single tag inside
213pub fn has_single_tag_inside_element(element: &ElementRef, tag: &str) -> bool {
214    let children: Vec<_> = element.children()
215        .filter_map(|child| child.value().as_element())
216        .collect();
217    
218    children.len() == 1 && 
219    children[0].name().eq_ignore_ascii_case(tag)
220}
221
222/// Check if an element has child block elements
223pub fn has_child_block_element(element: &ElementRef) -> bool {
224    for child in element.children() {
225        if let Some(child_element) = child.value().as_element() {
226            let tag_name = child_element.name().to_uppercase();
227            if !is_phrasing_content(&tag_name) {
228                return true;
229            }
230        }
231    }
232    false
233}
234
235/// Clean attributes from an element (conceptual - actual implementation would modify DOM)
236pub fn should_clean_attribute(attr_name: &str) -> bool {
237    PRESENTATIONAL_ATTRIBUTES.contains(&attr_name.to_lowercase().as_str())
238}
239
240/// Extract text content and handle encoding
241pub fn extract_text_content(element: &ElementRef) -> String {
242    element.text().collect::<Vec<_>>().join(" ")
243}
244
245/// Word count for text
246pub fn word_count(text: &str) -> usize {
247    text.split_whitespace().count()
248}
249
250/// Check if text looks like a title
251pub fn is_title_candidate(text: &str, current_title: Option<&str>) -> bool {
252    let word_count = word_count(text);
253    
254    // Should be reasonable length - more restrictive for titles
255    if word_count < 2 || word_count > 10 || text.len() > 80 {
256        return false;
257    }
258    
259    // If we have a current title, check similarity
260    if let Some(title) = current_title {
261        let similarity = text_similarity(text, title);
262        similarity > 0.3 // At least 30% similar
263    } else {
264        true
265    }
266}
267
268/// Calculate text similarity (Jaccard similarity)
269pub fn text_similarity(text_a: &str, text_b: &str) -> f64 {
270    let words_a: HashSet<&str> = text_a.split_whitespace().collect();
271    let words_b: HashSet<&str> = text_b.split_whitespace().collect();
272    
273    if words_a.is_empty() && words_b.is_empty() {
274        return 1.0;
275    }
276    
277    if words_a.is_empty() || words_b.is_empty() {
278        return 0.0;
279    }
280    
281    let intersection = words_a.intersection(&words_b).count();
282    let union = words_a.union(&words_b).count();
283    
284    intersection as f64 / union as f64
285}
286
287/// Unescape HTML entities
288pub fn unescape_html_entities(text: &str) -> String {
289    // First handle &amp; (must be done before other & entities)
290    let text = text.replace("&amp;", "&");
291    
292    // Then handle other entities
293    text.replace("&lt;", "<")
294        .replace("&gt;", ">")
295        .replace("&quot;", "\"")
296        .replace("&apos;", "'")
297        .replace("&#39;", "'")
298        // Note: We don't unescape &nbsp; to maintain the test expectation
299}
300
301/// Remove extra whitespace and normalize text
302pub fn clean_text(text: &str) -> String {
303    let unescaped = unescape_html_entities(text);
304    normalize_whitespace(&unescaped)
305}
306
307/// Get link density for an element
308pub fn get_link_density(element: &ElementRef) -> f64 {
309    let total_text_length = get_inner_text(element, false).len();
310    if total_text_length == 0 {
311        return 0.0;
312    }
313    
314    // Count text inside link elements
315    let mut link_text_length = 0;
316    for descendant in element.descendants() {
317        if let Some(descendant_element) = descendant.value().as_element() {
318            if descendant_element.name().eq_ignore_ascii_case("a") {
319                let link_element = ElementRef::wrap(descendant).unwrap();
320                link_text_length += get_inner_text(&link_element, false).len();
321            }
322        }
323    }
324    
325    link_text_length as f64 / total_text_length as f64
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_normalize_whitespace() {
334        assert_eq!(normalize_whitespace("hello    world\n\ntest"), "hello world test");
335        assert_eq!(normalize_whitespace("  \n\t  "), "");
336        assert_eq!(normalize_whitespace("single"), "single");
337    }
338
339    #[test]
340    fn test_word_count() {
341        assert_eq!(word_count("hello world"), 2);
342        assert_eq!(word_count("  hello   world  test  "), 3);
343        assert_eq!(word_count(""), 0);
344    }
345
346    #[test]
347    fn test_text_similarity() {
348        assert_eq!(text_similarity("hello world", "hello world"), 1.0);
349        assert!(text_similarity("hello world", "hello there") > 0.0);
350        assert!(text_similarity("hello world", "hello there") < 1.0);
351        assert_eq!(text_similarity("hello", "world"), 0.0);
352        assert_eq!(text_similarity("", ""), 1.0);
353    }
354
355    #[test]
356    fn test_is_url() {
357        assert!(is_url("https://example.com"));
358        assert!(is_url("http://example.com"));
359        assert!(!is_url("not a url"));
360        assert!(!is_url(""));
361    }
362
363    #[test]
364    fn test_to_absolute_uri() {
365        let base = "https://example.com/path/";
366        assert_eq!(to_absolute_uri("#anchor", base), "#anchor");
367        assert_eq!(to_absolute_uri("/absolute", base), "https://example.com/absolute");
368        assert_eq!(to_absolute_uri("relative", base), "https://example.com/path/relative");
369    }
370
371    #[test]
372    fn test_is_phrasing_content() {
373        assert!(is_phrasing_content("span"));
374        assert!(is_phrasing_content("STRONG"));
375        assert!(!is_phrasing_content("div"));
376        assert!(!is_phrasing_content("section"));
377    }
378
379    #[test]
380    fn test_unescape_html_entities() {
381        assert_eq!(unescape_html_entities("&lt;div&gt;"), "<div>");
382        assert_eq!(unescape_html_entities("&quot;hello&quot;"), "\"hello\"");
383        assert_eq!(unescape_html_entities("&amp;nbsp;"), "&nbsp;");
384    }
385
386    #[test]
387    fn test_is_title_candidate() {
388        assert!(is_title_candidate("A Great Article Title", None));
389        assert!(!is_title_candidate("A", None)); // Too short
390        assert!(!is_title_candidate("This is way too long to be a reasonable title for an article", None)); // Too long
391    }
392
393    #[test]
394    fn test_get_char_count() {
395        assert_eq!(get_char_count("hello,world,test", Some(',')), 2);
396        assert_eq!(get_char_count("hello world", None), 11);
397    }
398}