scrape_core/
utils.rs

1//! Shared utility functions for HTML processing.
2//!
3//! This module provides common utilities used across the library and bindings
4//! for HTML text escaping, attribute escaping, and void element detection.
5
6use std::borrow::Cow;
7
8/// Escapes special characters for HTML text content.
9///
10/// Returns borrowed input when no escaping is needed (common case),
11/// avoiding allocation overhead. Only `&`, `<`, and `>` are escaped.
12///
13/// # Performance
14///
15/// This function uses a fast-path check to avoid allocation when the input
16/// contains no special characters. In typical HTML content, 80-90% of text
17/// nodes require no escaping.
18///
19/// # Examples
20///
21/// ```rust
22/// use std::borrow::Cow;
23///
24/// use scrape_core::utils::escape_text;
25///
26/// // No escaping needed - returns borrowed reference
27/// let result = escape_text("Hello World");
28/// assert!(matches!(result, Cow::Borrowed(_)));
29/// assert_eq!(result, "Hello World");
30///
31/// // Escaping needed - returns owned string
32/// let result = escape_text("<script>alert('xss')</script>");
33/// assert!(matches!(result, Cow::Owned(_)));
34/// assert_eq!(result, "&lt;script&gt;alert('xss')&lt;/script&gt;");
35/// ```
36#[must_use]
37pub fn escape_text(s: &str) -> Cow<'_, str> {
38    if !s.contains(['&', '<', '>']) {
39        return Cow::Borrowed(s);
40    }
41
42    let mut result = String::with_capacity(s.len());
43    for c in s.chars() {
44        match c {
45            '&' => result.push_str("&amp;"),
46            '<' => result.push_str("&lt;"),
47            '>' => result.push_str("&gt;"),
48            _ => result.push(c),
49        }
50    }
51    Cow::Owned(result)
52}
53
54/// Escapes special characters for HTML attribute values.
55///
56/// Returns borrowed input when no escaping is needed (common case),
57/// avoiding allocation overhead. Escapes `&`, `"`, `<`, and `>`.
58///
59/// # Performance
60///
61/// Similar to [`escape_text`], uses a fast-path check to avoid allocation
62/// for attribute values without special characters.
63///
64/// # Examples
65///
66/// ```rust
67/// use std::borrow::Cow;
68///
69/// use scrape_core::utils::escape_attr;
70///
71/// // No escaping needed
72/// let result = escape_attr("simple-value");
73/// assert!(matches!(result, Cow::Borrowed(_)));
74///
75/// // Escaping needed for quotes
76/// let result = escape_attr("value with \"quotes\"");
77/// assert_eq!(result, "value with &quot;quotes&quot;");
78/// ```
79#[must_use]
80pub fn escape_attr(s: &str) -> Cow<'_, str> {
81    if !s.contains(['&', '"', '<', '>']) {
82        return Cow::Borrowed(s);
83    }
84
85    let mut result = String::with_capacity(s.len());
86    for c in s.chars() {
87        match c {
88            '&' => result.push_str("&amp;"),
89            '"' => result.push_str("&quot;"),
90            '<' => result.push_str("&lt;"),
91            '>' => result.push_str("&gt;"),
92            _ => result.push(c),
93        }
94    }
95    Cow::Owned(result)
96}
97
98/// Returns true if the element is a void element (no closing tag).
99///
100/// Void elements are HTML elements that cannot have content and must not
101/// have a closing tag. Per the HTML5 specification, these are:
102///
103/// - `area`, `base`, `br`, `col`, `embed`, `hr`, `img`, `input`
104/// - `link`, `meta`, `param`, `source`, `track`, `wbr`
105///
106/// # Examples
107///
108/// ```rust
109/// use scrape_core::utils::is_void_element;
110///
111/// assert!(is_void_element("br"));
112/// assert!(is_void_element("img"));
113/// assert!(is_void_element("input"));
114///
115/// assert!(!is_void_element("div"));
116/// assert!(!is_void_element("span"));
117/// assert!(!is_void_element("p"));
118/// ```
119#[must_use]
120pub fn is_void_element(name: &str) -> bool {
121    matches!(
122        name,
123        "area"
124            | "base"
125            | "br"
126            | "col"
127            | "embed"
128            | "hr"
129            | "img"
130            | "input"
131            | "link"
132            | "meta"
133            | "param"
134            | "source"
135            | "track"
136            | "wbr"
137    )
138}
139
140#[cfg(test)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn test_escape_text_no_special_chars() {
146        let input = "Hello World";
147        let result = escape_text(input);
148        assert!(matches!(result, Cow::Borrowed(_)));
149        assert_eq!(result, "Hello World");
150    }
151
152    #[test]
153    fn test_escape_text_with_ampersand() {
154        let result = escape_text("Tom & Jerry");
155        assert!(matches!(result, Cow::Owned(_)));
156        assert_eq!(result, "Tom &amp; Jerry");
157    }
158
159    #[test]
160    fn test_escape_text_with_angle_brackets() {
161        let result = escape_text("<tag>");
162        assert_eq!(result, "&lt;tag&gt;");
163    }
164
165    #[test]
166    fn test_escape_text_mixed() {
167        let result = escape_text("1 < 2 & 2 > 1");
168        assert_eq!(result, "1 &lt; 2 &amp; 2 &gt; 1");
169    }
170
171    #[test]
172    fn test_escape_text_empty() {
173        let result = escape_text("");
174        assert!(matches!(result, Cow::Borrowed(_)));
175        assert_eq!(result, "");
176    }
177
178    #[test]
179    fn test_escape_attr_no_special_chars() {
180        let input = "simple-value";
181        let result = escape_attr(input);
182        assert!(matches!(result, Cow::Borrowed(_)));
183        assert_eq!(result, "simple-value");
184    }
185
186    #[test]
187    fn test_escape_attr_with_quotes() {
188        let result = escape_attr("say \"hello\"");
189        assert_eq!(result, "say &quot;hello&quot;");
190    }
191
192    #[test]
193    fn test_escape_attr_mixed() {
194        let result = escape_attr("<a href=\"&\">link</a>");
195        assert_eq!(result, "&lt;a href=&quot;&amp;&quot;&gt;link&lt;/a&gt;");
196    }
197
198    #[test]
199    fn test_is_void_element_true() {
200        for tag in [
201            "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
202            "source", "track", "wbr",
203        ] {
204            assert!(is_void_element(tag), "{tag} should be a void element");
205        }
206    }
207
208    #[test]
209    fn test_is_void_element_false() {
210        for tag in ["div", "span", "p", "a", "ul", "li", "table", "form", "script", "style"] {
211            assert!(!is_void_element(tag), "{tag} should not be a void element");
212        }
213    }
214}