scrape_core/utils.rs
1//! Shared utility functions for HTML processing.
2//!
3//! This module provides common utilities used across the library and bindings
4//! for HTML text escaping, attribute escaping, and void element detection.
5
6use std::borrow::Cow;
7
8/// Escapes special characters for HTML text content.
9///
10/// Returns borrowed input when no escaping is needed (common case),
11/// avoiding allocation overhead. Only `&`, `<`, and `>` are escaped.
12///
13/// # Performance
14///
15/// This function uses a fast-path check to avoid allocation when the input
16/// contains no special characters. In typical HTML content, 80-90% of text
17/// nodes require no escaping.
18///
19/// # Examples
20///
21/// ```rust
22/// use std::borrow::Cow;
23///
24/// use scrape_core::utils::escape_text;
25///
26/// // No escaping needed - returns borrowed reference
27/// let result = escape_text("Hello World");
28/// assert!(matches!(result, Cow::Borrowed(_)));
29/// assert_eq!(result, "Hello World");
30///
31/// // Escaping needed - returns owned string
32/// let result = escape_text("<script>alert('xss')</script>");
33/// assert!(matches!(result, Cow::Owned(_)));
34/// assert_eq!(result, "<script>alert('xss')</script>");
35/// ```
36#[must_use]
37pub fn escape_text(s: &str) -> Cow<'_, str> {
38 if !s.contains(['&', '<', '>']) {
39 return Cow::Borrowed(s);
40 }
41
42 let mut result = String::with_capacity(s.len());
43 for c in s.chars() {
44 match c {
45 '&' => result.push_str("&"),
46 '<' => result.push_str("<"),
47 '>' => result.push_str(">"),
48 _ => result.push(c),
49 }
50 }
51 Cow::Owned(result)
52}
53
54/// Escapes special characters for HTML attribute values.
55///
56/// Returns borrowed input when no escaping is needed (common case),
57/// avoiding allocation overhead. Escapes `&`, `"`, `<`, and `>`.
58///
59/// # Performance
60///
61/// Similar to [`escape_text`], uses a fast-path check to avoid allocation
62/// for attribute values without special characters.
63///
64/// # Examples
65///
66/// ```rust
67/// use std::borrow::Cow;
68///
69/// use scrape_core::utils::escape_attr;
70///
71/// // No escaping needed
72/// let result = escape_attr("simple-value");
73/// assert!(matches!(result, Cow::Borrowed(_)));
74///
75/// // Escaping needed for quotes
76/// let result = escape_attr("value with \"quotes\"");
77/// assert_eq!(result, "value with "quotes"");
78/// ```
79#[must_use]
80pub fn escape_attr(s: &str) -> Cow<'_, str> {
81 if !s.contains(['&', '"', '<', '>']) {
82 return Cow::Borrowed(s);
83 }
84
85 let mut result = String::with_capacity(s.len());
86 for c in s.chars() {
87 match c {
88 '&' => result.push_str("&"),
89 '"' => result.push_str("""),
90 '<' => result.push_str("<"),
91 '>' => result.push_str(">"),
92 _ => result.push(c),
93 }
94 }
95 Cow::Owned(result)
96}
97
98/// Returns true if the element is a void element (no closing tag).
99///
100/// Void elements are HTML elements that cannot have content and must not
101/// have a closing tag. Per the HTML5 specification, these are:
102///
103/// - `area`, `base`, `br`, `col`, `embed`, `hr`, `img`, `input`
104/// - `link`, `meta`, `param`, `source`, `track`, `wbr`
105///
106/// # Examples
107///
108/// ```rust
109/// use scrape_core::utils::is_void_element;
110///
111/// assert!(is_void_element("br"));
112/// assert!(is_void_element("img"));
113/// assert!(is_void_element("input"));
114///
115/// assert!(!is_void_element("div"));
116/// assert!(!is_void_element("span"));
117/// assert!(!is_void_element("p"));
118/// ```
119#[must_use]
120pub fn is_void_element(name: &str) -> bool {
121 matches!(
122 name,
123 "area"
124 | "base"
125 | "br"
126 | "col"
127 | "embed"
128 | "hr"
129 | "img"
130 | "input"
131 | "link"
132 | "meta"
133 | "param"
134 | "source"
135 | "track"
136 | "wbr"
137 )
138}
139
140#[cfg(test)]
141mod tests {
142 use super::*;
143
144 #[test]
145 fn test_escape_text_no_special_chars() {
146 let input = "Hello World";
147 let result = escape_text(input);
148 assert!(matches!(result, Cow::Borrowed(_)));
149 assert_eq!(result, "Hello World");
150 }
151
152 #[test]
153 fn test_escape_text_with_ampersand() {
154 let result = escape_text("Tom & Jerry");
155 assert!(matches!(result, Cow::Owned(_)));
156 assert_eq!(result, "Tom & Jerry");
157 }
158
159 #[test]
160 fn test_escape_text_with_angle_brackets() {
161 let result = escape_text("<tag>");
162 assert_eq!(result, "<tag>");
163 }
164
165 #[test]
166 fn test_escape_text_mixed() {
167 let result = escape_text("1 < 2 & 2 > 1");
168 assert_eq!(result, "1 < 2 & 2 > 1");
169 }
170
171 #[test]
172 fn test_escape_text_empty() {
173 let result = escape_text("");
174 assert!(matches!(result, Cow::Borrowed(_)));
175 assert_eq!(result, "");
176 }
177
178 #[test]
179 fn test_escape_attr_no_special_chars() {
180 let input = "simple-value";
181 let result = escape_attr(input);
182 assert!(matches!(result, Cow::Borrowed(_)));
183 assert_eq!(result, "simple-value");
184 }
185
186 #[test]
187 fn test_escape_attr_with_quotes() {
188 let result = escape_attr("say \"hello\"");
189 assert_eq!(result, "say "hello"");
190 }
191
192 #[test]
193 fn test_escape_attr_mixed() {
194 let result = escape_attr("<a href=\"&\">link</a>");
195 assert_eq!(result, "<a href="&">link</a>");
196 }
197
198 #[test]
199 fn test_is_void_element_true() {
200 for tag in [
201 "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param",
202 "source", "track", "wbr",
203 ] {
204 assert!(is_void_element(tag), "{tag} should be a void element");
205 }
206 }
207
208 #[test]
209 fn test_is_void_element_false() {
210 for tag in ["div", "span", "p", "a", "ul", "li", "table", "form", "script", "style"] {
211 assert!(!is_void_element(tag), "{tag} should not be a void element");
212 }
213 }
214}