html_helpers/selector/
select.rs

1use crate::{Elem, Error, Result};
2use scraper::{Html, Selector};
3
4/// Selects HTML elements based on a list of CSS selectors and returns them as a list of `Elem`.
5/// The selectors are combined with a comma, effectively performing an "OR" match.
6/// Elements are returned in document order.
7///
8/// # Arguments
9///
10/// * `html_content` - A string slice containing the HTML content to parse.
11/// * `selectors` - An iterator of string-like items, each representing a CSS selector.
12///
13/// # Returns
14///
15/// A `Result` containing:
16/// - `Ok(Vec<Elem>)`: A vector of `Elem` objects representing the selected elements.
17/// - `Err(Error)`: An error if parsing the HTML or the combined selector fails.
18pub fn select<S>(html_content: &str, selectors: S) -> Result<Vec<Elem>>
19where
20	S: IntoIterator,
21	S::Item: AsRef<str>,
22{
23	// -- Build the selectors_str
24	let mut selectors_str = String::new();
25	for s_ref in selectors {
26		let s = s_ref.as_ref().trim();
27		if s.is_empty() {
28			continue;
29		}
30		if !selectors_str.is_empty() {
31			selectors_str.push(',');
32		}
33		selectors_str.push_str(s);
34	}
35	// if empty, just return empty vector
36	if selectors_str.is_empty() {
37		return Ok(Vec::new());
38	}
39	// build the scraper seletor
40	let css_selector = Selector::parse(&selectors_str).map_err(|err| Error::SelectorParse {
41		selector: selectors_str.clone(),
42		cause: err.to_string(),
43	})?;
44
45	// -- Parse and select
46	let html = Html::parse_document(html_content);
47
48	let mut els = Vec::new();
49	for element_ref in html.select(&css_selector) {
50		els.push(Elem::from_element_ref(element_ref));
51	}
52
53	Ok(els)
54}
55
56// region:    --- Tests
57
58#[cfg(test)]
59mod tests {
60	use super::*;
61	// General test functions use this local `Result<T>` for `Box<dyn Error>`.
62	type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
63
64	#[test]
65	fn test_selector_select_simple_single_selector() -> Result<()> {
66		// -- Setup & Fixtures
67		let html_content = r#"
68			<!DOCTYPE html>
69			<html>
70			<head><title>Test</title></head>
71			<body>
72				<div id="main" class="container">
73					<h1>Title</h1>
74					<p>First paragraph.</p>
75					<p class="highlight">Second paragraph with <span>span text</span>.</p>
76					<ul>
77						<li>Item 1</li>
78						<li>Item 2</li>
79					</ul>
80				</div>
81			</body>
82			</html>
83		"#;
84
85		// -- Exec
86		let els_p = select(html_content, vec!["p"])?;
87
88		// -- Check
89		assert_eq!(els_p.len(), 2);
90
91		assert_eq!(els_p[0].tag, "p");
92		assert!(els_p[0].attrs.is_empty());
93		assert_eq!(els_p[0].text.as_deref(), Some("First paragraph."));
94		assert_eq!(els_p[0].inner_html.as_deref(), Some("First paragraph."));
95
96		assert_eq!(els_p[1].tag, "p");
97		assert_eq!(els_p[1].attrs.get("class").map(|s| s.as_str()), Some("highlight"));
98		assert_eq!(els_p[1].text.as_deref(), Some("Second paragraph with span text."));
99		assert_eq!(
100			els_p[1].inner_html.as_deref(),
101			Some("Second paragraph with <span>span text</span>.")
102		);
103
104		// -- Exec & Check - Span inside p.highlight
105		let els_span_in_p = select(html_content, ["p.highlight span"])?;
106		assert_eq!(els_span_in_p.len(), 1);
107		assert_eq!(els_span_in_p[0].tag, "span");
108		assert_eq!(els_span_in_p[0].text.as_deref(), Some("span text"));
109		assert_eq!(els_span_in_p[0].inner_html.as_deref(), Some("span text"));
110
111		Ok(())
112	}
113
114	#[test]
115	fn test_selector_select_multiple_selectors_or_logic() -> Result<()> {
116		// -- Setup & Fixtures
117		let html_content = r#"
118            <h1>Title 1</h1>
119            <p>Paragraph 1</p>
120            <h2>Title 2</h2>
121            <div>Div content</div>
122            <p>Paragraph 2</p>
123        "#;
124
125		// -- Exec
126		// Selects elements matching h1 OR p OR h3. (h3 does not exist)
127		// Order should be document order.
128		let els = select(html_content, ["h1", "p", "h3"])?;
129
130		// -- Check
131		assert_eq!(els.len(), 3, "Should find one h1 and two p tags");
132		assert_eq!(els[0].tag, "h1");
133		assert_eq!(els[0].text.as_deref(), Some("Title 1"));
134		assert_eq!(els[1].tag, "p");
135		assert_eq!(els[1].text.as_deref(), Some("Paragraph 1"));
136		assert_eq!(els[2].tag, "p");
137		assert_eq!(els[2].text.as_deref(), Some("Paragraph 2"));
138
139		// -- Exec & Check - Different order of selectors, same result order
140		let els_reordered_selectors = select(html_content, ["p", "h1"])?;
141		assert_eq!(els_reordered_selectors.len(), 3);
142		assert_eq!(
143			els_reordered_selectors[0].tag, "h1",
144			"Order is document order, not selector order"
145		);
146		assert_eq!(els_reordered_selectors[1].tag, "p");
147		assert_eq!(els_reordered_selectors[2].tag, "p");
148
149		// -- Exec & Check - Select div and h2
150		let els_div_h2 = select(html_content, ["div", "h2"])?;
151		assert_eq!(els_div_h2.len(), 2);
152		assert_eq!(els_div_h2[0].tag, "h2");
153		assert_eq!(els_div_h2[0].text.as_deref(), Some("Title 2"));
154		assert_eq!(els_div_h2[1].tag, "div");
155		assert_eq!(els_div_h2[1].text.as_deref(), Some("Div content"));
156
157		Ok(())
158	}
159
160	#[test]
161	fn test_selector_select_by_id_and_class_single_selector() -> Result<()> {
162		// -- Setup & Fixtures
163		let html_content = r#"
164			<div id="unique">ID Content</div>
165			<div class="group">Class Content 1</div>
166			<span class="group">Class Content 2</span>
167		"#;
168
169		// -- Exec & Check - By ID
170		let els_id = select(html_content, ["#unique"])?;
171		assert_eq!(els_id.len(), 1);
172		assert_eq!(els_id[0].tag, "div");
173		assert_eq!(els_id[0].attrs.get("id").map(|s| s.as_str()), Some("unique"));
174		assert_eq!(els_id[0].text.as_deref(), Some("ID Content"));
175
176		// -- Exec & Check - By Class
177		let els_class = select(html_content, [".group"])?;
178		assert_eq!(els_class.len(), 2);
179		assert_eq!(els_class[0].tag, "div");
180		assert_eq!(els_class[0].text.as_deref(), Some("Class Content 1"));
181		assert_eq!(els_class[1].tag, "span");
182		assert_eq!(els_class[1].text.as_deref(), Some("Class Content 2"));
183
184		Ok(())
185	}
186
187	#[test]
188	fn test_selector_select_empty_selector_single() -> Result<()> {
189		// -- Setup & Fixtures
190		let html_content = "<p>No divs here</p>";
191
192		// -- Exec & Check - Non-existent tag
193		let els_div = select(html_content, ["div"])?;
194		assert!(els_div.is_empty());
195
196		// -- Exec & Check - Non-existent class
197		let els_class = select(html_content, [".missing"])?;
198		assert!(els_class.is_empty());
199
200		// -- Exec & Check - Multiple non-existent selectors
201		let els_multiple_missing = select(html_content, ["div.foo", ".bar", "main"])?;
202		assert!(els_multiple_missing.is_empty());
203
204		Ok(())
205	}
206
207	#[test]
208	fn test_selector_select_empty_selector_multiple() -> Result<()> {
209		// -- Setup & Fixtures
210		let html_content = "<p>Some content</p>";
211
212		// -- Exec
213		let res = select(html_content, ["", ""])?;
214
215		// -- Check
216		assert!(res.is_empty(), "Elem vector should be empty");
217
218		Ok(())
219	}
220
221	#[test]
222	fn test_selector_select_empty_selector_mixed() -> Result<()> {
223		// -- Setup & Fixtures
224		let html_content = "<p>Some content</p><span> other content<span>";
225
226		// -- Exec
227		let res = select(html_content, ["", "p", ""])?;
228
229		// -- Check
230		assert_eq!(res.len(), 1,);
231		let el = res.first().ok_or("Should have one item")?;
232		assert_eq!(&el.tag, "p");
233
234		Ok(())
235	}
236
237	#[test]
238	fn test_selector_select_invalid_selector_syntax() -> Result<()> {
239		// -- Setup & Fixtures
240		let html_content = "<p>Some content</p>";
241
242		// -- Exec
243		let res = select(html_content, ["p", "h1[", "div"]); // Invalid selector syntax
244
245		// -- Check
246		let Err(err) = res else {
247			panic!("Should have been an error for invalid selector syntax")
248		};
249		let err_string = err.to_string();
250		// scraper's error for "p[" is "Invalid selector: Expected an attribute name, found Eof"
251		assert!(err_string.contains("is invalid"));
252
253		Ok(())
254	}
255
256	#[test]
257	fn test_selector_select_empty_iterator_is_error() -> Result<()> {
258		// -- Setup & Fixtures
259		let html_content = "<p>Some content</p>";
260
261		// -- Exec
262		let res = select(html_content, Vec::<&str>::new())?; // Empty iterator
263
264		// -- Check
265		assert!(res.is_empty(), "Elem vector should be empty");
266
267		Ok(())
268	}
269
270	#[test]
271	fn test_selector_select_attributes_and_inner_html_single_selector() -> Result<()> {
272		// -- Setup & Fixtures
273		let html_content =
274			r#"<a href="https://example.com" title="Test Link" class="external link">Click <b>here</b></a>"#;
275
276		// -- Exec
277		let snodes = select(html_content, ["a.link"])?;
278
279		// -- Check
280		assert_eq!(snodes.len(), 1);
281		let node = &snodes[0];
282		assert_eq!(node.tag, "a");
283		assert_eq!(node.attrs.len(), 3);
284		assert_eq!(node.attrs.get("href").map(|s| s.as_str()), Some("https://example.com"));
285		assert_eq!(node.attrs.get("title").map(|s| s.as_str()), Some("Test Link"));
286		assert_eq!(node.attrs.get("class").map(|s| s.as_str()), Some("external link"));
287
288		assert_eq!(node.text.as_deref(), Some("Click here"));
289		assert_eq!(node.inner_html.as_deref(), Some("Click <b>here</b>"));
290
291		Ok(())
292	}
293
294	#[test]
295	fn test_selector_select_text_and_inner_html_trimming_single_selector() -> Result<()> {
296		// -- Setup & Fixtures
297		let html_content = r#"
298            <p>  Trimmed text here  </p>
299            <div>  <span>  Inner  </span>  </div>
300            <pre>
301            Untrimmed  
302            </pre>
303            <button>  </button>
304        "#;
305
306		// -- Exec & Check - Paragraph text
307		let p_nodes = select(html_content, ["p"])?;
308		assert_eq!(p_nodes.len(), 1);
309		assert_eq!(p_nodes[0].text.as_deref(), Some("Trimmed text here"));
310		assert_eq!(p_nodes[0].inner_html.as_deref(), Some("Trimmed text here"));
311
312		// -- Exec & Check - Div with span
313		let div_nodes = select(html_content, ["div"])?;
314		assert_eq!(div_nodes.len(), 1);
315		assert_eq!(div_nodes[0].text.as_deref(), Some("Inner"));
316		assert_eq!(div_nodes[0].inner_html.as_deref(), Some("<span>  Inner  </span>"));
317
318		// -- Exec & Check - Pre
319		let pre_nodes = select(html_content, ["pre"])?;
320		assert_eq!(pre_nodes.len(), 1);
321		assert_eq!(pre_nodes[0].text.as_deref(), Some("Untrimmed"));
322		assert_eq!(pre_nodes[0].inner_html.as_deref(), Some("Untrimmed"));
323
324		// -- Exec & Check - Empty button
325		let button_nodes = select(html_content, ["button"])?;
326		assert_eq!(button_nodes.len(), 1);
327		assert_eq!(button_nodes[0].text, None);
328		assert_eq!(button_nodes[0].inner_html, None);
329
330		Ok(())
331	}
332}
333
334// endregion: --- Tests