html_helpers/selector/
select.rs

1use crate::{Elem, Error, Result};
2use scraper::{Html, Selector};
3
4/// Selects HTML elements based on a list of CSS selectors and returns them as a list of `Elem`.
5/// The selectors are combined with a comma, effectively performing an "OR" match.
6/// Elements are returned in document order.
7///
8/// # Arguments
9///
10/// * `html_content` - A string slice containing the HTML content to parse.
11/// * `selectors` - An iterator of string-like items, each representing a CSS selector.
12///
13/// # Returns
14///
15/// A `Result` containing:
16/// - `Ok(Vec<Elem>)`: A vector of `Elem` objects representing the selected elements.
17/// - `Err(Error)`: An error if parsing the HTML or the combined selector fails.
18pub fn select<S>(html_content: &str, selectors: S) -> Result<Vec<Elem>>
19where
20	S: IntoIterator,
21	S::Item: AsRef<str>,
22{
23	// -- Build the selectors_str
24	let mut selectors_str = String::new();
25	for s_ref in selectors {
26		let s = s_ref.as_ref().trim();
27		if s.is_empty() {
28			continue;
29		}
30		if !selectors_str.is_empty() {
31			selectors_str.push(',');
32		}
33		selectors_str.push_str(s);
34	}
35	// if empty, just return empty vector
36	if selectors_str.is_empty() {
37		return Ok(Vec::new());
38	}
39	// build the scraper seletor
40	let css_selector = Selector::parse(&selectors_str).map_err(|err| Error::SelectorParse {
41		selector: selectors_str.clone(),
42		cause: err.to_string(),
43	})?;
44
45	// -- Parse and select
46	let html = Html::parse_document(html_content);
47
48	let mut els = Vec::new();
49	for element_ref in html.select(&css_selector) {
50		els.push(Elem::from_element_ref(element_ref));
51	}
52
53	Ok(els)
54}
55
56// region:    --- Tests
57
58#[cfg(test)]
59mod tests {
60	use super::*;
61	// General test functions use this local `Result<T>` for `Box<dyn Error>`.
62	type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
63
64	#[test]
65	fn test_selector_select_simple_single_selector() -> Result<()> {
66		// -- Setup & Fixtures
67		let html_content = r#"
68			<!DOCTYPE html>
69			<html>
70			<head><title>Test</title></head>
71			<body>
72				<div id="main" class="container">
73					<h1>Title</h1>
74					<p>First paragraph.</p>
75					<p class="highlight">Second paragraph with <span>span text</span>.</p>
76					<ul>
77						<li>Item 1</li>
78						<li>Item 2</li>
79					</ul>
80				</div>
81			</body>
82			</html>
83		"#;
84
85		// -- Exec
86		let els_p = select(html_content, vec!["p"])?;
87
88		// -- Check
89		assert_eq!(els_p.len(), 2);
90
91		assert_eq!(els_p[0].tag, "p");
92		assert!(els_p[0].attrs.is_none());
93		assert_eq!(els_p[0].text.as_deref(), Some("First paragraph."));
94		assert_eq!(els_p[0].inner_html.as_deref(), Some("First paragraph."));
95
96		assert_eq!(els_p[1].tag, "p");
97		assert_eq!(
98			els_p[1]
99				.attrs
100				.as_ref()
101				.ok_or("Should have attrs")?
102				.get("class")
103				.map(|s| s.as_str()),
104			Some("highlight")
105		);
106		assert_eq!(els_p[1].text.as_deref(), Some("Second paragraph with span text."));
107		assert_eq!(
108			els_p[1].inner_html.as_deref(),
109			Some("Second paragraph with <span>span text</span>.")
110		);
111
112		// -- Exec & Check - Span inside p.highlight
113		let els_span_in_p = select(html_content, ["p.highlight span"])?;
114		assert_eq!(els_span_in_p.len(), 1);
115		assert_eq!(els_span_in_p[0].tag, "span");
116		assert_eq!(els_span_in_p[0].text.as_deref(), Some("span text"));
117		assert_eq!(els_span_in_p[0].inner_html.as_deref(), Some("span text"));
118
119		Ok(())
120	}
121
122	#[test]
123	fn test_selector_select_multiple_selectors_or_logic() -> Result<()> {
124		// -- Setup & Fixtures
125		let html_content = r#"
126            <h1>Title 1</h1>
127            <p>Paragraph 1</p>
128            <h2>Title 2</h2>
129            <div>Div content</div>
130            <p>Paragraph 2</p>
131        "#;
132
133		// -- Exec
134		// Selects elements matching h1 OR p OR h3. (h3 does not exist)
135		// Order should be document order.
136		let els = select(html_content, ["h1", "p", "h3"])?;
137
138		// -- Check
139		assert_eq!(els.len(), 3, "Should find one h1 and two p tags");
140		assert_eq!(els[0].tag, "h1");
141		assert_eq!(els[0].text.as_deref(), Some("Title 1"));
142		assert_eq!(els[1].tag, "p");
143		assert_eq!(els[1].text.as_deref(), Some("Paragraph 1"));
144		assert_eq!(els[2].tag, "p");
145		assert_eq!(els[2].text.as_deref(), Some("Paragraph 2"));
146
147		// -- Exec & Check - Different order of selectors, same result order
148		let els_reordered_selectors = select(html_content, ["p", "h1"])?;
149		assert_eq!(els_reordered_selectors.len(), 3);
150		assert_eq!(
151			els_reordered_selectors[0].tag, "h1",
152			"Order is document order, not selector order"
153		);
154		assert_eq!(els_reordered_selectors[1].tag, "p");
155		assert_eq!(els_reordered_selectors[2].tag, "p");
156
157		// -- Exec & Check - Select div and h2
158		let els_div_h2 = select(html_content, ["div", "h2"])?;
159		assert_eq!(els_div_h2.len(), 2);
160		assert_eq!(els_div_h2[0].tag, "h2");
161		assert_eq!(els_div_h2[0].text.as_deref(), Some("Title 2"));
162		assert_eq!(els_div_h2[1].tag, "div");
163		assert_eq!(els_div_h2[1].text.as_deref(), Some("Div content"));
164
165		Ok(())
166	}
167
168	#[test]
169	fn test_selector_select_by_id_and_class_single_selector() -> Result<()> {
170		// -- Setup & Fixtures
171		let html_content = r#"
172			<div id="unique">ID Content</div>
173			<div class="group">Class Content 1</div>
174			<span class="group">Class Content 2</span>
175		"#;
176
177		// -- Exec & Check - By ID
178		let els_id = select(html_content, ["#unique"])?;
179		assert_eq!(els_id.len(), 1);
180		assert_eq!(els_id[0].tag, "div");
181		assert_eq!(
182			els_id[0]
183				.attrs
184				.as_ref()
185				.ok_or("Should have attrs")?
186				.get("id")
187				.map(|s| s.as_str()),
188			Some("unique")
189		);
190		assert_eq!(els_id[0].text.as_deref(), Some("ID Content"));
191
192		// -- Exec & Check - By Class
193		let els_class = select(html_content, [".group"])?;
194		assert_eq!(els_class.len(), 2);
195		assert_eq!(els_class[0].tag, "div");
196		assert_eq!(els_class[0].text.as_deref(), Some("Class Content 1"));
197		assert_eq!(els_class[1].tag, "span");
198		assert_eq!(els_class[1].text.as_deref(), Some("Class Content 2"));
199
200		Ok(())
201	}
202
203	#[test]
204	fn test_selector_select_empty_selector_single() -> Result<()> {
205		// -- Setup & Fixtures
206		let html_content = "<p>No divs here</p>";
207
208		// -- Exec & Check - Non-existent tag
209		let els_div = select(html_content, ["div"])?;
210		assert!(els_div.is_empty());
211
212		// -- Exec & Check - Non-existent class
213		let els_class = select(html_content, [".missing"])?;
214		assert!(els_class.is_empty());
215
216		// -- Exec & Check - Multiple non-existent selectors
217		let els_multiple_missing = select(html_content, ["div.foo", ".bar", "main"])?;
218		assert!(els_multiple_missing.is_empty());
219
220		Ok(())
221	}
222
223	#[test]
224	fn test_selector_select_empty_selector_multiple() -> Result<()> {
225		// -- Setup & Fixtures
226		let html_content = "<p>Some content</p>";
227
228		// -- Exec
229		let res = select(html_content, ["", ""])?;
230
231		// -- Check
232		assert!(res.is_empty(), "Elem vector should be empty");
233
234		Ok(())
235	}
236
237	#[test]
238	fn test_selector_select_empty_selector_mixed() -> Result<()> {
239		// -- Setup & Fixtures
240		let html_content = "<p>Some content</p><span> other content<span>";
241
242		// -- Exec
243		let res = select(html_content, ["", "p", ""])?;
244
245		// -- Check
246		assert_eq!(res.len(), 1,);
247		let el = res.first().ok_or("Should have one item")?;
248		assert_eq!(&el.tag, "p");
249
250		Ok(())
251	}
252
253	#[test]
254	fn test_selector_select_invalid_selector_syntax() -> Result<()> {
255		// -- Setup & Fixtures
256		let html_content = "<p>Some content</p>";
257
258		// -- Exec
259		let res = select(html_content, ["p", "h1[", "div"]); // Invalid selector syntax
260
261		// -- Check
262		let Err(err) = res else {
263			panic!("Should have been an error for invalid selector syntax")
264		};
265		let err_string = err.to_string();
266		// scraper's error for "p[" is "Invalid selector: Expected an attribute name, found Eof"
267		assert!(err_string.contains("is invalid"));
268
269		Ok(())
270	}
271
272	#[test]
273	fn test_selector_select_empty_iterator_is_error() -> Result<()> {
274		// -- Setup & Fixtures
275		let html_content = "<p>Some content</p>";
276
277		// -- Exec
278		let res = select(html_content, Vec::<&str>::new())?; // Empty iterator
279
280		// -- Check
281		assert!(res.is_empty(), "Elem vector should be empty");
282
283		Ok(())
284	}
285
286	#[test]
287	fn test_selector_select_attributes_and_inner_html_single_selector() -> Result<()> {
288		// -- Setup & Fixtures
289		let html_content =
290			r#"<a href="https://example.com" title="Test Link" class="external link">Click <b>here</b></a>"#;
291
292		// -- Exec
293		let snodes = select(html_content, ["a.link"])?;
294
295		// -- Check
296		assert_eq!(snodes.len(), 1);
297		let node = &snodes[0];
298		assert_eq!(node.tag, "a");
299		let attrs = node.attrs.as_ref().ok_or("should have attrs")?;
300		assert_eq!(attrs.len(), 3);
301		assert_eq!(attrs.get("href").map(|s| s.as_str()), Some("https://example.com"));
302		assert_eq!(attrs.get("title").map(|s| s.as_str()), Some("Test Link"));
303		assert_eq!(attrs.get("class").map(|s| s.as_str()), Some("external link"));
304
305		assert_eq!(node.text.as_deref(), Some("Click here"));
306		assert_eq!(node.inner_html.as_deref(), Some("Click <b>here</b>"));
307
308		Ok(())
309	}
310
311	// NOTE: Now, the lib does not trim anymore.
312	#[test]
313	fn test_selector_select_text_and_inner_html_trimming_single_selector() -> Result<()> {
314		// -- Setup & Fixtures
315		let html_content = r#"
316            <p>  Trimmed text here  </p>
317            <div>  <span>  Inner  </span>  </div>
318            <pre>
319            Untrimmed  
320            </pre>
321            <button>  </button>
322        "#;
323
324		// -- Exec & Check - Paragraph text
325		let p_nodes = select(html_content, ["p"])?;
326		assert_eq!(p_nodes.len(), 1);
327		assert_eq!(p_nodes[0].text.as_deref(), Some("  Trimmed text here  "));
328		assert_eq!(p_nodes[0].inner_html.as_deref(), Some("  Trimmed text here  "));
329
330		// -- Exec & Check - Div with span
331		let div_nodes = select(html_content, ["div"])?;
332		assert_eq!(div_nodes.len(), 1);
333		assert_eq!(div_nodes[0].text.as_deref(), Some("    Inner    "));
334		assert_eq!(div_nodes[0].inner_html.as_deref(), Some("  <span>  Inner  </span>  "));
335
336		// -- Exec & Check - Pre
337		let pre_nodes = select(html_content, ["pre"])?;
338		assert_eq!(pre_nodes.len(), 1);
339		assert_eq!(
340			pre_nodes[0].text.as_deref(),
341			Some("            Untrimmed  \n            ")
342		);
343		assert_eq!(
344			pre_nodes[0].inner_html.as_deref(),
345			Some("            Untrimmed  \n            ")
346		);
347
348		// -- Exec & Check - Empty button
349		let button_nodes = select(html_content, ["button"])?;
350		assert_eq!(button_nodes.len(), 1);
351		assert_eq!(button_nodes[0].text, None);
352		assert_eq!(button_nodes[0].inner_html, None);
353
354		Ok(())
355	}
356}
357
358// endregion: --- Tests