1use crate::{Elem, Error, Result};
2use scraper::{Html, Selector};
3
4pub fn select<S>(html_content: &str, selectors: S) -> Result<Vec<Elem>>
19where
20	S: IntoIterator,
21	S::Item: AsRef<str>,
22{
23	let mut selectors_str = String::new();
25	for s_ref in selectors {
26		let s = s_ref.as_ref().trim();
27		if s.is_empty() {
28			continue;
29		}
30		if !selectors_str.is_empty() {
31			selectors_str.push(',');
32		}
33		selectors_str.push_str(s);
34	}
35	if selectors_str.is_empty() {
37		return Ok(Vec::new());
38	}
39	let css_selector = Selector::parse(&selectors_str).map_err(|err| Error::SelectorParse {
41		selector: selectors_str.clone(),
42		cause: err.to_string(),
43	})?;
44
45	let html = Html::parse_document(html_content);
47
48	let mut els = Vec::new();
49	for element_ref in html.select(&css_selector) {
50		els.push(Elem::from_element_ref(element_ref));
51	}
52
53	Ok(els)
54}
55
56#[cfg(test)]
59mod tests {
60	use super::*;
61	type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
63
64	#[test]
65	fn test_selector_select_simple_single_selector() -> Result<()> {
66		let html_content = r#"
68			<!DOCTYPE html>
69			<html>
70			<head><title>Test</title></head>
71			<body>
72				<div id="main" class="container">
73					<h1>Title</h1>
74					<p>First paragraph.</p>
75					<p class="highlight">Second paragraph with <span>span text</span>.</p>
76					<ul>
77						<li>Item 1</li>
78						<li>Item 2</li>
79					</ul>
80				</div>
81			</body>
82			</html>
83		"#;
84
85		let els_p = select(html_content, vec!["p"])?;
87
88		assert_eq!(els_p.len(), 2);
90
91		assert_eq!(els_p[0].tag, "p");
92		assert!(els_p[0].attrs.is_empty());
93		assert_eq!(els_p[0].text.as_deref(), Some("First paragraph."));
94		assert_eq!(els_p[0].inner_html.as_deref(), Some("First paragraph."));
95
96		assert_eq!(els_p[1].tag, "p");
97		assert_eq!(els_p[1].attrs.get("class").map(|s| s.as_str()), Some("highlight"));
98		assert_eq!(els_p[1].text.as_deref(), Some("Second paragraph with span text."));
99		assert_eq!(
100			els_p[1].inner_html.as_deref(),
101			Some("Second paragraph with <span>span text</span>.")
102		);
103
104		let els_span_in_p = select(html_content, ["p.highlight span"])?;
106		assert_eq!(els_span_in_p.len(), 1);
107		assert_eq!(els_span_in_p[0].tag, "span");
108		assert_eq!(els_span_in_p[0].text.as_deref(), Some("span text"));
109		assert_eq!(els_span_in_p[0].inner_html.as_deref(), Some("span text"));
110
111		Ok(())
112	}
113
114	#[test]
115	fn test_selector_select_multiple_selectors_or_logic() -> Result<()> {
116		let html_content = r#"
118            <h1>Title 1</h1>
119            <p>Paragraph 1</p>
120            <h2>Title 2</h2>
121            <div>Div content</div>
122            <p>Paragraph 2</p>
123        "#;
124
125		let els = select(html_content, ["h1", "p", "h3"])?;
129
130		assert_eq!(els.len(), 3, "Should find one h1 and two p tags");
132		assert_eq!(els[0].tag, "h1");
133		assert_eq!(els[0].text.as_deref(), Some("Title 1"));
134		assert_eq!(els[1].tag, "p");
135		assert_eq!(els[1].text.as_deref(), Some("Paragraph 1"));
136		assert_eq!(els[2].tag, "p");
137		assert_eq!(els[2].text.as_deref(), Some("Paragraph 2"));
138
139		let els_reordered_selectors = select(html_content, ["p", "h1"])?;
141		assert_eq!(els_reordered_selectors.len(), 3);
142		assert_eq!(
143			els_reordered_selectors[0].tag, "h1",
144			"Order is document order, not selector order"
145		);
146		assert_eq!(els_reordered_selectors[1].tag, "p");
147		assert_eq!(els_reordered_selectors[2].tag, "p");
148
149		let els_div_h2 = select(html_content, ["div", "h2"])?;
151		assert_eq!(els_div_h2.len(), 2);
152		assert_eq!(els_div_h2[0].tag, "h2");
153		assert_eq!(els_div_h2[0].text.as_deref(), Some("Title 2"));
154		assert_eq!(els_div_h2[1].tag, "div");
155		assert_eq!(els_div_h2[1].text.as_deref(), Some("Div content"));
156
157		Ok(())
158	}
159
160	#[test]
161	fn test_selector_select_by_id_and_class_single_selector() -> Result<()> {
162		let html_content = r#"
164			<div id="unique">ID Content</div>
165			<div class="group">Class Content 1</div>
166			<span class="group">Class Content 2</span>
167		"#;
168
169		let els_id = select(html_content, ["#unique"])?;
171		assert_eq!(els_id.len(), 1);
172		assert_eq!(els_id[0].tag, "div");
173		assert_eq!(els_id[0].attrs.get("id").map(|s| s.as_str()), Some("unique"));
174		assert_eq!(els_id[0].text.as_deref(), Some("ID Content"));
175
176		let els_class = select(html_content, [".group"])?;
178		assert_eq!(els_class.len(), 2);
179		assert_eq!(els_class[0].tag, "div");
180		assert_eq!(els_class[0].text.as_deref(), Some("Class Content 1"));
181		assert_eq!(els_class[1].tag, "span");
182		assert_eq!(els_class[1].text.as_deref(), Some("Class Content 2"));
183
184		Ok(())
185	}
186
187	#[test]
188	fn test_selector_select_empty_selector_single() -> Result<()> {
189		let html_content = "<p>No divs here</p>";
191
192		let els_div = select(html_content, ["div"])?;
194		assert!(els_div.is_empty());
195
196		let els_class = select(html_content, [".missing"])?;
198		assert!(els_class.is_empty());
199
200		let els_multiple_missing = select(html_content, ["div.foo", ".bar", "main"])?;
202		assert!(els_multiple_missing.is_empty());
203
204		Ok(())
205	}
206
207	#[test]
208	fn test_selector_select_empty_selector_multiple() -> Result<()> {
209		let html_content = "<p>Some content</p>";
211
212		let res = select(html_content, ["", ""])?;
214
215		assert!(res.is_empty(), "Elem vector should be empty");
217
218		Ok(())
219	}
220
221	#[test]
222	fn test_selector_select_empty_selector_mixed() -> Result<()> {
223		let html_content = "<p>Some content</p><span> other content<span>";
225
226		let res = select(html_content, ["", "p", ""])?;
228
229		assert_eq!(res.len(), 1,);
231		let el = res.first().ok_or("Should have one item")?;
232		assert_eq!(&el.tag, "p");
233
234		Ok(())
235	}
236
237	#[test]
238	fn test_selector_select_invalid_selector_syntax() -> Result<()> {
239		let html_content = "<p>Some content</p>";
241
242		let res = select(html_content, ["p", "h1[", "div"]); let Err(err) = res else {
247			panic!("Should have been an error for invalid selector syntax")
248		};
249		let err_string = err.to_string();
250		assert!(err_string.contains("is invalid"));
252
253		Ok(())
254	}
255
256	#[test]
257	fn test_selector_select_empty_iterator_is_error() -> Result<()> {
258		let html_content = "<p>Some content</p>";
260
261		let res = select(html_content, Vec::<&str>::new())?; assert!(res.is_empty(), "Elem vector should be empty");
266
267		Ok(())
268	}
269
270	#[test]
271	fn test_selector_select_attributes_and_inner_html_single_selector() -> Result<()> {
272		let html_content =
274			r#"<a href="https://example.com" title="Test Link" class="external link">Click <b>here</b></a>"#;
275
276		let snodes = select(html_content, ["a.link"])?;
278
279		assert_eq!(snodes.len(), 1);
281		let node = &snodes[0];
282		assert_eq!(node.tag, "a");
283		assert_eq!(node.attrs.len(), 3);
284		assert_eq!(node.attrs.get("href").map(|s| s.as_str()), Some("https://example.com"));
285		assert_eq!(node.attrs.get("title").map(|s| s.as_str()), Some("Test Link"));
286		assert_eq!(node.attrs.get("class").map(|s| s.as_str()), Some("external link"));
287
288		assert_eq!(node.text.as_deref(), Some("Click here"));
289		assert_eq!(node.inner_html.as_deref(), Some("Click <b>here</b>"));
290
291		Ok(())
292	}
293
294	#[test]
295	fn test_selector_select_text_and_inner_html_trimming_single_selector() -> Result<()> {
296		let html_content = r#"
298            <p>  Trimmed text here  </p>
299            <div>  <span>  Inner  </span>  </div>
300            <pre>
301            Untrimmed  
302            </pre>
303            <button>  </button>
304        "#;
305
306		let p_nodes = select(html_content, ["p"])?;
308		assert_eq!(p_nodes.len(), 1);
309		assert_eq!(p_nodes[0].text.as_deref(), Some("Trimmed text here"));
310		assert_eq!(p_nodes[0].inner_html.as_deref(), Some("Trimmed text here"));
311
312		let div_nodes = select(html_content, ["div"])?;
314		assert_eq!(div_nodes.len(), 1);
315		assert_eq!(div_nodes[0].text.as_deref(), Some("Inner"));
316		assert_eq!(div_nodes[0].inner_html.as_deref(), Some("<span>  Inner  </span>"));
317
318		let pre_nodes = select(html_content, ["pre"])?;
320		assert_eq!(pre_nodes.len(), 1);
321		assert_eq!(pre_nodes[0].text.as_deref(), Some("Untrimmed"));
322		assert_eq!(pre_nodes[0].inner_html.as_deref(), Some("Untrimmed"));
323
324		let button_nodes = select(html_content, ["button"])?;
326		assert_eq!(button_nodes.len(), 1);
327		assert_eq!(button_nodes[0].text, None);
328		assert_eq!(button_nodes[0].inner_html, None);
329
330		Ok(())
331	}
332}
333
334