1use crate::{Elem, Error, Result};
2use scraper::{Html, Selector};
3
4pub fn select<S>(html_content: &str, selectors: S) -> Result<Vec<Elem>>
19where
20 S: IntoIterator,
21 S::Item: AsRef<str>,
22{
23 let mut selectors_str = String::new();
25 for s_ref in selectors {
26 let s = s_ref.as_ref().trim();
27 if s.is_empty() {
28 continue;
29 }
30 if !selectors_str.is_empty() {
31 selectors_str.push(',');
32 }
33 selectors_str.push_str(s);
34 }
35 if selectors_str.is_empty() {
37 return Ok(Vec::new());
38 }
39 let css_selector = Selector::parse(&selectors_str).map_err(|err| Error::SelectorParse {
41 selector: selectors_str.clone(),
42 cause: err.to_string(),
43 })?;
44
45 let html = Html::parse_document(html_content);
47
48 let mut els = Vec::new();
49 for element_ref in html.select(&css_selector) {
50 els.push(Elem::from_element_ref(element_ref));
51 }
52
53 Ok(els)
54}
55
56#[cfg(test)]
59mod tests {
60 use super::*;
61 type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
63
64 #[test]
65 fn test_selector_select_simple_single_selector() -> Result<()> {
66 let html_content = r#"
68 <!DOCTYPE html>
69 <html>
70 <head><title>Test</title></head>
71 <body>
72 <div id="main" class="container">
73 <h1>Title</h1>
74 <p>First paragraph.</p>
75 <p class="highlight">Second paragraph with <span>span text</span>.</p>
76 <ul>
77 <li>Item 1</li>
78 <li>Item 2</li>
79 </ul>
80 </div>
81 </body>
82 </html>
83 "#;
84
85 let els_p = select(html_content, vec!["p"])?;
87
88 assert_eq!(els_p.len(), 2);
90
91 assert_eq!(els_p[0].tag, "p");
92 assert!(els_p[0].attrs.is_none());
93 assert_eq!(els_p[0].text.as_deref(), Some("First paragraph."));
94 assert_eq!(els_p[0].inner_html.as_deref(), Some("First paragraph."));
95
96 assert_eq!(els_p[1].tag, "p");
97 assert_eq!(
98 els_p[1]
99 .attrs
100 .as_ref()
101 .ok_or("Should have attrs")?
102 .get("class")
103 .map(|s| s.as_str()),
104 Some("highlight")
105 );
106 assert_eq!(els_p[1].text.as_deref(), Some("Second paragraph with span text."));
107 assert_eq!(
108 els_p[1].inner_html.as_deref(),
109 Some("Second paragraph with <span>span text</span>.")
110 );
111
112 let els_span_in_p = select(html_content, ["p.highlight span"])?;
114 assert_eq!(els_span_in_p.len(), 1);
115 assert_eq!(els_span_in_p[0].tag, "span");
116 assert_eq!(els_span_in_p[0].text.as_deref(), Some("span text"));
117 assert_eq!(els_span_in_p[0].inner_html.as_deref(), Some("span text"));
118
119 Ok(())
120 }
121
122 #[test]
123 fn test_selector_select_multiple_selectors_or_logic() -> Result<()> {
124 let html_content = r#"
126 <h1>Title 1</h1>
127 <p>Paragraph 1</p>
128 <h2>Title 2</h2>
129 <div>Div content</div>
130 <p>Paragraph 2</p>
131 "#;
132
133 let els = select(html_content, ["h1", "p", "h3"])?;
137
138 assert_eq!(els.len(), 3, "Should find one h1 and two p tags");
140 assert_eq!(els[0].tag, "h1");
141 assert_eq!(els[0].text.as_deref(), Some("Title 1"));
142 assert_eq!(els[1].tag, "p");
143 assert_eq!(els[1].text.as_deref(), Some("Paragraph 1"));
144 assert_eq!(els[2].tag, "p");
145 assert_eq!(els[2].text.as_deref(), Some("Paragraph 2"));
146
147 let els_reordered_selectors = select(html_content, ["p", "h1"])?;
149 assert_eq!(els_reordered_selectors.len(), 3);
150 assert_eq!(
151 els_reordered_selectors[0].tag, "h1",
152 "Order is document order, not selector order"
153 );
154 assert_eq!(els_reordered_selectors[1].tag, "p");
155 assert_eq!(els_reordered_selectors[2].tag, "p");
156
157 let els_div_h2 = select(html_content, ["div", "h2"])?;
159 assert_eq!(els_div_h2.len(), 2);
160 assert_eq!(els_div_h2[0].tag, "h2");
161 assert_eq!(els_div_h2[0].text.as_deref(), Some("Title 2"));
162 assert_eq!(els_div_h2[1].tag, "div");
163 assert_eq!(els_div_h2[1].text.as_deref(), Some("Div content"));
164
165 Ok(())
166 }
167
168 #[test]
169 fn test_selector_select_by_id_and_class_single_selector() -> Result<()> {
170 let html_content = r#"
172 <div id="unique">ID Content</div>
173 <div class="group">Class Content 1</div>
174 <span class="group">Class Content 2</span>
175 "#;
176
177 let els_id = select(html_content, ["#unique"])?;
179 assert_eq!(els_id.len(), 1);
180 assert_eq!(els_id[0].tag, "div");
181 assert_eq!(
182 els_id[0]
183 .attrs
184 .as_ref()
185 .ok_or("Should have attrs")?
186 .get("id")
187 .map(|s| s.as_str()),
188 Some("unique")
189 );
190 assert_eq!(els_id[0].text.as_deref(), Some("ID Content"));
191
192 let els_class = select(html_content, [".group"])?;
194 assert_eq!(els_class.len(), 2);
195 assert_eq!(els_class[0].tag, "div");
196 assert_eq!(els_class[0].text.as_deref(), Some("Class Content 1"));
197 assert_eq!(els_class[1].tag, "span");
198 assert_eq!(els_class[1].text.as_deref(), Some("Class Content 2"));
199
200 Ok(())
201 }
202
203 #[test]
204 fn test_selector_select_empty_selector_single() -> Result<()> {
205 let html_content = "<p>No divs here</p>";
207
208 let els_div = select(html_content, ["div"])?;
210 assert!(els_div.is_empty());
211
212 let els_class = select(html_content, [".missing"])?;
214 assert!(els_class.is_empty());
215
216 let els_multiple_missing = select(html_content, ["div.foo", ".bar", "main"])?;
218 assert!(els_multiple_missing.is_empty());
219
220 Ok(())
221 }
222
223 #[test]
224 fn test_selector_select_empty_selector_multiple() -> Result<()> {
225 let html_content = "<p>Some content</p>";
227
228 let res = select(html_content, ["", ""])?;
230
231 assert!(res.is_empty(), "Elem vector should be empty");
233
234 Ok(())
235 }
236
237 #[test]
238 fn test_selector_select_empty_selector_mixed() -> Result<()> {
239 let html_content = "<p>Some content</p><span> other content<span>";
241
242 let res = select(html_content, ["", "p", ""])?;
244
245 assert_eq!(res.len(), 1,);
247 let el = res.first().ok_or("Should have one item")?;
248 assert_eq!(&el.tag, "p");
249
250 Ok(())
251 }
252
253 #[test]
254 fn test_selector_select_invalid_selector_syntax() -> Result<()> {
255 let html_content = "<p>Some content</p>";
257
258 let res = select(html_content, ["p", "h1[", "div"]); let Err(err) = res else {
263 panic!("Should have been an error for invalid selector syntax")
264 };
265 let err_string = err.to_string();
266 assert!(err_string.contains("is invalid"));
268
269 Ok(())
270 }
271
272 #[test]
273 fn test_selector_select_empty_iterator_is_error() -> Result<()> {
274 let html_content = "<p>Some content</p>";
276
277 let res = select(html_content, Vec::<&str>::new())?; assert!(res.is_empty(), "Elem vector should be empty");
282
283 Ok(())
284 }
285
286 #[test]
287 fn test_selector_select_attributes_and_inner_html_single_selector() -> Result<()> {
288 let html_content =
290 r#"<a href="https://example.com" title="Test Link" class="external link">Click <b>here</b></a>"#;
291
292 let snodes = select(html_content, ["a.link"])?;
294
295 assert_eq!(snodes.len(), 1);
297 let node = &snodes[0];
298 assert_eq!(node.tag, "a");
299 let attrs = node.attrs.as_ref().ok_or("should have attrs")?;
300 assert_eq!(attrs.len(), 3);
301 assert_eq!(attrs.get("href").map(|s| s.as_str()), Some("https://example.com"));
302 assert_eq!(attrs.get("title").map(|s| s.as_str()), Some("Test Link"));
303 assert_eq!(attrs.get("class").map(|s| s.as_str()), Some("external link"));
304
305 assert_eq!(node.text.as_deref(), Some("Click here"));
306 assert_eq!(node.inner_html.as_deref(), Some("Click <b>here</b>"));
307
308 Ok(())
309 }
310
311 #[test]
313 fn test_selector_select_text_and_inner_html_trimming_single_selector() -> Result<()> {
314 let html_content = r#"
316 <p> Trimmed text here </p>
317 <div> <span> Inner </span> </div>
318 <pre>
319 Untrimmed
320 </pre>
321 <button> </button>
322 "#;
323
324 let p_nodes = select(html_content, ["p"])?;
326 assert_eq!(p_nodes.len(), 1);
327 assert_eq!(p_nodes[0].text.as_deref(), Some(" Trimmed text here "));
328 assert_eq!(p_nodes[0].inner_html.as_deref(), Some(" Trimmed text here "));
329
330 let div_nodes = select(html_content, ["div"])?;
332 assert_eq!(div_nodes.len(), 1);
333 assert_eq!(div_nodes[0].text.as_deref(), Some(" Inner "));
334 assert_eq!(div_nodes[0].inner_html.as_deref(), Some(" <span> Inner </span> "));
335
336 let pre_nodes = select(html_content, ["pre"])?;
338 assert_eq!(pre_nodes.len(), 1);
339 assert_eq!(
340 pre_nodes[0].text.as_deref(),
341 Some(" Untrimmed \n ")
342 );
343 assert_eq!(
344 pre_nodes[0].inner_html.as_deref(),
345 Some(" Untrimmed \n ")
346 );
347
348 let button_nodes = select(html_content, ["button"])?;
350 assert_eq!(button_nodes.len(), 1);
351 assert_eq!(button_nodes[0].text, None);
352 assert_eq!(button_nodes[0].inner_html, None);
353
354 Ok(())
355 }
356}
357
358