1use crate::{Elem, Error, Result};
2use scraper::{Html, Selector};
3
4pub fn select<S>(html_content: &str, selectors: S) -> Result<Vec<Elem>>
19where
20 S: IntoIterator,
21 S::Item: AsRef<str>,
22{
23 let mut selectors_str = String::new();
25 for s_ref in selectors {
26 let s = s_ref.as_ref().trim();
27 if s.is_empty() {
28 continue;
29 }
30 if !selectors_str.is_empty() {
31 selectors_str.push(',');
32 }
33 selectors_str.push_str(s);
34 }
35 if selectors_str.is_empty() {
37 return Ok(Vec::new());
38 }
39 let css_selector = Selector::parse(&selectors_str).map_err(|err| Error::SelectorParse {
41 selector: selectors_str.clone(),
42 cause: err.to_string(),
43 })?;
44
45 let html = Html::parse_document(html_content);
47
48 let mut els = Vec::new();
49 for element_ref in html.select(&css_selector) {
50 els.push(Elem::from_element_ref(element_ref));
51 }
52
53 Ok(els)
54}
55
56#[cfg(test)]
59mod tests {
60 use super::*;
61 type Result<T> = core::result::Result<T, Box<dyn std::error::Error>>;
63
64 #[test]
65 fn test_selector_select_simple_single_selector() -> Result<()> {
66 let html_content = r#"
68 <!DOCTYPE html>
69 <html>
70 <head><title>Test</title></head>
71 <body>
72 <div id="main" class="container">
73 <h1>Title</h1>
74 <p>First paragraph.</p>
75 <p class="highlight">Second paragraph with <span>span text</span>.</p>
76 <ul>
77 <li>Item 1</li>
78 <li>Item 2</li>
79 </ul>
80 </div>
81 </body>
82 </html>
83 "#;
84
85 let els_p = select(html_content, vec!["p"])?;
87
88 assert_eq!(els_p.len(), 2);
90
91 assert_eq!(els_p[0].tag, "p");
92 assert!(els_p[0].attrs.is_empty());
93 assert_eq!(els_p[0].text.as_deref(), Some("First paragraph."));
94 assert_eq!(els_p[0].inner_html.as_deref(), Some("First paragraph."));
95
96 assert_eq!(els_p[1].tag, "p");
97 assert_eq!(els_p[1].attrs.get("class").map(|s| s.as_str()), Some("highlight"));
98 assert_eq!(els_p[1].text.as_deref(), Some("Second paragraph with span text."));
99 assert_eq!(
100 els_p[1].inner_html.as_deref(),
101 Some("Second paragraph with <span>span text</span>.")
102 );
103
104 let els_span_in_p = select(html_content, ["p.highlight span"])?;
106 assert_eq!(els_span_in_p.len(), 1);
107 assert_eq!(els_span_in_p[0].tag, "span");
108 assert_eq!(els_span_in_p[0].text.as_deref(), Some("span text"));
109 assert_eq!(els_span_in_p[0].inner_html.as_deref(), Some("span text"));
110
111 Ok(())
112 }
113
114 #[test]
115 fn test_selector_select_multiple_selectors_or_logic() -> Result<()> {
116 let html_content = r#"
118 <h1>Title 1</h1>
119 <p>Paragraph 1</p>
120 <h2>Title 2</h2>
121 <div>Div content</div>
122 <p>Paragraph 2</p>
123 "#;
124
125 let els = select(html_content, ["h1", "p", "h3"])?;
129
130 assert_eq!(els.len(), 3, "Should find one h1 and two p tags");
132 assert_eq!(els[0].tag, "h1");
133 assert_eq!(els[0].text.as_deref(), Some("Title 1"));
134 assert_eq!(els[1].tag, "p");
135 assert_eq!(els[1].text.as_deref(), Some("Paragraph 1"));
136 assert_eq!(els[2].tag, "p");
137 assert_eq!(els[2].text.as_deref(), Some("Paragraph 2"));
138
139 let els_reordered_selectors = select(html_content, ["p", "h1"])?;
141 assert_eq!(els_reordered_selectors.len(), 3);
142 assert_eq!(
143 els_reordered_selectors[0].tag, "h1",
144 "Order is document order, not selector order"
145 );
146 assert_eq!(els_reordered_selectors[1].tag, "p");
147 assert_eq!(els_reordered_selectors[2].tag, "p");
148
149 let els_div_h2 = select(html_content, ["div", "h2"])?;
151 assert_eq!(els_div_h2.len(), 2);
152 assert_eq!(els_div_h2[0].tag, "h2");
153 assert_eq!(els_div_h2[0].text.as_deref(), Some("Title 2"));
154 assert_eq!(els_div_h2[1].tag, "div");
155 assert_eq!(els_div_h2[1].text.as_deref(), Some("Div content"));
156
157 Ok(())
158 }
159
160 #[test]
161 fn test_selector_select_by_id_and_class_single_selector() -> Result<()> {
162 let html_content = r#"
164 <div id="unique">ID Content</div>
165 <div class="group">Class Content 1</div>
166 <span class="group">Class Content 2</span>
167 "#;
168
169 let els_id = select(html_content, ["#unique"])?;
171 assert_eq!(els_id.len(), 1);
172 assert_eq!(els_id[0].tag, "div");
173 assert_eq!(els_id[0].attrs.get("id").map(|s| s.as_str()), Some("unique"));
174 assert_eq!(els_id[0].text.as_deref(), Some("ID Content"));
175
176 let els_class = select(html_content, [".group"])?;
178 assert_eq!(els_class.len(), 2);
179 assert_eq!(els_class[0].tag, "div");
180 assert_eq!(els_class[0].text.as_deref(), Some("Class Content 1"));
181 assert_eq!(els_class[1].tag, "span");
182 assert_eq!(els_class[1].text.as_deref(), Some("Class Content 2"));
183
184 Ok(())
185 }
186
187 #[test]
188 fn test_selector_select_empty_selector_single() -> Result<()> {
189 let html_content = "<p>No divs here</p>";
191
192 let els_div = select(html_content, ["div"])?;
194 assert!(els_div.is_empty());
195
196 let els_class = select(html_content, [".missing"])?;
198 assert!(els_class.is_empty());
199
200 let els_multiple_missing = select(html_content, ["div.foo", ".bar", "main"])?;
202 assert!(els_multiple_missing.is_empty());
203
204 Ok(())
205 }
206
207 #[test]
208 fn test_selector_select_empty_selector_multiple() -> Result<()> {
209 let html_content = "<p>Some content</p>";
211
212 let res = select(html_content, ["", ""])?;
214
215 assert!(res.is_empty(), "Elem vector should be empty");
217
218 Ok(())
219 }
220
221 #[test]
222 fn test_selector_select_empty_selector_mixed() -> Result<()> {
223 let html_content = "<p>Some content</p><span> other content<span>";
225
226 let res = select(html_content, ["", "p", ""])?;
228
229 assert_eq!(res.len(), 1,);
231 let el = res.first().ok_or("Should have one item")?;
232 assert_eq!(&el.tag, "p");
233
234 Ok(())
235 }
236
237 #[test]
238 fn test_selector_select_invalid_selector_syntax() -> Result<()> {
239 let html_content = "<p>Some content</p>";
241
242 let res = select(html_content, ["p", "h1[", "div"]); let Err(err) = res else {
247 panic!("Should have been an error for invalid selector syntax")
248 };
249 let err_string = err.to_string();
250 assert!(err_string.contains("is invalid"));
252
253 Ok(())
254 }
255
256 #[test]
257 fn test_selector_select_empty_iterator_is_error() -> Result<()> {
258 let html_content = "<p>Some content</p>";
260
261 let res = select(html_content, Vec::<&str>::new())?; assert!(res.is_empty(), "Elem vector should be empty");
266
267 Ok(())
268 }
269
270 #[test]
271 fn test_selector_select_attributes_and_inner_html_single_selector() -> Result<()> {
272 let html_content =
274 r#"<a href="https://example.com" title="Test Link" class="external link">Click <b>here</b></a>"#;
275
276 let snodes = select(html_content, ["a.link"])?;
278
279 assert_eq!(snodes.len(), 1);
281 let node = &snodes[0];
282 assert_eq!(node.tag, "a");
283 assert_eq!(node.attrs.len(), 3);
284 assert_eq!(node.attrs.get("href").map(|s| s.as_str()), Some("https://example.com"));
285 assert_eq!(node.attrs.get("title").map(|s| s.as_str()), Some("Test Link"));
286 assert_eq!(node.attrs.get("class").map(|s| s.as_str()), Some("external link"));
287
288 assert_eq!(node.text.as_deref(), Some("Click here"));
289 assert_eq!(node.inner_html.as_deref(), Some("Click <b>here</b>"));
290
291 Ok(())
292 }
293
294 #[test]
295 fn test_selector_select_text_and_inner_html_trimming_single_selector() -> Result<()> {
296 let html_content = r#"
298 <p> Trimmed text here </p>
299 <div> <span> Inner </span> </div>
300 <pre>
301 Untrimmed
302 </pre>
303 <button> </button>
304 "#;
305
306 let p_nodes = select(html_content, ["p"])?;
308 assert_eq!(p_nodes.len(), 1);
309 assert_eq!(p_nodes[0].text.as_deref(), Some("Trimmed text here"));
310 assert_eq!(p_nodes[0].inner_html.as_deref(), Some("Trimmed text here"));
311
312 let div_nodes = select(html_content, ["div"])?;
314 assert_eq!(div_nodes.len(), 1);
315 assert_eq!(div_nodes[0].text.as_deref(), Some("Inner"));
316 assert_eq!(div_nodes[0].inner_html.as_deref(), Some("<span> Inner </span>"));
317
318 let pre_nodes = select(html_content, ["pre"])?;
320 assert_eq!(pre_nodes.len(), 1);
321 assert_eq!(pre_nodes[0].text.as_deref(), Some("Untrimmed"));
322 assert_eq!(pre_nodes[0].inner_html.as_deref(), Some("Untrimmed"));
323
324 let button_nodes = select(html_content, ["button"])?;
326 assert_eq!(button_nodes.len(), 1);
327 assert_eq!(button_nodes[0].text, None);
328 assert_eq!(button_nodes[0].inner_html, None);
329
330 Ok(())
331 }
332}
333
334