1use crate::{
6 Result, Tag,
7 dom::{Document, NodeId, NodeKind},
8 parser::{Html5everParser, ParseConfig, Parser},
9 query::{QueryResult, find, find_all},
10};
11
12#[derive(Debug, Clone)]
22pub struct SoupConfig {
23 pub max_depth: usize,
25 pub strict_mode: bool,
27 pub preserve_whitespace: bool,
29 pub include_comments: bool,
31}
32
33impl Default for SoupConfig {
34 fn default() -> Self {
35 Self {
36 max_depth: 512,
37 strict_mode: false,
38 preserve_whitespace: false,
39 include_comments: false,
40 }
41 }
42}
43
44impl SoupConfig {
45 #[must_use]
47 pub fn builder() -> SoupConfigBuilder {
48 SoupConfigBuilder::default()
49 }
50}
51
52#[derive(Debug, Default)]
54pub struct SoupConfigBuilder {
55 max_depth: Option<usize>,
56 strict_mode: Option<bool>,
57 preserve_whitespace: Option<bool>,
58 include_comments: Option<bool>,
59}
60
61impl SoupConfigBuilder {
62 #[must_use]
64 pub fn max_depth(mut self, depth: usize) -> Self {
65 self.max_depth = Some(depth);
66 self
67 }
68
69 #[must_use]
71 pub fn strict_mode(mut self, strict: bool) -> Self {
72 self.strict_mode = Some(strict);
73 self
74 }
75
76 #[must_use]
78 pub fn preserve_whitespace(mut self, preserve: bool) -> Self {
79 self.preserve_whitespace = Some(preserve);
80 self
81 }
82
83 #[must_use]
85 pub fn include_comments(mut self, include: bool) -> Self {
86 self.include_comments = Some(include);
87 self
88 }
89
90 #[must_use]
92 pub fn build(self) -> SoupConfig {
93 SoupConfig {
94 max_depth: self.max_depth.unwrap_or(512),
95 strict_mode: self.strict_mode.unwrap_or(false),
96 preserve_whitespace: self.preserve_whitespace.unwrap_or(false),
97 include_comments: self.include_comments.unwrap_or(false),
98 }
99 }
100}
101
102#[derive(Debug)]
139pub struct Soup {
140 document: Document,
141 #[allow(dead_code)]
142 config: SoupConfig,
143}
144
145impl Soup {
146 #[must_use]
159 pub fn parse(html: &str) -> Self {
160 Self::parse_with_config(html, SoupConfig::default())
161 }
162
163 #[must_use]
174 pub fn parse_with_config(html: &str, config: SoupConfig) -> Self {
175 let parser = Html5everParser;
176 let parse_config = ParseConfig {
177 max_depth: config.max_depth,
178 preserve_whitespace: config.preserve_whitespace,
179 include_comments: config.include_comments,
180 };
181
182 let document =
183 parser.parse_with_config(html, &parse_config).unwrap_or_else(|_| Document::new());
184
185 Self { document, config }
186 }
187
188 #[must_use]
190 pub fn document(&self) -> &Document {
191 &self.document
192 }
193
194 pub fn from_file(path: &std::path::Path) -> Result<Self> {
210 let html = std::fs::read_to_string(path)?;
211 Ok(Self::parse(&html))
212 }
213
214 pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>> {
232 find(&self.document, selector).map(|opt| opt.map(|id| Tag::new(&self.document, id)))
233 }
234
235 pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
251 find_all(&self.document, selector)
252 .map(|ids| ids.into_iter().map(|id| Tag::new(&self.document, id)).collect())
253 }
254
255 pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
274 self.find_all(selector)
275 }
276
277 #[must_use]
294 pub fn root(&self) -> Option<Tag<'_>> {
295 self.document.root().map(|id| Tag::new(&self.document, id))
296 }
297
298 #[must_use]
309 pub fn title(&self) -> Option<String> {
310 self.find("title").ok()?.map(|tag| tag.text())
311 }
312
313 #[must_use]
326 pub fn text(&self) -> String {
327 let Some(root) = self.document.root() else {
328 return String::new();
329 };
330 let mut result = String::new();
331 collect_text(&self.document, root, &mut result);
332 result
333 }
334
335 #[must_use]
348 pub fn to_html(&self) -> String {
349 self.root().map(|tag| tag.outer_html()).unwrap_or_default()
350 }
351}
352
353fn collect_text(doc: &Document, id: NodeId, buf: &mut String) {
355 let Some(node) = doc.get(id) else { return };
356
357 match &node.kind {
358 NodeKind::Text { content } => buf.push_str(content),
359 NodeKind::Element { .. } => {
360 for child_id in doc.children(id) {
361 collect_text(doc, child_id, buf);
362 }
363 }
364 NodeKind::Comment { .. } => {}
365 }
366}
367
368#[cfg(test)]
369mod tests {
370 use super::*;
371
372 #[test]
373 fn test_soup_config_default() {
374 let config = SoupConfig::default();
375 assert_eq!(config.max_depth, 512);
376 assert!(!config.strict_mode);
377 assert!(!config.preserve_whitespace);
378 assert!(!config.include_comments);
379 }
380
381 #[test]
382 fn test_soup_config_builder() {
383 let config = SoupConfig::builder()
384 .max_depth(128)
385 .strict_mode(true)
386 .preserve_whitespace(true)
387 .include_comments(true)
388 .build();
389 assert_eq!(config.max_depth, 128);
390 assert!(config.strict_mode);
391 assert!(config.preserve_whitespace);
392 assert!(config.include_comments);
393 }
394
395 #[test]
396 fn test_soup_parse_creates_document() {
397 let soup = Soup::parse("<html><body>Hello</body></html>");
398 assert!(soup.document().root().is_some());
399 }
400
401 #[test]
402 fn test_soup_parse_empty_creates_empty_document() {
403 let soup = Soup::parse("");
404 assert!(soup.document().is_empty());
405 }
406
407 #[test]
408 fn test_soup_parse_with_config() {
409 let config = SoupConfig::builder().max_depth(256).build();
410 let soup = Soup::parse_with_config("<div>Test</div>", config);
411 assert!(soup.document().root().is_some());
412 }
413
414 #[test]
415 fn test_soup_find() {
416 let soup = Soup::parse("<div><span class=\"item\">text</span></div>");
417 let result = soup.find("span.item").unwrap();
418 assert!(result.is_some());
419 assert_eq!(result.unwrap().name(), Some("span"));
420 }
421
422 #[test]
423 fn test_soup_find_returns_none() {
424 let soup = Soup::parse("<div>text</div>");
425 let result = soup.find("span").unwrap();
426 assert!(result.is_none());
427 }
428
429 #[test]
430 fn test_soup_find_invalid_selector() {
431 let soup = Soup::parse("<div>text</div>");
432 let result = soup.find("[");
433 assert!(result.is_err());
434 }
435
436 #[test]
437 fn test_soup_find_all() {
438 let soup = Soup::parse("<ul><li>A</li><li>B</li><li>C</li></ul>");
439 let items = soup.find_all("li").unwrap();
440 assert_eq!(items.len(), 3);
441 }
442
443 #[test]
444 fn test_soup_select() {
445 let soup = Soup::parse("<div class=\"a\"><span class=\"b\">text</span></div>");
446 let results = soup.select("div.a > span.b").unwrap();
447 assert_eq!(results.len(), 1);
448 }
449
450 #[test]
451 fn test_soup_root() {
452 let soup = Soup::parse("<html><body>text</body></html>");
453 let root = soup.root();
454 assert!(root.is_some());
455 assert_eq!(root.unwrap().name(), Some("html"));
456 }
457
458 #[test]
459 fn test_soup_title() {
460 let soup = Soup::parse("<html><head><title>Test Title</title></head></html>");
461 assert_eq!(soup.title(), Some("Test Title".to_string()));
462 }
463
464 #[test]
465 fn test_soup_title_missing() {
466 let soup = Soup::parse("<html><body>no title</body></html>");
467 assert_eq!(soup.title(), None);
468 }
469
470 #[test]
471 fn test_soup_text() {
472 let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
473 let text = soup.text();
474 assert!(text.contains("Hello"));
475 assert!(text.contains("World"));
476 assert!(text.contains('!'));
477 }
478
479 #[test]
480 fn test_soup_to_html() {
481 let soup = Soup::parse("<div><span>text</span></div>");
482 let html = soup.to_html();
483 assert!(html.contains("<div>"));
484 assert!(html.contains("<span>text</span>"));
485 assert!(html.contains("</div>"));
486 }
487
488 #[test]
489 fn test_soup_empty_to_html() {
490 let soup = Soup::parse("");
491 let html = soup.to_html();
492 assert!(html.is_empty());
493 }
494
495 #[test]
496 fn test_soup_find_by_class() {
497 let soup = Soup::parse("<div class=\"foo bar\">text</div>");
498 let result = soup.find(".foo").unwrap();
499 assert!(result.is_some());
500 }
501
502 #[test]
503 fn test_soup_find_by_id() {
504 let soup = Soup::parse("<div id=\"main\">text</div>");
505 let result = soup.find("#main").unwrap();
506 assert!(result.is_some());
507 }
508
509 #[test]
510 fn test_soup_find_compound_selector() {
511 let soup =
512 Soup::parse("<div class=\"foo\" id=\"bar\">text</div><div class=\"foo\">other</div>");
513 let result = soup.find("div.foo#bar").unwrap();
514 assert!(result.is_some());
515 }
516
517 #[test]
518 fn test_soup_find_descendant() {
519 let soup = Soup::parse("<div><ul><li>item</li></ul></div>");
520 let result = soup.find("div li").unwrap();
521 assert!(result.is_some());
522 assert_eq!(result.unwrap().name(), Some("li"));
523 }
524
525 #[test]
526 fn test_soup_find_child_combinator() {
527 let soup =
528 Soup::parse("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
529 let results = soup.select("div > span").unwrap();
530 assert_eq!(results.len(), 1);
531 }
532
533 #[test]
534 fn test_soup_find_with_attribute() {
535 let soup = Soup::parse("<input type=\"text\"><input type=\"password\">");
536 let result = soup.find("input[type=\"text\"]").unwrap();
537 assert!(result.is_some());
538 }
539}