1use crate::{
6 Result, Tag,
7 dom::{Document, NodeId, NodeKind},
8 parser::{Html5everParser, ParseConfig},
9 query::{
10 CompiledSelector, QueryResult, find, find_all, find_all_compiled, find_compiled,
11 select_attr, select_text,
12 },
13};
14
15#[derive(Debug, Clone)]
25pub struct SoupConfig {
26 pub max_depth: usize,
28 pub strict_mode: bool,
30 pub preserve_whitespace: bool,
32 pub include_comments: bool,
34}
35
36impl Default for SoupConfig {
37 fn default() -> Self {
38 Self {
39 max_depth: 512,
40 strict_mode: false,
41 preserve_whitespace: false,
42 include_comments: false,
43 }
44 }
45}
46
47impl SoupConfig {
48 #[must_use]
50 pub fn builder() -> SoupConfigBuilder {
51 SoupConfigBuilder::default()
52 }
53}
54
55#[derive(Debug, Default)]
57pub struct SoupConfigBuilder {
58 max_depth: Option<usize>,
59 strict_mode: Option<bool>,
60 preserve_whitespace: Option<bool>,
61 include_comments: Option<bool>,
62}
63
64impl SoupConfigBuilder {
65 #[must_use]
67 pub fn max_depth(mut self, depth: usize) -> Self {
68 self.max_depth = Some(depth);
69 self
70 }
71
72 #[must_use]
74 pub fn strict_mode(mut self, strict: bool) -> Self {
75 self.strict_mode = Some(strict);
76 self
77 }
78
79 #[must_use]
81 pub fn preserve_whitespace(mut self, preserve: bool) -> Self {
82 self.preserve_whitespace = Some(preserve);
83 self
84 }
85
86 #[must_use]
88 pub fn include_comments(mut self, include: bool) -> Self {
89 self.include_comments = Some(include);
90 self
91 }
92
93 #[must_use]
95 pub fn build(self) -> SoupConfig {
96 SoupConfig {
97 max_depth: self.max_depth.unwrap_or(512),
98 strict_mode: self.strict_mode.unwrap_or(false),
99 preserve_whitespace: self.preserve_whitespace.unwrap_or(false),
100 include_comments: self.include_comments.unwrap_or(false),
101 }
102 }
103}
104
105#[derive(Debug)]
142pub struct Soup {
143 document: Document,
144 #[allow(dead_code)]
145 config: SoupConfig,
146}
147
148impl Soup {
149 #[must_use]
162 pub fn parse(html: &str) -> Self {
163 Self::parse_with_config(html, SoupConfig::default())
164 }
165
166 #[must_use]
177 pub fn parse_with_config(html: &str, config: SoupConfig) -> Self {
178 let parser = Html5everParser;
179 let parse_config = ParseConfig {
180 max_depth: config.max_depth,
181 preserve_whitespace: config.preserve_whitespace,
182 include_comments: config.include_comments,
183 };
184
185 let estimated_nodes = estimate_node_count(html.len());
186 let document = parser
187 .parse_with_config_and_capacity(html, &parse_config, estimated_nodes)
188 .unwrap_or_default();
189
190 Self { document, config }
191 }
192
193 #[must_use]
195 pub fn document(&self) -> &Document {
196 &self.document
197 }
198
199 pub fn from_file(path: &std::path::Path) -> Result<Self> {
215 let html = std::fs::read_to_string(path)?;
216 Ok(Self::parse(&html))
217 }
218
219 #[must_use]
234 pub fn parse_fragment(html: &str) -> Self {
235 Self::parse_fragment_with_context(html, "body")
236 }
237
238 #[must_use]
255 pub fn parse_fragment_with_context(html: &str, context: &str) -> Self {
256 Self::parse_fragment_with_config(html, context, SoupConfig::default())
257 }
258
259 #[must_use]
261 pub fn parse_fragment_with_config(html: &str, context: &str, config: SoupConfig) -> Self {
262 let parse_config = ParseConfig {
263 max_depth: config.max_depth,
264 preserve_whitespace: config.preserve_whitespace,
265 include_comments: config.include_comments,
266 };
267
268 let document = crate::parser::fragment::parse_fragment_impl(html, context, &parse_config)
269 .unwrap_or_default();
270
271 Self { document, config }
272 }
273
274 pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>> {
292 find(&self.document, selector).map(|opt| opt.map(|id| Tag::new(&self.document, id)))
293 }
294
295 pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
311 find_all(&self.document, selector)
312 .map(|ids| ids.into_iter().map(|id| Tag::new(&self.document, id)).collect())
313 }
314
315 pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
334 self.find_all(selector)
335 }
336
337 #[must_use]
350 pub fn find_compiled(&self, selector: &CompiledSelector) -> Option<Tag<'_>> {
351 find_compiled(&self.document, selector).map(|id| Tag::new(&self.document, id))
352 }
353
354 #[must_use]
367 pub fn select_compiled(&self, selector: &CompiledSelector) -> Vec<Tag<'_>> {
368 find_all_compiled(&self.document, selector)
369 .into_iter()
370 .map(|id| Tag::new(&self.document, id))
371 .collect()
372 }
373
374 pub fn select_text(&self, selector: &str) -> QueryResult<Vec<String>> {
392 select_text(&self.document, selector)
393 }
394
395 pub fn select_attr(&self, selector: &str, attr: &str) -> QueryResult<Vec<Option<String>>> {
413 select_attr(&self.document, selector, attr)
414 }
415
416 #[must_use]
433 pub fn root(&self) -> Option<Tag<'_>> {
434 self.document.root().map(|id| Tag::new(&self.document, id))
435 }
436
437 #[must_use]
448 pub fn title(&self) -> Option<String> {
449 self.find("title").ok()?.map(|tag| tag.text())
450 }
451
452 #[must_use]
465 pub fn text(&self) -> String {
466 let Some(root) = self.document.root() else {
467 return String::new();
468 };
469 let mut result = String::new();
470 collect_text(&self.document, root, &mut result);
471 result
472 }
473
474 #[must_use]
487 pub fn to_html(&self) -> String {
488 self.root().map(|tag| tag.outer_html()).unwrap_or_default()
489 }
490}
491
492fn collect_text(doc: &Document, id: NodeId, buf: &mut String) {
494 let Some(node) = doc.get(id) else { return };
495
496 match &node.kind {
497 NodeKind::Text { content } => buf.push_str(content),
498 NodeKind::Element { .. } => {
499 for child_id in doc.children(id) {
500 collect_text(doc, child_id, buf);
501 }
502 }
503 NodeKind::Comment { .. } => {}
504 }
505}
506
507#[inline]
512fn estimate_node_count(html_len: usize) -> usize {
513 (html_len / 50).max(256)
514}
515
516#[cfg(test)]
517mod tests {
518 use super::*;
519
520 #[test]
521 fn test_soup_config_default() {
522 let config = SoupConfig::default();
523 assert_eq!(config.max_depth, 512);
524 assert!(!config.strict_mode);
525 assert!(!config.preserve_whitespace);
526 assert!(!config.include_comments);
527 }
528
529 #[test]
530 fn test_soup_config_builder() {
531 let config = SoupConfig::builder()
532 .max_depth(128)
533 .strict_mode(true)
534 .preserve_whitespace(true)
535 .include_comments(true)
536 .build();
537 assert_eq!(config.max_depth, 128);
538 assert!(config.strict_mode);
539 assert!(config.preserve_whitespace);
540 assert!(config.include_comments);
541 }
542
543 #[test]
544 fn test_soup_parse_creates_document() {
545 let soup = Soup::parse("<html><body>Hello</body></html>");
546 assert!(soup.document().root().is_some());
547 }
548
549 #[test]
550 fn test_soup_parse_empty_creates_empty_document() {
551 let soup = Soup::parse("");
552 assert!(soup.document().is_empty());
553 }
554
555 #[test]
556 fn test_soup_parse_with_config() {
557 let config = SoupConfig::builder().max_depth(256).build();
558 let soup = Soup::parse_with_config("<div>Test</div>", config);
559 assert!(soup.document().root().is_some());
560 }
561
562 #[test]
563 fn test_soup_find() {
564 let soup = Soup::parse("<div><span class=\"item\">text</span></div>");
565 let result = soup.find("span.item").unwrap();
566 assert!(result.is_some());
567 assert_eq!(result.unwrap().name(), Some("span"));
568 }
569
570 #[test]
571 fn test_soup_find_returns_none() {
572 let soup = Soup::parse("<div>text</div>");
573 let result = soup.find("span").unwrap();
574 assert!(result.is_none());
575 }
576
577 #[test]
578 fn test_soup_find_invalid_selector() {
579 let soup = Soup::parse("<div>text</div>");
580 let result = soup.find("[");
581 assert!(result.is_err());
582 }
583
584 #[test]
585 fn test_soup_find_all() {
586 let soup = Soup::parse("<ul><li>A</li><li>B</li><li>C</li></ul>");
587 let items = soup.find_all("li").unwrap();
588 assert_eq!(items.len(), 3);
589 }
590
591 #[test]
592 fn test_soup_select() {
593 let soup = Soup::parse("<div class=\"a\"><span class=\"b\">text</span></div>");
594 let results = soup.select("div.a > span.b").unwrap();
595 assert_eq!(results.len(), 1);
596 }
597
598 #[test]
599 fn test_soup_root() {
600 let soup = Soup::parse("<html><body>text</body></html>");
601 let root = soup.root();
602 assert!(root.is_some());
603 assert_eq!(root.unwrap().name(), Some("html"));
604 }
605
606 #[test]
607 fn test_soup_title() {
608 let soup = Soup::parse("<html><head><title>Test Title</title></head></html>");
609 assert_eq!(soup.title(), Some("Test Title".to_string()));
610 }
611
612 #[test]
613 fn test_soup_title_missing() {
614 let soup = Soup::parse("<html><body>no title</body></html>");
615 assert_eq!(soup.title(), None);
616 }
617
618 #[test]
619 fn test_soup_text() {
620 let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
621 let text = soup.text();
622 assert!(text.contains("Hello"));
623 assert!(text.contains("World"));
624 assert!(text.contains('!'));
625 }
626
627 #[test]
628 fn test_soup_to_html() {
629 let soup = Soup::parse("<div><span>text</span></div>");
630 let html = soup.to_html();
631 assert!(html.contains("<div>"));
632 assert!(html.contains("<span>text</span>"));
633 assert!(html.contains("</div>"));
634 }
635
636 #[test]
637 fn test_soup_empty_to_html() {
638 let soup = Soup::parse("");
639 let html = soup.to_html();
640 assert!(html.is_empty());
641 }
642
643 #[test]
644 fn test_soup_find_by_class() {
645 let soup = Soup::parse("<div class=\"foo bar\">text</div>");
646 let result = soup.find(".foo").unwrap();
647 assert!(result.is_some());
648 }
649
650 #[test]
651 fn test_soup_find_by_id() {
652 let soup = Soup::parse("<div id=\"main\">text</div>");
653 let result = soup.find("#main").unwrap();
654 assert!(result.is_some());
655 }
656
657 #[test]
658 fn test_soup_find_compound_selector() {
659 let soup =
660 Soup::parse("<div class=\"foo\" id=\"bar\">text</div><div class=\"foo\">other</div>");
661 let result = soup.find("div.foo#bar").unwrap();
662 assert!(result.is_some());
663 }
664
665 #[test]
666 fn test_soup_find_descendant() {
667 let soup = Soup::parse("<div><ul><li>item</li></ul></div>");
668 let result = soup.find("div li").unwrap();
669 assert!(result.is_some());
670 assert_eq!(result.unwrap().name(), Some("li"));
671 }
672
673 #[test]
674 fn test_soup_find_child_combinator() {
675 let soup =
676 Soup::parse("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
677 let results = soup.select("div > span").unwrap();
678 assert_eq!(results.len(), 1);
679 }
680
681 #[test]
682 fn test_soup_find_with_attribute() {
683 let soup = Soup::parse("<input type=\"text\"><input type=\"password\">");
684 let result = soup.find("input[type=\"text\"]").unwrap();
685 assert!(result.is_some());
686 }
687
688 #[test]
689 fn test_soup_find_compiled() {
690 use crate::query::CompiledSelector;
691
692 let selector = CompiledSelector::compile("div.item").unwrap();
693 let soup = Soup::parse("<div class=\"item\">Text</div>");
694 let result = soup.find_compiled(&selector);
695 assert!(result.is_some());
696 assert_eq!(result.unwrap().text(), "Text");
697 }
698
699 #[test]
700 fn test_soup_select_compiled() {
701 use crate::query::CompiledSelector;
702
703 let selector = CompiledSelector::compile("li").unwrap();
704 let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
705 let items = soup.select_compiled(&selector);
706 assert_eq!(items.len(), 2);
707 }
708
709 #[test]
710 fn test_compiled_selector_reuse() {
711 use crate::query::CompiledSelector;
712
713 let selector = CompiledSelector::compile("li").unwrap();
714
715 let soup1 = Soup::parse("<ul><li>A</li></ul>");
716 let soup2 = Soup::parse("<ul><li>X</li><li>Y</li></ul>");
717
718 assert_eq!(soup1.select_compiled(&selector).len(), 1);
719 assert_eq!(soup2.select_compiled(&selector).len(), 2);
720 }
721
722 #[test]
723 fn test_estimate_node_count_minimum() {
724 assert_eq!(estimate_node_count(0), 256);
725 assert_eq!(estimate_node_count(10), 256);
726 assert_eq!(estimate_node_count(100), 256);
727 assert_eq!(estimate_node_count(256 * 50 - 1), 256);
728 }
729
730 #[test]
731 fn test_estimate_node_count_small() {
732 assert_eq!(estimate_node_count(1000), 256);
733 assert_eq!(estimate_node_count(5000), 256);
734 }
735
736 #[test]
737 fn test_estimate_node_count_medium() {
738 assert_eq!(estimate_node_count(15_000), 300);
739 assert_eq!(estimate_node_count(25_000), 500);
740 assert_eq!(estimate_node_count(50_000), 1000);
741 }
742
743 #[test]
744 fn test_estimate_node_count_large() {
745 assert_eq!(estimate_node_count(100_000), 2000);
746 assert_eq!(estimate_node_count(500_000), 10_000);
747 assert_eq!(estimate_node_count(1_000_000), 20_000);
748 }
749
750 #[test]
751 fn test_estimate_node_count_huge() {
752 assert_eq!(estimate_node_count(10_000_000), 200_000);
753 }
754}