Skip to main content

scrape_core/
soup.rs

1//! Main document container type.
2//!
3//! The [`Soup`] struct is the primary entry point for parsing and querying HTML documents.
4
5use crate::{
6    Result, Tag,
7    dom::{Document, NodeId, NodeKind},
8    parser::{Html5everParser, ParseConfig},
9    query::{
10        CompiledSelector, QueryResult, find, find_all, find_all_compiled, find_compiled,
11        select_attr, select_text,
12    },
13};
14
15/// Configuration options for HTML parsing.
16///
17/// # Examples
18///
19/// ```rust
20/// use scrape_core::SoupConfig;
21///
22/// let config = SoupConfig::builder().max_depth(256).strict_mode(false).build();
23/// ```
24#[derive(Debug, Clone)]
25pub struct SoupConfig {
26    /// Maximum nesting depth for DOM tree.
27    pub max_depth: usize,
28    /// Enable strict parsing mode (fail on malformed HTML).
29    pub strict_mode: bool,
30    /// Whether to preserve whitespace-only text nodes.
31    pub preserve_whitespace: bool,
32    /// Whether to include comment nodes.
33    pub include_comments: bool,
34}
35
36impl Default for SoupConfig {
37    fn default() -> Self {
38        Self {
39            max_depth: 512,
40            strict_mode: false,
41            preserve_whitespace: false,
42            include_comments: false,
43        }
44    }
45}
46
47impl SoupConfig {
48    /// Creates a new configuration builder.
49    #[must_use]
50    pub fn builder() -> SoupConfigBuilder {
51        SoupConfigBuilder::default()
52    }
53}
54
55/// Builder for [`SoupConfig`].
56#[derive(Debug, Default)]
57pub struct SoupConfigBuilder {
58    max_depth: Option<usize>,
59    strict_mode: Option<bool>,
60    preserve_whitespace: Option<bool>,
61    include_comments: Option<bool>,
62}
63
64impl SoupConfigBuilder {
65    /// Sets the maximum nesting depth.
66    #[must_use]
67    pub fn max_depth(mut self, depth: usize) -> Self {
68        self.max_depth = Some(depth);
69        self
70    }
71
72    /// Enables or disables strict parsing mode.
73    #[must_use]
74    pub fn strict_mode(mut self, strict: bool) -> Self {
75        self.strict_mode = Some(strict);
76        self
77    }
78
79    /// Enables or disables whitespace preservation.
80    #[must_use]
81    pub fn preserve_whitespace(mut self, preserve: bool) -> Self {
82        self.preserve_whitespace = Some(preserve);
83        self
84    }
85
86    /// Enables or disables comment inclusion.
87    #[must_use]
88    pub fn include_comments(mut self, include: bool) -> Self {
89        self.include_comments = Some(include);
90        self
91    }
92
93    /// Builds the configuration.
94    #[must_use]
95    pub fn build(self) -> SoupConfig {
96        SoupConfig {
97            max_depth: self.max_depth.unwrap_or(512),
98            strict_mode: self.strict_mode.unwrap_or(false),
99            preserve_whitespace: self.preserve_whitespace.unwrap_or(false),
100            include_comments: self.include_comments.unwrap_or(false),
101        }
102    }
103}
104
105/// A parsed HTML document.
106///
107/// `Soup` is the main entry point for parsing and querying HTML documents.
108/// It provides methods for finding elements by CSS selector or tag name.
109///
110/// # Examples
111///
112/// ## Basic Parsing
113///
114/// ```rust
115/// use scrape_core::Soup;
116///
117/// let html = "<html><body><h1>Hello, World!</h1></body></html>";
118/// let soup = Soup::parse(html);
119///
120/// if let Ok(Some(h1)) = soup.find("h1") {
121///     assert_eq!(h1.text(), "Hello, World!");
122/// }
123/// ```
124///
125/// ## CSS Selectors
126///
127/// ```rust
128/// use scrape_core::Soup;
129///
130/// let html = r#"
131///     <div class="container">
132///         <span class="item">One</span>
133///         <span class="item">Two</span>
134///     </div>
135/// "#;
136/// let soup = Soup::parse(html);
137///
138/// let items = soup.select("div.container > span.item").unwrap();
139/// assert_eq!(items.len(), 2);
140/// ```
141#[derive(Debug)]
142pub struct Soup {
143    document: Document,
144    #[allow(dead_code)]
145    config: SoupConfig,
146}
147
148impl Soup {
149    /// Parses an HTML string into a `Soup` document.
150    ///
151    /// This uses the default configuration. For custom configuration,
152    /// use [`Soup::parse_with_config`].
153    ///
154    /// # Examples
155    ///
156    /// ```rust
157    /// use scrape_core::Soup;
158    ///
159    /// let soup = Soup::parse("<html><body>Hello</body></html>");
160    /// ```
161    #[must_use]
162    pub fn parse(html: &str) -> Self {
163        Self::parse_with_config(html, SoupConfig::default())
164    }
165
166    /// Parses an HTML string with custom configuration.
167    ///
168    /// # Examples
169    ///
170    /// ```rust
171    /// use scrape_core::{Soup, SoupConfig};
172    ///
173    /// let config = SoupConfig::builder().max_depth(128).build();
174    /// let soup = Soup::parse_with_config("<html>...</html>", config);
175    /// ```
176    #[must_use]
177    pub fn parse_with_config(html: &str, config: SoupConfig) -> Self {
178        let parser = Html5everParser;
179        let parse_config = ParseConfig {
180            max_depth: config.max_depth,
181            preserve_whitespace: config.preserve_whitespace,
182            include_comments: config.include_comments,
183        };
184
185        let estimated_nodes = estimate_node_count(html.len());
186        let document = parser
187            .parse_with_config_and_capacity(html, &parse_config, estimated_nodes)
188            .unwrap_or_default();
189
190        Self { document, config }
191    }
192
193    /// Returns a reference to the underlying document.
194    #[must_use]
195    pub fn document(&self) -> &Document {
196        &self.document
197    }
198
199    /// Parses HTML from a file.
200    ///
201    /// # Errors
202    ///
203    /// Returns an error if the file cannot be read.
204    ///
205    /// # Examples
206    ///
207    /// ```rust,no_run
208    /// use std::path::Path;
209    ///
210    /// use scrape_core::Soup;
211    ///
212    /// let soup = Soup::from_file(Path::new("index.html")).unwrap();
213    /// ```
214    pub fn from_file(path: &std::path::Path) -> Result<Self> {
215        let html = std::fs::read_to_string(path)?;
216        Ok(Self::parse(&html))
217    }
218
219    /// Parses an HTML fragment without wrapping in html/body tags.
220    ///
221    /// Unlike [`Soup::parse`], this does not wrap content in `<html><body>` structure.
222    /// The fragment is parsed as if it appeared inside a `<body>` element.
223    ///
224    /// # Examples
225    ///
226    /// ```rust
227    /// use scrape_core::Soup;
228    ///
229    /// let soup = Soup::parse_fragment("<span>A</span><span>B</span>");
230    /// let spans = soup.find_all("span").unwrap();
231    /// assert_eq!(spans.len(), 2);
232    /// ```
233    #[must_use]
234    pub fn parse_fragment(html: &str) -> Self {
235        Self::parse_fragment_with_context(html, "body")
236    }
237
238    /// Parses an HTML fragment with a custom context element.
239    ///
240    /// The context element determines parsing behavior:
241    /// - `"body"`: Standard HTML elements (default)
242    /// - `"table"`: Allows tr/td without explicit tbody
243    /// - `"tbody"`: Allows tr directly
244    ///
245    /// # Examples
246    ///
247    /// ```rust
248    /// use scrape_core::Soup;
249    ///
250    /// let soup = Soup::parse_fragment_with_context("<tr><td>A</td></tr>", "tbody");
251    /// let tr = soup.find("tr").unwrap();
252    /// assert!(tr.is_some());
253    /// ```
254    #[must_use]
255    pub fn parse_fragment_with_context(html: &str, context: &str) -> Self {
256        Self::parse_fragment_with_config(html, context, SoupConfig::default())
257    }
258
259    /// Parses an HTML fragment with custom context and configuration.
260    #[must_use]
261    pub fn parse_fragment_with_config(html: &str, context: &str, config: SoupConfig) -> Self {
262        let parse_config = ParseConfig {
263            max_depth: config.max_depth,
264            preserve_whitespace: config.preserve_whitespace,
265            include_comments: config.include_comments,
266        };
267
268        let document = crate::parser::fragment::parse_fragment_impl(html, context, &parse_config)
269            .unwrap_or_default();
270
271        Self { document, config }
272    }
273
274    // ==================== Query Methods ====================
275
276    /// Finds the first element matching the given CSS selector.
277    ///
278    /// # Errors
279    ///
280    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
281    ///
282    /// # Examples
283    ///
284    /// ```rust
285    /// use scrape_core::Soup;
286    ///
287    /// let soup = Soup::parse("<div><span class=\"item\">Hello</span></div>");
288    /// let span = soup.find("span.item").unwrap().unwrap();
289    /// assert_eq!(span.text(), "Hello");
290    /// ```
291    pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>> {
292        find(&self.document, selector).map(|opt| opt.map(|id| Tag::new(&self.document, id)))
293    }
294
295    /// Finds all elements matching the given CSS selector.
296    ///
297    /// # Errors
298    ///
299    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
300    ///
301    /// # Examples
302    ///
303    /// ```rust
304    /// use scrape_core::Soup;
305    ///
306    /// let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
307    /// let items = soup.find_all("li").unwrap();
308    /// assert_eq!(items.len(), 2);
309    /// ```
310    pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
311        find_all(&self.document, selector)
312            .map(|ids| ids.into_iter().map(|id| Tag::new(&self.document, id)).collect())
313    }
314
315    /// Selects elements using a CSS selector.
316    ///
317    /// This is an alias for [`Soup::find_all`] for users familiar with
318    /// the CSS selector API.
319    ///
320    /// # Errors
321    ///
322    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
323    ///
324    /// # Examples
325    ///
326    /// ```rust
327    /// use scrape_core::Soup;
328    ///
329    /// let soup = Soup::parse("<div class=\"a\"><span class=\"b\">Text</span></div>");
330    /// let results = soup.select("div.a > span.b").unwrap();
331    /// assert_eq!(results.len(), 1);
332    /// ```
333    pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
334        self.find_all(selector)
335    }
336
337    /// Finds the first element using a pre-compiled selector.
338    ///
339    /// # Examples
340    ///
341    /// ```rust
342    /// use scrape_core::{Soup, query::CompiledSelector};
343    ///
344    /// let selector = CompiledSelector::compile("div.item").unwrap();
345    /// let soup = Soup::parse("<div class=\"item\">Text</div>");
346    /// let result = soup.find_compiled(&selector);
347    /// assert!(result.is_some());
348    /// ```
349    #[must_use]
350    pub fn find_compiled(&self, selector: &CompiledSelector) -> Option<Tag<'_>> {
351        find_compiled(&self.document, selector).map(|id| Tag::new(&self.document, id))
352    }
353
354    /// Finds all elements using a pre-compiled selector.
355    ///
356    /// # Examples
357    ///
358    /// ```rust
359    /// use scrape_core::{Soup, query::CompiledSelector};
360    ///
361    /// let selector = CompiledSelector::compile("li").unwrap();
362    /// let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
363    /// let items = soup.select_compiled(&selector);
364    /// assert_eq!(items.len(), 2);
365    /// ```
366    #[must_use]
367    pub fn select_compiled(&self, selector: &CompiledSelector) -> Vec<Tag<'_>> {
368        find_all_compiled(&self.document, selector)
369            .into_iter()
370            .map(|id| Tag::new(&self.document, id))
371            .collect()
372    }
373
374    /// Extracts text content from all elements matching a CSS selector.
375    ///
376    /// Returns the concatenated text content of each matching element.
377    ///
378    /// # Errors
379    ///
380    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
381    ///
382    /// # Examples
383    ///
384    /// ```rust
385    /// use scrape_core::Soup;
386    ///
387    /// let soup = Soup::parse("<ul><li>First</li><li>Second</li></ul>");
388    /// let texts = soup.select_text("li").unwrap();
389    /// assert_eq!(texts, vec!["First", "Second"]);
390    /// ```
391    pub fn select_text(&self, selector: &str) -> QueryResult<Vec<String>> {
392        select_text(&self.document, selector)
393    }
394
395    /// Extracts attribute values from all elements matching a CSS selector.
396    ///
397    /// Returns `Some(value)` if the attribute exists, `None` if it doesn't.
398    ///
399    /// # Errors
400    ///
401    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
402    ///
403    /// # Examples
404    ///
405    /// ```rust
406    /// use scrape_core::Soup;
407    ///
408    /// let soup = Soup::parse("<a href='/a'>A</a><a>B</a>");
409    /// let hrefs = soup.select_attr("a", "href").unwrap();
410    /// assert_eq!(hrefs, vec![Some("/a".to_string()), None]);
411    /// ```
412    pub fn select_attr(&self, selector: &str, attr: &str) -> QueryResult<Vec<Option<String>>> {
413        select_attr(&self.document, selector, attr)
414    }
415
416    // ==================== Document Methods ====================
417
418    /// Returns the root element of the document.
419    ///
420    /// This is typically the `<html>` element.
421    ///
422    /// # Examples
423    ///
424    /// ```rust
425    /// use scrape_core::Soup;
426    ///
427    /// let soup = Soup::parse("<html><body>text</body></html>");
428    /// if let Some(root) = soup.root() {
429    ///     assert_eq!(root.name(), Some("html"));
430    /// }
431    /// ```
432    #[must_use]
433    pub fn root(&self) -> Option<Tag<'_>> {
434        self.document.root().map(|id| Tag::new(&self.document, id))
435    }
436
437    /// Returns the document's title, if present.
438    ///
439    /// # Examples
440    ///
441    /// ```rust
442    /// use scrape_core::Soup;
443    ///
444    /// let soup = Soup::parse("<html><head><title>My Page</title></head></html>");
445    /// assert_eq!(soup.title(), Some("My Page".to_string()));
446    /// ```
447    #[must_use]
448    pub fn title(&self) -> Option<String> {
449        self.find("title").ok()?.map(|tag| tag.text())
450    }
451
452    /// Returns the document's text content with tags stripped.
453    ///
454    /// # Examples
455    ///
456    /// ```rust
457    /// use scrape_core::Soup;
458    ///
459    /// let soup = Soup::parse("<div>Hello <b>World</b></div>");
460    /// let text = soup.text();
461    /// assert!(text.contains("Hello"));
462    /// assert!(text.contains("World"));
463    /// ```
464    #[must_use]
465    pub fn text(&self) -> String {
466        let Some(root) = self.document.root() else {
467            return String::new();
468        };
469        let mut result = String::new();
470        collect_text(&self.document, root, &mut result);
471        result
472    }
473
474    /// Returns the document as an HTML string.
475    ///
476    /// # Examples
477    ///
478    /// ```rust
479    /// use scrape_core::Soup;
480    ///
481    /// let soup = Soup::parse("<div><span>text</span></div>");
482    /// let html = soup.to_html();
483    /// assert!(html.contains("<div>"));
484    /// assert!(html.contains("<span>"));
485    /// ```
486    #[must_use]
487    pub fn to_html(&self) -> String {
488        self.root().map(|tag| tag.outer_html()).unwrap_or_default()
489    }
490}
491
492/// Recursively collects text content from a subtree.
493fn collect_text(doc: &Document, id: NodeId, buf: &mut String) {
494    let Some(node) = doc.get(id) else { return };
495
496    match &node.kind {
497        NodeKind::Text { content } => buf.push_str(content),
498        NodeKind::Element { .. } => {
499            for child_id in doc.children(id) {
500                collect_text(doc, child_id, buf);
501            }
502        }
503        NodeKind::Comment { .. } => {}
504    }
505}
506
507/// Estimates the number of nodes in the document based on HTML size.
508///
509/// Uses heuristic: ~1 node per 50 bytes of HTML.
510/// Clamps to minimum of 256 nodes to avoid excessive allocations for small documents.
511#[inline]
512fn estimate_node_count(html_len: usize) -> usize {
513    (html_len / 50).max(256)
514}
515
516#[cfg(test)]
517mod tests {
518    use super::*;
519
520    #[test]
521    fn test_soup_config_default() {
522        let config = SoupConfig::default();
523        assert_eq!(config.max_depth, 512);
524        assert!(!config.strict_mode);
525        assert!(!config.preserve_whitespace);
526        assert!(!config.include_comments);
527    }
528
529    #[test]
530    fn test_soup_config_builder() {
531        let config = SoupConfig::builder()
532            .max_depth(128)
533            .strict_mode(true)
534            .preserve_whitespace(true)
535            .include_comments(true)
536            .build();
537        assert_eq!(config.max_depth, 128);
538        assert!(config.strict_mode);
539        assert!(config.preserve_whitespace);
540        assert!(config.include_comments);
541    }
542
543    #[test]
544    fn test_soup_parse_creates_document() {
545        let soup = Soup::parse("<html><body>Hello</body></html>");
546        assert!(soup.document().root().is_some());
547    }
548
549    #[test]
550    fn test_soup_parse_empty_creates_empty_document() {
551        let soup = Soup::parse("");
552        assert!(soup.document().is_empty());
553    }
554
555    #[test]
556    fn test_soup_parse_with_config() {
557        let config = SoupConfig::builder().max_depth(256).build();
558        let soup = Soup::parse_with_config("<div>Test</div>", config);
559        assert!(soup.document().root().is_some());
560    }
561
562    #[test]
563    fn test_soup_find() {
564        let soup = Soup::parse("<div><span class=\"item\">text</span></div>");
565        let result = soup.find("span.item").unwrap();
566        assert!(result.is_some());
567        assert_eq!(result.unwrap().name(), Some("span"));
568    }
569
570    #[test]
571    fn test_soup_find_returns_none() {
572        let soup = Soup::parse("<div>text</div>");
573        let result = soup.find("span").unwrap();
574        assert!(result.is_none());
575    }
576
577    #[test]
578    fn test_soup_find_invalid_selector() {
579        let soup = Soup::parse("<div>text</div>");
580        let result = soup.find("[");
581        assert!(result.is_err());
582    }
583
584    #[test]
585    fn test_soup_find_all() {
586        let soup = Soup::parse("<ul><li>A</li><li>B</li><li>C</li></ul>");
587        let items = soup.find_all("li").unwrap();
588        assert_eq!(items.len(), 3);
589    }
590
591    #[test]
592    fn test_soup_select() {
593        let soup = Soup::parse("<div class=\"a\"><span class=\"b\">text</span></div>");
594        let results = soup.select("div.a > span.b").unwrap();
595        assert_eq!(results.len(), 1);
596    }
597
598    #[test]
599    fn test_soup_root() {
600        let soup = Soup::parse("<html><body>text</body></html>");
601        let root = soup.root();
602        assert!(root.is_some());
603        assert_eq!(root.unwrap().name(), Some("html"));
604    }
605
606    #[test]
607    fn test_soup_title() {
608        let soup = Soup::parse("<html><head><title>Test Title</title></head></html>");
609        assert_eq!(soup.title(), Some("Test Title".to_string()));
610    }
611
612    #[test]
613    fn test_soup_title_missing() {
614        let soup = Soup::parse("<html><body>no title</body></html>");
615        assert_eq!(soup.title(), None);
616    }
617
618    #[test]
619    fn test_soup_text() {
620        let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
621        let text = soup.text();
622        assert!(text.contains("Hello"));
623        assert!(text.contains("World"));
624        assert!(text.contains('!'));
625    }
626
627    #[test]
628    fn test_soup_to_html() {
629        let soup = Soup::parse("<div><span>text</span></div>");
630        let html = soup.to_html();
631        assert!(html.contains("<div>"));
632        assert!(html.contains("<span>text</span>"));
633        assert!(html.contains("</div>"));
634    }
635
636    #[test]
637    fn test_soup_empty_to_html() {
638        let soup = Soup::parse("");
639        let html = soup.to_html();
640        assert!(html.is_empty());
641    }
642
643    #[test]
644    fn test_soup_find_by_class() {
645        let soup = Soup::parse("<div class=\"foo bar\">text</div>");
646        let result = soup.find(".foo").unwrap();
647        assert!(result.is_some());
648    }
649
650    #[test]
651    fn test_soup_find_by_id() {
652        let soup = Soup::parse("<div id=\"main\">text</div>");
653        let result = soup.find("#main").unwrap();
654        assert!(result.is_some());
655    }
656
657    #[test]
658    fn test_soup_find_compound_selector() {
659        let soup =
660            Soup::parse("<div class=\"foo\" id=\"bar\">text</div><div class=\"foo\">other</div>");
661        let result = soup.find("div.foo#bar").unwrap();
662        assert!(result.is_some());
663    }
664
665    #[test]
666    fn test_soup_find_descendant() {
667        let soup = Soup::parse("<div><ul><li>item</li></ul></div>");
668        let result = soup.find("div li").unwrap();
669        assert!(result.is_some());
670        assert_eq!(result.unwrap().name(), Some("li"));
671    }
672
673    #[test]
674    fn test_soup_find_child_combinator() {
675        let soup =
676            Soup::parse("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
677        let results = soup.select("div > span").unwrap();
678        assert_eq!(results.len(), 1);
679    }
680
681    #[test]
682    fn test_soup_find_with_attribute() {
683        let soup = Soup::parse("<input type=\"text\"><input type=\"password\">");
684        let result = soup.find("input[type=\"text\"]").unwrap();
685        assert!(result.is_some());
686    }
687
688    #[test]
689    fn test_soup_find_compiled() {
690        use crate::query::CompiledSelector;
691
692        let selector = CompiledSelector::compile("div.item").unwrap();
693        let soup = Soup::parse("<div class=\"item\">Text</div>");
694        let result = soup.find_compiled(&selector);
695        assert!(result.is_some());
696        assert_eq!(result.unwrap().text(), "Text");
697    }
698
699    #[test]
700    fn test_soup_select_compiled() {
701        use crate::query::CompiledSelector;
702
703        let selector = CompiledSelector::compile("li").unwrap();
704        let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
705        let items = soup.select_compiled(&selector);
706        assert_eq!(items.len(), 2);
707    }
708
709    #[test]
710    fn test_compiled_selector_reuse() {
711        use crate::query::CompiledSelector;
712
713        let selector = CompiledSelector::compile("li").unwrap();
714
715        let soup1 = Soup::parse("<ul><li>A</li></ul>");
716        let soup2 = Soup::parse("<ul><li>X</li><li>Y</li></ul>");
717
718        assert_eq!(soup1.select_compiled(&selector).len(), 1);
719        assert_eq!(soup2.select_compiled(&selector).len(), 2);
720    }
721
722    #[test]
723    fn test_estimate_node_count_minimum() {
724        assert_eq!(estimate_node_count(0), 256);
725        assert_eq!(estimate_node_count(10), 256);
726        assert_eq!(estimate_node_count(100), 256);
727        assert_eq!(estimate_node_count(256 * 50 - 1), 256);
728    }
729
730    #[test]
731    fn test_estimate_node_count_small() {
732        assert_eq!(estimate_node_count(1000), 256);
733        assert_eq!(estimate_node_count(5000), 256);
734    }
735
736    #[test]
737    fn test_estimate_node_count_medium() {
738        assert_eq!(estimate_node_count(15_000), 300);
739        assert_eq!(estimate_node_count(25_000), 500);
740        assert_eq!(estimate_node_count(50_000), 1000);
741    }
742
743    #[test]
744    fn test_estimate_node_count_large() {
745        assert_eq!(estimate_node_count(100_000), 2000);
746        assert_eq!(estimate_node_count(500_000), 10_000);
747        assert_eq!(estimate_node_count(1_000_000), 20_000);
748    }
749
750    #[test]
751    fn test_estimate_node_count_huge() {
752        assert_eq!(estimate_node_count(10_000_000), 200_000);
753    }
754}