scrape_core/
soup.rs

1//! Main document container type.
2//!
3//! The [`Soup`] struct is the primary entry point for parsing and querying HTML documents.
4
5use crate::{
6    Result, Tag,
7    dom::{Document, NodeId, NodeKind},
8    parser::{Html5everParser, ParseConfig, Parser},
9    query::{QueryResult, find, find_all},
10};
11
12/// Configuration options for HTML parsing.
13///
14/// # Examples
15///
16/// ```rust
17/// use scrape_core::SoupConfig;
18///
19/// let config = SoupConfig::builder().max_depth(256).strict_mode(false).build();
20/// ```
21#[derive(Debug, Clone)]
22pub struct SoupConfig {
23    /// Maximum nesting depth for DOM tree.
24    pub max_depth: usize,
25    /// Enable strict parsing mode (fail on malformed HTML).
26    pub strict_mode: bool,
27    /// Whether to preserve whitespace-only text nodes.
28    pub preserve_whitespace: bool,
29    /// Whether to include comment nodes.
30    pub include_comments: bool,
31}
32
33impl Default for SoupConfig {
34    fn default() -> Self {
35        Self {
36            max_depth: 512,
37            strict_mode: false,
38            preserve_whitespace: false,
39            include_comments: false,
40        }
41    }
42}
43
44impl SoupConfig {
45    /// Creates a new configuration builder.
46    #[must_use]
47    pub fn builder() -> SoupConfigBuilder {
48        SoupConfigBuilder::default()
49    }
50}
51
52/// Builder for [`SoupConfig`].
53#[derive(Debug, Default)]
54pub struct SoupConfigBuilder {
55    max_depth: Option<usize>,
56    strict_mode: Option<bool>,
57    preserve_whitespace: Option<bool>,
58    include_comments: Option<bool>,
59}
60
61impl SoupConfigBuilder {
62    /// Sets the maximum nesting depth.
63    #[must_use]
64    pub fn max_depth(mut self, depth: usize) -> Self {
65        self.max_depth = Some(depth);
66        self
67    }
68
69    /// Enables or disables strict parsing mode.
70    #[must_use]
71    pub fn strict_mode(mut self, strict: bool) -> Self {
72        self.strict_mode = Some(strict);
73        self
74    }
75
76    /// Enables or disables whitespace preservation.
77    #[must_use]
78    pub fn preserve_whitespace(mut self, preserve: bool) -> Self {
79        self.preserve_whitespace = Some(preserve);
80        self
81    }
82
83    /// Enables or disables comment inclusion.
84    #[must_use]
85    pub fn include_comments(mut self, include: bool) -> Self {
86        self.include_comments = Some(include);
87        self
88    }
89
90    /// Builds the configuration.
91    #[must_use]
92    pub fn build(self) -> SoupConfig {
93        SoupConfig {
94            max_depth: self.max_depth.unwrap_or(512),
95            strict_mode: self.strict_mode.unwrap_or(false),
96            preserve_whitespace: self.preserve_whitespace.unwrap_or(false),
97            include_comments: self.include_comments.unwrap_or(false),
98        }
99    }
100}
101
102/// A parsed HTML document.
103///
104/// `Soup` is the main entry point for parsing and querying HTML documents.
105/// It provides methods for finding elements by CSS selector or tag name.
106///
107/// # Examples
108///
109/// ## Basic Parsing
110///
111/// ```rust
112/// use scrape_core::Soup;
113///
114/// let html = "<html><body><h1>Hello, World!</h1></body></html>";
115/// let soup = Soup::parse(html);
116///
117/// if let Ok(Some(h1)) = soup.find("h1") {
118///     assert_eq!(h1.text(), "Hello, World!");
119/// }
120/// ```
121///
122/// ## CSS Selectors
123///
124/// ```rust
125/// use scrape_core::Soup;
126///
127/// let html = r#"
128///     <div class="container">
129///         <span class="item">One</span>
130///         <span class="item">Two</span>
131///     </div>
132/// "#;
133/// let soup = Soup::parse(html);
134///
135/// let items = soup.select("div.container > span.item").unwrap();
136/// assert_eq!(items.len(), 2);
137/// ```
138#[derive(Debug)]
139pub struct Soup {
140    document: Document,
141    #[allow(dead_code)]
142    config: SoupConfig,
143}
144
145impl Soup {
146    /// Parses an HTML string into a `Soup` document.
147    ///
148    /// This uses the default configuration. For custom configuration,
149    /// use [`Soup::parse_with_config`].
150    ///
151    /// # Examples
152    ///
153    /// ```rust
154    /// use scrape_core::Soup;
155    ///
156    /// let soup = Soup::parse("<html><body>Hello</body></html>");
157    /// ```
158    #[must_use]
159    pub fn parse(html: &str) -> Self {
160        Self::parse_with_config(html, SoupConfig::default())
161    }
162
163    /// Parses an HTML string with custom configuration.
164    ///
165    /// # Examples
166    ///
167    /// ```rust
168    /// use scrape_core::{Soup, SoupConfig};
169    ///
170    /// let config = SoupConfig::builder().max_depth(128).build();
171    /// let soup = Soup::parse_with_config("<html>...</html>", config);
172    /// ```
173    #[must_use]
174    pub fn parse_with_config(html: &str, config: SoupConfig) -> Self {
175        let parser = Html5everParser;
176        let parse_config = ParseConfig {
177            max_depth: config.max_depth,
178            preserve_whitespace: config.preserve_whitespace,
179            include_comments: config.include_comments,
180        };
181
182        let document =
183            parser.parse_with_config(html, &parse_config).unwrap_or_else(|_| Document::new());
184
185        Self { document, config }
186    }
187
188    /// Returns a reference to the underlying document.
189    #[must_use]
190    pub fn document(&self) -> &Document {
191        &self.document
192    }
193
194    /// Parses HTML from a file.
195    ///
196    /// # Errors
197    ///
198    /// Returns an error if the file cannot be read.
199    ///
200    /// # Examples
201    ///
202    /// ```rust,no_run
203    /// use std::path::Path;
204    ///
205    /// use scrape_core::Soup;
206    ///
207    /// let soup = Soup::from_file(Path::new("index.html")).unwrap();
208    /// ```
209    pub fn from_file(path: &std::path::Path) -> Result<Self> {
210        let html = std::fs::read_to_string(path)?;
211        Ok(Self::parse(&html))
212    }
213
214    // ==================== Query Methods ====================
215
216    /// Finds the first element matching the given CSS selector.
217    ///
218    /// # Errors
219    ///
220    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
221    ///
222    /// # Examples
223    ///
224    /// ```rust
225    /// use scrape_core::Soup;
226    ///
227    /// let soup = Soup::parse("<div><span class=\"item\">Hello</span></div>");
228    /// let span = soup.find("span.item").unwrap().unwrap();
229    /// assert_eq!(span.text(), "Hello");
230    /// ```
231    pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>> {
232        find(&self.document, selector).map(|opt| opt.map(|id| Tag::new(&self.document, id)))
233    }
234
235    /// Finds all elements matching the given CSS selector.
236    ///
237    /// # Errors
238    ///
239    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
240    ///
241    /// # Examples
242    ///
243    /// ```rust
244    /// use scrape_core::Soup;
245    ///
246    /// let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
247    /// let items = soup.find_all("li").unwrap();
248    /// assert_eq!(items.len(), 2);
249    /// ```
250    pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
251        find_all(&self.document, selector)
252            .map(|ids| ids.into_iter().map(|id| Tag::new(&self.document, id)).collect())
253    }
254
255    /// Selects elements using a CSS selector.
256    ///
257    /// This is an alias for [`Soup::find_all`] for users familiar with
258    /// the CSS selector API.
259    ///
260    /// # Errors
261    ///
262    /// Returns [`QueryError::InvalidSelector`] if the selector syntax is invalid.
263    ///
264    /// # Examples
265    ///
266    /// ```rust
267    /// use scrape_core::Soup;
268    ///
269    /// let soup = Soup::parse("<div class=\"a\"><span class=\"b\">Text</span></div>");
270    /// let results = soup.select("div.a > span.b").unwrap();
271    /// assert_eq!(results.len(), 1);
272    /// ```
273    pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>> {
274        self.find_all(selector)
275    }
276
277    // ==================== Document Methods ====================
278
279    /// Returns the root element of the document.
280    ///
281    /// This is typically the `<html>` element.
282    ///
283    /// # Examples
284    ///
285    /// ```rust
286    /// use scrape_core::Soup;
287    ///
288    /// let soup = Soup::parse("<html><body>text</body></html>");
289    /// if let Some(root) = soup.root() {
290    ///     assert_eq!(root.name(), Some("html"));
291    /// }
292    /// ```
293    #[must_use]
294    pub fn root(&self) -> Option<Tag<'_>> {
295        self.document.root().map(|id| Tag::new(&self.document, id))
296    }
297
298    /// Returns the document's title, if present.
299    ///
300    /// # Examples
301    ///
302    /// ```rust
303    /// use scrape_core::Soup;
304    ///
305    /// let soup = Soup::parse("<html><head><title>My Page</title></head></html>");
306    /// assert_eq!(soup.title(), Some("My Page".to_string()));
307    /// ```
308    #[must_use]
309    pub fn title(&self) -> Option<String> {
310        self.find("title").ok()?.map(|tag| tag.text())
311    }
312
313    /// Returns the document's text content with tags stripped.
314    ///
315    /// # Examples
316    ///
317    /// ```rust
318    /// use scrape_core::Soup;
319    ///
320    /// let soup = Soup::parse("<div>Hello <b>World</b></div>");
321    /// let text = soup.text();
322    /// assert!(text.contains("Hello"));
323    /// assert!(text.contains("World"));
324    /// ```
325    #[must_use]
326    pub fn text(&self) -> String {
327        let Some(root) = self.document.root() else {
328            return String::new();
329        };
330        let mut result = String::new();
331        collect_text(&self.document, root, &mut result);
332        result
333    }
334
335    /// Returns the document as an HTML string.
336    ///
337    /// # Examples
338    ///
339    /// ```rust
340    /// use scrape_core::Soup;
341    ///
342    /// let soup = Soup::parse("<div><span>text</span></div>");
343    /// let html = soup.to_html();
344    /// assert!(html.contains("<div>"));
345    /// assert!(html.contains("<span>"));
346    /// ```
347    #[must_use]
348    pub fn to_html(&self) -> String {
349        self.root().map(|tag| tag.outer_html()).unwrap_or_default()
350    }
351}
352
353/// Recursively collects text content from a subtree.
354fn collect_text(doc: &Document, id: NodeId, buf: &mut String) {
355    let Some(node) = doc.get(id) else { return };
356
357    match &node.kind {
358        NodeKind::Text { content } => buf.push_str(content),
359        NodeKind::Element { .. } => {
360            for child_id in doc.children(id) {
361                collect_text(doc, child_id, buf);
362            }
363        }
364        NodeKind::Comment { .. } => {}
365    }
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    #[test]
373    fn test_soup_config_default() {
374        let config = SoupConfig::default();
375        assert_eq!(config.max_depth, 512);
376        assert!(!config.strict_mode);
377        assert!(!config.preserve_whitespace);
378        assert!(!config.include_comments);
379    }
380
381    #[test]
382    fn test_soup_config_builder() {
383        let config = SoupConfig::builder()
384            .max_depth(128)
385            .strict_mode(true)
386            .preserve_whitespace(true)
387            .include_comments(true)
388            .build();
389        assert_eq!(config.max_depth, 128);
390        assert!(config.strict_mode);
391        assert!(config.preserve_whitespace);
392        assert!(config.include_comments);
393    }
394
395    #[test]
396    fn test_soup_parse_creates_document() {
397        let soup = Soup::parse("<html><body>Hello</body></html>");
398        assert!(soup.document().root().is_some());
399    }
400
401    #[test]
402    fn test_soup_parse_empty_creates_empty_document() {
403        let soup = Soup::parse("");
404        assert!(soup.document().is_empty());
405    }
406
407    #[test]
408    fn test_soup_parse_with_config() {
409        let config = SoupConfig::builder().max_depth(256).build();
410        let soup = Soup::parse_with_config("<div>Test</div>", config);
411        assert!(soup.document().root().is_some());
412    }
413
414    #[test]
415    fn test_soup_find() {
416        let soup = Soup::parse("<div><span class=\"item\">text</span></div>");
417        let result = soup.find("span.item").unwrap();
418        assert!(result.is_some());
419        assert_eq!(result.unwrap().name(), Some("span"));
420    }
421
422    #[test]
423    fn test_soup_find_returns_none() {
424        let soup = Soup::parse("<div>text</div>");
425        let result = soup.find("span").unwrap();
426        assert!(result.is_none());
427    }
428
429    #[test]
430    fn test_soup_find_invalid_selector() {
431        let soup = Soup::parse("<div>text</div>");
432        let result = soup.find("[");
433        assert!(result.is_err());
434    }
435
436    #[test]
437    fn test_soup_find_all() {
438        let soup = Soup::parse("<ul><li>A</li><li>B</li><li>C</li></ul>");
439        let items = soup.find_all("li").unwrap();
440        assert_eq!(items.len(), 3);
441    }
442
443    #[test]
444    fn test_soup_select() {
445        let soup = Soup::parse("<div class=\"a\"><span class=\"b\">text</span></div>");
446        let results = soup.select("div.a > span.b").unwrap();
447        assert_eq!(results.len(), 1);
448    }
449
450    #[test]
451    fn test_soup_root() {
452        let soup = Soup::parse("<html><body>text</body></html>");
453        let root = soup.root();
454        assert!(root.is_some());
455        assert_eq!(root.unwrap().name(), Some("html"));
456    }
457
458    #[test]
459    fn test_soup_title() {
460        let soup = Soup::parse("<html><head><title>Test Title</title></head></html>");
461        assert_eq!(soup.title(), Some("Test Title".to_string()));
462    }
463
464    #[test]
465    fn test_soup_title_missing() {
466        let soup = Soup::parse("<html><body>no title</body></html>");
467        assert_eq!(soup.title(), None);
468    }
469
470    #[test]
471    fn test_soup_text() {
472        let soup = Soup::parse("<div>Hello <b>World</b>!</div>");
473        let text = soup.text();
474        assert!(text.contains("Hello"));
475        assert!(text.contains("World"));
476        assert!(text.contains('!'));
477    }
478
479    #[test]
480    fn test_soup_to_html() {
481        let soup = Soup::parse("<div><span>text</span></div>");
482        let html = soup.to_html();
483        assert!(html.contains("<div>"));
484        assert!(html.contains("<span>text</span>"));
485        assert!(html.contains("</div>"));
486    }
487
488    #[test]
489    fn test_soup_empty_to_html() {
490        let soup = Soup::parse("");
491        let html = soup.to_html();
492        assert!(html.is_empty());
493    }
494
495    #[test]
496    fn test_soup_find_by_class() {
497        let soup = Soup::parse("<div class=\"foo bar\">text</div>");
498        let result = soup.find(".foo").unwrap();
499        assert!(result.is_some());
500    }
501
502    #[test]
503    fn test_soup_find_by_id() {
504        let soup = Soup::parse("<div id=\"main\">text</div>");
505        let result = soup.find("#main").unwrap();
506        assert!(result.is_some());
507    }
508
509    #[test]
510    fn test_soup_find_compound_selector() {
511        let soup =
512            Soup::parse("<div class=\"foo\" id=\"bar\">text</div><div class=\"foo\">other</div>");
513        let result = soup.find("div.foo#bar").unwrap();
514        assert!(result.is_some());
515    }
516
517    #[test]
518    fn test_soup_find_descendant() {
519        let soup = Soup::parse("<div><ul><li>item</li></ul></div>");
520        let result = soup.find("div li").unwrap();
521        assert!(result.is_some());
522        assert_eq!(result.unwrap().name(), Some("li"));
523    }
524
525    #[test]
526    fn test_soup_find_child_combinator() {
527        let soup =
528            Soup::parse("<div><span>direct</span></div><div><ul><span>nested</span></ul></div>");
529        let results = soup.select("div > span").unwrap();
530        assert_eq!(results.len(), 1);
531    }
532
533    #[test]
534    fn test_soup_find_with_attribute() {
535        let soup = Soup::parse("<input type=\"text\"><input type=\"password\">");
536        let result = soup.find("input[type=\"text\"]").unwrap();
537        assert!(result.is_some());
538    }
539}