Struct Soup

Source

pub struct Soup { /* private fields */ }

Expand description

A parsed HTML document.

Soup is the main entry point for parsing and querying HTML documents. It provides methods for finding elements by CSS selector or tag name.

§Examples

§Basic Parsing

use scrape_core::Soup;

let html = "<html><body><h1>Hello, World!</h1></body></html>";
let soup = Soup::parse(html);

if let Ok(Some(h1)) = soup.find("h1") {
    assert_eq!(h1.text(), "Hello, World!");
}

§CSS Selectors

use scrape_core::Soup;

let html = r#"
    <div class="container">
        <span class="item">One</span>
        <span class="item">Two</span>
    </div>
"#;
let soup = Soup::parse(html);

let items = soup.select("div.container > span.item").unwrap();
assert_eq!(items.len(), 2);

Implementations§

Source §

impl Soup

Source

pub fn parse(html: &str) -> Self

Parses an HTML string into a Soup document.

This uses the default configuration. For custom configuration, use Soup::parse_with_config.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><body>Hello</body></html>");

Source

pub fn parse_with_config(html: &str, config: SoupConfig) -> Self

Parses an HTML string with custom configuration.

§Examples

use scrape_core::{Soup, SoupConfig};

let config = SoupConfig::builder().max_depth(128).build();
let soup = Soup::parse_with_config("<html>...</html>", config);

Source

pub fn document(&self) -> &Document

Returns a reference to the underlying document.

Source

pub fn from_file(path: &Path) -> Result<Self>

Parses HTML from a file.

§Errors

Returns an error if the file cannot be read.

§Examples

use std::path::Path;

use scrape_core::Soup;

let soup = Soup::from_file(Path::new("index.html")).unwrap();

Source

pub fn parse_fragment(html: &str) -> Self

Parses an HTML fragment without wrapping in html/body tags.

Unlike Soup::parse, this does not wrap content in <html><body> structure. The fragment is parsed as if it appeared inside a <body> element.

§Examples

use scrape_core::Soup;

let soup = Soup::parse_fragment("<span>A</span><span>B</span>");
let spans = soup.find_all("span").unwrap();
assert_eq!(spans.len(), 2);

Source

pub fn parse_fragment_with_context(html: &str, context: &str) -> Self

Parses an HTML fragment with a custom context element.

The context element determines parsing behavior:

"body": Standard HTML elements (default)
"table": Allows tr/td without explicit tbody
"tbody": Allows tr directly

§Examples

use scrape_core::Soup;

let soup = Soup::parse_fragment_with_context("<tr><td>A</td></tr>", "tbody");
let tr = soup.find("tr").unwrap();
assert!(tr.is_some());

Source

pub fn parse_fragment_with_config( html: &str, context: &str, config: SoupConfig, ) -> Self

Parses an HTML fragment with custom context and configuration.

Source

pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>>

Finds the first element matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div><span class=\"item\">Hello</span></div>");
let span = soup.find("span.item").unwrap().unwrap();
assert_eq!(span.text(), "Hello");

Source

pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Finds all elements matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
let items = soup.find_all("li").unwrap();
assert_eq!(items.len(), 2);

Source

pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Selects elements using a CSS selector.

This is an alias for Soup::find_all for users familiar with the CSS selector API.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div class=\"a\"><span class=\"b\">Text</span></div>");
let results = soup.select("div.a > span.b").unwrap();
assert_eq!(results.len(), 1);

Source

pub fn find_compiled(&self, selector: &CompiledSelector) -> Option<Tag<'_>>

Finds the first element using a pre-compiled selector.

§Examples

use scrape_core::{Soup, query::CompiledSelector};

let selector = CompiledSelector::compile("div.item").unwrap();
let soup = Soup::parse("<div class=\"item\">Text</div>");
let result = soup.find_compiled(&selector);
assert!(result.is_some());

Source

pub fn select_compiled(&self, selector: &CompiledSelector) -> Vec<Tag<'_>>

Finds all elements using a pre-compiled selector.

§Examples

use scrape_core::{Soup, query::CompiledSelector};

let selector = CompiledSelector::compile("li").unwrap();
let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
let items = soup.select_compiled(&selector);
assert_eq!(items.len(), 2);

Source

pub fn select_text(&self, selector: &str) -> QueryResult<Vec<String>>

Extracts text content from all elements matching a CSS selector.

Returns the concatenated text content of each matching element.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<ul><li>First</li><li>Second</li></ul>");
let texts = soup.select_text("li").unwrap();
assert_eq!(texts, vec!["First", "Second"]);

Source

pub fn select_attr( &self, selector: &str, attr: &str, ) -> QueryResult<Vec<Option<String>>>

Extracts attribute values from all elements matching a CSS selector.

Returns Some(value) if the attribute exists, None if it doesn’t.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<a href='/a'>A</a><a>B</a>");
let hrefs = soup.select_attr("a", "href").unwrap();
assert_eq!(hrefs, vec![Some("/a".to_string()), None]);

Source

pub fn root(&self) -> Option<Tag<'_>>

Returns the root element of the document.

This is typically the <html> element.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><body>text</body></html>");
if let Some(root) = soup.root() {
    assert_eq!(root.name(), Some("html"));
}

Source

pub fn title(&self) -> Option<String>

Returns the document’s title, if present.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><head><title>My Page</title></head></html>");
assert_eq!(soup.title(), Some("My Page".to_string()));

Source

pub fn text(&self) -> String

Returns the document’s text content with tags stripped.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div>Hello <b>World</b></div>");
let text = soup.text();
assert!(text.contains("Hello"));
assert!(text.contains("World"));

Source

pub fn to_html(&self) -> String

Returns the document as an HTML string.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div><span>text</span></div>");
let html = soup.to_html();
assert!(html.contains("<div>"));
assert!(html.contains("<span>"));