Struct Soup

Source

pub struct Soup { /* private fields */ }

Expand description

A parsed HTML document.

Soup is the main entry point for parsing and querying HTML documents. It provides methods for finding elements by CSS selector or tag name.

§Examples

§Basic Parsing

use scrape_core::Soup;

let html = "<html><body><h1>Hello, World!</h1></body></html>";
let soup = Soup::parse(html);

if let Ok(Some(h1)) = soup.find("h1") {
    assert_eq!(h1.text(), "Hello, World!");
}

§CSS Selectors

use scrape_core::Soup;

let html = r#"
    <div class="container">
        <span class="item">One</span>
        <span class="item">Two</span>
    </div>
"#;
let soup = Soup::parse(html);

let items = soup.select("div.container > span.item").unwrap();
assert_eq!(items.len(), 2);

Implementations§

Source §

impl Soup

Source

pub fn parse(html: &str) -> Self

Parses an HTML string into a Soup document.

This uses the default configuration. For custom configuration, use Soup::parse_with_config.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><body>Hello</body></html>");

Source

pub fn parse_with_config(html: &str, config: SoupConfig) -> Self

Parses an HTML string with custom configuration.

§Examples

use scrape_core::{Soup, SoupConfig};

let config = SoupConfig::builder().max_depth(128).build();
let soup = Soup::parse_with_config("<html>...</html>", config);

Source

pub fn document(&self) -> &Document

Returns a reference to the underlying document.

Source

pub fn from_file(path: &Path) -> Result<Self>

Parses HTML from a file.

§Errors

Returns an error if the file cannot be read.

§Examples

use std::path::Path;

use scrape_core::Soup;

let soup = Soup::from_file(Path::new("index.html")).unwrap();

Source

pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>>

Finds the first element matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div><span class=\"item\">Hello</span></div>");
let span = soup.find("span.item").unwrap().unwrap();
assert_eq!(span.text(), "Hello");

Source

pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Finds all elements matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
let items = soup.find_all("li").unwrap();
assert_eq!(items.len(), 2);

Source

pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Selects elements using a CSS selector.

This is an alias for Soup::find_all for users familiar with the CSS selector API.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div class=\"a\"><span class=\"b\">Text</span></div>");
let results = soup.select("div.a > span.b").unwrap();
assert_eq!(results.len(), 1);

Source

pub fn root(&self) -> Option<Tag<'_>>

Returns the root element of the document.

This is typically the <html> element.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><body>text</body></html>");
if let Some(root) = soup.root() {
    assert_eq!(root.name(), Some("html"));
}

Source

pub fn title(&self) -> Option<String>

Returns the document’s title, if present.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<html><head><title>My Page</title></head></html>");
assert_eq!(soup.title(), Some("My Page".to_string()));

Source

pub fn text(&self) -> String

Returns the document’s text content with tags stripped.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div>Hello <b>World</b></div>");
let text = soup.text();
assert!(text.contains("Hello"));
assert!(text.contains("World"));

Source

pub fn to_html(&self) -> String

Returns the document as an HTML string.

§Examples

use scrape_core::Soup;

let soup = Soup::parse("<div><span>text</span></div>");
let html = soup.to_html();
assert!(html.contains("<div>"));
assert!(html.contains("<span>"));