Skip to main content

Soup

Struct Soup 

Source
pub struct Soup { /* private fields */ }
Expand description

A parsed HTML document.

Soup is the main entry point for parsing and querying HTML documents. It provides methods for finding elements by CSS selector or tag name.

§Examples

§Basic Parsing

use scrape_core::Soup;

let html = "<html><body><h1>Hello, World!</h1></body></html>";
let soup = Soup::parse(html);

if let Ok(Some(h1)) = soup.find("h1") {
    assert_eq!(h1.text(), "Hello, World!");
}

§CSS Selectors

use scrape_core::Soup;

let html = r#"
    <div class="container">
        <span class="item">One</span>
        <span class="item">Two</span>
    </div>
"#;
let soup = Soup::parse(html);

let items = soup.select("div.container > span.item").unwrap();
assert_eq!(items.len(), 2);

Implementations§

Source§

impl Soup

Source

pub fn parse(html: &str) -> Self

Parses an HTML string into a Soup document.

This uses the default configuration. For custom configuration, use Soup::parse_with_config.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<html><body>Hello</body></html>");
Source

pub fn parse_with_config(html: &str, config: SoupConfig) -> Self

Parses an HTML string with custom configuration.

§Examples
use scrape_core::{Soup, SoupConfig};

let config = SoupConfig::builder().max_depth(128).build();
let soup = Soup::parse_with_config("<html>...</html>", config);
Source

pub fn document(&self) -> &Document

Returns a reference to the underlying document.

Source

pub fn from_file(path: &Path) -> Result<Self>

Parses HTML from a file.

§Errors

Returns an error if the file cannot be read.

§Examples
use std::path::Path;

use scrape_core::Soup;

let soup = Soup::from_file(Path::new("index.html")).unwrap();
Source

pub fn parse_fragment(html: &str) -> Self

Parses an HTML fragment without wrapping in html/body tags.

Unlike Soup::parse, this does not wrap content in <html><body> structure. The fragment is parsed as if it appeared inside a <body> element.

§Examples
use scrape_core::Soup;

let soup = Soup::parse_fragment("<span>A</span><span>B</span>");
let spans = soup.find_all("span").unwrap();
assert_eq!(spans.len(), 2);
Source

pub fn parse_fragment_with_context(html: &str, context: &str) -> Self

Parses an HTML fragment with a custom context element.

The context element determines parsing behavior:

  • "body": Standard HTML elements (default)
  • "table": Allows tr/td without explicit tbody
  • "tbody": Allows tr directly
§Examples
use scrape_core::Soup;

let soup = Soup::parse_fragment_with_context("<tr><td>A</td></tr>", "tbody");
let tr = soup.find("tr").unwrap();
assert!(tr.is_some());
Source

pub fn parse_fragment_with_config( html: &str, context: &str, config: SoupConfig, ) -> Self

Parses an HTML fragment with custom context and configuration.

Source

pub fn find(&self, selector: &str) -> QueryResult<Option<Tag<'_>>>

Finds the first element matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<div><span class=\"item\">Hello</span></div>");
let span = soup.find("span.item").unwrap().unwrap();
assert_eq!(span.text(), "Hello");
Source

pub fn find_all(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Finds all elements matching the given CSS selector.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
let items = soup.find_all("li").unwrap();
assert_eq!(items.len(), 2);
Source

pub fn select(&self, selector: &str) -> QueryResult<Vec<Tag<'_>>>

Selects elements using a CSS selector.

This is an alias for Soup::find_all for users familiar with the CSS selector API.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<div class=\"a\"><span class=\"b\">Text</span></div>");
let results = soup.select("div.a > span.b").unwrap();
assert_eq!(results.len(), 1);
Source

pub fn find_compiled(&self, selector: &CompiledSelector) -> Option<Tag<'_>>

Finds the first element using a pre-compiled selector.

§Examples
use scrape_core::{Soup, query::CompiledSelector};

let selector = CompiledSelector::compile("div.item").unwrap();
let soup = Soup::parse("<div class=\"item\">Text</div>");
let result = soup.find_compiled(&selector);
assert!(result.is_some());
Source

pub fn select_compiled(&self, selector: &CompiledSelector) -> Vec<Tag<'_>>

Finds all elements using a pre-compiled selector.

§Examples
use scrape_core::{Soup, query::CompiledSelector};

let selector = CompiledSelector::compile("li").unwrap();
let soup = Soup::parse("<ul><li>A</li><li>B</li></ul>");
let items = soup.select_compiled(&selector);
assert_eq!(items.len(), 2);
Source

pub fn select_text(&self, selector: &str) -> QueryResult<Vec<String>>

Extracts text content from all elements matching a CSS selector.

Returns the concatenated text content of each matching element.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<ul><li>First</li><li>Second</li></ul>");
let texts = soup.select_text("li").unwrap();
assert_eq!(texts, vec!["First", "Second"]);
Source

pub fn select_attr( &self, selector: &str, attr: &str, ) -> QueryResult<Vec<Option<String>>>

Extracts attribute values from all elements matching a CSS selector.

Returns Some(value) if the attribute exists, None if it doesn’t.

§Errors

Returns [QueryError::InvalidSelector] if the selector syntax is invalid.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<a href='/a'>A</a><a>B</a>");
let hrefs = soup.select_attr("a", "href").unwrap();
assert_eq!(hrefs, vec![Some("/a".to_string()), None]);
Source

pub fn root(&self) -> Option<Tag<'_>>

Returns the root element of the document.

This is typically the <html> element.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<html><body>text</body></html>");
if let Some(root) = soup.root() {
    assert_eq!(root.name(), Some("html"));
}
Source

pub fn title(&self) -> Option<String>

Returns the document’s title, if present.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<html><head><title>My Page</title></head></html>");
assert_eq!(soup.title(), Some("My Page".to_string()));
Source

pub fn text(&self) -> String

Returns the document’s text content with tags stripped.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<div>Hello <b>World</b></div>");
let text = soup.text();
assert!(text.contains("Hello"));
assert!(text.contains("World"));
Source

pub fn to_html(&self) -> String

Returns the document as an HTML string.

§Examples
use scrape_core::Soup;

let soup = Soup::parse("<div><span>text</span></div>");
let html = soup.to_html();
assert!(html.contains("<div>"));
assert!(html.contains("<span>"));

Trait Implementations§

Source§

impl Debug for Soup

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

§

impl Freeze for Soup

§

impl RefUnwindSafe for Soup

§

impl Send for Soup

§

impl Sync for Soup

§

impl Unpin for Soup

§

impl UnwindSafe for Soup

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.