rust-pickaxe 0.5.5

HTML data extraction library
Documentation
use std::collections::HashMap;
use pyo3::{
    prelude::*,
    exceptions::{PyValueError, PyRuntimeError},
    types::PyType,
};

use crate::document::{HtmlDocument, HtmlNode, XPathResult};
use crate::markdown::{html_to_markdown_with_converter, HtmlToMarkdown};
use crate::errors::{Result, PackageError};


impl From<PackageError> for PyErr {
    fn from(error: PackageError) -> Self {
        match error {
            PackageError::HTMLParseError(e) => PyValueError::new_err(e.to_string()),
            PackageError::SelectorParseError(e) => PyValueError::new_err(e.to_string()),
            PackageError::UnknownError(e) => PyRuntimeError::new_err(e.to_string()),
        }
    }
}


/// An XPath result object.
#[derive(IntoPyObject)]
pub enum PyXPathResult {
    /// The HtmlNode itself
    Node(PyHtmlNode),
    /// A string
    String(String),
}

impl From<XPathResult> for PyXPathResult {
    fn from(result: XPathResult) -> Self {
        match result {
            XPathResult::Node(node) => PyXPathResult::Node(PyHtmlNode { inner: node }),
            XPathResult::String(string) => PyXPathResult::String(string),
        }
    }
}

/// An HTML document object.
/// 
/// This object is used to parse and manipulate HTML documents.
/// 
/// # Examples
/// 
/// ```python
/// from pickaxe import HtmlDocument
/// 
/// doc = HtmlDocument.from_str("<html><body><h1>Hello, world!</h1></body></html>")
/// h1 = doc.find("h1")
/// print(h1.inner_text)
/// ```
#[pyclass(name = "HtmlDocument")]
#[derive(Clone)]
pub struct PyHtmlDocument {
    inner: HtmlDocument,
}

#[pymethods]
impl PyHtmlDocument {
    fn __repr__(&self) -> String {
        format!("{:?}", self.inner)
    }

    /// Create an `HtmlDocument` from a string.
    /// 
    /// * `raw` - The raw HTML string.
    #[classmethod]
    #[allow(unused_variables)]
    pub fn from_str(cls: &Bound<'_, PyType>, py: Python<'_>, raw: String) -> Self {
        py.allow_threads(|| Self {
            inner: HtmlDocument::from_str(raw),
        })
    }

    #[getter]
    /// The raw HTML string of the document.
    pub fn raw(&self) -> &str {
        self.inner.raw()
    }
    
    #[getter]
    /// Get the immediate children of the root node.
    pub fn children(&self) -> Vec<PyHtmlNode> {
        self.inner
            .children()
            .into_iter()
            .map(|node| PyHtmlNode { inner: node })
            .collect()
    }

    #[getter]
    /// Get the root node of the document.
    pub fn root(&self) -> PyHtmlNode {
        PyHtmlNode { inner: self.inner.root() }
    }

    /// Query the document for matching elements using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    pub fn find_all(&self, selector: &str) -> Result<Vec<PyHtmlNode>> {
        Ok(
            self.inner
                .find_all(selector)?
                .into_iter()
                .map(|node| PyHtmlNode { inner: node })
                .collect()
        )
    }

    /// Query the document for matching elements using an XPath expression.
    /// 
    /// * `xpath` - The XPath expression to use.
    pub fn find_all_xpath(&self, xpath: &str) -> Result<Vec<PyXPathResult>> {
        Ok(
            self.inner
                .find_all_xpath(xpath)?
                .into_iter()
                .map(|result| result.into())
                .collect()
        )
    }

    /// Query the document for the first matching element using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    pub fn find(&self, selector: &str) -> Result<Option<PyHtmlNode>> {
        Ok(
            self.inner
                .find(selector)?
                .map(|node| PyHtmlNode { inner: node })
        )
    }

    /// Query the document for the first matching element using an XPath expression.
    ///
    /// * `xpath` - The XPath expression to use.
    pub fn find_xpath(&self, xpath: &str) -> Result<Option<PyXPathResult>> {
        Ok(
            self.inner
                .find_xpath(xpath)?
                .map(|result| result.into())
        )
    }

    /// Query the `n`th element using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    /// * `n` - The index of the element to query.
    pub fn find_nth(&self, selector: &str, n: usize) -> Result<Option<PyHtmlNode>> {
        Ok(
            self.inner
                .find_nth(selector, n)?
                .map(|node| PyHtmlNode { inner: node })
        )
    }

    /// Query the `n`th element using an XPath expression.
    /// 
    /// * `xpath` - The XPath expression to use.
    /// * `n` - The index of the element to query.
    pub fn find_nth_xpath(&self, xpath: &str, n: usize) -> Result<Option<PyXPathResult>> {
        Ok(
            self.inner
                .find_nth_xpath(xpath, n)?
                .map(|result| result.into())
        )
    }
}


/// An HTML node object.
/// 
/// This object is used to represent an HTML element in a document.
/// 
/// # Examples
/// 
/// ```python
/// from pickaxe import HtmlDocument
/// 
/// doc = HtmlDocument.from_str("<html><body><h1>Hello, world!</h1></body></html>")
/// h1 = doc.find("h1")
/// print(h1.inner_text)
/// ```
#[pyclass(name = "HtmlNode")]
#[derive(Clone)]
pub struct PyHtmlNode {
    inner: HtmlNode,
}

#[pymethods]
impl PyHtmlNode {
    fn __repr__(&self) -> String {
        format!("{:?}", self.inner)
    }

    /// Get the text of this node.
    #[getter]
    fn text(&self) -> String {
        self.inner.text()
    }

    /// Get the inner text of the node and its children.
    #[getter]
    fn inner_text(&self) -> String {
        self.inner.inner_text()
    }

    /// Get the inner HTML of the node and its children.
    #[getter]
    fn inner_html(&self) -> String {
        self.inner.inner_html()
    }

    /// Get the outer HTML of the node.
    #[getter]
    fn outer_html(&self) -> String {
        self.inner.outer_html()
    }

    /// Get the tag name of the node.
    #[getter]
    fn tag_name(&self) -> String {
        self.inner
            .tag_name()
            .to_string()
    }

    /// Get the attributes of the node.
    #[getter]
    fn attributes(&self) -> HashMap<&str, Option<&str>> {
        self.inner.attributes()
    }

    /// Get the children of the node.
    #[getter]
    fn children(&self) -> Vec<PyHtmlNode> {
        self.inner
            .children()
            .into_iter()
            .map(|node| PyHtmlNode { inner: node })
            .collect()
    }

    /// Query the node for matching elements using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    pub fn find_all(&self, selector: &str) -> Result<Vec<PyHtmlNode>> {
        Ok(
            self.inner
                .find_all(selector)?
                .into_iter()
                .map(|node| PyHtmlNode { inner: node })
                .collect()
        )
    }

    /// Query the node for matching elements using an XPath expression.
    /// 
    /// * `xpath` - The XPath expression to use.
    pub fn find_all_xpath(&self, xpath: &str) -> Result<Vec<PyXPathResult>> {
        Ok(
            self.inner
                .find_all_xpath(xpath)?
                .into_iter()
                .map(|result| result.into())
                .collect()
        )
    }

    /// Query the node for the first matching element using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    pub fn find(&self, selector: &str) -> Result<Option<PyHtmlNode>> {
        Ok(
            self.inner
                .find(selector)?
                .map(|node| PyHtmlNode { inner: node })
        )
    }

    /// Query the node for the first matching element using an XPath expression.
    /// 
    /// * `xpath` - The XPath expression to use.
    pub fn find_xpath(&self, xpath: &str) -> Result<Option<PyXPathResult>> {
        Ok(
            self.inner
                .find_xpath(xpath)?
                .map(|result| result.into())
        )
    }

    /// Query the `n`th element using a CSS selector.
    /// 
    /// * `selector` - The CSS selector to use.
    /// * `n` - The index of the element to query.
    pub fn find_nth(&self, selector: &str, n: usize) -> Result<Option<PyHtmlNode>> {
        Ok(
            self.inner
                .find_nth(selector, n)?
                .map(|node| PyHtmlNode { inner: node })
        )
    }

    /// Query the `n`th element using an XPath expression.
    /// 
    /// * `xpath` - The XPath expression to use.
    /// * `n` - The index of the element to query.
    pub fn find_nth_xpath(&self, xpath: &str, n: usize) -> Result<Option<PyXPathResult>> {
        Ok(
            self.inner
                .find_nth_xpath(xpath, n)?
                .map(|result| result.into())
        )
    }

    /// Get the value of an attribute.
    fn get_attribute(&self, name: &str) -> Option<&str> {
        self.inner.get_attribute(name)
    }
}


/// Convert an HTML string to markdown.
/// 
/// * `html` - The HTML string to convert.
/// * `skip_tags` - The tags to skip when converting to markdown.
#[pyfunction]
#[pyo3(signature = (html, skip_tags = vec!["script".to_string(), "style".to_string()]))]
fn html_to_markdown(html: String, skip_tags: Vec<String>) -> PyResult<String> {
    Ok(
        html_to_markdown_with_converter(
            html,
            HtmlToMarkdown::builder()
                .skip_tags(skip_tags.iter().map(|v| v.as_str()).collect())
                .build()
        )
        .map_err(|e| PyValueError::new_err(e.to_string()))?
    )
}


#[pymodule]
#[pyo3(name = "_pickaxe")]
fn pickaxe(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
    m.add_class::<PyHtmlDocument>()?;
    m.add_class::<PyHtmlNode>()?;
    m.add_function(wrap_pyfunction!(html_to_markdown, py)?)?;
    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
    Ok(())
}