article_scraper 2.3.1

Scrap article contents from the web. Powered by fivefilters full text feed configurations & mozilla readability.
Documentation
//! # article scraper
//!
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
//! It contains two ways of locating the desired content
//!
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
//!
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
//!
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
//! Please consider contributing new rules or updates to it.
//!
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
//!
//! ## 2. Mozilla Readability
//!
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
//! This re-implementation tries to mimic the original as closely as possible.
//!
//! # Example
//!
//! ```
//! use article_scraper::ArticleScraper;
//! use url::Url;
//! use reqwest::Client;
//!
//! async fn demo() {
//!     let scraper = ArticleScraper::new(None).await;
//!     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
//!     let client = Client::new();
//!     let article = scraper.parse(&url, &client).await.unwrap();
//! }
//! ```

mod article;
pub mod clean;
mod constants;
mod download_progress;
mod error;
mod full_text_parser;
mod image_object;
#[doc(hidden)]
#[cfg(feature = "image-downloader")]
pub mod images;
mod util;
mod video_object;

pub use crate::download_progress::DownloadProgress;
pub use article::Article;
pub use error::{ConfigError, FullTextParserError, ImageDownloadError, ScraperError};

#[doc(hidden)]
pub use full_text_parser::FullTextParser;
pub use full_text_parser::Readability;
#[doc(hidden)]
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
#[cfg(feature = "image-downloader")]
use futures::channel::mpsc::Sender;
#[cfg(feature = "image-downloader")]
use images::ImageDownloader;
use reqwest::Client;
use std::path::Path;

/// Download & extract meaningful content from websites
///
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
/// of mozilla Readability.
///
/// For detailed information about extraction rules and how to contribute new rules please see
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
pub struct ArticleScraper {
    full_text_parser: FullTextParser,
    #[cfg(feature = "image-downloader")]
    image_downloader: ImageDownloader,
}

impl ArticleScraper {
    /// Crate a new ArticleScraper
    ///
    /// # Arguments
    ///
    /// * `user_configs` - optional path to a folder containing additional ftr config files
    ///
    pub async fn new(user_configs: Option<&Path>) -> Self {
        Self {
            full_text_parser: FullTextParser::new(user_configs).await,
            #[cfg(feature = "image-downloader")]
            image_downloader: ImageDownloader::new((2048, 2048)),
        }
    }

    /// Download & extract content of a website
    ///
    /// # Arguments
    ///
    /// * `url` - Url to an article
    /// * `download_images` - if images should be downloaded & embedded into the HTML
    /// * `client` - reqwest HTTP client to use
    /// * `progress` - optional progress notifications (only for image downloads)
    ///
    /// # Examples
    ///
    /// ```
    /// use article_scraper::ArticleScraper;
    /// use url::Url;
    /// use reqwest::Client;
    ///
    /// async fn demo() {
    ///     let scraper = ArticleScraper::new(None).await;
    ///     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
    ///     let client = Client::new();
    ///     let article = scraper.parse(&url, &client).await.unwrap();
    /// }
    /// ```
    pub async fn parse(
        &self,
        url: &url::Url,
        client: &Client,
        #[cfg(feature = "image-downloader")] download_images: bool,
        #[cfg(feature = "image-downloader")] progress: Option<Sender<DownloadProgress>>,
    ) -> Result<Article, ScraperError> {
        let res = self.full_text_parser.parse(url, client).await?;

        #[cfg(feature = "image-downloader")]
        if download_images
            && let Some(html) = res.html.as_deref()
            && let Ok(downloaded_html) = self
                .image_downloader
                .download_images_from_string(html, client, progress)
                .await
        {
            return Ok(Article {
                title: res.title,
                author: res.author,
                url: res.url,
                date: res.date,
                thumbnail_url: res.thumbnail_url,
                html: Some(downloaded_html),
            });
        }

        Ok(res)
    }
}