article_scraper/
lib.rs

1//! # article scraper
2//!
3//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
4//! It contains two ways of locating the desired content
5//!
6//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
7//!
8//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
9//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
10//!
11//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
12//! Please consider contributing new rules or updates to it.
13//!
14//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
15//!
16//! ## 2. Mozilla Readability
17//!
18//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
19//! This re-implementation tries to mimic the original as closely as possible.
20//!
21//! # Example
22//!
23//! ```
24//! use article_scraper::ArticleScraper;
25//! use url::Url;
26//! use reqwest::Client;
27//!
28//! async fn demo() {
29//!     let scraper = ArticleScraper::new(None).await;
30//!     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
31//!     let client = Client::new();
32//!     let article = scraper.parse(&url, false, &client, None).await.unwrap();
33//! }
34//! ```
35
36mod article;
37pub mod clean;
38mod constants;
39mod error;
40mod full_text_parser;
41mod image_object;
42#[doc(hidden)]
43pub mod images;
44mod util;
45mod video_object;
46
47use crate::images::Progress;
48pub use article::Article;
49use error::ScraperError;
50#[doc(hidden)]
51pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
52#[doc(hidden)]
53pub use full_text_parser::FullTextParser;
54pub use full_text_parser::Readability;
55use images::ImageDownloader;
56use reqwest::Client;
57use std::path::Path;
58use tokio::sync::mpsc::Sender;
59
60/// Download & extract meaningful content from websites
61///
62/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
63/// of mozilla Readability.
64///
65/// For detailed information about extraction rules and how to contribute new rules please see
66/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
67pub struct ArticleScraper {
68    full_text_parser: FullTextParser,
69    image_downloader: ImageDownloader,
70}
71
72impl ArticleScraper {
73    /// Crate a new ArticleScraper
74    ///
75    /// # Arguments
76    ///
77    /// * `user_configs` - optional path to a folder containing additional ftr config files
78    ///
79    pub async fn new(user_configs: Option<&Path>) -> Self {
80        Self {
81            full_text_parser: FullTextParser::new(user_configs).await,
82            image_downloader: ImageDownloader::new((2048, 2048)),
83        }
84    }
85
86    /// Download & extract content of a website
87    ///
88    /// # Arguments
89    ///
90    /// * `url` - Url to an article
91    /// * `download_images` - if images should be downloaded & embedded into the HTML
92    /// * `client` - reqwest HTTP client to use
93    /// * `progress` - optional progress notifications (only for image downloads)
94    ///
95    /// # Examples
96    ///
97    /// ```
98    /// use article_scraper::ArticleScraper;
99    /// use url::Url;
100    /// use reqwest::Client;
101    ///
102    /// async fn demo() {
103    ///     let scraper = ArticleScraper::new(None).await;
104    ///     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
105    ///     let client = Client::new();
106    ///     let article = scraper.parse(&url, false, &client, None).await.unwrap();
107    /// }
108    /// ```
109    pub async fn parse(
110        &self,
111        url: &url::Url,
112        download_images: bool,
113        client: &Client,
114        progress: Option<Sender<Progress>>,
115    ) -> Result<Article, ScraperError> {
116        let mut res = self.full_text_parser.parse(url, client).await?;
117
118        if download_images {
119            if let Some(html) = res.html.as_deref() {
120                if let Ok(downloaded_html) = self
121                    .image_downloader
122                    .download_images_from_string(html, client, progress)
123                    .await
124                {
125                    res.html.replace(downloaded_html);
126                }
127            }
128        }
129
130        Ok(res)
131    }
132}