Skip to main content

article_scraper/
lib.rs

1//! # article scraper
2//!
3//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
4//! It contains two ways of locating the desired content
5//!
6//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
7//!
8//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
9//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
10//!
11//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
12//! Please consider contributing new rules or updates to it.
13//!
14//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
15//!
16//! ## 2. Mozilla Readability
17//!
18//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
19//! This re-implementation tries to mimic the original as closely as possible.
20//!
21//! # Example
22//!
23//! ```
24//! use article_scraper::ArticleScraper;
25//! use url::Url;
26//! use reqwest::Client;
27//!
28//! async fn demo() {
29//!     let scraper = ArticleScraper::new(None).await;
30//!     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
31//!     let client = Client::new();
32//!     let article = scraper.parse(&url, &client).await.unwrap();
33//! }
34//! ```
35
36mod article;
37pub mod clean;
38mod constants;
39mod download_progress;
40mod error;
41mod full_text_parser;
42mod image_object;
43#[doc(hidden)]
44#[cfg(feature = "image-downloader")]
45pub mod images;
46mod util;
47mod video_object;
48
49pub use crate::download_progress::DownloadProgress;
50pub use article::Article;
51pub use error::{ConfigError, FullTextParserError, ImageDownloadError, ScraperError};
52
53#[doc(hidden)]
54pub use full_text_parser::FullTextParser;
55pub use full_text_parser::Readability;
56#[doc(hidden)]
57pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
58#[cfg(feature = "image-downloader")]
59use futures::channel::mpsc::Sender;
60#[cfg(feature = "image-downloader")]
61use images::ImageDownloader;
62use reqwest::Client;
63use std::path::Path;
64
65/// Download & extract meaningful content from websites
66///
67/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
68/// of mozilla Readability.
69///
70/// For detailed information about extraction rules and how to contribute new rules please see
71/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
72pub struct ArticleScraper {
73    full_text_parser: FullTextParser,
74    #[cfg(feature = "image-downloader")]
75    image_downloader: ImageDownloader,
76}
77
78impl ArticleScraper {
79    /// Crate a new ArticleScraper
80    ///
81    /// # Arguments
82    ///
83    /// * `user_configs` - optional path to a folder containing additional ftr config files
84    ///
85    pub async fn new(user_configs: Option<&Path>) -> Self {
86        Self {
87            full_text_parser: FullTextParser::new(user_configs).await,
88            #[cfg(feature = "image-downloader")]
89            image_downloader: ImageDownloader::new((2048, 2048)),
90        }
91    }
92
93    /// Download & extract content of a website
94    ///
95    /// # Arguments
96    ///
97    /// * `url` - Url to an article
98    /// * `download_images` - if images should be downloaded & embedded into the HTML
99    /// * `client` - reqwest HTTP client to use
100    /// * `progress` - optional progress notifications (only for image downloads)
101    ///
102    /// # Examples
103    ///
104    /// ```
105    /// use article_scraper::ArticleScraper;
106    /// use url::Url;
107    /// use reqwest::Client;
108    ///
109    /// async fn demo() {
110    ///     let scraper = ArticleScraper::new(None).await;
111    ///     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
112    ///     let client = Client::new();
113    ///     let article = scraper.parse(&url, &client).await.unwrap();
114    /// }
115    /// ```
116    pub async fn parse(
117        &self,
118        url: &url::Url,
119        client: &Client,
120        #[cfg(feature = "image-downloader")] download_images: bool,
121        #[cfg(feature = "image-downloader")] progress: Option<Sender<DownloadProgress>>,
122    ) -> Result<Article, ScraperError> {
123        let res = self.full_text_parser.parse(url, client).await?;
124
125        #[cfg(feature = "image-downloader")]
126        if download_images
127            && let Some(html) = res.html.as_deref()
128            && let Ok(downloaded_html) = self
129                .image_downloader
130                .download_images_from_string(html, client, progress)
131                .await
132        {
133            return Ok(Article {
134                title: res.title,
135                author: res.author,
136                url: res.url,
137                date: res.date,
138                thumbnail_url: res.thumbnail_url,
139                html: Some(downloaded_html),
140            });
141        }
142
143        Ok(res)
144    }
145}