article_scraper/lib.rs
1//! # article scraper
2//!
3//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
4//! It contains two ways of locating the desired content
5//!
6//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
7//!
8//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
9//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
10//!
11//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
12//! Please consider contributing new rules or updates to it.
13//!
14//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
15//!
16//! ## 2. Mozilla Readability
17//!
18//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
19//! This re-implementation tries to mimic the original as closely as possible.
20//!
21//! # Example
22//!
23//! ```
24//! use article_scraper::ArticleScraper;
25//! use url::Url;
26//! use reqwest::Client;
27//!
28//! async fn demo() {
29//! let scraper = ArticleScraper::new(None).await;
30//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
31//! let client = Client::new();
32//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
33//! }
34//! ```
35
36mod article;
37pub mod clean;
38mod constants;
39mod error;
40mod full_text_parser;
41mod image_object;
42#[doc(hidden)]
43pub mod images;
44mod util;
45mod video_object;
46
47use crate::images::Progress;
48pub use article::Article;
49use error::ScraperError;
50#[doc(hidden)]
51pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
52#[doc(hidden)]
53pub use full_text_parser::FullTextParser;
54pub use full_text_parser::Readability;
55use images::ImageDownloader;
56use reqwest::Client;
57use std::path::Path;
58use tokio::sync::mpsc::Sender;
59
60/// Download & extract meaningful content from websites
61///
62/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
63/// of mozilla Readability.
64///
65/// For detailed information about extraction rules and how to contribute new rules please see
66/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
67pub struct ArticleScraper {
68 full_text_parser: FullTextParser,
69 image_downloader: ImageDownloader,
70}
71
72impl ArticleScraper {
73 /// Crate a new ArticleScraper
74 ///
75 /// # Arguments
76 ///
77 /// * `user_configs` - optional path to a folder containing additional ftr config files
78 ///
79 pub async fn new(user_configs: Option<&Path>) -> Self {
80 Self {
81 full_text_parser: FullTextParser::new(user_configs).await,
82 image_downloader: ImageDownloader::new((2048, 2048)),
83 }
84 }
85
86 /// Download & extract content of a website
87 ///
88 /// # Arguments
89 ///
90 /// * `url` - Url to an article
91 /// * `download_images` - if images should be downloaded & embedded into the HTML
92 /// * `client` - reqwest HTTP client to use
93 /// * `progress` - optional progress notifications (only for image downloads)
94 ///
95 /// # Examples
96 ///
97 /// ```
98 /// use article_scraper::ArticleScraper;
99 /// use url::Url;
100 /// use reqwest::Client;
101 ///
102 /// async fn demo() {
103 /// let scraper = ArticleScraper::new(None).await;
104 /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
105 /// let client = Client::new();
106 /// let article = scraper.parse(&url, false, &client, None).await.unwrap();
107 /// }
108 /// ```
109 pub async fn parse(
110 &self,
111 url: &url::Url,
112 download_images: bool,
113 client: &Client,
114 progress: Option<Sender<Progress>>,
115 ) -> Result<Article, ScraperError> {
116 let mut res = self.full_text_parser.parse(url, client).await?;
117
118 if download_images {
119 if let Some(html) = res.html.as_deref() {
120 if let Ok(downloaded_html) = self
121 .image_downloader
122 .download_images_from_string(html, client, progress)
123 .await
124 {
125 res.html.replace(downloaded_html);
126 }
127 }
128 }
129
130 Ok(res)
131 }
132}