article_scraper/lib.rs
1//! # article scraper
2//!
3//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
4//! It contains two ways of locating the desired content
5//!
6//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
7//!
8//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
9//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
10//!
11//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
12//! Please consider contributing new rules or updates to it.
13//!
14//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
15//!
16//! ## 2. Mozilla Readability
17//!
18//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
19//! This re-implementation tries to mimic the original as closely as possible.
20//!
21//! # Example
22//!
23//! ```
24//! use article_scraper::ArticleScraper;
25//! use url::Url;
26//! use reqwest::Client;
27//!
28//! async fn demo() {
29//! let scraper = ArticleScraper::new(None).await;
30//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
31//! let client = Client::new();
32//! let article = scraper.parse(&url, &client).await.unwrap();
33//! }
34//! ```
35
36mod article;
37pub mod clean;
38mod constants;
39mod download_progress;
40mod error;
41mod full_text_parser;
42mod image_object;
43#[doc(hidden)]
44#[cfg(feature = "image-downloader")]
45pub mod images;
46mod util;
47mod video_object;
48
49pub use crate::download_progress::DownloadProgress;
50pub use article::Article;
51pub use error::{ConfigError, FullTextParserError, ImageDownloadError, ScraperError};
52
53#[doc(hidden)]
54pub use full_text_parser::FullTextParser;
55pub use full_text_parser::Readability;
56#[doc(hidden)]
57pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
58#[cfg(feature = "image-downloader")]
59use futures::channel::mpsc::Sender;
60#[cfg(feature = "image-downloader")]
61use images::ImageDownloader;
62use reqwest::Client;
63use std::path::Path;
64
65/// Download & extract meaningful content from websites
66///
67/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
68/// of mozilla Readability.
69///
70/// For detailed information about extraction rules and how to contribute new rules please see
71/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
72pub struct ArticleScraper {
73 full_text_parser: FullTextParser,
74 #[cfg(feature = "image-downloader")]
75 image_downloader: ImageDownloader,
76}
77
78impl ArticleScraper {
79 /// Crate a new ArticleScraper
80 ///
81 /// # Arguments
82 ///
83 /// * `user_configs` - optional path to a folder containing additional ftr config files
84 ///
85 pub async fn new(user_configs: Option<&Path>) -> Self {
86 Self {
87 full_text_parser: FullTextParser::new(user_configs).await,
88 #[cfg(feature = "image-downloader")]
89 image_downloader: ImageDownloader::new((2048, 2048)),
90 }
91 }
92
93 /// Download & extract content of a website
94 ///
95 /// # Arguments
96 ///
97 /// * `url` - Url to an article
98 /// * `download_images` - if images should be downloaded & embedded into the HTML
99 /// * `client` - reqwest HTTP client to use
100 /// * `progress` - optional progress notifications (only for image downloads)
101 ///
102 /// # Examples
103 ///
104 /// ```
105 /// use article_scraper::ArticleScraper;
106 /// use url::Url;
107 /// use reqwest::Client;
108 ///
109 /// async fn demo() {
110 /// let scraper = ArticleScraper::new(None).await;
111 /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
112 /// let client = Client::new();
113 /// let article = scraper.parse(&url, &client).await.unwrap();
114 /// }
115 /// ```
116 pub async fn parse(
117 &self,
118 url: &url::Url,
119 client: &Client,
120 #[cfg(feature = "image-downloader")] download_images: bool,
121 #[cfg(feature = "image-downloader")] progress: Option<Sender<DownloadProgress>>,
122 ) -> Result<Article, ScraperError> {
123 let res = self.full_text_parser.parse(url, client).await?;
124
125 #[cfg(feature = "image-downloader")]
126 if download_images
127 && let Some(html) = res.html.as_deref()
128 && let Ok(downloaded_html) = self
129 .image_downloader
130 .download_images_from_string(html, client, progress)
131 .await
132 {
133 return Ok(Article {
134 title: res.title,
135 author: res.author,
136 url: res.url,
137 date: res.date,
138 thumbnail_url: res.thumbnail_url,
139 html: Some(downloaded_html),
140 });
141 }
142
143 Ok(res)
144 }
145}