mod article;
pub mod clean;
mod constants;
mod download_progress;
mod error;
mod full_text_parser;
mod image_object;
#[doc(hidden)]
#[cfg(feature = "image-downloader")]
pub mod images;
mod util;
mod video_object;
pub use crate::download_progress::DownloadProgress;
pub use article::Article;
pub use error::{ConfigError, FullTextParserError, ImageDownloadError, ScraperError};
#[doc(hidden)]
pub use full_text_parser::FullTextParser;
pub use full_text_parser::Readability;
#[doc(hidden)]
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
#[cfg(feature = "image-downloader")]
use futures::channel::mpsc::Sender;
#[cfg(feature = "image-downloader")]
use images::ImageDownloader;
use reqwest::Client;
use std::path::Path;
pub struct ArticleScraper {
full_text_parser: FullTextParser,
#[cfg(feature = "image-downloader")]
image_downloader: ImageDownloader,
}
impl ArticleScraper {
pub async fn new(user_configs: Option<&Path>) -> Self {
Self {
full_text_parser: FullTextParser::new(user_configs).await,
#[cfg(feature = "image-downloader")]
image_downloader: ImageDownloader::new((2048, 2048)),
}
}
pub async fn parse(
&self,
url: &url::Url,
client: &Client,
#[cfg(feature = "image-downloader")] download_images: bool,
#[cfg(feature = "image-downloader")] progress: Option<Sender<DownloadProgress>>,
) -> Result<Article, ScraperError> {
let res = self.full_text_parser.parse(url, client).await?;
#[cfg(feature = "image-downloader")]
if download_images
&& let Some(html) = res.html.as_deref()
&& let Ok(downloaded_html) = self
.image_downloader
.download_images_from_string(html, client, progress)
.await
{
return Ok(Article {
title: res.title,
author: res.author,
url: res.url,
date: res.date,
thumbnail_url: res.thumbnail_url,
html: Some(downloaded_html),
});
}
Ok(res)
}
}