use crate::{
article::Article,
cleaner,
content_extractor::grab_article,
dom_utils,
error::{ReadabilityError, Result},
metadata::{get_article_metadata, get_json_ld, Metadata},
options::ReadabilityOptions,
utils,
};
use scraper::{ElementRef, Html, Selector};
pub struct Readability {
document: Html,
html: String,
base_url: Option<String>,
options: ReadabilityOptions,
metadata: Metadata,
}
impl Readability {
pub fn new(html: &str, url: Option<&str>, options: Option<ReadabilityOptions>) -> Result<Self> {
let document = Html::parse_document(html);
let base_url = url
.map(|u| {
url::Url::parse(u)
.map(|_| u.to_string())
.map_err(|_| ReadabilityError::InvalidUrl(u.to_string()))
})
.transpose()?;
let options = options.unwrap_or_default();
Ok(Self {
document,
html: html.to_string(),
base_url,
options,
metadata: Metadata::default(),
})
}
pub fn parse(mut self) -> Option<Article> {
let json_ld = if !self.options.disable_json_ld {
get_json_ld(&self.document)
} else {
Metadata::default()
};
self.metadata = get_article_metadata(&self.document, json_ld);
let preprocessed_html = cleaner::prep_document(&self.html);
let preprocessed_doc = Html::parse_document(&preprocessed_html);
match grab_article(&preprocessed_doc, &self.options) {
Ok(Some(content_html)) => {
let cleaned_wrapper_html =
cleaner::clean_article_content_light(&content_html, self.base_url.as_deref())
.unwrap_or_else(|_| content_html.clone());
let mut prepped_html = crate::post_processor::prep_article(
&cleaned_wrapper_html,
self.options.clean_styles,
self.options.clean_whitespace,
);
if self.options.remove_title_from_content {
if let Some(ref title) = self.metadata.title {
prepped_html =
crate::post_processor::remove_title_from_content(&prepped_html, title);
}
}
let cleaned_html =
match cleaner::clean_article_content(&prepped_html, self.base_url.as_deref()) {
Ok(html) => html,
Err(e) => {
if self.options.debug {
eprintln!("Error cleaning content: {e}");
}
prepped_html
}
};
let text_content = self.get_text_content(&cleaned_html);
let length = text_content.len();
let excerpt = self.metadata.excerpt.clone().or_else(|| {
self.generate_excerpt_from_html(&cleaned_html)
.or_else(|| self.generate_excerpt_from_text(&text_content))
});
let dir = crate::dom_utils::get_article_direction(&self.document);
let markdown_content = if self.options.output_markdown {
let md_opts = self
.options
.markdown_options
.as_ref()
.cloned()
.unwrap_or_default();
let standardized = crate::elements::standardize_all(
&cleaned_html,
self.metadata.title.as_deref(),
);
Some(crate::markdown::html_to_markdown(&standardized, &md_opts))
} else {
None
};
Some(Article {
title: self.metadata.title,
content: Some(cleaned_html),
raw_content: Some(content_html),
text_content: Some(text_content),
length,
excerpt,
image: self.metadata.image,
byline: self.metadata.byline,
dir,
site_name: self.metadata.site_name,
lang: self.metadata.lang,
published_time: self.metadata.published_time,
markdown_content,
})
}
Ok(None) => None,
Err(e) => {
if self.options.debug {
eprintln!("Error grabbing article: {e}");
}
None
}
}
}
fn get_text_content(&self, html: &str) -> String {
let doc = Html::parse_fragment(html);
doc.root_element().text().collect::<String>()
}
fn generate_excerpt_from_html(&self, html: &str) -> Option<String> {
let doc = Html::parse_fragment(html);
let p_selector = Selector::parse("p").ok()?;
for p in doc.select(&p_selector) {
let text = p.text().collect::<String>();
let trimmed = text.trim();
if trimmed.len() < 25 {
continue;
}
if utils::looks_like_bracket_menu(trimmed) {
continue;
}
let class_attr = p.value().attr("class").unwrap_or("");
let id_attr = p.value().attr("id").unwrap_or("");
let class_lower = class_attr.to_lowercase();
let id_lower = id_attr.to_lowercase();
if Self::paragraph_is_excerpt_noise(&p, trimmed, &class_lower, &id_lower) {
continue;
}
let looks_like_byline = utils::looks_like_byline(trimmed)
|| class_lower.contains("byline")
|| class_lower.contains("author")
|| id_lower.contains("byline")
|| id_lower.contains("author");
if looks_like_byline {
continue;
}
return Some(trimmed.to_string());
}
None
}
fn paragraph_is_excerpt_noise(
element: &ElementRef,
text: &str,
class_lower: &str,
id_lower: &str,
) -> bool {
const CLASS_KEYWORDS: [&str; 8] = [
"hatnote",
"shortdescription",
"metadata",
"navbox",
"dablink",
"noprint",
"mwe-math-element",
"mw-empty-elt",
];
if CLASS_KEYWORDS
.iter()
.any(|kw| class_lower.contains(kw) || id_lower.contains(kw))
{
return true;
}
if element
.value()
.attr("role")
.map(|role| role.eq_ignore_ascii_case("note"))
.unwrap_or(false)
{
return true;
}
let trimmed_lower = text.to_lowercase();
const TEXT_PREFIXES: [&str; 5] = [
"see also",
"coordinates",
"navigation menu",
"external links",
"further reading",
];
if TEXT_PREFIXES
.iter()
.any(|prefix| trimmed_lower.starts_with(prefix))
{
return true;
}
let link_density = dom_utils::get_link_density(*element);
link_density > 0.8
}
fn generate_excerpt_from_text(&self, text: &str) -> Option<String> {
let cleaned = text.trim();
if cleaned.is_empty() {
return None;
}
for paragraph in cleaned.split("\n\n") {
let para_trimmed = paragraph.trim();
if para_trimmed.len() < 80 {
continue;
}
if utils::looks_like_bracket_menu(para_trimmed) {
continue;
}
return Some(self.truncate_text(para_trimmed, 300));
}
if utils::looks_like_bracket_menu(cleaned) {
return None;
}
if cleaned.len() > 40 {
Some(self.truncate_text(cleaned, 300))
} else {
None
}
}
fn truncate_text(&self, text: &str, max_len: usize) -> String {
let char_count = text.chars().count();
if char_count <= max_len {
return text.to_string();
}
let truncated: String = text.chars().take(max_len).collect();
if let Some(last_space_pos) = truncated.rfind(char::is_whitespace) {
truncated[..last_space_pos].trim().to_string()
} else {
truncated.trim().to_string()
}
}
#[allow(dead_code)]
fn log(&self, message: &str) {
if self.options.debug {
eprintln!("Reader: (Readability) {message}");
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_new_readability() {
let html = r#"<html><body><p>Test</p></body></html>"#;
let result = Readability::new(html, None, None);
assert!(result.is_ok());
}
#[test]
fn test_invalid_url() {
let html = r#"<html><body><p>Test</p></body></html>"#;
let result = Readability::new(html, Some("not a url"), None);
assert!(result.is_err());
}
#[test]
fn test_parse_simple() {
let html = r#"
<html>
<body>
<article>
<h1>Test Article</h1>
<p>This is a test article with some content.</p>
</article>
</body>
</html>
"#;
let readability = Readability::new(html, None, None).unwrap();
let _article = readability.parse();
}
#[test]
fn excerpt_skips_hatnote_paragraphs() {
let html = r#"
<p class="hatnote" role="note">See also: Something else entirely.</p>
<p>This is the first real paragraph with sufficient length to act as an excerpt. It should be returned.</p>
"#;
let reader = Readability::new(html, None, None).unwrap();
let excerpt = reader.generate_excerpt_from_html(html);
assert_eq!(
excerpt,
Some(
"This is the first real paragraph with sufficient length to act as an excerpt. It should be returned."
.to_string()
)
);
}
}