use regex::Regex;
use scraper::{Html, Selector, ElementRef};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use thiserror::Error;
mod regexps;
mod utils;
pub use regexps::{
is_unlikely_candidate, has_positive_indicators, has_negative_indicators,
is_byline, is_video_url, is_whitespace, has_content, contains_ad_words, contains_loading_words,
is_extraneous_content, is_share_element, is_next_link, is_prev_link, is_hash_url,
is_b64_data_url, is_json_ld_article_type, replace_font_tags, normalize_whitespace,
tokenize_text, count_commas
};
pub use utils::{
to_absolute_uri, is_url, get_inner_text, get_char_count, is_phrasing_content,
is_single_image, is_node_visible, has_ancestor_tag, get_node_ancestors,
is_element_without_content, has_single_tag_inside_element, has_child_block_element,
should_clean_attribute, extract_text_content, word_count, is_title_candidate,
unescape_html_entities, clean_text, get_link_density
};
#[derive(Error, Debug)]
pub enum ReadabilityError {
#[error("Invalid HTML document")]
InvalidHtml,
#[error("No content found")]
NoContent,
#[error("Parsing failed: {0}")]
ParseError(String),
}
#[derive(Debug, Clone, Copy)]
pub struct ReadabilityFlags {
pub strip_unlikelys: bool,
pub weight_classes: bool,
pub clean_conditionally: bool,
}
impl Default for ReadabilityFlags {
fn default() -> Self {
Self {
strip_unlikelys: true,
weight_classes: true,
clean_conditionally: true,
}
}
}
#[derive(Debug, Clone)]
pub struct ReadabilityOptions {
pub debug: bool,
pub max_elems_to_parse: usize,
pub nb_top_candidates: usize,
pub char_threshold: usize,
pub classes_to_preserve: Vec<String>,
pub keep_classes: bool,
pub disable_json_ld: bool,
pub allowed_video_regex: Option<Regex>,
pub link_density_modifier: f64,
pub flags: ReadabilityFlags,
}
impl Default for ReadabilityOptions {
fn default() -> Self {
Self {
debug: false,
max_elems_to_parse: 0,
nb_top_candidates: 5,
char_threshold: 25, classes_to_preserve: Vec::new(),
keep_classes: false,
disable_json_ld: false,
allowed_video_regex: None,
link_density_modifier: 1.0,
flags: ReadabilityFlags::default(),
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Article {
pub title: Option<String>,
pub content: Option<String>,
pub text_content: Option<String>,
pub length: Option<usize>,
pub excerpt: Option<String>,
pub byline: Option<String>,
pub dir: Option<String>,
pub site_name: Option<String>,
pub lang: Option<String>,
pub published_time: Option<String>,
pub readerable: Option<bool>,
}
pub struct Readability {
document: Html,
options: ReadabilityOptions,
base_uri: Option<String>,
article_title: Option<String>,
article_byline: Option<String>,
article_dir: Option<String>,
article_site_name: Option<String>,
metadata: HashMap<String, String>,
}
impl Readability {
pub fn new(html: &str, options: Option<ReadabilityOptions>) -> Result<Self, ReadabilityError> {
let document = Html::parse_document(html);
let options = options.unwrap_or_default();
Ok(Self {
document,
options,
base_uri: None,
article_title: None,
article_byline: None,
article_dir: None,
article_site_name: None,
metadata: HashMap::new(),
})
}
pub fn new_with_base_uri(html: &str, base_uri: &str, options: Option<ReadabilityOptions>) -> Result<Self, ReadabilityError> {
let mut parser = Self::new(html, options)?;
parser.base_uri = Some(base_uri.to_string());
Ok(parser)
}
pub fn parse(&mut self) -> Option<Article> {
if self.options.debug {
println!("Starting readability parsing...");
}
self.unwrap_noscript_images();
if !self.options.disable_json_ld {
self.extract_json_ld_metadata();
}
self.remove_scripts();
self.prep_document();
self.get_article_metadata();
self.get_article_title();
let char_threshold = self.options.char_threshold;
let debug = self.options.debug;
let has_description = self.metadata.get("description").is_some();
let description = self.metadata.get("description").cloned();
let article_content = self.grab_article()?;
let raw_content_html = article_content.inner_html();
let text_content = get_inner_text(&article_content, true);
let excerpt = if !has_description {
let p_selector = Selector::parse("p").unwrap();
article_content.select(&p_selector)
.next()
.map(|p| get_inner_text(&p, true))
.filter(|text| !text.trim().is_empty())
} else {
description
};
let content_html = self.clean_article_content(&raw_content_html);
let text_length = text_content.len();
if text_length < char_threshold {
if debug {
println!("Content too short: {} chars (minimum: {})", text_length, char_threshold);
}
return None;
}
Some(Article {
title: self.article_title.clone(),
content: Some(content_html),
text_content: Some(text_content),
length: Some(text_length),
excerpt,
byline: self.article_byline.clone(),
dir: self.article_dir.clone(),
site_name: self.article_site_name.clone(),
lang: self.metadata.get("lang").cloned(),
published_time: self.metadata.get("publishedTime").cloned(),
readerable: Some(true), })
}
fn remove_scripts(&mut self) {
}
fn get_article_metadata(&mut self) {
let meta_selector = Selector::parse("meta").unwrap();
for element in self.document.select(&meta_selector) {
if let Some(property) = element.value().attr("property") {
if let Some(content) = element.value().attr("content") {
self.metadata.insert(property.to_string(), content.to_string());
match property {
"og:site_name" => self.article_site_name = Some(content.to_string()),
"article:published_time" => {
self.metadata.insert("publishedTime".to_string(), content.to_string());
},
_ => {}
}
}
}
if let Some(name) = element.value().attr("name") {
if let Some(content) = element.value().attr("content") {
self.metadata.insert(name.to_string(), content.to_string());
match name {
"author" => self.article_byline = Some(content.to_string()),
_ => {}
}
}
}
}
self.extract_byline_from_dom();
if let Ok(html_selector) = Selector::parse("html") {
if let Some(html_element) = self.document.select(&html_selector).next() {
if let Some(lang) = html_element.value().attr("lang") {
self.metadata.insert("lang".to_string(), lang.to_string());
}
}
}
}
fn extract_byline_from_dom(&mut self) {
if self.article_byline.is_some() {
return;
}
let byline_selectors = [
".byline",
".author",
".post-author",
".article-author",
"[rel=\"author\"]",
".by-author",
".writer",
];
for selector_str in &byline_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = self.document.select(&selector).next() {
let byline_text = self.get_inner_text_from_ref(&element, false);
let cleaned_byline = byline_text.trim();
let cleaned_byline = cleaned_byline
.strip_prefix("By ")
.or_else(|| cleaned_byline.strip_prefix("by "))
.or_else(|| cleaned_byline.strip_prefix("BY "))
.or_else(|| cleaned_byline.strip_prefix("Author: "))
.or_else(|| cleaned_byline.strip_prefix("Written by "))
.unwrap_or(cleaned_byline);
if !cleaned_byline.is_empty() && cleaned_byline.len() < 100 {
self.article_byline = Some(cleaned_byline.to_string());
break;
}
}
}
}
}
fn get_article_title(&mut self) {
let title_selector = Selector::parse("title").unwrap();
if let Some(title_element) = self.document.select(&title_selector).next() {
self.article_title = Some(title_element.inner_html());
}
let h1_selector = Selector::parse("h1").unwrap();
for h1 in self.document.select(&h1_selector) {
let h1_text = self.get_inner_text_from_ref(&h1, false);
if h1_text.len() > 10 {
self.article_title = Some(h1_text);
break;
}
}
}
fn grab_article(&mut self) -> Option<ElementRef> {
if self.options.debug {
println!("**** grabArticle ****");
}
if self.options.max_elems_to_parse > 0 {
let all_elements: Vec<_> = self.document.select(&Selector::parse("*").unwrap()).collect();
if all_elements.len() > self.options.max_elems_to_parse {
return None;
}
}
if self.options.flags.strip_unlikelys {
self.remove_unlikely_candidates_from_dom();
}
self.remove_empty_paragraphs();
let candidates = self.find_and_score_candidates();
if candidates.is_empty() {
return self.fallback_content_selection();
}
if let Some(best_candidate) = self.select_best_candidate(&candidates) {
let tag_name = best_candidate.value().name();
let text_content = self.get_inner_text_from_ref(&best_candidate, true);
let selector = Selector::parse(tag_name).unwrap();
for element in self.document.select(&selector) {
let element_text = self.get_inner_text_from_ref(&element, true);
if element_text == text_content {
return Some(element);
}
}
}
None
}
fn get_class_weight(&self, element: &ElementRef) -> f64 {
if !self.options.flags.weight_classes {
return 0.0;
}
let mut weight = 0.0;
if let Some(class_name) = element.value().attr("class") {
if has_negative_indicators(class_name) {
weight -= 25.0;
}
if has_positive_indicators(class_name) {
weight += 25.0;
}
}
if let Some(id) = element.value().attr("id") {
if has_negative_indicators(id) {
weight -= 25.0;
}
if has_positive_indicators(id) {
weight += 25.0;
}
}
weight
}
fn find_and_score_candidates(&self) -> Vec<(ElementRef, f64)> {
let mut candidates = Vec::new();
let mut candidate_map: HashMap<String, (ElementRef, f64)> = HashMap::new();
let content_selector = Selector::parse("p, td, pre").unwrap();
for element in self.document.select(&content_selector) {
let text = get_inner_text(&element, true);
let text_length = text.trim().len();
if text_length < 25 {
continue;
}
let mut ancestors = Vec::new();
if let Some(parent) = element.parent() {
if let Some(parent_element) = ElementRef::wrap(parent) {
if self.options.flags.strip_unlikelys && self.is_unlikely_candidate(&parent_element) {
continue;
}
ancestors.push((parent_element, 1));
if let Some(grandparent) = parent.parent() {
if let Some(grandparent_element) = ElementRef::wrap(grandparent) {
if self.options.flags.strip_unlikelys && self.is_unlikely_candidate(&grandparent_element) {
continue;
}
ancestors.push((grandparent_element, 2));
}
}
}
}
for (ancestor, _level) in &ancestors {
let ancestor_id = self.get_element_id(ancestor);
if !candidate_map.contains_key(&ancestor_id) {
let content_score = self.initialize_candidate_score(ancestor);
candidate_map.insert(ancestor_id, (*ancestor, content_score));
}
}
let mut content_score = 1.0;
content_score += count_commas(&text) as f64;
content_score += (text_length as f64 / 100.0).min(3.0);
for (ancestor, level) in &ancestors {
let ancestor_id = self.get_element_id(ancestor);
if let Some((_, current_score)) = candidate_map.get_mut(&ancestor_id) {
let score_divider = match level {
1 => 1.0, 2 => 2.0, _ => (*level as f64) * 3.0, };
*current_score += content_score / score_divider;
}
}
}
for (_, (element, mut score)) in candidate_map {
let link_density = get_link_density(&element);
score *= 1.0 - link_density;
candidates.push((element, score));
}
candidates
}
fn is_unlikely_candidate(&self, element: &ElementRef) -> bool {
let tag_name = element.value().name();
if matches!(tag_name, "nav" | "aside" | "header" | "footer") {
return true;
}
if matches!(tag_name, "body" | "a" | "table" | "tbody" | "tr" | "td" | "th" | "article" | "section") {
return false;
}
let class_and_id = format!(
"{} {}",
element.value().attr("class").unwrap_or(""),
element.value().attr("id").unwrap_or("")
);
if is_unlikely_candidate(&class_and_id) && !has_positive_indicators(&class_and_id) {
return true;
}
if let Some(role) = element.value().attr("role") {
if matches!(role, "menu" | "menubar" | "complementary" | "navigation" | "alert" | "alertdialog" | "dialog") {
return true;
}
}
false
}
fn get_element_id(&self, element: &ElementRef) -> String {
format!("{:p}", element.value())
}
fn initialize_candidate_score(&self, element: &ElementRef) -> f64 {
let mut score = 1.0;
let tag_name = element.value().name().to_uppercase();
match tag_name.as_str() {
"DIV" => score += 5.0,
"PRE" | "TD" | "BLOCKQUOTE" => score += 3.0,
"ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => score -= 3.0,
"H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => score -= 5.0,
_ => {},
}
score += self.get_class_weight(element);
score
}
fn select_best_candidate<'a>(&self, candidates: &'a [(ElementRef<'a>, f64)]) -> Option<ElementRef<'a>> {
if candidates.is_empty() {
return None;
}
let mut sorted_candidates = candidates.to_vec();
sorted_candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
let best_candidate = sorted_candidates[0].0;
let best_score = sorted_candidates[0].1;
if self.options.debug {
println!("Best candidate score: {}", best_score);
}
if let Some(parent) = best_candidate.parent() {
if let Some(parent_element) = ElementRef::wrap(parent) {
let nav_selector = Selector::parse("nav, aside, header, footer, [class*='sidebar'], [class*='navigation']").unwrap();
if parent_element.select(&nav_selector).next().is_some() {
if self.options.debug {
println!("Parent contains navigation elements, skipping");
}
} else {
let parent_text_length = self.get_inner_text_from_ref(&parent_element, false).len();
let candidate_text_length = self.get_inner_text_from_ref(&best_candidate, false).len();
if parent_text_length > candidate_text_length * 2 {
let parent_score = self.calculate_candidate_score(&parent_element);
if parent_score > best_score * 0.75 {
if self.options.debug {
println!("Using parent element with score: {}", parent_score);
}
return Some(parent_element);
}
}
}
}
}
Some(best_candidate)
}
fn calculate_candidate_score(&self, element: &ElementRef) -> f64 {
let text = get_inner_text(element, true);
if text.len() < 25 {
return 0.0;
}
let mut content_score = 0.0;
content_score += 1.0;
content_score += count_commas(&text) as f64;
content_score += (text.len() as f64 / 100.0).min(3.0);
content_score
}
fn fallback_content_selection(&self) -> Option<ElementRef> {
let selectors = ["article", "main", "#content", ".content", ".entry-content", "body"];
for selector_str in &selectors {
if let Ok(selector) = Selector::parse(selector_str) {
if let Some(element) = self.document.select(&selector).next() {
if self.options.debug {
println!("Found content using fallback selector: {}", selector_str);
}
return Some(element);
}
}
}
None
}
fn extract_json_ld_metadata(&mut self) {
let script_selector = Selector::parse("script[type='application/ld+json']").unwrap();
for element in self.document.select(&script_selector) {
let text = element.text().collect::<String>();
if text.contains("@type") && text.contains("Article") {
if self.options.debug {
println!("Found JSON-LD article metadata");
}
}
}
}
fn unwrap_noscript_images(&mut self) {
let _noscript_selector = Selector::parse("noscript").unwrap();
}
fn prep_document(&mut self) {
if self.options.debug {
println!("**** prepDocument ****");
}
self.remove_nodes_by_tag("script");
self.remove_nodes_by_tag("style");
self.remove_nodes_by_tag("noscript");
if self.options.flags.strip_unlikelys {
self.remove_unlikely_candidates_from_dom();
}
self.replace_font_tags();
self.replace_brs();
self.unwrap_noscript_images();
self.convert_divs_to_paragraphs();
self.remove_empty_paragraphs();
if self.options.debug {
println!("Document preparation complete");
}
}
fn remove_unlikely_candidates_from_dom(&mut self) {
if self.options.debug {
println!("Removing unlikely candidates from DOM");
}
}
fn remove_empty_paragraphs(&mut self) {
if self.options.debug {
println!("Removing empty paragraphs");
}
}
fn remove_nodes_by_tag(&mut self, tag_name: &str) {
if self.options.debug {
println!("Removing {} tags", tag_name);
}
}
fn replace_font_tags(&mut self) {
if self.options.debug {
println!("Replacing font tags with span tags");
}
}
fn replace_brs(&mut self) {
if self.options.debug {
println!("Converting <br> sequences to paragraphs");
}
}
fn convert_divs_to_paragraphs(&mut self) {
if self.options.debug {
println!("Converting appropriate DIVs to paragraphs");
}
}
fn clean_article_content(&self, content: &str) -> String {
if self.options.debug {
println!("Cleaning article content");
}
let mut cleaned_content = content.to_string();
if self.options.debug {
println!("Original content before cleaning: {}", cleaned_content);
}
let unwanted_patterns = [
r"(?s)<nav[^>]*>.*?</nav>",
r"(?s)<aside[^>]*>.*?</aside>",
r"(?s)<header[^>]*>.*?</header>",
r"(?s)<footer[^>]*>.*?</footer>",
r#"(?s)<div[^>]*class=["'][^"']*sidebar[^"']*["'][^>]*>.*?</div>"#,
r#"(?s)<div[^>]*class=["'][^"']*navigation[^"']*["'][^>]*>.*?</div>"#,
];
for pattern in &unwanted_patterns {
let re = regex::Regex::new(pattern).unwrap();
cleaned_content = re.replace_all(&cleaned_content, "").to_string();
}
let re_whitespace = regex::Regex::new(r"\s{2,}").unwrap();
cleaned_content = re_whitespace.replace_all(&cleaned_content, " ").to_string();
cleaned_content.trim().to_string()
}
fn get_inner_text_from_ref(&self, element: &ElementRef, normalize_spaces: bool) -> String {
let text = element.text().collect::<Vec<_>>().join(" ");
if normalize_spaces {
let re = Regex::new(r"\s+").unwrap();
re.replace_all(&text, " ").trim().to_string()
} else {
text
}
}
}
pub fn is_probably_readerable(html: &str, options: Option<ReadabilityOptions>) -> bool {
let document = Html::parse_document(html);
let opts = options.unwrap_or_default();
let min_content_length = if opts.char_threshold > 0 {
opts.char_threshold
} else {
140 };
let min_score = if min_content_length <= 20 {
8.0 } else if min_content_length <= 50 {
20.0 } else if min_content_length <= 100 {
30.0 } else {
40.0 };
let content_selectors = ["p", "pre", "article", "div"];
let mut score = 0.0;
let mut total_text_length = 0;
for selector_str in &content_selectors {
if let Ok(selector) = Selector::parse(selector_str) {
for element in document.select(&selector) {
let text_content = element.text().collect::<String>();
let text_length = text_content.trim().len();
if text_length < 10 { continue;
}
total_text_length += text_length;
let class_and_id = format!("{} {}",
element.value().attr("class").unwrap_or(""),
element.value().attr("id").unwrap_or("")
);
if is_unlikely_candidate(&class_and_id) {
score -= 5.0; continue;
}
let element_score = match element.value().name() {
"article" => (text_length as f64 * 0.5).min(30.0),
"p" => (text_length as f64 * 0.3).min(20.0),
"pre" => (text_length as f64 * 0.4).min(25.0),
"div" => {
if min_content_length <= 50 && text_length > 20 {
(text_length as f64 * 0.25).min(15.0)
} else if text_length > 80 {
(text_length as f64 * 0.2).min(15.0)
} else {
0.0
}
},
_ => 0.0,
};
score += element_score;
if score > min_score && total_text_length >= min_content_length {
return true;
}
}
}
}
score > min_score && total_text_length >= min_content_length
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::path::Path;
use serde_json;
fn create_parser(html: &str) -> Readability {
Readability::new(html, Some(ReadabilityOptions {
debug: true,
char_threshold: 25, ..Default::default()
})).unwrap()
}
fn create_parser_with_options(html: &str, options: ReadabilityOptions) -> Readability {
Readability::new(html, Some(options)).unwrap()
}
fn load_test_case(test_dir: &str) -> Result<(String, String, serde_json::Value), Box<dyn std::error::Error>> {
let base_path = Path::new("mozzila-readability/test/test-pages").join(test_dir);
let source_path = base_path.join("source.html");
let expected_content_path = base_path.join("expected.html");
let expected_metadata_path = base_path.join("expected-metadata.json");
let source = fs::read_to_string(&source_path)
.map_err(|e| format!("Failed to read source.html for {}: {}", test_dir, e))?;
let expected_content = fs::read_to_string(&expected_content_path)
.map_err(|e| format!("Failed to read expected.html for {}: {}", test_dir, e))?;
let expected_metadata: serde_json::Value = serde_json::from_str(
&fs::read_to_string(&expected_metadata_path)
.map_err(|e| format!("Failed to read expected-metadata.json for {}: {}", test_dir, e))?
).map_err(|e| format!("Failed to parse expected-metadata.json for {}: {}", test_dir, e))?;
Ok((source, expected_content, expected_metadata))
}
fn get_test_case_dirs() -> Vec<String> {
let test_pages_path = Path::new("mozzila-readability/test/test-pages");
if !test_pages_path.exists() {
println!("Warning: Mozilla test pages directory not found at {:?}", test_pages_path);
return Vec::new();
}
let mut dirs = Vec::new();
if let Ok(entries) = fs::read_dir(test_pages_path) {
for entry in entries {
if let Ok(entry) = entry {
if entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
if let Some(name) = entry.file_name().to_str() {
dirs.push(name.to_string());
}
}
}
}
}
dirs.sort();
dirs
}
fn test_mozilla_case(test_dir: &str) {
let (source, _expected_content, expected_metadata) = match load_test_case(test_dir) {
Ok(data) => data,
Err(e) => {
println!("Skipping test case {}: {}", test_dir, e);
return;
}
};
let base_uri = "http://fakehost/test/page.html";
let mut parser = match Readability::new_with_base_uri(&source, base_uri, Some(ReadabilityOptions {
debug: false,
char_threshold: 25,
classes_to_preserve: vec!["caption".to_string()],
..Default::default()
})) {
Ok(p) => p,
Err(e) => {
println!("Failed to create parser for {}: {:?}", test_dir, e);
return;
}
};
let is_readerable = is_probably_readerable(&source, Some(ReadabilityOptions {
char_threshold: 25,
..Default::default()
}));
let expected_readerable = expected_metadata["readerable"].as_bool().unwrap_or(false);
if expected_readerable && !is_readerable {
println!("Warning: {} expected to be readerable but failed readerable check", test_dir);
}
let article = parser.parse();
if expected_readerable {
if let Some(article) = article {
if let Some(expected_title) = expected_metadata["title"].as_str() {
if let Some(actual_title) = &article.title {
if !actual_title.contains(expected_title) && !expected_title.contains(actual_title) {
println!("Title mismatch in {}: expected '{}', got '{}'",
test_dir, expected_title, actual_title);
}
} else {
println!("Missing title in {}: expected '{}'", test_dir, expected_title);
}
}
if let Some(expected_byline) = expected_metadata["byline"].as_str() {
if let Some(actual_byline) = &article.byline {
if actual_byline != expected_byline {
println!("Byline mismatch in {}: expected '{}', got '{}'",
test_dir, expected_byline, actual_byline);
}
} else {
println!("Missing byline in {}: expected '{}'", test_dir, expected_byline);
}
}
if let Some(expected_lang) = expected_metadata["lang"].as_str() {
if let Some(actual_lang) = &article.lang {
if actual_lang != expected_lang {
println!("Language mismatch in {}: expected '{}', got '{}'",
test_dir, expected_lang, actual_lang);
}
} else {
println!("Missing language in {}: expected '{}'", test_dir, expected_lang);
}
}
if let Some(expected_site_name) = expected_metadata["siteName"].as_str() {
if let Some(actual_site_name) = &article.site_name {
if actual_site_name != expected_site_name {
println!("Site name mismatch in {}: expected '{}', got '{}'",
test_dir, expected_site_name, actual_site_name);
}
} else {
println!("Missing site name in {}: expected '{}'", test_dir, expected_site_name);
}
}
if let Some(expected_published_time) = expected_metadata["publishedTime"].as_str() {
if let Some(actual_published_time) = &article.published_time {
if actual_published_time != expected_published_time {
println!("Published time mismatch in {}: expected '{}', got '{}'",
test_dir, expected_published_time, actual_published_time);
}
} else {
println!("Missing published time in {}: expected '{}'", test_dir, expected_published_time);
}
}
if let Some(content) = &article.content {
if content.trim().is_empty() {
println!("Empty content in {}", test_dir);
}
} else {
println!("Missing content in {}", test_dir);
}
assert_eq!(article.readerable, Some(true), "Article should be marked as readerable for {}", test_dir);
} else {
println!("Failed to parse article for {} (expected to be readerable)", test_dir);
}
} else {
if article.is_some() {
println!("Unexpectedly parsed article for {} (expected not readerable)", test_dir);
}
}
}
#[test]
fn test_readability_options_default() {
let options = ReadabilityOptions::default();
assert!(!options.debug);
assert_eq!(options.max_elems_to_parse, 0);
assert_eq!(options.nb_top_candidates, 5);
assert_eq!(options.char_threshold, 25);
assert!(!options.keep_classes);
assert!(!options.disable_json_ld);
}
#[test]
fn test_article_creation() {
let article = Article {
title: Some("Test Title".to_string()),
content: Some("<div>Test content</div>".to_string()),
text_content: Some("Test content".to_string()),
length: Some(12),
excerpt: Some("Test excerpt".to_string()),
byline: Some("Test Author".to_string()),
readerable: Some(true),
dir: None,
site_name: Some("Test Site".to_string()),
lang: Some("en".to_string()),
published_time: None,
};
assert_eq!(article.title.unwrap(), "Test Title");
assert_eq!(article.length.unwrap(), 12);
assert!(article.excerpt.is_some());
}
#[test]
fn test_simple_article_parsing() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Article</title>
<meta name="author" content="John Doe">
<meta name="description" content="This is a test article">
</head>
<body>
<h1>Test Article Title</h1>
<article>
<p>This is the first paragraph of our test article. It contains enough content to be considered readable.</p>
<p>This is the second paragraph with more content. It helps ensure the article meets the minimum length requirements for processing.</p>
<p>A third paragraph to add more substance to our test article and make it comprehensive enough for testing.</p>
</article>
</body>
</html>
"#;
let mut options = ReadabilityOptions::default();
options.debug = true;
let mut parser = create_parser_with_options(html, options);
let result = parser.parse();
assert!(result.is_some());
let article = result.unwrap();
assert!(article.title.is_some() && !article.title.as_ref().unwrap().is_empty());
assert!(article.content.is_some());
assert!(article.length.is_some() && article.length.unwrap() > 100);
}
#[test]
fn test_empty_document() {
let html = "<html><body></body></html>";
let mut options = ReadabilityOptions::default();
options.debug = true;
let mut parser = create_parser_with_options(html, options);
let result = parser.parse();
assert!(result.is_none());
}
#[test]
fn test_minimal_content() {
let html = r#"
<html>
<body>
<p>Short</p>
</body>
</html>
"#;
let mut options = ReadabilityOptions::default();
options.debug = true;
let mut parser = create_parser_with_options(html, options);
let result = parser.parse();
assert!(result.is_none());
}
#[test]
fn test_article_with_metadata() {
let html = r#"
<!DOCTYPE html>
<html lang="en">
<head>
<title>Test Article - Test Site</title>
<meta name="author" content="Jane Smith">
<meta name="description" content="A comprehensive test article for readability testing">
<meta property="og:site_name" content="Test Publishing">
<meta property="og:title" content="Test Article">
</head>
<body>
<article>
<h1>Test Article Title</h1>
<div class="byline">By Jane Smith</div>
<p>This is a comprehensive test article with enough content to be considered readable by the parser.</p>
<p>The article contains multiple paragraphs with substantial text content that should pass all readability checks.</p>
<p>Additional content to ensure the article meets minimum length requirements and provides meaningful extractable content.</p>
<p>More content to test the parsing and extraction capabilities of the readability implementation.</p>
</article>
</body>
</html>
"#;
let mut parser = create_parser(html);
let result = parser.parse();
assert!(result.is_some());
let article = result.unwrap();
assert!(article.title.is_some() && !article.title.as_ref().unwrap().is_empty());
assert!(article.byline.is_some());
assert!(article.site_name.is_some());
assert!(article.lang.is_some());
assert_eq!(article.lang.as_ref().unwrap(), "en");
assert!(article.length.is_some() && article.length.unwrap() > 200);
}
#[test]
fn test_is_probably_readerable_basic() {
let readable_html = r#"
<html>
<body>
<article>
<h1>Long Article Title</h1>
<p>This is a long article with substantial content that should be considered readable.</p>
<p>Multiple paragraphs with enough text to meet the readability thresholds.</p>
<p>Additional content to ensure this passes the readability checks.</p>
<p>Even more content to make sure this document is substantial enough.</p>
</article>
</body>
</html>
"#;
assert!(is_probably_readerable(readable_html, None));
let unreadable_html = r#"
<html>
<body>
<nav>Menu</nav>
<footer>Copyright</footer>
</body>
</html>
"#;
assert!(!is_probably_readerable(unreadable_html, None));
}
#[test]
fn test_is_probably_readerable_with_options() {
let html = r#"
<html>
<body>
<p>Medium length content that is somewhat substantial.</p>
</body>
</html>
"#;
assert!(!is_probably_readerable(html, None));
let lenient_options = ReadabilityOptions {
char_threshold: 20,
..Default::default()
};
assert!(is_probably_readerable(html, Some(lenient_options)));
}
#[test]
fn test_parser_creation() {
let html = "<html><body><p>Test content</p></body></html>";
let parser = Readability::new(html, None);
assert!(parser.is_ok());
}
#[test]
fn test_parser_with_options() {
let html = "<html><body><p>Test content</p></body></html>";
let options = ReadabilityOptions {
debug: true,
char_threshold: 100,
..Default::default()
};
let parser = Readability::new(html, Some(options));
assert!(parser.is_ok());
}
#[test]
fn test_unicode_handling() {
let unicode_html = r#"
<!DOCTYPE html>
<html lang="zh">
<head>
<title>测试文章</title>
<meta charset="UTF-8">
</head>
<body>
<article>
<h1>Unicode Content Test</h1>
<p>This article contains unicode characters: 测试 🚀 ñáéíóú àèìòù</p>
<p>Emoji support test: 😀 🎉 🌟 💻 📚</p>
<p>Various languages: English, Español, Français, 中文, 日本語, العربية</p>
<p>Special characters: ™ © ® € £ ¥ § ¶ † ‡ • … ‰ ′ ″ ‹ › « » " " ' '</p>
</article>
</body>
</html>
"#;
let mut parser = create_parser(unicode_html);
let result = parser.parse();
assert!(result.is_some());
let article = result.unwrap();
assert!(article.title.is_some());
assert!(article.text_content.is_some());
}
#[test]
fn test_malformed_html_handling() {
let malformed_html = r#"
<html>
<head>
<title>Malformed HTML Test</title>
</head>
<body>
<article>
<h1>Test Article</h1>
<p>This is a test article with malformed HTML that contains substantial content to meet the minimum character threshold. The article discusses various aspects of HTML parsing and how robust parsers should handle malformed markup gracefully without failing completely.</p>
<p>Missing closing tags and other issues are common in real-world HTML documents. A good readability parser should be able to extract meaningful content even when the HTML structure is not perfect. This includes handling unclosed tags, missing attributes, and other structural problems.</p>
<div>Unclosed div with more content to ensure we meet the character requirements for successful parsing.</div>
</article>
</body>
</html>
"#;
let options = ReadabilityOptions {
char_threshold: 50, debug: true,
..Default::default()
};
let mut parser = Readability::new(malformed_html, Some(options)).unwrap();
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.title.is_some());
assert_eq!(article.title.unwrap(), "Test Article");
}
#[test]
fn test_mozilla_test_case_001() {
let html = r#"
<!DOCTYPE html>
<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<title>Get your Frontend JavaScript Code Covered | Code | Nicolas Perriault</title>
<meta name="description" content="Nicolas Perriault's homepage."/>
<meta name="author" content="Nicolas Perriault"/>
</head>
<body>
<div class="container">
<article>
<h1>Get your Frontend JavaScript Code Covered</h1>
<p>This is the main content of the article about JavaScript code coverage.</p>
<p>It contains multiple paragraphs with substantial content that should be extracted.</p>
<p>The readability algorithm should identify this as the main content area.</p>
</article>
<nav class="sidebar">
<ul>
<li><a href="/">Home</a></li>
<li><a href="/about">About</a></li>
</ul>
</nav>
</div>
</body>
</html>
"#;
let mut parser = create_parser(html);
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.title.is_some());
assert!(article.title.as_ref().unwrap().contains("Get your Frontend JavaScript Code Covered"));
assert_eq!(article.byline, Some("Nicolas Perriault".to_string()));
assert_eq!(article.lang, Some("en".to_string()));
assert_eq!(article.excerpt, Some("Nicolas Perriault's homepage.".to_string()));
assert!(article.content.is_some());
let content = article.content.unwrap();
println!("Extracted content: {}", content);
assert!(content.contains("main content of the article"));
assert!(content.contains("JavaScript code coverage"));
assert!(!content.contains("sidebar"));
assert!(!content.contains("Home"));
assert!(!content.contains("About"));
}
#[test]
fn test_mozilla_test_case_wikipedia() {
let html = r#"
<!DOCTYPE html>
<html lang="en">
<head>
<title>Mozilla - Wikipedia</title>
<meta name="description" content="Mozilla is a free software community founded in 1998."/>
</head>
<body>
<div id="content">
<h1>Mozilla</h1>
<p><strong>Mozilla</strong> is a free software community founded in 1998.</p>
<p>Mozilla Firefox is a web browser developed by Mozilla.</p>
<h2>History</h2>
<p>Mozilla was founded in 1998 when Netscape Communications Corporation released the source code for its flagship Netscape Communicator product.</p>
<p>The Mozilla project was created to coordinate the development of the Mozilla Application Suite.</p>
<h2>Products</h2>
<h3>Firefox</h3>
<p>Firefox is a free and open-source web browser developed by Mozilla Foundation.</p>
<h3>Thunderbird</h3>
<p>Thunderbird is a free and open-source email client developed by Mozilla Foundation.</p>
</div>
<div id="navigation">
<ul>
<li><a href="/wiki/Main_Page">Main page</a></li>
<li><a href="/wiki/Special:Random">Random article</a></li>
</ul>
</div>
</body>
</html>
"#;
let mut parser = create_parser(html);
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.title.is_some());
assert!(article.title.as_ref().unwrap().contains("Mozilla"));
assert!(article.content.is_some());
let content = article.content.unwrap();
assert!(content.contains("free software community"));
assert!(content.contains("Firefox"));
assert!(content.contains("Thunderbird"));
assert!(content.contains("History"));
assert!(content.contains("Products"));
assert!(!content.contains("Main page"));
assert!(!content.contains("Random article"));
}
#[test]
fn test_content_scoring_algorithm() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Content Scoring Test</title>
</head>
<body>
<div class="advertisement">
<p>This is an advertisement that should be filtered out.</p>
</div>
<article class="main-content">
<h1>Main Article Title</h1>
<p>This is the main article content with substantial text. It contains multiple sentences and should be scored highly by the readability algorithm. The content is meaningful and provides value to readers.</p>
<p>Another paragraph with more substantial content. This paragraph also contains commas, which should increase the content score according to Mozilla's algorithm.</p>
<p>A third paragraph to ensure we have enough content for proper scoring.</p>
</article>
<div class="sidebar">
<p>Short sidebar text.</p>
</div>
<footer>
<p>Copyright notice and other footer content.</p>
</footer>
</body>
</html>
"#;
let mut parser = create_parser(html);
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.content.is_some());
let content = article.content.unwrap();
assert!(content.contains("main article content"));
assert!(content.contains("substantial text"));
assert!(content.contains("commas, which should increase"));
assert!(!content.contains("advertisement"));
assert!(!content.contains("Short sidebar"));
assert!(!content.contains("Copyright notice"));
}
#[test]
fn test_metadata_extraction_comprehensive() {
let html = r#"
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Comprehensive Metadata Test Article</title>
<meta name="author" content="John Doe">
<meta name="description" content="A comprehensive test of metadata extraction capabilities.">
<meta property="og:title" content="OG Title Override">
<meta property="og:description" content="Open Graph description.">
<meta property="og:site_name" content="Test Site">
<meta property="article:published_time" content="2023-01-15T10:30:00Z">
<meta name="twitter:title" content="Twitter Title">
<meta name="twitter:description" content="Twitter description.">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "JSON-LD Headline",
"author": {
"@type": "Person",
"name": "Jane Smith"
},
"datePublished": "2023-01-15"
}
</script>
</head>
<body>
<article>
<header>
<h1>Article Title</h1>
<p class="byline">By <span class="author">Article Author</span></p>
<time datetime="2023-01-15">January 15, 2023</time>
</header>
<div class="content">
<p>This is the main article content for testing metadata extraction capabilities in our readability parser. The article demonstrates how various metadata formats can be parsed and extracted from HTML documents, including Open Graph tags, Twitter Card metadata, and JSON-LD structured data.</p>
<p>The article contains substantial content to ensure proper parsing and meets the minimum character threshold required by the readability algorithm. This comprehensive test validates that our parser can handle multiple metadata sources and prioritize them correctly according to the Mozilla Readability specification.</p>
<p>Additional content is provided here to ensure we have enough text for the parser to consider this a valid article worth extracting. The metadata extraction process should work seamlessly with content extraction to provide a complete article parsing solution.</p>
</div>
</article>
</body>
</html>
"#;
let mut parser = create_parser(html);
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.title.is_some());
assert!(article.byline.is_some());
assert_eq!(article.lang, Some("en-US".to_string()));
assert!(article.excerpt.is_some());
assert!(article.site_name.is_some());
assert!(article.published_time.is_some());
assert!(article.content.is_some());
let content = article.content.unwrap();
assert!(content.contains("main article content"));
assert!(content.contains("metadata extraction"));
}
#[test]
fn test_readability_assessment() {
let readable_html = r#"
<!DOCTYPE html>
<html>
<head><title>Readable Article</title></head>
<body>
<article>
<h1>This is a readable article</h1>
<p>This article contains substantial content that makes it worth reading. It has multiple paragraphs with meaningful text that provides value to the reader.</p>
<p>The content is well-structured and contains enough text to be considered readable by the algorithm.</p>
<p>Additional paragraphs ensure that there is sufficient content for proper assessment.</p>
</article>
</body>
</html>
"#;
let unreadable_html = r#"
<!DOCTYPE html>
<html>
<head><title>Unreadable Page</title></head>
<body>
<div class="navigation">
<a href="/home">Home</a>
<a href="/about">About</a>
</div>
<p>Short text.</p>
<footer>Footer content</footer>
</body>
</html>
"#;
assert!(is_probably_readerable(readable_html, None));
assert!(!is_probably_readerable(unreadable_html, None));
}
#[test]
fn test_cli_integration() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>CLI Integration Test</title>
<meta name="author" content="CLI Tester">
</head>
<body>
<main>
<h1>CLI Integration Test Article</h1>
<p>This article tests the integration between the library and CLI usage patterns. The CLI tool should be able to parse HTML documents and extract readable content in various output formats including JSON, plain text, and HTML.</p>
<p>It should be parseable and return structured data suitable for JSON output. The parser needs to handle various input sources like files, URLs, and stdin, while providing comprehensive metadata extraction and content cleaning capabilities.</p>
<p>The CLI integration test ensures that all the core functionality works correctly when invoked from command-line tools, maintaining compatibility with the original Mozilla Readability library while providing additional Rust-specific features and performance improvements.</p>
</main>
</body>
</html>
"#;
let mut parser = create_parser(html);
let article = parser.parse();
assert!(article.is_some());
let article = article.unwrap();
assert!(article.title.is_some());
assert!(article.content.is_some());
assert!(article.text_content.is_some());
assert!(article.length.is_some());
assert!(article.byline.is_some());
let json_result = serde_json::to_string(&article);
assert!(json_result.is_ok());
let json_str = json_result.unwrap();
assert!(json_str.contains("CLI Integration Test"));
assert!(json_str.contains("CLI Tester"));
}
#[test]
fn test_mozilla_test_cases_sample() {
let test_cases = vec![
"001",
"002",
"basic-tags-cleaning",
"003-metadata-preferred",
"article-author-tag"
];
for test_case in test_cases {
println!("Testing Mozilla case: {}", test_case);
test_mozilla_case(test_case);
}
}
#[test]
fn test_all_mozilla_test_cases() {
let test_dirs = get_test_case_dirs();
if test_dirs.is_empty() {
println!("No Mozilla test cases found - skipping comprehensive test");
return;
}
println!("Running {} Mozilla test cases", test_dirs.len());
let mut passed = 0;
let mut failed = 0;
for test_dir in &test_dirs {
println!("Testing: {}", test_dir);
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
test_mozilla_case(test_dir);
}));
match result {
Ok(_) => {
passed += 1;
println!("✓ {}", test_dir);
},
Err(e) => {
failed += 1;
println!("✗ {} - {:?}", test_dir, e);
}
}
}
println!("\nMozilla test results: {} passed, {} failed", passed, failed);
}
#[test]
fn test_mozilla_metadata_extraction() {
let test_cases = vec![
("003-metadata-preferred", "Dublin Core property title", Some("Dublin Core property author")),
("article-author-tag", "The Deck of Cards That Made Tarot A Global Phenomenon", Some("Laura June Topolsky")),
];
for (test_dir, expected_title, expected_byline) in test_cases {
if let Ok((source, _, expected_metadata)) = load_test_case(test_dir) {
let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
debug: false,
char_threshold: 25,
..Default::default()
})).unwrap();
if let Some(article) = parser.parse() {
if let Some(title) = &article.title {
if !title.contains(expected_title) && !expected_title.contains(title) {
println!("Title difference in {}: expected '{}', got '{}'", test_dir, expected_title, title);
}
}
if let Some(expected_byline) = expected_byline {
if let Some(byline) = &article.byline {
if byline != expected_byline {
println!("Byline difference in {}: expected '{}', got '{}'", test_dir, expected_byline, byline);
}
}
}
if let Some(expected_lang) = expected_metadata["lang"].as_str() {
assert_eq!(article.lang.as_deref(), Some(expected_lang),
"Language mismatch in {}", test_dir);
}
if let Some(expected_site_name) = expected_metadata["siteName"].as_str() {
assert_eq!(article.site_name.as_deref(), Some(expected_site_name),
"Site name mismatch in {}", test_dir);
}
}
}
}
}
#[test]
fn test_mozilla_readerable_detection() {
let test_cases = vec![
"001",
"basic-tags-cleaning",
"article-author-tag",
"bbc-1",
"cnn"
];
for test_case in test_cases {
if let Ok((source, _, expected_metadata)) = load_test_case(test_case) {
let expected_readerable = expected_metadata["readerable"].as_bool().unwrap_or(false);
let actual_readerable = is_probably_readerable(&source, Some(ReadabilityOptions {
char_threshold: 25,
..Default::default()
}));
if expected_readerable != actual_readerable {
println!("Readerable detection difference in {}: expected {}, got {}",
test_case, expected_readerable, actual_readerable);
}
}
}
}
#[test]
fn test_mozilla_content_extraction_quality() {
let test_cases = vec![
"001",
"bbc-1",
"guardian-1",
"nytimes-1",
"medium-1"
];
for test_case in test_cases {
if let Ok((source, _expected_content, _)) = load_test_case(test_case) {
let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
debug: false,
char_threshold: 25,
classes_to_preserve: vec!["caption".to_string()],
..Default::default()
})).unwrap();
if let Some(article) = parser.parse() {
if let Some(content) = &article.content {
assert!(!content.trim().is_empty(), "Content should not be empty for {}", test_case);
assert!(content.len() > 100, "Content should be substantial for {}", test_case);
if !content.contains("<p>") && !content.contains("<div>") {
println!("Warning: Content does not contain paragraphs or divs for {}", test_case);
}
let content_lower = content.to_lowercase();
if content_lower.contains("navigation") {
println!("Warning: Content contains navigation elements for {}", test_case);
}
if content_lower.contains("menu") {
println!("Warning: Content contains menu elements for {}", test_case);
}
}
}
}
}
}
#[test]
fn test_mozilla_edge_cases() {
let edge_cases = vec![
"comment-inside-script-parsing",
"malformed-html",
"missing-paragraphs",
"normalize-spaces",
"remove-extra-brs",
"remove-extra-paragraphs"
];
for test_case in edge_cases {
if let Ok((source, _, _expected_metadata)) = load_test_case(test_case) {
let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
debug: false,
char_threshold: 100, ..Default::default()
})).unwrap();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
parser.parse()
}));
match result {
Ok(_) => {
println!("✓ Edge case {} handled gracefully", test_case);
},
Err(_) => {
println!("✗ Edge case {} caused panic", test_case);
}
}
}
}
}
}