readabilityrs/lib.rs
1//! # ReadabilityRS
2//!
3//! A Rust port of Mozilla's Readability library for extracting article content from web pages.
4//!
5//! This library is a faithful port of the [Mozilla Readability](https://github.com/mozilla/readability)
6//! JavaScript library, used in Firefox Reader View.
7//!
8//! ## Overview
9//!
10//! ReadabilityRS provides intelligent extraction of main article content from HTML documents,
11//! removing clutter such as advertisements, navigation elements, and other non-essential content.
12//! It also extracts metadata like article title, author (byline), publish date, and more.
13//!
14//! ## Key Features
15//!
16//! - **Content Extraction**: Intelligently identifies and extracts main article content
17//! - **Markdown Output**: Optional HTML-to-Markdown conversion with content standardization
18//! - **Metadata Extraction**: Extracts title, author, description, site name, language, and publish date
19//! - **JSON-LD Support**: Parses structured data from JSON-LD markup
20//! - **Multiple Retry Strategies**: Uses adaptive algorithms to handle various page layouts
21//! - **Customizable Options**: Configure thresholds, scoring, and behavior
22//! - **Pre-flight Check**: Quick check to determine if a page is likely readable
23//!
24//! ## Basic Usage
25//!
26//! ```rust,no_run
27//! use readabilityrs::{Readability, ReadabilityOptions};
28//!
29//! let html = r#"<html><body><article><h1>Title</h1><p>Content...</p></article></body></html>"#;
30//! let url = "https://example.com/article";
31//!
32//! let options = ReadabilityOptions::default();
33//! let readability = Readability::new(html, Some(url), Some(options)).unwrap();
34//!
35//! if let Some(article) = readability.parse() {
36//! println!("Title: {:?}", article.title);
37//! println!("Content: {:?}", article.content);
38//! println!("Author: {:?}", article.byline);
39//! }
40//! ```
41//!
42//! ## Advanced Usage
43//!
44//! ### Custom Options
45//!
46//! ```rust,no_run
47//! use readabilityrs::{Readability, ReadabilityOptions};
48//!
49//! let html = "<html>...</html>";
50//!
51//! let options = ReadabilityOptions::builder()
52//! .char_threshold(300)
53//! .nb_top_candidates(10)
54//! .keep_classes(true)
55//! .build();
56//!
57//! let readability = Readability::new(html, None, Some(options)).unwrap();
58//! let article = readability.parse();
59//! ```
60//!
61//! ### Pre-flight Check
62//!
63//! Use [`is_probably_readerable`] to quickly check if a document is likely to be parseable
64//! before doing the full parse:
65//!
66//! ```rust,no_run
67//! use readabilityrs::is_probably_readerable;
68//!
69//! let html = "<html>...</html>";
70//!
71//! if is_probably_readerable(html, None) {
72//! // Proceed with full parsing
73//! } else {
74//! // Skip parsing or use alternative strategy
75//! }
76//! ```
77//!
78//! ## Error Handling
79//!
80//! ```rust,no_run
81//! use readabilityrs::{Readability, ReadabilityError};
82//!
83//! let html = "<html>...</html>";
84//! let url = "not a valid url";
85//!
86//! match Readability::new(html, Some(url), None) {
87//! Ok(readability) => {
88//! if let Some(article) = readability.parse() {
89//! println!("Success!");
90//! }
91//! }
92//! Err(ReadabilityError::InvalidUrl(url)) => {
93//! eprintln!("Invalid URL: {}", url);
94//! }
95//! Err(e) => {
96//! eprintln!("Error: {}", e);
97//! }
98//! }
99//! ```
100//!
101//! ## Algorithm
102//!
103//! The extraction algorithm works in several phases. First, scripts and styles are removed
104//! to prepare the document. Then potential content containers are identified throughout the page.
105//! These candidates are scored based on various content signals like paragraph count, text length,
106//! and link density. The best candidate is selected using adaptive strategies with multiple fallback
107//! approaches. Nearby high-quality content is aggregated by examining sibling elements. Finally,
108//! the extracted content goes through post-processing to clean and finalize the output.
109//!
110//! ## Compatibility
111//!
112//! This implementation strives to match the behavior of Mozilla's Readability.js as closely
113//! as possible while leveraging Rust's type system and safety guarantees.
114
115mod article;
116mod cleaner;
117mod constants;
118mod content_extractor;
119mod dom_utils;
120pub mod elements;
121mod error;
122pub mod markdown;
123mod metadata;
124mod options;
125mod post_processor;
126mod readability;
127mod readerable;
128mod scoring;
129mod utils;
130
131// Public exports
132pub use article::Article;
133pub use error::{ReadabilityError, Result};
134pub use markdown::MarkdownOptions;
135pub use options::ReadabilityOptions;
136pub use readability::Readability;
137pub use readerable::{is_probably_readerable, ReaderableOptions};