readability_js/
lib.rs

1//! Extract clean, readable content from web pages using Mozilla's Readability.js algorithm.
2//!
3//! This crate provides both a Rust library and CLI tool for extracting the main content
4//! from HTML documents, removing navigation, ads, and other clutter. It uses the same
5//! algorithm as Firefox Reader Mode.
6//!
7//! For background on why this crate exists and design decisions, see the
8//! [blog post](https://egemengol.com/blog/readability/).
9//!
10//! # Algorithm
11//!
12//! This crate embeds Mozilla's Readability.js library using a JavaScript engine.
13//! It uses the same algorithm that processes articles in Firefox Reader Mode,
14//! providing high accuracy on modern web content including single-page applications
15//! and complex layouts.
16//!
17//! Unlike pure Rust port crates available, this approach sacrifices
18//! some performance for extraction accuracy and ongoing improvements
19//! from Mozilla's team.
20//!
21//! # Quick Start
22//!
23//! ```rust
24//! use readability_js::Readability;
25//!
26//! let html = r#"<html><body><h1>Article Title</h1><p>Main content...</p></body></html>"#;
27//! let reader = Readability::new()?;
28//! let article = reader.parse(&html)?;
29//!
30//! println!("Title: {}", article.title);
31//! println!("Content: {}", article.content);
32//! # Ok::<(), readability_js::ReadabilityError>(())
33//! ```
34//!
35//! # Parsing with URL Context
36//!
37//! Providing a URL improves link resolution and metadata extraction:
38//!
39//! ```rust
40//! use readability_js::Readability;
41//!
42//! let reader = Readability::new()?;
43//! let article = reader.parse_with_url(&html, "https://example.com/article")?;
44//! # Ok::<(), readability_js::ReadabilityError>(())
45//! ```
46//!
47//! # Custom Options
48//!
49//! Configure the parsing behavior with [`ReadabilityOptions`]:
50//!
51//! ```rust
52//! use readability_js::{Readability, ReadabilityOptions};
53//!
54//! let options = ReadabilityOptions::new()
55//!     .char_threshold(500)
56//!     .keep_classes(true);
57//!
58//! let reader = Readability::new()?;
59//! let article = reader.parse_with_options(&html, Some("https://example.com"), Some(options))?;
60//! # Ok::<(), readability_js::ReadabilityError>(())
61//! ```
62//!
63//! # Performance Considerations
64//!
65//! Creating a [`Readability`] instance is expensive (~30ms) as it initializes a JavaScript
66//! engine. Once created, parsing individual documents is fast (~10ms). Reuse the same instance
67//! when processing multiple documents:
68//!
69//! ```rust
70//! use readability_js::Readability;
71//!
72//! let reader = Readability::new()?;
73//! for html in documents {
74//!     let article = reader.parse(&html)?;
75//!     process_article(article);
76//! }
77//! # Ok::<(), readability_js::ReadabilityError>(())
78//! ```
79//!
80//! # Error Handling
81//!
82//! The most common error is [`ReadabilityError::ReadabilityCheckFailed`], which occurs
83//! when the algorithm cannot extract sufficient readable content:
84//!
85//! ```rust
86//! use readability_js::{Readability, ReadabilityError, ReadabilityOptions};
87//!
88//! let reader = Readability::new()?;
89//! match reader.parse(&html) {
90//!     Ok(article) => println!("Extracted: {}", article.title),
91//!     Err(ReadabilityError::ReadabilityCheckFailed) => {
92//!         // Try with lower threshold
93//!         let options = ReadabilityOptions::new().char_threshold(100);
94//!         let article = reader.parse_with_options(&html, None, Some(options))?;
95//!         println!("Extracted with relaxed settings: {}", article.title);
96//!     }
97//!     Err(e) => return Err(e),
98//! }
99//! # Ok::<(), readability_js::ReadabilityError>(())
100//! ```
101//!
102//! # CLI Usage
103//!
104//! The CLI tool extracts content and converts it to clean Markdown:
105//!
106//! ```bash
107//! # Install the CLI tool
108//! cargo install readability-js-cli
109//!
110//! # Process local files
111//! readable article.html > article.md
112//!
113//! # Fetch and process URLs
114//! readable https://example.com/news > news.md
115//!
116//! # Process from stdin (great for pipelines)
117//! curl -s https://site.com/article | readable > clean.md
118//!
119//! # View directly in terminal
120//! readable https://news.site/story | less
121//! ```
122//!
123//! The CLI automatically:
124//! - Detects whether input is a file path or URL
125//! - Fetches web content with proper headers
126//! - Converts the clean HTML to Markdown
127//! - Handles errors gracefully
128//!
129//! # Troubleshooting
130//!
131//! ## "Content failed readability check"
132//!
133//! This happens when the page doesn't contain enough readable content or
134//! the algorithm can't distinguish content from navigation. Try:
135//!
136//! ```rust
137//! use readability_js::{Readability, ReadabilityOptions};
138//!
139//! let options = ReadabilityOptions::new()
140//!     .char_threshold(100)         // Lower threshold (default: ~140)
141//!     .nb_top_candidates(10)       // Consider more candidates
142//!     .link_density_modifier(2.0); // More permissive with links
143//!
144//! let reader = Readability::new()?;
145//! let article = reader.parse_with_options(&html, None, Some(options))?;
146//! # Ok::<(), readability_js::ReadabilityError>(())
147//! ```
148//!
149//! ## Poor extraction quality
150//!
151//! If the extracted content is incomplete or includes unwanted elements:
152//!
153//! ```rust
154//! use readability_js::{Readability, ReadabilityOptions};
155//!
156//! // Better link resolution and metadata extraction
157//! let reader = Readability::new()?;
158//! let article = reader.parse_with_url(&html, "https://example.com/article")?;
159//!
160//! // Or preserve important CSS classes
161//! let options = ReadabilityOptions::new()
162//!     .keep_classes(true)
163//!     .classes_to_preserve(vec!["highlight".into(), "code".into(), "caption".into()]);
164//! let article = reader.parse_with_options(&html, None, Some(options))?;
165//! # Ok::<(), readability_js::ReadabilityError>(())
166//! ```
167//!
168//! ## Memory or performance issues
169//!
170//! For very large documents or resource-constrained environments:
171//!
172//! ```rust
173//! use readability_js::{Readability, ReadabilityOptions};
174//!
175//! let options = ReadabilityOptions::new()
176//!     .max_elems_to_parse(1000)   // Limit processing
177//!     .nb_top_candidates(3);      // Fewer candidates = faster
178//!
179//! let reader = Readability::new()?;
180//! let article = reader.parse_with_options(&html, None, Some(options))?;
181//! # Ok::<(), readability_js::ReadabilityError>(())
182//! ```
183
184mod readability;
185pub use readability::{Article, Direction, Readability, ReadabilityError, ReadabilityOptions};