readability_js/
lib.rs

1//! Extract clean, readable content from web pages using Mozilla's Readability.js algorithm.
2//!
3//! This crate provides both a Rust library and CLI tool for extracting the main content
4//! from HTML documents, removing navigation, ads, and other clutter. It uses the same
5//! algorithm as Firefox Reader Mode.
6//!
7//! # Algorithm
8//!
9//! This crate embeds Mozilla's Readability.js library using a JavaScript engine.
10//! It uses the same algorithm that processes articles in Firefox Reader Mode,
11//! providing high accuracy on modern web content including single-page applications
12//! and complex layouts.
13//!
14//! Unlike pure Rust port crates available, this approach sacrifices
15//! some performance for extraction accuracy and ongoing improvements
16//! from Mozilla's team.
17//!
18//! # Quick Start
19//!
20//! ```rust
21//! use readability_js::Readability;
22//!
23//! let html = r#"<html><body><h1>Article Title</h1><p>Main content...</p></body></html>"#;
24//! let reader = Readability::new()?;
25//! let article = reader.parse(&html)?;
26//!
27//! println!("Title: {}", article.title);
28//! println!("Content: {}", article.content);
29//! # Ok::<(), readability_js::ReadabilityError>(())
30//! ```
31//!
32//! # Parsing with URL Context
33//!
34//! Providing a URL improves link resolution and metadata extraction:
35//!
36//! ```rust
37//! use readability_js::Readability;
38//!
39//! let reader = Readability::new()?;
40//! let article = reader.parse_with_url(&html, "https://example.com/article")?;
41//! # Ok::<(), readability_js::ReadabilityError>(())
42//! ```
43//!
44//! # Custom Options
45//!
46//! Configure the parsing behavior with [`ReadabilityOptions`]:
47//!
48//! ```rust
49//! use readability_js::{Readability, ReadabilityOptions};
50//!
51//! let options = ReadabilityOptions::new()
52//!     .char_threshold(500)
53//!     .keep_classes(true);
54//!
55//! let reader = Readability::new()?;
56//! let article = reader.parse_with_options(&html, Some("https://example.com"), Some(options))?;
57//! # Ok::<(), readability_js::ReadabilityError>(())
58//! ```
59//!
60//! # Performance Considerations
61//!
62//! Creating a [`Readability`] instance is expensive (~30ms) as it initializes a JavaScript
63//! engine. Once created, parsing individual documents is fast (~10ms). Reuse the same instance
64//! when processing multiple documents:
65//!
66//! ```rust
67//! use readability_js::Readability;
68//!
69//! let reader = Readability::new()?;
70//! for html in documents {
71//!     let article = reader.parse(&html)?;
72//!     process_article(article);
73//! }
74//! # Ok::<(), readability_js::ReadabilityError>(())
75//! ```
76//!
77//! # Error Handling
78//!
79//! The most common error is [`ReadabilityError::ReadabilityCheckFailed`], which occurs
80//! when the algorithm cannot extract sufficient readable content:
81//!
82//! ```rust
83//! use readability_js::{Readability, ReadabilityError, ReadabilityOptions};
84//!
85//! let reader = Readability::new()?;
86//! match reader.parse(&html) {
87//!     Ok(article) => println!("Extracted: {}", article.title),
88//!     Err(ReadabilityError::ReadabilityCheckFailed) => {
89//!         // Try with lower threshold
90//!         let options = ReadabilityOptions::new().char_threshold(100);
91//!         let article = reader.parse_with_options(&html, None, Some(options))?;
92//!         println!("Extracted with relaxed settings: {}", article.title);
93//!     }
94//!     Err(e) => return Err(e),
95//! }
96//! # Ok::<(), readability_js::ReadabilityError>(())
97//! ```
98//!
99//! # CLI Usage
100//!
101//! The CLI tool extracts content and converts it to clean Markdown:
102//!
103//! ```bash
104//! # Install the CLI tool
105//! cargo install readability-js-cli
106//!
107//! # Process local files
108//! readable article.html > article.md
109//!
110//! # Fetch and process URLs
111//! readable https://example.com/news > news.md
112//!
113//! # Process from stdin (great for pipelines)
114//! curl -s https://site.com/article | readable > clean.md
115//!
116//! # View directly in terminal
117//! readable https://news.site/story | less
118//! ```
119//!
120//! The CLI automatically:
121//! - Detects whether input is a file path or URL
122//! - Fetches web content with proper headers
123//! - Converts the clean HTML to Markdown
124//! - Handles errors gracefully
125//!
126//! # Troubleshooting
127//!
128//! ## "Content failed readability check"
129//!
130//! This happens when the page doesn't contain enough readable content or
131//! the algorithm can't distinguish content from navigation. Try:
132//!
133//! ```rust
134//! use readability_js::{Readability, ReadabilityOptions};
135//!
136//! let options = ReadabilityOptions::new()
137//!     .char_threshold(100)         // Lower threshold (default: ~140)
138//!     .nb_top_candidates(10)       // Consider more candidates
139//!     .link_density_modifier(2.0); // More permissive with links
140//!
141//! let reader = Readability::new()?;
142//! let article = reader.parse_with_options(&html, None, Some(options))?;
143//! # Ok::<(), readability_js::ReadabilityError>(())
144//! ```
145//!
146//! ## Poor extraction quality
147//!
148//! If the extracted content is incomplete or includes unwanted elements:
149//!
150//! ```rust
151//! use readability_js::{Readability, ReadabilityOptions};
152//!
153//! // Better link resolution and metadata extraction
154//! let reader = Readability::new()?;
155//! let article = reader.parse_with_url(&html, "https://example.com/article")?;
156//!
157//! // Or preserve important CSS classes
158//! let options = ReadabilityOptions::new()
159//!     .keep_classes(true)
160//!     .classes_to_preserve(vec!["highlight".into(), "code".into(), "caption".into()]);
161//! let article = reader.parse_with_options(&html, None, Some(options))?;
162//! # Ok::<(), readability_js::ReadabilityError>(())
163//! ```
164//!
165//! ## Memory or performance issues
166//!
167//! For very large documents or resource-constrained environments:
168//!
169//! ```rust
170//! use readability_js::{Readability, ReadabilityOptions};
171//!
172//! let options = ReadabilityOptions::new()
173//!     .max_elems_to_parse(1000)   // Limit processing
174//!     .nb_top_candidates(3);      // Fewer candidates = faster
175//!
176//! let reader = Readability::new()?;
177//! let article = reader.parse_with_options(&html, None, Some(options))?;
178//! # Ok::<(), readability_js::ReadabilityError>(())
179//! ```
180
181mod readability;
182pub use readability::{Article, Direction, Readability, ReadabilityError, ReadabilityOptions};