readable_rs/lib.rs
1//! A Rust port of Mozilla's [Readability](https://github.com/nicolo-ribaudo/readability) algorithm
2//! for extracting the main article content from an HTML page.
3//!
4//! ## Quick start
5//!
6//! ```rust
7//! use readable_rs::{extract, ExtractOptions};
8//!
9//! let html = "<html><body><article><p>The actual article text goes here.</p></article></body></html>";
10//! let product = extract(html, "https://example.com/article", ExtractOptions::default());
11//!
12//! // product.content holds the extracted DOM (or None if nothing was found)
13//! // product.title, product.by_line, product.sitename, etc. hold metadata
14//! ```
15//!
16//! ## Module layout
17//!
18//! * **Top level** – [`extract`] is the single entry-point. [`Product`] and
19//! [`ExtractOptions`] are the main public types.
20//! * [`parser`] – thin wrappers around the underlying HTML parser ([`parser::NodeRef`],
21//! [`parser::parse_html`]).
22//! * [`shared_utils`] – a curated set of DOM helpers useful when post-processing
23//! the extracted content (URL resolution, text normalisation, etc.).
24//! * [`NodeExt`] / [`NodeScoreStore`] – the trait and store that the scorer uses
25//! to attach readability metadata to DOM nodes without modifying the nodes themselves.
26
27macro_rules! d {
28 ($code:block) => {
29 if cfg!(debug_assertions) {
30 $code
31 }
32 };
33}
34
35#[macro_use]
36mod logging;
37mod extractor;
38
39mod models;
40mod node_ext;
41mod node_utils;
42mod utils;
43
44pub use models::{Product, ExtractOptions};
45pub use node_ext::NodeScoreStore;
46pub use node_utils::{new_html_element, NodeExt};
47
48/// Convenience re-exports of DOM helpers for post-processing extracted content.
49///
50/// These are a stable, curated subset of the internal utility library.
51pub mod shared_utils {
52 pub use crate::utils::{
53 apply, contains_single_tag_in_element, move_children, normalize_text,
54 replace_relative_urls_with_absolute, word_count,
55 };
56}
57
58/// Thin wrappers around the underlying HTML parser.
59///
60/// [`NodeRef`] is the reference-counted DOM node type used throughout the crate.
61/// [`parse_html`] parses a complete HTML document into a [`NodeRef`] tree.
62pub mod parser {
63 use kuchikikiki::traits::TendrilSink;
64 pub use kuchikikiki::{Attributes, NodeRef};
65 pub use crate::node_utils::{new_html_element, NodeExt};
66
67 /// Parse an HTML string into a [`NodeRef`] document tree.
68 ///
69 /// The parser follows the HTML5 specification; an implicit `<html>`, `<head>`,
70 /// and `<body>` are synthesised when missing.
71 ///
72 /// # Examples
73 ///
74 /// ```rust
75 /// use readable_rs::parser::parse_html;
76 ///
77 /// let doc = parse_html("<div><p>hello</p></div>");
78 /// assert!(doc.select_first("p").is_ok());
79 /// ```
80 pub fn parse_html(html: &str) -> NodeRef {
81 kuchikikiki::parse_html().one(html)
82 }
83}
84
85/// Extract the main article content from an HTML page.
86///
87/// This is the primary entry-point of the crate. It implements the Readability
88/// algorithm: scoring candidate nodes by content density, pruning navigation /
89/// boilerplate, and returning the best content subtree along with any metadata
90/// (title, byline, etc.) that could be extracted.
91///
92/// # Arguments
93///
94/// * `html_str` – the raw HTML source of the page.
95/// * `doc_uri` – the URL the page was fetched from. Used to resolve relative
96/// URLs in `<a href>`, `<img src>`, `srcset`, etc.
97/// * `options` – tuning knobs for the extraction algorithm. [`ExtractOptions::default()`]
98/// is a sensible starting point.
99///
100/// # Returns
101///
102/// A [`Product`] whose `content` field is `Some` if article content was found,
103/// or `None` if the page did not contain extractable content.
104///
105/// # Examples
106///
107/// ```rust
108/// use readable_rs::{extract, ExtractOptions};
109///
110/// let html = "<html><body><p>Short.</p></body></html>";
111/// let product = extract(html, "https://example.com", ExtractOptions::default());
112/// // product.content may be None — the paragraph is below the default char_threshold.
113/// ```
114pub fn extract(html_str: &str, doc_uri: &str, options: ExtractOptions) -> Product {
115 let processor = extractor::Extractor::new(html_str, doc_uri.to_string(), options);
116 processor.extract()
117}