readable_rs/
lib.rs

1//! A Rust port of Mozilla's [Readability](https://github.com/nicolo-ribaudo/readability) algorithm
2//! for extracting the main article content from an HTML page.
3//!
4//! ## Quick start
5//!
6//! ```rust
7//! use readable_rs::{extract, ExtractOptions};
8//!
9//! let html = "<html><body><article><p>The actual article text goes here.</p></article></body></html>";
10//! let product = extract(html, "https://example.com/article", ExtractOptions::default());
11//!
12//! // product.content holds the extracted DOM (or None if nothing was found)
13//! // product.title, product.by_line, product.sitename, etc. hold metadata
14//! ```
15//!
16//! ## Module layout
17//!
18//! * **Top level** – [`extract`] is the single entry-point.  [`Product`] and
19//!   [`ExtractOptions`] are the main public types.
20//! * [`parser`] – thin wrappers around the underlying HTML parser ([`parser::NodeRef`],
21//!   [`parser::parse_html`]).
22//! * [`shared_utils`] – a curated set of DOM helpers useful when post-processing
23//!   the extracted content (URL resolution, text normalisation, etc.).
24//! * [`NodeExt`] / [`NodeScoreStore`] – the trait and store that the scorer uses
25//!   to attach readability metadata to DOM nodes without modifying the nodes themselves.
26
27macro_rules! d {
28    ($code:block) => {
29        if cfg!(debug_assertions) {
30            $code
31        }
32    };
33}
34
35#[macro_use]
36mod logging;
37mod extractor;
38
39mod models;
40mod node_ext;
41mod node_utils;
42mod utils;
43
44pub use models::{Product, ExtractOptions};
45pub use node_ext::NodeScoreStore;
46pub use node_utils::{new_html_element, NodeExt};
47
48/// Convenience re-exports of DOM helpers for post-processing extracted content.
49///
50/// These are a stable, curated subset of the internal utility library.
51pub mod shared_utils {
52    pub use crate::utils::{
53        apply, contains_single_tag_in_element, move_children, normalize_text,
54        replace_relative_urls_with_absolute, word_count,
55    };
56}
57
58/// Thin wrappers around the underlying HTML parser.
59///
60/// [`NodeRef`] is the reference-counted DOM node type used throughout the crate.
61/// [`parse_html`] parses a complete HTML document into a [`NodeRef`] tree.
62pub mod parser {
63    use kuchikikiki::traits::TendrilSink;
64    pub use kuchikikiki::{Attributes, NodeRef};
65    pub use crate::node_utils::{new_html_element, NodeExt};
66
67    /// Parse an HTML string into a [`NodeRef`] document tree.
68    ///
69    /// The parser follows the HTML5 specification; an implicit `<html>`, `<head>`,
70    /// and `<body>` are synthesised when missing.
71    ///
72    /// # Examples
73    ///
74    /// ```rust
75    /// use readable_rs::parser::parse_html;
76    ///
77    /// let doc = parse_html("<div><p>hello</p></div>");
78    /// assert!(doc.select_first("p").is_ok());
79    /// ```
80    pub fn parse_html(html: &str) -> NodeRef {
81        kuchikikiki::parse_html().one(html)
82    }
83}
84
85/// Extract the main article content from an HTML page.
86///
87/// This is the primary entry-point of the crate.  It implements the Readability
88/// algorithm: scoring candidate nodes by content density, pruning navigation /
89/// boilerplate, and returning the best content subtree along with any metadata
90/// (title, byline, etc.) that could be extracted.
91///
92/// # Arguments
93///
94/// * `html_str` – the raw HTML source of the page.
95/// * `doc_uri` – the URL the page was fetched from.  Used to resolve relative
96///   URLs in `<a href>`, `<img src>`, `srcset`, etc.
97/// * `options` – tuning knobs for the extraction algorithm.  [`ExtractOptions::default()`]
98///   is a sensible starting point.
99///
100/// # Returns
101///
102/// A [`Product`] whose `content` field is `Some` if article content was found,
103/// or `None` if the page did not contain extractable content.
104///
105/// # Examples
106///
107/// ```rust
108/// use readable_rs::{extract, ExtractOptions};
109///
110/// let html = "<html><body><p>Short.</p></body></html>";
111/// let product = extract(html, "https://example.com", ExtractOptions::default());
112/// // product.content may be None — the paragraph is below the default char_threshold.
113/// ```
114pub fn extract(html_str: &str, doc_uri: &str, options: ExtractOptions) -> Product {
115    let processor = extractor::Extractor::new(html_str, doc_uri.to_string(), options);
116    processor.extract()
117}
readable_rs/lib.rs

readable_rs/
lib.rs