Skip to main content

readable_rs/
models.rs

1use crate::parser::NodeRef;
2use crate::node_ext::NodeScoreStore;
3use std::collections::HashSet;
4
5/// Metadata extracted from a page's `<meta>` tags, JSON-LD, and heuristic
6/// byline / title detection.  Used internally during extraction; the fields
7/// are surfaced on [`Product`].
8#[derive(Debug, Clone)]
9pub struct Metadata {
10    /// The page title, cleaned and de-duplicated against site-name suffixes.
11    pub title: String,
12    /// The article author / byline string, if one could be detected.
13    pub by_line: String,
14    /// The site name (e.g. from `og:site_name`).
15    pub sitename: String,
16    /// A short excerpt / description (e.g. from `og:description`).
17    pub excerpt: String,
18    /// An ISO-8601 publish timestamp, if present in the page metadata.
19    pub published_time: String,
20}
21
22/// The output of [`crate::extract`].  Contains the extracted article content
23/// as a DOM subtree together with any metadata that was found.
24#[derive(Debug, Clone, Default)]
25pub struct Product {
26    /// The cleaned page / article title.
27    pub title: String,
28    /// The extracted content subtree, or `None` if no article content could
29    /// be identified.  Serialise to HTML with `.as_ref().map(|n| n.to_string())`.
30    pub content: Option<NodeRef>,
31    /// The author / byline string, if detected.
32    pub by_line: String,
33    /// The dominant text direction (`"ltr"` or `"rtl"`), inferred from
34    /// ancestor `dir` attributes of the top candidate.  Empty string if unknown.
35    pub dir: String,
36    /// The site name from page metadata.
37    pub sitename: String,
38    /// A short excerpt / description from page metadata.
39    pub excerpt: String,
40    /// An ISO-8601 publish timestamp from page metadata.
41    pub published_time: String,
42    /// The per-node score store produced during scoring.  Useful for
43    /// introspection / debugging; not needed for normal consumers.
44    pub score_store: NodeScoreStore,
45}
46
47/// Knobs that control the behaviour of the extraction algorithm.
48///
49/// All fields have sensible defaults via [`Default`]; start there and only
50/// override what you need.
51///
52/// # Examples
53///
54/// ```rust
55/// use readable_rs::ExtractOptions;
56///
57/// let mut opts = ExtractOptions::default();
58/// opts.char_threshold = 200;   // accept shorter articles
59/// opts.remove_style_tags = false; // keep <style> elements
60/// ```
61#[derive(Debug, Clone)]
62pub struct ExtractOptions {
63    /// Enable extra `eprintln!` tracing inside the algorithm (gated behind
64    /// `debug_assertions` in release builds).
65    pub debug: bool,
66    /// Strip all `<style>` elements from the document before extraction.
67    pub remove_style_tags: bool,
68    /// When `true`, apply additional cleanup passes that produce output
69    /// suitable for embedding in an EPUB (e.g. stricter image handling).
70    pub ready_for_epub: bool,
71    /// Remove elements whose class / id / role strongly suggest they are
72    /// navigation, ads, or other non-content.  Disabling this is one of the
73    /// retry strategies when the first pass yields too little text.
74    pub strip_unlikelys: bool,
75    /// Use class-name / id heuristics (positive/negative word lists) to
76    /// adjust candidate scores.  Disabling is another retry strategy.
77    pub weight_classes: bool,
78    /// When `true`, preserve CSS class attributes on the output nodes
79    /// (subject to [`classes_to_preserve`][Self::classes_to_preserve]).
80    /// When `false`, all class attributes are stripped.
81    pub keep_classes: bool,
82    /// The set of class names that are *always* kept even when
83    /// [`keep_classes`][Self::keep_classes] is `false`.  Readability's own
84    /// marker classes (e.g. `"page"`) are added automatically.
85    pub classes_to_preserve: HashSet<String>,
86    /// Apply the "clean conditionally" pass, which removes elements with
87    /// low content density (few commas, high link density, etc.).
88    /// Disabling is the third retry strategy.
89    pub clean_conditionally: bool,
90    /// Maximum number of scoring candidates to evaluate.  `0` means no limit.
91    pub max_elements_to_parse: u16,
92    /// How many top-scoring candidate nodes to retain before picking the
93    /// winner.  Higher values make the algorithm slightly more robust against
94    /// mis-scored nodes.
95    pub n_top_candidates: u16,
96    /// Minimum character count the extracted content must reach before it is
97    /// accepted.  If the first pass falls short, the algorithm retries with
98    /// progressively relaxed options.
99    pub char_threshold: u16,
100    /// An additive modifier applied to the link-density thresholds used in
101    /// the "clean conditionally" pass.  Positive values make the filter
102    /// more lenient (tolerate higher link density).
103    pub link_density_modifier: f64,
104}
105
106impl Default for ExtractOptions {
107    fn default() -> ExtractOptions {
108        ExtractOptions {
109            debug: false,
110            remove_style_tags: true,
111            ready_for_epub: false,
112            strip_unlikelys: true,
113            weight_classes: true,
114            keep_classes: true,
115            classes_to_preserve: HashSet::new(),
116            clean_conditionally: true,
117            max_elements_to_parse: 0,
118            n_top_candidates: 5,
119            char_threshold: 500,
120            link_density_modifier: 0.0,
121        }
122    }
123}