readable_rs/models.rs
1use crate::parser::NodeRef;
2use crate::node_ext::NodeScoreStore;
3use std::collections::HashSet;
4
5/// Metadata extracted from a page's `<meta>` tags, JSON-LD, and heuristic
6/// byline / title detection. Used internally during extraction; the fields
7/// are surfaced on [`Product`].
8#[derive(Debug, Clone)]
9pub struct Metadata {
10 /// The page title, cleaned and de-duplicated against site-name suffixes.
11 pub title: String,
12 /// The article author / byline string, if one could be detected.
13 pub by_line: String,
14 /// The site name (e.g. from `og:site_name`).
15 pub sitename: String,
16 /// A short excerpt / description (e.g. from `og:description`).
17 pub excerpt: String,
18 /// An ISO-8601 publish timestamp, if present in the page metadata.
19 pub published_time: String,
20}
21
22/// The output of [`crate::extract`]. Contains the extracted article content
23/// as a DOM subtree together with any metadata that was found.
24#[derive(Debug, Clone, Default)]
25pub struct Product {
26 /// The cleaned page / article title.
27 pub title: String,
28 /// The extracted content subtree, or `None` if no article content could
29 /// be identified. Serialise to HTML with `.as_ref().map(|n| n.to_string())`.
30 pub content: Option<NodeRef>,
31 /// The author / byline string, if detected.
32 pub by_line: String,
33 /// The dominant text direction (`"ltr"` or `"rtl"`), inferred from
34 /// ancestor `dir` attributes of the top candidate. Empty string if unknown.
35 pub dir: String,
36 /// The site name from page metadata.
37 pub sitename: String,
38 /// A short excerpt / description from page metadata.
39 pub excerpt: String,
40 /// An ISO-8601 publish timestamp from page metadata.
41 pub published_time: String,
42 /// The per-node score store produced during scoring. Useful for
43 /// introspection / debugging; not needed for normal consumers.
44 pub score_store: NodeScoreStore,
45}
46
47/// Knobs that control the behaviour of the extraction algorithm.
48///
49/// All fields have sensible defaults via [`Default`]; start there and only
50/// override what you need.
51///
52/// # Examples
53///
54/// ```rust
55/// use readable_rs::ExtractOptions;
56///
57/// let mut opts = ExtractOptions::default();
58/// opts.char_threshold = 200; // accept shorter articles
59/// opts.remove_style_tags = false; // keep <style> elements
60/// ```
61#[derive(Debug, Clone)]
62pub struct ExtractOptions {
63 /// Enable extra `eprintln!` tracing inside the algorithm (gated behind
64 /// `debug_assertions` in release builds).
65 pub debug: bool,
66 /// Strip all `<style>` elements from the document before extraction.
67 pub remove_style_tags: bool,
68 /// When `true`, apply additional cleanup passes that produce output
69 /// suitable for embedding in an EPUB (e.g. stricter image handling).
70 pub ready_for_epub: bool,
71 /// Remove elements whose class / id / role strongly suggest they are
72 /// navigation, ads, or other non-content. Disabling this is one of the
73 /// retry strategies when the first pass yields too little text.
74 pub strip_unlikelys: bool,
75 /// Use class-name / id heuristics (positive/negative word lists) to
76 /// adjust candidate scores. Disabling is another retry strategy.
77 pub weight_classes: bool,
78 /// When `true`, preserve CSS class attributes on the output nodes
79 /// (subject to [`classes_to_preserve`][Self::classes_to_preserve]).
80 /// When `false`, all class attributes are stripped.
81 pub keep_classes: bool,
82 /// The set of class names that are *always* kept even when
83 /// [`keep_classes`][Self::keep_classes] is `false`. Readability's own
84 /// marker classes (e.g. `"page"`) are added automatically.
85 pub classes_to_preserve: HashSet<String>,
86 /// Apply the "clean conditionally" pass, which removes elements with
87 /// low content density (few commas, high link density, etc.).
88 /// Disabling is the third retry strategy.
89 pub clean_conditionally: bool,
90 /// Maximum number of scoring candidates to evaluate. `0` means no limit.
91 pub max_elements_to_parse: u16,
92 /// How many top-scoring candidate nodes to retain before picking the
93 /// winner. Higher values make the algorithm slightly more robust against
94 /// mis-scored nodes.
95 pub n_top_candidates: u16,
96 /// Minimum character count the extracted content must reach before it is
97 /// accepted. If the first pass falls short, the algorithm retries with
98 /// progressively relaxed options.
99 pub char_threshold: u16,
100 /// An additive modifier applied to the link-density thresholds used in
101 /// the "clean conditionally" pass. Positive values make the filter
102 /// more lenient (tolerate higher link density).
103 pub link_density_modifier: f64,
104}
105
106impl Default for ExtractOptions {
107 fn default() -> ExtractOptions {
108 ExtractOptions {
109 debug: false,
110 remove_style_tags: true,
111 ready_for_epub: false,
112 strip_unlikelys: true,
113 weight_classes: true,
114 keep_classes: true,
115 classes_to_preserve: HashSet::new(),
116 clean_conditionally: true,
117 max_elements_to_parse: 0,
118 n_top_candidates: 5,
119 char_threshold: 500,
120 link_density_modifier: 0.0,
121 }
122 }
123}