1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
//! Quick readability check without full parsing.
//!
//! This module provides the [`is_probably_readerable`] function, which performs
//! a fast pre-flight check to determine if a document is likely to have extractable
//! article content without doing a full parse.
//!
//! ## Use Case
//!
//! Use this function to quickly filter out pages that are unlikely to contain article
//! content, saving the cost of a full parse:
//!
//! ```rust
//! use readabilityrs::{is_probably_readerable, Readability};
//!
//! let html = "<html>...</html>";
//!
//! // Quick check first
//! if is_probably_readerable(html, None) {
//! // Do full parse
//! let readability = Readability::new(html, None, None).unwrap();
//! if let Some(article) = readability.parse() {
//! println!("Article extracted!");
//! }
//! } else {
//! println!("Not an article page, skipping parse");
//! }
//! ```
//!
//! ## Performance
//!
//! This check is significantly faster than a full parse because it only looks
//! for basic content signals without doing deep analysis or scoring.
use ;
/// Options for the readability pre-flight check.
///
/// Controls the thresholds used by [`is_probably_readerable`] to determine
/// if a document is likely to be parseable.
///
/// ## Example
///
/// ```rust
/// use readabilityrs::{is_probably_readerable, ReaderableOptions};
///
/// let html = "<html>...</html>";
///
/// let options = ReaderableOptions {
/// min_content_length: 200,
/// min_score: 30.0,
/// };
///
/// let is_readerable = is_probably_readerable(html, Some(options));
/// ```
/// Quick check to determine if a document is likely to be readerable.
///
/// This function performs a fast analysis to predict whether full article extraction
/// is likely to succeed, without doing the expensive full parse. It looks for basic
/// content signals like paragraphs with sufficient text.
///
/// ## Arguments
///
/// * `html` - The HTML document to check
/// * `options` - Optional custom thresholds (uses defaults if `None`)
///
/// ## Returns
///
/// `true` if the document likely contains extractable article content, `false` otherwise.
///
/// ## Example
///
/// ```rust
/// use readabilityrs::is_probably_readerable;
///
/// let article_html = r#"
/// <html><body>
/// <article>
/// <p>This is a substantial paragraph with enough content to indicate that this page
/// likely contains article text that can be extracted by the readability algorithm.
/// The paragraph needs to be long enough to pass the minimum content length threshold.</p>
/// </article>
/// </body></html>
/// "#;
///
/// assert!(is_probably_readerable(article_html, None));
///
/// let non_article_html = "<html><body><p>Short</p></body></html>";
/// assert!(!is_probably_readerable(non_article_html, None));
/// ```
///
/// ## With Custom Options
///
/// ```rust
/// use readabilityrs::{is_probably_readerable, ReaderableOptions};
///
/// let html = "<html>...</html>";
/// let options = ReaderableOptions {
/// min_content_length: 200,
/// min_score: 30.0,
/// };
///
/// if is_probably_readerable(html, Some(options)) {
/// println!("Likely readerable with stricter thresholds");
/// }
/// ```
///
/// ## Algorithm
///
/// The function finds all `<p>`, `<pre>`, and `<article>` elements in the document,
/// then filters out paragraphs shorter than the configured `min_content_length`. A score
/// is calculated based on the remaining content length, and the function returns `true`
/// if this score exceeds the `min_score` threshold.
///
/// ## Performance
///
/// This function is much faster than a full parse, making it ideal for batch processing
/// large numbers of URLs, pre-filtering in crawlers or scrapers, and quick content
/// classification tasks.