1use rquickjs::{Context as QuickContext, Ctx, Function, Object, Runtime, Value};
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub enum Direction {
6 Ltr,
8 Rtl,
10}
11
12#[derive(Debug, Clone, PartialEq)]
14pub struct Article {
15 pub title: String,
17
18 pub byline: Option<String>,
20
21 pub direction: Option<Direction>,
23
24 pub content: String,
26
27 pub text_content: String,
29
30 pub length: u32,
32
33 pub excerpt: Option<String>,
35
36 pub site_name: Option<String>,
38
39 pub language: Option<String>,
41
42 pub published_time: Option<String>,
44}
45
46impl<'js> TryFrom<Value<'js>> for Article {
47 type Error = ReadabilityError;
48
49 fn try_from(value: Value<'js>) -> Result<Self> {
50 let obj = value.as_object().ok_or_else(|| {
51 ReadabilityError::ExtractionError(
52 "Expected JavaScript object, got a different type".into(),
53 )
54 })?;
55
56 let title = obj
57 .get::<_, String>("title")
58 .map_err(|e| ReadabilityError::JsEvaluation {
59 context: "failed to get title".into(),
60 source: e,
61 })?;
62
63 let byline = obj
64 .get::<_, Value>("byline")
65 .map_err(|e| ReadabilityError::JsEvaluation {
66 context: "failed to get byline".into(),
67 source: e,
68 })?;
69 let byline = if byline.is_null() || byline.is_undefined() {
70 None
71 } else {
72 Some(
73 byline
74 .get::<String>()
75 .map_err(|e| ReadabilityError::JsEvaluation {
76 context: "failed to get byline as string".into(),
77 source: e,
78 })?,
79 )
80 };
81
82 let dir = obj
83 .get::<_, Value>("dir")
84 .map_err(|e| ReadabilityError::JsEvaluation {
85 context: "failed to get dir".into(),
86 source: e,
87 })?;
88 let direction = if dir.is_null() || dir.is_undefined() {
89 None
90 } else {
91 let dir_str = dir
92 .get::<String>()
93 .map_err(|e| ReadabilityError::JsEvaluation {
94 context: "failed to get dir as string".into(),
95 source: e,
96 })?;
97 match dir_str.as_str() {
98 "ltr" => Some(Direction::Ltr),
99 "rtl" => Some(Direction::Rtl),
100 _ => None,
101 }
102 };
103
104 let content =
105 obj.get::<_, String>("content")
106 .map_err(|e| ReadabilityError::JsEvaluation {
107 context: "failed to get content".into(),
108 source: e,
109 })?;
110 let text_content =
111 obj.get::<_, String>("textContent")
112 .map_err(|e| ReadabilityError::JsEvaluation {
113 context: "failed to get text_content".into(),
114 source: e,
115 })?;
116 let length = obj
117 .get::<_, u32>("length")
118 .map_err(|e| ReadabilityError::JsEvaluation {
119 context: "failed to get length".into(),
120 source: e,
121 })?;
122
123 let excerpt =
124 obj.get::<_, Value>("excerpt")
125 .map_err(|e| ReadabilityError::JsEvaluation {
126 context: "failed to get excerpt".into(),
127 source: e,
128 })?;
129 let excerpt = if excerpt.is_null() || excerpt.is_undefined() {
130 None
131 } else {
132 Some(
133 excerpt
134 .get::<String>()
135 .map_err(|e| ReadabilityError::JsEvaluation {
136 context: "failed to get excerpt as string".into(),
137 source: e,
138 })?,
139 )
140 };
141
142 let site_name =
143 obj.get::<_, Value>("siteName")
144 .map_err(|e| ReadabilityError::JsEvaluation {
145 context: "failed to get site_name".into(),
146 source: e,
147 })?;
148 let site_name = if site_name.is_null() || site_name.is_undefined() {
149 None
150 } else {
151 Some(
152 site_name
153 .get::<String>()
154 .map_err(|e| ReadabilityError::JsEvaluation {
155 context: "failed to get site_name as string".into(),
156 source: e,
157 })?,
158 )
159 };
160
161 let language = obj
162 .get::<_, Value>("lang")
163 .map_err(|e| ReadabilityError::JsEvaluation {
164 context: "failed to get lang".into(),
165 source: e,
166 })?;
167 let language = if language.is_null() || language.is_undefined() {
168 None
169 } else {
170 Some(
171 language
172 .get::<String>()
173 .map_err(|e| ReadabilityError::JsEvaluation {
174 context: "failed to get lang as string".into(),
175 source: e,
176 })?,
177 )
178 };
179
180 let published_time =
181 obj.get::<_, Value>("publishedTime")
182 .map_err(|e| ReadabilityError::JsEvaluation {
183 context: "failed to get published_time".into(),
184 source: e,
185 })?;
186 let published_time =
187 if published_time.is_null() || published_time.is_undefined() {
188 None
189 } else {
190 Some(published_time.get::<String>().map_err(|e| {
191 ReadabilityError::JsEvaluation {
192 context: "failed to get published_time as string".into(),
193 source: e,
194 }
195 })?)
196 };
197
198 Ok(Article {
199 title,
200 byline,
201 direction,
202 content,
203 text_content,
204 length,
205 excerpt,
206 site_name,
207 language,
208 published_time,
209 })
210 }
211}
212
213#[derive(Default, Debug, Clone)]
214pub struct ReadabilityOptions {
215 pub debug: Option<bool>,
216 pub max_elems_to_parse: Option<usize>,
217 pub nb_top_candidates: Option<usize>,
218 pub char_threshold: Option<usize>,
219 pub classes_to_preserve: Option<Vec<String>>,
220 pub keep_classes: Option<bool>,
221 pub disable_jsonld: Option<bool>,
222 pub link_density_modifier: Option<f32>,
223 }
225
226impl ReadabilityOptions {
227 pub fn new() -> Self {
228 Self::default()
229 }
230 pub fn debug(mut self, val: bool) -> Self {
231 self.debug = Some(val);
232 self
233 }
234 pub fn max_elems_to_parse(mut self, val: usize) -> Self {
235 self.max_elems_to_parse = Some(val);
236 self
237 }
238 pub fn nb_top_candidates(mut self, val: usize) -> Self {
239 self.nb_top_candidates = Some(val);
240 self
241 }
242 pub fn char_threshold(mut self, val: usize) -> Self {
243 self.char_threshold = Some(val);
244 self
245 }
246 pub fn classes_to_preserve(mut self, val: Vec<String>) -> Self {
247 self.classes_to_preserve = Some(val);
248 self
249 }
250 pub fn keep_classes(mut self, val: bool) -> Self {
251 self.keep_classes = Some(val);
252 self
253 }
254 pub fn disable_jsonld(mut self, val: bool) -> Self {
255 self.disable_jsonld = Some(val);
256 self
257 }
258 pub fn link_density_modifier(mut self, val: f32) -> Self {
259 self.link_density_modifier = Some(val);
260 self
261 }
262
263 fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
264 let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
265 context: "failed to create options object".into(),
266 source: e,
267 })?;
268
269 if let Some(val) = self.debug {
270 obj.set("debug", val)
271 .map_err(|e| ReadabilityError::JsEvaluation {
272 context: "failed to set debug option".into(),
273 source: e,
274 })?;
275 }
276 if let Some(val) = self.max_elems_to_parse {
277 obj.set("maxElemsToParse", val)
278 .map_err(|e| ReadabilityError::JsEvaluation {
279 context: "failed to set maxElemsToParse option".into(),
280 source: e,
281 })?;
282 }
283 if let Some(val) = self.nb_top_candidates {
284 obj.set("nbTopCandidates", val)
285 .map_err(|e| ReadabilityError::JsEvaluation {
286 context: "failed to set nbTopCandidates option".into(),
287 source: e,
288 })?;
289 }
290 if let Some(val) = self.char_threshold {
291 obj.set("charThreshold", val)
292 .map_err(|e| ReadabilityError::JsEvaluation {
293 context: "failed to set charThreshold option".to_string(),
294 source: e,
295 })?;
296 }
297 if let Some(ref val) = self.classes_to_preserve {
298 obj.set("classesToPreserve", val.clone()).map_err(|e| {
299 ReadabilityError::JsEvaluation {
300 context: "failed to set classesToPreserve option".to_string(),
301 source: e,
302 }
303 })?;
304 }
305 if let Some(val) = self.keep_classes {
306 obj.set("keepClasses", val)
307 .map_err(|e| ReadabilityError::JsEvaluation {
308 context: "failed to set keepClasses option".to_string(),
309 source: e,
310 })?;
311 }
312 if let Some(val) = self.disable_jsonld {
313 obj.set("disableJSONLD", val)
314 .map_err(|e| ReadabilityError::JsEvaluation {
315 context: "failed to set disableJSONLD option".to_string(),
316 source: e,
317 })?;
318 }
319 if let Some(val) = self.link_density_modifier {
320 obj.set("linkDensityModifier", val)
321 .map_err(|e| ReadabilityError::JsEvaluation {
322 context: "failed to set linkDensityModifier option".to_string(),
323 source: e,
324 })?;
325 }
326 Ok(obj)
327 }
328}
329
330#[derive(Error, Debug)]
375pub enum ReadabilityError {
376 #[error("Failed to parse HTML: {0}")]
377 HtmlParseError(String),
378
379 #[error("Content failed readability check")]
380 ReadabilityCheckFailed,
381
382 #[error("Failed to extract readable content: {0}")]
383 ExtractionError(String),
384
385 #[error("Failed to evaluate JavaScript: {context}")]
386 JsEvaluation {
387 context: String,
388 #[source] source: rquickjs::Error,
390 },
391
392 #[error("Invalid options: {0}")]
393 InvalidOptions(String),
394}
395
396trait JsResultExt<T> {
397 fn js_context(self, context: &str) -> Result<T>;
398}
399
400impl<T> JsResultExt<T> for std::result::Result<T, rquickjs::Error> {
401 fn js_context(self, context: &str) -> Result<T> {
402 self.map_err(|source| ReadabilityError::JsEvaluation {
403 context: context.into(),
404 source,
405 })
406 }
407}
408
409type Result<T> = std::result::Result<T, ReadabilityError>;
410
411pub struct Readability {
412 context: QuickContext,
413}
414impl Readability {
415 pub fn new() -> Result<Self> {
416 let runtime = Runtime::new().js_context("Failed to create runtime")?;
417 let context = QuickContext::full(&runtime).js_context("Failed to create context")?;
418
419 context.with(|ctx| {
439 let readability_code = include_str!("../vendor/readability/Readability.js");
440 ctx.eval::<(), _>(readability_code)
441 .js_context("Failed to load Readability")?;
442
443 let bundle = include_str!("../js/bundled.js");
444 ctx.eval::<(), _>(bundle)
445 .js_context("Failed to load bundle")?;
446
447 Ok(())
448 })?;
449
450 Ok(Self { context })
451 }
452
453 fn validate_base_url(url: &str) -> Result<String> {
454 if url.starts_with("javascript:") || url.starts_with("data:") {
455 return Err(ReadabilityError::InvalidOptions(
456 "Invalid base URL scheme".into(),
457 ));
458 }
459
460 match url::Url::parse(url) {
462 Ok(parsed) if matches!(parsed.scheme(), "http" | "https") => Ok(url.to_string()),
463 _ => Err(ReadabilityError::InvalidOptions(
464 "Base URL must be HTTP(S)".into(),
465 )),
466 }
467 }
468
469 pub fn extract(
471 &self,
472 html: &str,
473 base_url: Option<&str>,
474 options: Option<ReadabilityOptions>,
475 ) -> Result<Article> {
476 let clean_base_url = match base_url {
477 None => None,
478 Some(url) => Some(Self::validate_base_url(url)?),
479 };
480 self.context.with(|ctx| {
481 let extract_fn: Function = ctx
482 .globals()
483 .get("extract")
484 .js_context("extract function not found")?;
485 let options_obj = match options {
486 None => None,
487 Some(options) => Some(options.build(ctx.clone())?),
488 };
489
490 let result: Value = extract_fn
491 .call((html, clean_base_url, options_obj))
492 .js_context("Failed to call extract")?;
493
494 if let Some(obj) = result.as_object()
496 && let Ok(error_type) = obj.get::<_, String>("errorType")
497 {
498 let error_msg = obj
499 .get::<_, String>("error")
500 .unwrap_or_else(|_| "Unknown error".to_string());
501
502 return Err(match error_type.as_str() {
503 "HtmlParseError" => ReadabilityError::HtmlParseError(error_msg),
504 "ExtractionError" => ReadabilityError::ExtractionError(error_msg),
505 "RuntimeError" => ReadabilityError::JsEvaluation {
506 context: format!("JavaScript runtime error: {}", error_msg),
507 source: rquickjs::Error::Unknown,
508 },
509 _ => ReadabilityError::ExtractionError(format!(
510 "Unknown error type '{}': {}",
511 error_type, error_msg
512 )),
513 });
514 }
515
516 Article::try_from(result)
518 })
519 }
520}
521
522#[cfg(test)]
523mod tests {
524 use super::*;
525
526 #[test]
527 fn test_basic_extraction() {
528 let html = r#"
529 <html>
530 <head><title>Test Article Title</title></head>
531 <body>
532 <h1>This is a test article</h1>
533 <p>This is the first paragraph with some content that should be long enough to be considered readable content by the readability algorithm.</p>
534 <p>This is another paragraph with more content. It has enough text to make the article substantial and worth reading.</p>
535 <p>And here's a third paragraph to make sure we have enough content for the readability parser to work with.</p>
536 </body>
537 </html>
538 "#;
539
540 let readability = Readability::new().unwrap();
541 let article = readability
542 .extract(html, Some("https://example.com"), None)
543 .unwrap();
544
545 assert_eq!(article.title, "Test Article Title");
546 assert!(article.content.contains("first paragraph"));
547 assert!(article.content.contains("another paragraph"));
548 assert!(article.content.contains("third paragraph"));
549 assert!(article.content.contains("<p>"));
550 assert!(article.text_content.contains("This is a test article"));
551 assert!(!article.text_content.contains("<"));
552 assert!(article.length > 0);
553 }
554}