readability_js/
readability.rs

1use rquickjs::{Context as QuickContext, Ctx, Function, Object, Runtime, Value};
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5pub enum Direction {
6    /// Left-to-Right
7    Ltr,
8    /// Right-to-Left
9    Rtl,
10}
11
12/// Represents a parsed article from Readability.js
13#[derive(Debug, Clone, PartialEq)]
14pub struct Article {
15    /// Title of the article (parsed or inferred from document)
16    pub title: String,
17
18    /// Author byline metadata
19    pub byline: Option<String>,
20
21    /// Content direction
22    pub direction: Option<Direction>,
23
24    /// HTML content of the processed article
25    pub content: String,
26
27    /// Plain-text content with all HTML tags removed
28    pub text_content: String,
29
30    /// Length of article content in characters
31    pub length: u32,
32
33    /// Article description or short excerpt
34    pub excerpt: Option<String>,
35
36    /// Name of the website
37    pub site_name: Option<String>,
38
39    /// Content language code (BCP 47), if detectable
40    pub language: Option<String>,
41
42    /// Published time in ISO 8601 or site format, if detectable
43    pub published_time: Option<String>,
44}
45
46impl<'js> TryFrom<Value<'js>> for Article {
47    type Error = ReadabilityError;
48
49    fn try_from(value: Value<'js>) -> Result<Self> {
50        let obj = value.as_object().ok_or_else(|| {
51            ReadabilityError::ExtractionError(
52                "Expected JavaScript object, got a different type".into(),
53            )
54        })?;
55
56        let title = obj
57            .get::<_, String>("title")
58            .map_err(|e| ReadabilityError::JsEvaluation {
59                context: "failed to get title".into(),
60                source: e,
61            })?;
62
63        let byline = obj
64            .get::<_, Value>("byline")
65            .map_err(|e| ReadabilityError::JsEvaluation {
66                context: "failed to get byline".into(),
67                source: e,
68            })?;
69        let byline = if byline.is_null() || byline.is_undefined() {
70            None
71        } else {
72            Some(
73                byline
74                    .get::<String>()
75                    .map_err(|e| ReadabilityError::JsEvaluation {
76                        context: "failed to get byline as string".into(),
77                        source: e,
78                    })?,
79            )
80        };
81
82        let dir = obj
83            .get::<_, Value>("dir")
84            .map_err(|e| ReadabilityError::JsEvaluation {
85                context: "failed to get dir".into(),
86                source: e,
87            })?;
88        let direction = if dir.is_null() || dir.is_undefined() {
89            None
90        } else {
91            let dir_str = dir
92                .get::<String>()
93                .map_err(|e| ReadabilityError::JsEvaluation {
94                    context: "failed to get dir as string".into(),
95                    source: e,
96                })?;
97            match dir_str.as_str() {
98                "ltr" => Some(Direction::Ltr),
99                "rtl" => Some(Direction::Rtl),
100                _ => None,
101            }
102        };
103
104        let content =
105            obj.get::<_, String>("content")
106                .map_err(|e| ReadabilityError::JsEvaluation {
107                    context: "failed to get content".into(),
108                    source: e,
109                })?;
110        let text_content =
111            obj.get::<_, String>("textContent")
112                .map_err(|e| ReadabilityError::JsEvaluation {
113                    context: "failed to get text_content".into(),
114                    source: e,
115                })?;
116        let length = obj
117            .get::<_, u32>("length")
118            .map_err(|e| ReadabilityError::JsEvaluation {
119                context: "failed to get length".into(),
120                source: e,
121            })?;
122
123        let excerpt =
124            obj.get::<_, Value>("excerpt")
125                .map_err(|e| ReadabilityError::JsEvaluation {
126                    context: "failed to get excerpt".into(),
127                    source: e,
128                })?;
129        let excerpt = if excerpt.is_null() || excerpt.is_undefined() {
130            None
131        } else {
132            Some(
133                excerpt
134                    .get::<String>()
135                    .map_err(|e| ReadabilityError::JsEvaluation {
136                        context: "failed to get excerpt as string".into(),
137                        source: e,
138                    })?,
139            )
140        };
141
142        let site_name =
143            obj.get::<_, Value>("siteName")
144                .map_err(|e| ReadabilityError::JsEvaluation {
145                    context: "failed to get site_name".into(),
146                    source: e,
147                })?;
148        let site_name = if site_name.is_null() || site_name.is_undefined() {
149            None
150        } else {
151            Some(
152                site_name
153                    .get::<String>()
154                    .map_err(|e| ReadabilityError::JsEvaluation {
155                        context: "failed to get site_name as string".into(),
156                        source: e,
157                    })?,
158            )
159        };
160
161        let language = obj
162            .get::<_, Value>("lang")
163            .map_err(|e| ReadabilityError::JsEvaluation {
164                context: "failed to get lang".into(),
165                source: e,
166            })?;
167        let language = if language.is_null() || language.is_undefined() {
168            None
169        } else {
170            Some(
171                language
172                    .get::<String>()
173                    .map_err(|e| ReadabilityError::JsEvaluation {
174                        context: "failed to get lang as string".into(),
175                        source: e,
176                    })?,
177            )
178        };
179
180        let published_time =
181            obj.get::<_, Value>("publishedTime")
182                .map_err(|e| ReadabilityError::JsEvaluation {
183                    context: "failed to get published_time".into(),
184                    source: e,
185                })?;
186        let published_time =
187            if published_time.is_null() || published_time.is_undefined() {
188                None
189            } else {
190                Some(published_time.get::<String>().map_err(|e| {
191                    ReadabilityError::JsEvaluation {
192                        context: "failed to get published_time as string".into(),
193                        source: e,
194                    }
195                })?)
196            };
197
198        Ok(Article {
199            title,
200            byline,
201            direction,
202            content,
203            text_content,
204            length,
205            excerpt,
206            site_name,
207            language,
208            published_time,
209        })
210    }
211}
212
213#[derive(Default, Debug, Clone)]
214pub struct ReadabilityOptions {
215    pub debug: Option<bool>,
216    pub max_elems_to_parse: Option<usize>,
217    pub nb_top_candidates: Option<usize>,
218    pub char_threshold: Option<usize>,
219    pub classes_to_preserve: Option<Vec<String>>,
220    pub keep_classes: Option<bool>,
221    pub disable_jsonld: Option<bool>,
222    pub link_density_modifier: Option<f32>,
223    // TODO: serializer and allowed_video_regex
224}
225
226impl ReadabilityOptions {
227    pub fn new() -> Self {
228        Self::default()
229    }
230    pub fn debug(mut self, val: bool) -> Self {
231        self.debug = Some(val);
232        self
233    }
234    pub fn max_elems_to_parse(mut self, val: usize) -> Self {
235        self.max_elems_to_parse = Some(val);
236        self
237    }
238    pub fn nb_top_candidates(mut self, val: usize) -> Self {
239        self.nb_top_candidates = Some(val);
240        self
241    }
242    pub fn char_threshold(mut self, val: usize) -> Self {
243        self.char_threshold = Some(val);
244        self
245    }
246    pub fn classes_to_preserve(mut self, val: Vec<String>) -> Self {
247        self.classes_to_preserve = Some(val);
248        self
249    }
250    pub fn keep_classes(mut self, val: bool) -> Self {
251        self.keep_classes = Some(val);
252        self
253    }
254    pub fn disable_jsonld(mut self, val: bool) -> Self {
255        self.disable_jsonld = Some(val);
256        self
257    }
258    pub fn link_density_modifier(mut self, val: f32) -> Self {
259        self.link_density_modifier = Some(val);
260        self
261    }
262
263    fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
264        let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
265            context: "failed to create options object".into(),
266            source: e,
267        })?;
268
269        if let Some(val) = self.debug {
270            obj.set("debug", val)
271                .map_err(|e| ReadabilityError::JsEvaluation {
272                    context: "failed to set debug option".into(),
273                    source: e,
274                })?;
275        }
276        if let Some(val) = self.max_elems_to_parse {
277            obj.set("maxElemsToParse", val)
278                .map_err(|e| ReadabilityError::JsEvaluation {
279                    context: "failed to set maxElemsToParse option".into(),
280                    source: e,
281                })?;
282        }
283        if let Some(val) = self.nb_top_candidates {
284            obj.set("nbTopCandidates", val)
285                .map_err(|e| ReadabilityError::JsEvaluation {
286                    context: "failed to set nbTopCandidates option".into(),
287                    source: e,
288                })?;
289        }
290        if let Some(val) = self.char_threshold {
291            obj.set("charThreshold", val)
292                .map_err(|e| ReadabilityError::JsEvaluation {
293                    context: "failed to set charThreshold option".to_string(),
294                    source: e,
295                })?;
296        }
297        if let Some(ref val) = self.classes_to_preserve {
298            obj.set("classesToPreserve", val.clone()).map_err(|e| {
299                ReadabilityError::JsEvaluation {
300                    context: "failed to set classesToPreserve option".to_string(),
301                    source: e,
302                }
303            })?;
304        }
305        if let Some(val) = self.keep_classes {
306            obj.set("keepClasses", val)
307                .map_err(|e| ReadabilityError::JsEvaluation {
308                    context: "failed to set keepClasses option".to_string(),
309                    source: e,
310                })?;
311        }
312        if let Some(val) = self.disable_jsonld {
313            obj.set("disableJSONLD", val)
314                .map_err(|e| ReadabilityError::JsEvaluation {
315                    context: "failed to set disableJSONLD option".to_string(),
316                    source: e,
317                })?;
318        }
319        if let Some(val) = self.link_density_modifier {
320            obj.set("linkDensityModifier", val)
321                .map_err(|e| ReadabilityError::JsEvaluation {
322                    context: "failed to set linkDensityModifier option".to_string(),
323                    source: e,
324                })?;
325        }
326        Ok(obj)
327    }
328}
329
330// #[derive(Default, Debug, Clone)]
331// struct ReadabilityCheckOptions {
332//     pub min_content_length: Option<usize>, // default 140
333//     pub min_score: Option<usize>,          // default 20
334//                                            // TODO visibility checker
335// }
336
337// impl ReadabilityCheckOptions {
338//     pub fn new() -> Self {
339//         Self::default()
340//     }
341//     pub fn min_content_length(mut self, val: usize) -> Self {
342//         self.min_content_length = Some(val);
343//         self
344//     }
345//     pub fn min_score(mut self, val: usize) -> Self {
346//         self.min_score = Some(val);
347//         self
348//     }
349
350//     fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
351//         let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
352//             context: "failed to create check options object".to_string(),
353//             source: e,
354//         })?;
355
356//         if let Some(val) = self.min_content_length {
357//             obj.set("minContentLength", val)
358//                 .map_err(|e| ReadabilityError::JsEvaluation {
359//                     context: "failed to set minContentLength option".to_string(),
360//                     source: e,
361//                 })?
362//         }
363//         if let Some(val) = self.min_score {
364//             obj.set("minScore", val)
365//                 .map_err(|e| ReadabilityError::JsEvaluation {
366//                     context: "failed to set minScore option".to_string(),
367//                     source: e,
368//                 })?;
369//         }
370//         Ok(obj)
371//     }
372// }
373
374#[derive(Error, Debug)]
375pub enum ReadabilityError {
376    #[error("Failed to parse HTML: {0}")]
377    HtmlParseError(String),
378
379    #[error("Content failed readability check")]
380    ReadabilityCheckFailed,
381
382    #[error("Failed to extract readable content: {0}")]
383    ExtractionError(String),
384
385    #[error("Failed to evaluate JavaScript: {context}")]
386    JsEvaluation {
387        context: String,
388        #[source] // This attribute is key!
389        source: rquickjs::Error,
390    },
391
392    #[error("Invalid options: {0}")]
393    InvalidOptions(String),
394}
395
396trait JsResultExt<T> {
397    fn js_context(self, context: &str) -> Result<T>;
398}
399
400impl<T> JsResultExt<T> for std::result::Result<T, rquickjs::Error> {
401    fn js_context(self, context: &str) -> Result<T> {
402        self.map_err(|source| ReadabilityError::JsEvaluation {
403            context: context.into(),
404            source,
405        })
406    }
407}
408
409type Result<T> = std::result::Result<T, ReadabilityError>;
410
411pub struct Readability {
412    context: QuickContext,
413}
414impl Readability {
415    pub fn new() -> Result<Self> {
416        let runtime = Runtime::new().js_context("Failed to create runtime")?;
417        let context = QuickContext::full(&runtime).js_context("Failed to create context")?;
418
419        // context.with(|ctx| {
420        //     // Load JSDOMParser
421        //     let jsdom_parser_code = include_str!("../vendor/linkedom/worker.js");
422        //     ctx.eval::<(), _>(jsdom_parser_code)
423        //         .js_context("Failed to load linkedom")?;
424
425        //     // Load Readability
426        //     let readability_code = include_str!("../vendor/readability/Readability.js");
427        //     ctx.eval::<(), _>(readability_code)
428        //         .js_context("Failed to load Readability")?;
429
430        //     // Load our functions
431        //     let script = include_str!("./script.js");
432        //     ctx.eval::<(), _>(script)
433        //         .js_context("Failed to load script")?;
434
435        //     Ok(())
436        // })?;
437
438        context.with(|ctx| {
439            let readability_code = include_str!("../vendor/readability/Readability.js");
440            ctx.eval::<(), _>(readability_code)
441                .js_context("Failed to load Readability")?;
442
443            let bundle = include_str!("../js/bundled.js");
444            ctx.eval::<(), _>(bundle)
445                .js_context("Failed to load bundle")?;
446
447            Ok(())
448        })?;
449
450        Ok(Self { context })
451    }
452
453    fn validate_base_url(url: &str) -> Result<String> {
454        if url.starts_with("javascript:") || url.starts_with("data:") {
455            return Err(ReadabilityError::InvalidOptions(
456                "Invalid base URL scheme".into(),
457            ));
458        }
459
460        // Optional: Parse with url crate for stricter validation
461        match url::Url::parse(url) {
462            Ok(parsed) if matches!(parsed.scheme(), "http" | "https") => Ok(url.to_string()),
463            _ => Err(ReadabilityError::InvalidOptions(
464                "Base URL must be HTTP(S)".into(),
465            )),
466        }
467    }
468
469    /// Extract readable content unconditionally
470    pub fn extract(
471        &self,
472        html: &str,
473        base_url: Option<&str>,
474        options: Option<ReadabilityOptions>,
475    ) -> Result<Article> {
476        let clean_base_url = match base_url {
477            None => None,
478            Some(url) => Some(Self::validate_base_url(url)?),
479        };
480        self.context.with(|ctx| {
481            let extract_fn: Function = ctx
482                .globals()
483                .get("extract")
484                .js_context("extract function not found")?;
485            let options_obj = match options {
486                None => None,
487                Some(options) => Some(options.build(ctx.clone())?),
488            };
489
490            let result: Value = extract_fn
491                .call((html, clean_base_url, options_obj))
492                .js_context("Failed to call extract")?;
493
494            // Check if result is an error object
495            if let Some(obj) = result.as_object()
496                && let Ok(error_type) = obj.get::<_, String>("errorType")
497            {
498                let error_msg = obj
499                    .get::<_, String>("error")
500                    .unwrap_or_else(|_| "Unknown error".to_string());
501
502                return Err(match error_type.as_str() {
503                    "HtmlParseError" => ReadabilityError::HtmlParseError(error_msg),
504                    "ExtractionError" => ReadabilityError::ExtractionError(error_msg),
505                    "RuntimeError" => ReadabilityError::JsEvaluation {
506                        context: format!("JavaScript runtime error: {}", error_msg),
507                        source: rquickjs::Error::Unknown,
508                    },
509                    _ => ReadabilityError::ExtractionError(format!(
510                        "Unknown error type '{}': {}",
511                        error_type, error_msg
512                    )),
513                });
514            }
515
516            // If not an error object, try to parse as Article
517            Article::try_from(result)
518        })
519    }
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525
526    #[test]
527    fn test_basic_extraction() {
528        let html = r#"
529            <html>
530            <head><title>Test Article Title</title></head>
531            <body>
532                <h1>This is a test article</h1>
533                <p>This is the first paragraph with some content that should be long enough to be considered readable content by the readability algorithm.</p>
534                <p>This is another paragraph with more content. It has enough text to make the article substantial and worth reading.</p>
535                <p>And here's a third paragraph to make sure we have enough content for the readability parser to work with.</p>
536            </body>
537            </html>
538        "#;
539
540        let readability = Readability::new().unwrap();
541        let article = readability
542            .extract(html, Some("https://example.com"), None)
543            .unwrap();
544
545        assert_eq!(article.title, "Test Article Title");
546        assert!(article.content.contains("first paragraph"));
547        assert!(article.content.contains("another paragraph"));
548        assert!(article.content.contains("third paragraph"));
549        assert!(article.content.contains("<p>"));
550        assert!(article.text_content.contains("This is a test article"));
551        assert!(!article.text_content.contains("<"));
552        assert!(article.length > 0);
553    }
554}