readability_js/
readability.rs

1use rquickjs::{Context as QuickContext, Ctx, Function, Object, Runtime, Value};
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
6pub enum Direction {
7    /// Left-to-Right
8    Ltr,
9    /// Right-to-Left
10    Rtl,
11}
12
13/// Parsed article content and metadata extracted by Readability.
14///
15/// All fields except `title`, `content`, `text_content`, and `length` are optional
16/// and depend on the input HTML having appropriate metadata.
17#[derive(Debug, Clone, PartialEq)]
18#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
19pub struct Article {
20    /// Extracted or inferred article title
21    pub title: String,
22
23    /// Clean HTML content (safe for display)
24    pub content: String,
25
26    /// Plain text with all HTML stripped
27    pub text_content: String,
28
29    /// Character count of the content
30    pub length: u32,
31
32    /// Author byline metadata
33    pub byline: Option<String>,
34
35    /// Content direction
36    pub direction: Option<Direction>,
37
38    /// Article description or short excerpt
39    pub excerpt: Option<String>,
40
41    /// Name of the website
42    pub site_name: Option<String>,
43
44    /// Content language code (BCP 47), if detectable
45    pub language: Option<String>,
46
47    /// Published time in ISO 8601 or site format, if detectable
48    pub published_time: Option<String>,
49}
50
51impl<'js> TryFrom<Value<'js>> for Article {
52    type Error = ReadabilityError;
53
54    fn try_from(value: Value<'js>) -> Result<Self> {
55        let obj = value.as_object().ok_or_else(|| {
56            ReadabilityError::ExtractionError(
57                "Expected JavaScript object, got a different type".into(),
58            )
59        })?;
60
61        let title = obj
62            .get::<_, String>("title")
63            .map_err(|e| ReadabilityError::JsEvaluation {
64                context: "failed to get title".into(),
65                source: e,
66            })?;
67
68        let byline = obj
69            .get::<_, Value>("byline")
70            .map_err(|e| ReadabilityError::JsEvaluation {
71                context: "failed to get byline".into(),
72                source: e,
73            })?;
74        let byline = if byline.is_null() || byline.is_undefined() {
75            None
76        } else {
77            Some(
78                byline
79                    .get::<String>()
80                    .map_err(|e| ReadabilityError::JsEvaluation {
81                        context: "failed to get byline as string".into(),
82                        source: e,
83                    })?,
84            )
85        };
86
87        let dir = obj
88            .get::<_, Value>("dir")
89            .map_err(|e| ReadabilityError::JsEvaluation {
90                context: "failed to get dir".into(),
91                source: e,
92            })?;
93        let direction = if dir.is_null() || dir.is_undefined() {
94            None
95        } else {
96            let dir_str = dir
97                .get::<String>()
98                .map_err(|e| ReadabilityError::JsEvaluation {
99                    context: "failed to get dir as string".into(),
100                    source: e,
101                })?;
102            match dir_str.as_str() {
103                "ltr" => Some(Direction::Ltr),
104                "rtl" => Some(Direction::Rtl),
105                _ => None,
106            }
107        };
108
109        let content =
110            obj.get::<_, String>("content")
111                .map_err(|e| ReadabilityError::JsEvaluation {
112                    context: "failed to get content".into(),
113                    source: e,
114                })?;
115        let text_content =
116            obj.get::<_, String>("textContent")
117                .map_err(|e| ReadabilityError::JsEvaluation {
118                    context: "failed to get text_content".into(),
119                    source: e,
120                })?;
121        let length = obj
122            .get::<_, u32>("length")
123            .map_err(|e| ReadabilityError::JsEvaluation {
124                context: "failed to get length".into(),
125                source: e,
126            })?;
127
128        let excerpt =
129            obj.get::<_, Value>("excerpt")
130                .map_err(|e| ReadabilityError::JsEvaluation {
131                    context: "failed to get excerpt".into(),
132                    source: e,
133                })?;
134        let excerpt = if excerpt.is_null() || excerpt.is_undefined() {
135            None
136        } else {
137            Some(
138                excerpt
139                    .get::<String>()
140                    .map_err(|e| ReadabilityError::JsEvaluation {
141                        context: "failed to get excerpt as string".into(),
142                        source: e,
143                    })?,
144            )
145        };
146
147        let site_name =
148            obj.get::<_, Value>("siteName")
149                .map_err(|e| ReadabilityError::JsEvaluation {
150                    context: "failed to get site_name".into(),
151                    source: e,
152                })?;
153        let site_name = if site_name.is_null() || site_name.is_undefined() {
154            None
155        } else {
156            Some(
157                site_name
158                    .get::<String>()
159                    .map_err(|e| ReadabilityError::JsEvaluation {
160                        context: "failed to get site_name as string".into(),
161                        source: e,
162                    })?,
163            )
164        };
165
166        let language = obj
167            .get::<_, Value>("lang")
168            .map_err(|e| ReadabilityError::JsEvaluation {
169                context: "failed to get lang".into(),
170                source: e,
171            })?;
172        let language = if language.is_null() || language.is_undefined() {
173            None
174        } else {
175            Some(
176                language
177                    .get::<String>()
178                    .map_err(|e| ReadabilityError::JsEvaluation {
179                        context: "failed to get lang as string".into(),
180                        source: e,
181                    })?,
182            )
183        };
184
185        let published_time =
186            obj.get::<_, Value>("publishedTime")
187                .map_err(|e| ReadabilityError::JsEvaluation {
188                    context: "failed to get published_time".into(),
189                    source: e,
190                })?;
191        let published_time =
192            if published_time.is_null() || published_time.is_undefined() {
193                None
194            } else {
195                Some(published_time.get::<String>().map_err(|e| {
196                    ReadabilityError::JsEvaluation {
197                        context: "failed to get published_time as string".into(),
198                        source: e,
199                    }
200                })?)
201            };
202
203        Ok(Article {
204            title,
205            byline,
206            direction,
207            content,
208            text_content,
209            length,
210            excerpt,
211            site_name,
212            language,
213            published_time,
214        })
215    }
216}
217
218/// Configuration options for content extraction.
219///
220/// Created with [`ReadabilityOptions::new`] and used with
221/// [`Readability::parse_with_options`].
222///
223/// See also: [`Readability::parse`] for basic extraction without options.
224/// # Examples
225///
226/// ```rust
227/// use readability_js::ReadabilityOptions;
228///
229/// // Fine-tuned for news sites
230/// let opts = ReadabilityOptions::new()
231///     .char_threshold(500)        // Require more content
232///     .nb_top_candidates(10)      // Consider more candidates
233///     .keep_classes(true)         // Preserve CSS classes
234///     .classes_to_preserve(vec!["highlight".into(), "code".into()]);
235/// ```
236#[derive(Default, Debug, Clone)]
237#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
238pub struct ReadabilityOptions {
239    pub max_elems_to_parse: Option<usize>,
240    pub nb_top_candidates: Option<usize>,
241    pub char_threshold: Option<usize>,
242    pub classes_to_preserve: Option<Vec<String>>,
243    pub keep_classes: Option<bool>,
244    pub disable_jsonld: Option<bool>,
245    pub link_density_modifier: Option<f32>,
246    // TODO: serializer and allowed_video_regex
247}
248
249impl ReadabilityOptions {
250    /// Creates a new options builder with default values.
251    pub fn new() -> Self {
252        Self::default()
253    }
254
255    /// Set maximum number of DOM elements to parse.
256    ///
257    /// Limits processing to avoid performance issues on very large documents.
258    /// Default is typically around 0 (unlimited).
259    ///
260    /// # Arguments
261    /// * `val` - Maximum elements to process (0 = unlimited)
262    pub fn max_elems_to_parse(mut self, val: usize) -> Self {
263        self.max_elems_to_parse = Some(val);
264        self
265    }
266
267    /// Set number of top content candidates to consider.
268    ///
269    /// The algorithm identifies potential content containers and ranks them.
270    /// Higher values may improve accuracy but reduce performance.
271    /// Default is typically 5.
272    ///
273    /// # Arguments
274    /// * `val` - Number of candidates to consider (recommended: 5-15)
275    pub fn nb_top_candidates(mut self, val: usize) -> Self {
276        self.nb_top_candidates = Some(val);
277        self
278    }
279
280    /// Set minimum character threshold for readable content.
281    ///
282    /// Content with fewer characters will fail the readability check.
283    /// Lower values are more permissive but may include navigation/ads.
284    /// Default is typically 140 characters.
285    ///
286    /// # Arguments
287    /// * `val` - Minimum character count (recommended: 50-500)
288    pub fn char_threshold(mut self, val: usize) -> Self {
289        self.char_threshold = Some(val);
290        self
291    }
292
293    /// Specify CSS classes to preserve in the output.
294    ///
295    /// By default, most CSS classes are stripped from the cleaned HTML.
296    /// Use this to preserve important styling classes.
297    ///
298    /// # Arguments
299    /// * `val` - Vector of class names to preserve (e.g., `vec!["highlight".into()]`)
300    pub fn classes_to_preserve(mut self, val: Vec<String>) -> Self {
301        self.classes_to_preserve = Some(val);
302        self
303    }
304
305    /// Whether to preserve CSS classes in the output.
306    ///
307    /// When true, CSS classes are preserved in the cleaned HTML.
308    /// When false (default), most classes are stripped.
309    ///
310    /// # Arguments
311    /// * `val` - true to preserve classes, false to strip them
312    pub fn keep_classes(mut self, val: bool) -> Self {
313        self.keep_classes = Some(val);
314        self
315    }
316
317    /// Disable JSON-LD metadata extraction.
318    ///
319    /// JSON-LD structured data can provide additional article metadata
320    /// (author, publish date, etc.). Disable this if you don't need
321    /// metadata or if it causes issues.
322    ///
323    /// # Arguments
324    /// * `val` - true to disable JSON-LD parsing, false to enable it
325    pub fn disable_jsonld(mut self, val: bool) -> Self {
326        self.disable_jsonld = Some(val);
327        self
328    }
329
330    /// Modify the link density calculation.
331    ///
332    /// Content with high link density is often navigation rather than article
333    /// content. This modifier adjusts how strictly link density is evaluated.
334    /// Values > 1.0 are more permissive, < 1.0 are stricter.
335    ///
336    /// # Arguments
337    /// * `val` - Link density modifier (recommended: 0.5-2.0, default: 1.0)
338    pub fn link_density_modifier(mut self, val: f32) -> Self {
339        self.link_density_modifier = Some(val);
340        self
341    }
342
343    fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
344        let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
345            context: "failed to create options object".into(),
346            source: e,
347        })?;
348
349        if let Some(val) = self.max_elems_to_parse {
350            obj.set("maxElemsToParse", val)
351                .map_err(|e| ReadabilityError::JsEvaluation {
352                    context: "failed to set maxElemsToParse option".into(),
353                    source: e,
354                })?;
355        }
356        if let Some(val) = self.nb_top_candidates {
357            obj.set("nbTopCandidates", val)
358                .map_err(|e| ReadabilityError::JsEvaluation {
359                    context: "failed to set nbTopCandidates option".into(),
360                    source: e,
361                })?;
362        }
363        if let Some(val) = self.char_threshold {
364            obj.set("charThreshold", val)
365                .map_err(|e| ReadabilityError::JsEvaluation {
366                    context: "failed to set charThreshold option".to_string(),
367                    source: e,
368                })?;
369        }
370        if let Some(ref val) = self.classes_to_preserve {
371            obj.set("classesToPreserve", val.clone()).map_err(|e| {
372                ReadabilityError::JsEvaluation {
373                    context: "failed to set classesToPreserve option".to_string(),
374                    source: e,
375                }
376            })?;
377        }
378        if let Some(val) = self.keep_classes {
379            obj.set("keepClasses", val)
380                .map_err(|e| ReadabilityError::JsEvaluation {
381                    context: "failed to set keepClasses option".to_string(),
382                    source: e,
383                })?;
384        }
385        if let Some(val) = self.disable_jsonld {
386            obj.set("disableJSONLD", val)
387                .map_err(|e| ReadabilityError::JsEvaluation {
388                    context: "failed to set disableJSONLD option".to_string(),
389                    source: e,
390                })?;
391        }
392        if let Some(val) = self.link_density_modifier {
393            obj.set("linkDensityModifier", val)
394                .map_err(|e| ReadabilityError::JsEvaluation {
395                    context: "failed to set linkDensityModifier option".to_string(),
396                    source: e,
397                })?;
398        }
399        Ok(obj)
400    }
401}
402
403// #[derive(Default, Debug, Clone)]
404// struct ReadabilityCheckOptions {
405//     pub min_content_length: Option<usize>, // default 140
406//     pub min_score: Option<usize>,          // default 20
407//                                            // TODO visibility checker
408// }
409
410// impl ReadabilityCheckOptions {
411//     pub fn new() -> Self {
412//         Self::default()
413//     }
414//     pub fn min_content_length(mut self, val: usize) -> Self {
415//         self.min_content_length = Some(val);
416//         self
417//     }
418//     pub fn min_score(mut self, val: usize) -> Self {
419//         self.min_score = Some(val);
420//         self
421//     }
422
423//     fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
424//         let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
425//             context: "failed to create check options object".to_string(),
426//             source: e,
427//         })?;
428
429//         if let Some(val) = self.min_content_length {
430//             obj.set("minContentLength", val)
431//                 .map_err(|e| ReadabilityError::JsEvaluation {
432//                     context: "failed to set minContentLength option".to_string(),
433//                     source: e,
434//                 })?
435//         }
436//         if let Some(val) = self.min_score {
437//             obj.set("minScore", val)
438//                 .map_err(|e| ReadabilityError::JsEvaluation {
439//                     context: "failed to set minScore option".to_string(),
440//                     source: e,
441//                 })?;
442//         }
443//         Ok(obj)
444//     }
445// }
446//
447/// Errors that can occur during content extraction.
448#[derive(Error, Debug)]
449pub enum ReadabilityError {
450    /// HTML could not be parsed (malformed, empty, etc.)
451    ///
452    /// This typically occurs when:
453    /// - HTML is severely malformed or incomplete
454    /// - Empty or whitespace-only input
455    /// - Input contains non-HTML content
456    ///
457    /// # Examples
458    ///
459    /// ```rust
460    /// # use readability_js::Readability;
461    /// let reader = Readability::new()?;
462    /// // This will likely fail with HtmlParseError
463    /// let result = reader.parse("<not valid html>");
464    /// # Ok::<(), Box<dyn std::error::Error>>(())
465    /// ```
466    #[error("Failed to parse HTML: {0}")]
467    HtmlParseError(String),
468
469    /// Content failed internal readability checks
470    ///
471    /// This usually means:
472    /// - Page has too little readable content (< 140 characters by default)
473    /// - Content couldn't be reliably distinguished from navigation/ads
474    /// - Page is mostly navigation, ads, or other non-content elements
475    /// - Content has too high link density (likely navigation)
476    ///
477    /// # What to do
478    ///
479    /// Try lowering the `char_threshold` in [`ReadabilityOptions`] or check
480    /// if the HTML actually contains substantial article content:
481    ///
482    /// ```rust
483    /// # use readability_js::{Readability, ReadabilityOptions};
484    /// let options = ReadabilityOptions::new().char_threshold(50);
485    /// let reader = Readability::new()?;
486    /// let article = reader.parse_with_options(&html, None, Some(options))?;
487    /// # Ok::<(), readability_js::ReadabilityError>(())
488    /// ```
489    #[error("Content failed readability check")]
490    ReadabilityCheckFailed,
491
492    /// Content extraction failed for other reasons
493    ///
494    /// This is a catch-all error for unexpected extraction failures that don't
495    /// fit into other categories. Often indicates issues with the JavaScript
496    /// execution environment or unexpected content structures.
497    ///
498    /// # Examples
499    ///
500    /// ```rust
501    /// # use readability_js::{Readability, ReadabilityError};
502    /// let reader = Readability::new()?;
503    /// match reader.parse(&html) {
504    ///     Err(ReadabilityError::ExtractionError(msg)) => {
505    ///         eprintln!("Extraction failed: {}", msg);
506    ///         // Maybe try with different options or fallback processing
507    ///     }
508    ///     Ok(article) => println!("Success: {}", article.title),
509    ///     Err(e) => eprintln!("Other error: {}", e),
510    /// }
511    /// # Ok::<(), Box<dyn std::error::Error>>(())
512    /// ```
513    #[error("Failed to extract readable content: {0}")]
514    ExtractionError(String),
515
516    /// JavaScript engine evaluation error
517    ///
518    /// Occurs when the embedded JavaScript engine fails to execute Readability.js
519    /// code. This could indicate:
520    /// - Memory constraints
521    /// - JavaScript syntax errors in the bundled code
522    /// - Runtime exceptions in the JavaScript environment
523    ///
524    /// # Examples
525    ///
526    /// ```rust
527    /// # use readability_js::{Readability, ReadabilityError};
528    /// let reader = Readability::new()?;
529    /// match reader.parse(&html) {
530    ///     Err(ReadabilityError::JsEvaluation { context, source }) => {
531    ///         eprintln!("JavaScript error in {}: {}", context, source);
532    ///         // This usually indicates a bug - please report it!
533    ///     }
534    ///     Ok(article) => println!("Success: {}", article.title),
535    ///     Err(e) => eprintln!("Other error: {}", e),
536    /// }
537    /// # Ok::<(), Box<dyn std::error::Error>>(())
538    /// ```
539    #[error("Failed to evaluate JavaScript: {context}")]
540    JsEvaluation {
541        context: String,
542        #[source]
543        source: rquickjs::Error,
544    },
545
546    /// Invalid input parameters (usually base URL)
547    ///
548    /// This error occurs when:
549    /// - Base URL has invalid format or unsupported scheme
550    /// - URL uses dangerous schemes like `javascript:` or `data:`
551    /// - URL is not HTTP(S) when validation is enabled
552    ///
553    /// # Examples
554    ///
555    /// ```rust
556    /// # use readability_js::{Readability, ReadabilityError};
557    /// let reader = Readability::new()?;
558    /// // This will fail with InvalidOptions
559    /// let result = reader.parse_with_url(&html, "javascript:alert('xss')");
560    /// assert!(matches!(result, Err(ReadabilityError::InvalidOptions(_))));
561    /// # Ok::<(), Box<dyn std::error::Error>>(())
562    /// ```
563    #[error("Invalid options: {0}")]
564    InvalidOptions(String),
565}
566
567trait JsResultExt<T> {
568    fn js_context(self, context: &str) -> Result<T>;
569}
570
571impl<T> JsResultExt<T> for std::result::Result<T, rquickjs::Error> {
572    fn js_context(self, context: &str) -> Result<T> {
573        self.map_err(|source| ReadabilityError::JsEvaluation {
574            context: context.into(),
575            source,
576        })
577    }
578}
579
580type Result<T> = std::result::Result<T, ReadabilityError>;
581
582/// The main readability parser that extracts clean content from HTML.
583///
584/// Uses Mozilla's Readability.js algorithm running in an embedded JavaScript engine.
585/// Create once and reuse for multiple extractions - the JS context initialization
586/// is expensive.
587///
588/// # Examples
589///
590/// ```rust
591/// use readability_js::{Readability, ReadabilityOptions};
592///
593/// // Create parser (expensive - reuse this!)
594/// let reader = Readability::new()?;
595///
596/// // Basic extraction
597/// let article = reader.extract(html, Some("https://example.com"), None)?;
598///
599/// // With custom options
600/// let options = ReadabilityOptions::new()
601///     .char_threshold(500);
602/// let article = reader.extract(html, Some("https://example.com"), Some(options))?;
603/// # Ok::<(), readability_js::ReadabilityError>(())
604/// ```
605///
606/// # Thread Safety
607///
608/// `Readability` instances are **not** thread-safe (`!Send + !Sync`). Each instance
609/// contains an embedded JavaScript engine that cannot be moved between threads or
610/// shared between threads.
611pub struct Readability {
612    context: QuickContext,
613}
614impl Readability {
615    /// Creates a new readability parser.
616    ///
617    /// # Performance
618    ///
619    /// This operation is expensive (50-100ms) as it initializes a JavaScript engine
620    /// and loads the Readability.js library. Create one instance and reuse it for
621    /// multiple extractions.
622    ///
623    /// # JavaScript Engine
624    ///
625    /// This method initializes an embedded QuickJS runtime. The JavaScript code
626    /// executed is Mozilla's Readability.js library and is considered safe for
627    /// processing untrusted HTML input.
628    pub fn new() -> Result<Self> {
629        let runtime = Runtime::new().js_context("Failed to create runtime")?;
630        let context = QuickContext::full(&runtime).js_context("Failed to create context")?;
631
632        context.with(|ctx| {
633            let readability_code = include_str!("../vendor/readability/Readability.js");
634            ctx.eval::<(), _>(readability_code)
635                .js_context("Failed to load Readability")?;
636
637            let bundle = include_str!("../js/bundled.js");
638            ctx.eval::<(), _>(bundle)
639                .js_context("Failed to load bundle")?;
640
641            Ok(())
642        })?;
643
644        Ok(Self { context })
645    }
646
647    fn validate_base_url(url: &str) -> Result<String> {
648        if url.starts_with("javascript:") || url.starts_with("data:") {
649            return Err(ReadabilityError::InvalidOptions(
650                "Invalid base URL scheme".into(),
651            ));
652        }
653
654        // Optional: Parse with url crate for stricter validation
655        match url::Url::parse(url) {
656            Ok(parsed) if matches!(parsed.scheme(), "http" | "https") => Ok(url.to_string()),
657            _ => Err(ReadabilityError::InvalidOptions(
658                "Base URL must be HTTP(S)".into(),
659            )),
660        }
661    }
662
663    /// Extract readable content from HTML.
664    ///
665    /// This is the main extraction method. It processes the HTML to remove
666    /// ads, navigation, sidebars and other clutter, leaving just the main article content.
667    ///
668    /// # Arguments
669    ///
670    /// * `html` - The HTML content to process. Should be a complete HTML document.
671    ///
672    /// # Examples
673    ///
674    /// ```rust
675    /// use readability_js::Readability;
676    ///
677    /// let html = r#"
678    ///   <html>
679    ///     <body>
680    ///       <article>
681    ///         <h1>Breaking News</h1>
682    ///         <p>Important news content here...</p>
683    ///       </article>
684    ///       <nav>Navigation menu</nav>
685    ///       <aside>Advertisement</aside>
686    ///     </body>
687    ///   </html>
688    /// "#;
689    ///
690    /// let reader = Readability::new()?;
691    /// let article = reader.parse(html)?;
692    ///
693    /// assert_eq!(article.title, "Breaking News");
694    /// assert!(article.content.contains("Important news content"));
695    /// // Navigation and ads are removed from the output
696    /// # Ok::<(), readability_js::ReadabilityError>(())
697    /// ```
698    ///
699    /// # Errors
700    ///
701    /// Returns [`ReadabilityError`] if:
702    /// * The HTML is malformed or empty (`HtmlParseError`)
703    /// * The page fails readability checks (`ReadabilityCheckFailed`)
704    /// * JavaScript evaluation fails (`JsEvaluation`)
705    ///
706    /// # Performance
707    ///
708    /// This method is fast (typically <10ms) once the [`Readability`] instance
709    /// is created. The expensive operation is [`Readability::new()`] which should
710    /// be called once and reused.
711    pub fn parse(&self, html: &str) -> Result<Article> {
712        self.extract(html, None, None)
713    }
714
715    /// Extract readable content from HTML with URL context.
716    ///
717    /// The URL helps with better link resolution and metadata extraction.
718    ///
719    /// # Arguments
720    ///
721    /// * `html` - The HTML content to extract from
722    /// * `base_url` - The original URL of the page for link resolution
723    ///
724    /// # Examples
725    /// ```rust
726    /// use readability_js::Readability;
727    ///
728    /// let reader = Readability::new()?;
729    /// let article = reader.parse_with_url(html, "https://example.com/article")?;
730    /// // Links in the article will be properly resolved
731    /// # Ok::<(), readability_js::ReadabilityError>(())
732    /// ```
733    ///
734    /// # Errors
735    ///
736    /// This function will return an error if:
737    /// * The HTML is malformed or cannot be parsed ([`ReadabilityError::HtmlParseError`])
738    /// * The base URL is invalid ([`ReadabilityError::InvalidOptions`])
739    /// * The content fails internal readability checks ([`ReadabilityError::ReadabilityCheckFailed`])
740    /// * JavaScript evaluation fails ([`ReadabilityError::JsEvaluation`])
741    pub fn parse_with_url(&self, html: &str, base_url: &str) -> Result<Article> {
742        self.extract(html, Some(base_url), None)
743    }
744
745    /// Extract readable content with custom parsing options.
746    ///
747    /// # Arguments
748    ///
749    /// * `html` - The HTML content to extract from
750    /// * `base_url` - Optional URL for link resolution
751    /// * `options` - Custom parsing options
752    ///
753    /// # Examples
754    /// ```rust
755    /// use readability_js::{Readability, ReadabilityOptions};
756    ///
757    /// let options = ReadabilityOptions::new()
758    ///     .char_threshold(500);
759    ///
760    /// let reader = Readability::new()?;
761    /// let article = reader.parse_with_options(html, Some("https://example.com"), Some(options))?;
762    /// # Ok::<(), readability_js::ReadabilityError>(())
763    /// ```
764    ///
765    /// # Errors
766    ///
767    /// This function will return an error if:
768    /// * The HTML is malformed or cannot be parsed ([`ReadabilityError::HtmlParseError`])
769    /// * The base URL is invalid ([`ReadabilityError::InvalidOptions`])
770    /// * The content fails internal readability checks ([`ReadabilityError::ReadabilityCheckFailed`])
771    /// * JavaScript evaluation fails ([`ReadabilityError::JsEvaluation`])
772    pub fn parse_with_options(
773        &self,
774        html: &str,
775        base_url: Option<&str>,
776        options: Option<ReadabilityOptions>,
777    ) -> Result<Article> {
778        self.extract(html, base_url, options)
779    }
780
781    fn extract(
782        &self,
783        html: &str,
784        base_url: Option<&str>,
785        options: Option<ReadabilityOptions>,
786    ) -> Result<Article> {
787        let clean_base_url = match base_url {
788            None => None,
789            Some(url) => Some(Self::validate_base_url(url)?),
790        };
791        self.context.with(|ctx| {
792            let extract_fn: Function = ctx
793                .globals()
794                .get("extract")
795                .js_context("extract function not found")?;
796            let options_obj = match options {
797                None => None,
798                Some(options) => Some(options.build(ctx.clone())?),
799            };
800
801            let result: Value = extract_fn
802                .call((html, clean_base_url, options_obj))
803                .js_context("Failed to call extract")?;
804
805            // Check if result is an error object
806            if let Some(obj) = result.as_object()
807                && let Ok(error_type) = obj.get::<_, String>("errorType")
808            {
809                let error_msg = obj
810                    .get::<_, String>("error")
811                    .unwrap_or_else(|_| "Unknown error".to_string());
812
813                return Err(match error_type.as_str() {
814                    "HtmlParseError" => ReadabilityError::HtmlParseError(error_msg),
815                    "ExtractionError" => ReadabilityError::ExtractionError(error_msg),
816                    "RuntimeError" => ReadabilityError::JsEvaluation {
817                        context: format!("JavaScript runtime error: {}", error_msg),
818                        source: rquickjs::Error::Unknown,
819                    },
820                    _ => ReadabilityError::ExtractionError(format!(
821                        "Unknown error type '{}': {}",
822                        error_type, error_msg
823                    )),
824                });
825            }
826
827            // If not an error object, try to parse as Article
828            Article::try_from(result)
829        })
830    }
831}
832
833#[cfg(test)]
834mod tests {
835    use super::*;
836
837    #[test]
838    fn test_basic_extraction() {
839        let html = r#"
840            <html>
841            <head><title>Test Article Title</title></head>
842            <body>
843                <h1>This is a test article</h1>
844                <p>This is the first paragraph with some content that should be long enough to be considered readable content by the readability algorithm.</p>
845                <p>This is another paragraph with more content. It has enough text to make the article substantial and worth reading.</p>
846                <p>And here's a third paragraph to make sure we have enough content for the readability parser to work with.</p>
847            </body>
848            </html>
849        "#;
850
851        let readability = Readability::new().unwrap();
852        let article = readability
853            .extract(html, Some("https://example.com"), None)
854            .unwrap();
855
856        assert_eq!(article.title, "Test Article Title");
857        assert!(article.content.contains("first paragraph"));
858        assert!(article.content.contains("another paragraph"));
859        assert!(article.content.contains("third paragraph"));
860        assert!(article.content.contains("<p>"));
861        assert!(article.text_content.contains("This is a test article"));
862        assert!(!article.text_content.contains("<"));
863        assert!(article.length > 0);
864    }
865}
readability_js/readability.rs

readability_js/
readability.rs