readability_js/readability.rs
1use rquickjs::{Context as QuickContext, Ctx, Function, Object, Runtime, Value};
2use thiserror::Error;
3
4#[derive(Debug, Clone, PartialEq, Eq)]
5#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
6pub enum Direction {
7 /// Left-to-Right
8 Ltr,
9 /// Right-to-Left
10 Rtl,
11}
12
13/// Parsed article content and metadata extracted by Readability.
14///
15/// All fields except `title`, `content`, `text_content`, and `length` are optional
16/// and depend on the input HTML having appropriate metadata.
17#[derive(Debug, Clone, PartialEq)]
18#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
19pub struct Article {
20 /// Extracted or inferred article title
21 pub title: String,
22
23 /// Clean HTML content (safe for display)
24 pub content: String,
25
26 /// Plain text with all HTML stripped
27 pub text_content: String,
28
29 /// Character count of the content
30 pub length: u32,
31
32 /// Author byline metadata
33 pub byline: Option<String>,
34
35 /// Content direction
36 pub direction: Option<Direction>,
37
38 /// Article description or short excerpt
39 pub excerpt: Option<String>,
40
41 /// Name of the website
42 pub site_name: Option<String>,
43
44 /// Content language code (BCP 47), if detectable
45 pub language: Option<String>,
46
47 /// Published time in ISO 8601 or site format, if detectable
48 pub published_time: Option<String>,
49}
50
51impl<'js> TryFrom<Value<'js>> for Article {
52 type Error = ReadabilityError;
53
54 fn try_from(value: Value<'js>) -> Result<Self> {
55 let obj = value.as_object().ok_or_else(|| {
56 ReadabilityError::ExtractionError(
57 "Expected JavaScript object, got a different type".into(),
58 )
59 })?;
60
61 let title = obj
62 .get::<_, String>("title")
63 .map_err(|e| ReadabilityError::JsEvaluation {
64 context: "failed to get title".into(),
65 source: e,
66 })?;
67
68 let byline = obj
69 .get::<_, Value>("byline")
70 .map_err(|e| ReadabilityError::JsEvaluation {
71 context: "failed to get byline".into(),
72 source: e,
73 })?;
74 let byline = if byline.is_null() || byline.is_undefined() {
75 None
76 } else {
77 Some(
78 byline
79 .get::<String>()
80 .map_err(|e| ReadabilityError::JsEvaluation {
81 context: "failed to get byline as string".into(),
82 source: e,
83 })?,
84 )
85 };
86
87 let dir = obj
88 .get::<_, Value>("dir")
89 .map_err(|e| ReadabilityError::JsEvaluation {
90 context: "failed to get dir".into(),
91 source: e,
92 })?;
93 let direction = if dir.is_null() || dir.is_undefined() {
94 None
95 } else {
96 let dir_str = dir
97 .get::<String>()
98 .map_err(|e| ReadabilityError::JsEvaluation {
99 context: "failed to get dir as string".into(),
100 source: e,
101 })?;
102 match dir_str.as_str() {
103 "ltr" => Some(Direction::Ltr),
104 "rtl" => Some(Direction::Rtl),
105 _ => None,
106 }
107 };
108
109 let content =
110 obj.get::<_, String>("content")
111 .map_err(|e| ReadabilityError::JsEvaluation {
112 context: "failed to get content".into(),
113 source: e,
114 })?;
115 let text_content =
116 obj.get::<_, String>("textContent")
117 .map_err(|e| ReadabilityError::JsEvaluation {
118 context: "failed to get text_content".into(),
119 source: e,
120 })?;
121 let length = obj
122 .get::<_, u32>("length")
123 .map_err(|e| ReadabilityError::JsEvaluation {
124 context: "failed to get length".into(),
125 source: e,
126 })?;
127
128 let excerpt =
129 obj.get::<_, Value>("excerpt")
130 .map_err(|e| ReadabilityError::JsEvaluation {
131 context: "failed to get excerpt".into(),
132 source: e,
133 })?;
134 let excerpt = if excerpt.is_null() || excerpt.is_undefined() {
135 None
136 } else {
137 Some(
138 excerpt
139 .get::<String>()
140 .map_err(|e| ReadabilityError::JsEvaluation {
141 context: "failed to get excerpt as string".into(),
142 source: e,
143 })?,
144 )
145 };
146
147 let site_name =
148 obj.get::<_, Value>("siteName")
149 .map_err(|e| ReadabilityError::JsEvaluation {
150 context: "failed to get site_name".into(),
151 source: e,
152 })?;
153 let site_name = if site_name.is_null() || site_name.is_undefined() {
154 None
155 } else {
156 Some(
157 site_name
158 .get::<String>()
159 .map_err(|e| ReadabilityError::JsEvaluation {
160 context: "failed to get site_name as string".into(),
161 source: e,
162 })?,
163 )
164 };
165
166 let language = obj
167 .get::<_, Value>("lang")
168 .map_err(|e| ReadabilityError::JsEvaluation {
169 context: "failed to get lang".into(),
170 source: e,
171 })?;
172 let language = if language.is_null() || language.is_undefined() {
173 None
174 } else {
175 Some(
176 language
177 .get::<String>()
178 .map_err(|e| ReadabilityError::JsEvaluation {
179 context: "failed to get lang as string".into(),
180 source: e,
181 })?,
182 )
183 };
184
185 let published_time =
186 obj.get::<_, Value>("publishedTime")
187 .map_err(|e| ReadabilityError::JsEvaluation {
188 context: "failed to get published_time".into(),
189 source: e,
190 })?;
191 let published_time =
192 if published_time.is_null() || published_time.is_undefined() {
193 None
194 } else {
195 Some(published_time.get::<String>().map_err(|e| {
196 ReadabilityError::JsEvaluation {
197 context: "failed to get published_time as string".into(),
198 source: e,
199 }
200 })?)
201 };
202
203 Ok(Article {
204 title,
205 byline,
206 direction,
207 content,
208 text_content,
209 length,
210 excerpt,
211 site_name,
212 language,
213 published_time,
214 })
215 }
216}
217
218/// Configuration options for content extraction.
219///
220/// Created with [`ReadabilityOptions::new`] and used with
221/// [`Readability::parse_with_options`].
222///
223/// See also: [`Readability::parse`] for basic extraction without options.
224/// # Examples
225///
226/// ```rust
227/// use readability_js::ReadabilityOptions;
228///
229/// // Fine-tuned for news sites
230/// let opts = ReadabilityOptions::new()
231/// .char_threshold(500) // Require more content
232/// .nb_top_candidates(10) // Consider more candidates
233/// .keep_classes(true) // Preserve CSS classes
234/// .classes_to_preserve(vec!["highlight".into(), "code".into()]);
235/// ```
236#[derive(Default, Debug, Clone)]
237#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
238pub struct ReadabilityOptions {
239 pub max_elems_to_parse: Option<usize>,
240 pub nb_top_candidates: Option<usize>,
241 pub char_threshold: Option<usize>,
242 pub classes_to_preserve: Option<Vec<String>>,
243 pub keep_classes: Option<bool>,
244 pub disable_jsonld: Option<bool>,
245 pub link_density_modifier: Option<f32>,
246 // TODO: serializer and allowed_video_regex
247}
248
249impl ReadabilityOptions {
250 /// Creates a new options builder with default values.
251 pub fn new() -> Self {
252 Self::default()
253 }
254
255 /// Set maximum number of DOM elements to parse.
256 ///
257 /// Limits processing to avoid performance issues on very large documents.
258 /// Default is typically around 0 (unlimited).
259 ///
260 /// # Arguments
261 /// * `val` - Maximum elements to process (0 = unlimited)
262 pub fn max_elems_to_parse(mut self, val: usize) -> Self {
263 self.max_elems_to_parse = Some(val);
264 self
265 }
266
267 /// Set number of top content candidates to consider.
268 ///
269 /// The algorithm identifies potential content containers and ranks them.
270 /// Higher values may improve accuracy but reduce performance.
271 /// Default is typically 5.
272 ///
273 /// # Arguments
274 /// * `val` - Number of candidates to consider (recommended: 5-15)
275 pub fn nb_top_candidates(mut self, val: usize) -> Self {
276 self.nb_top_candidates = Some(val);
277 self
278 }
279
280 /// Set minimum character threshold for readable content.
281 ///
282 /// Content with fewer characters will fail the readability check.
283 /// Lower values are more permissive but may include navigation/ads.
284 /// Default is typically 140 characters.
285 ///
286 /// # Arguments
287 /// * `val` - Minimum character count (recommended: 50-500)
288 pub fn char_threshold(mut self, val: usize) -> Self {
289 self.char_threshold = Some(val);
290 self
291 }
292
293 /// Specify CSS classes to preserve in the output.
294 ///
295 /// By default, most CSS classes are stripped from the cleaned HTML.
296 /// Use this to preserve important styling classes.
297 ///
298 /// # Arguments
299 /// * `val` - Vector of class names to preserve (e.g., `vec!["highlight".into()]`)
300 pub fn classes_to_preserve(mut self, val: Vec<String>) -> Self {
301 self.classes_to_preserve = Some(val);
302 self
303 }
304
305 /// Whether to preserve CSS classes in the output.
306 ///
307 /// When true, CSS classes are preserved in the cleaned HTML.
308 /// When false (default), most classes are stripped.
309 ///
310 /// # Arguments
311 /// * `val` - true to preserve classes, false to strip them
312 pub fn keep_classes(mut self, val: bool) -> Self {
313 self.keep_classes = Some(val);
314 self
315 }
316
317 /// Disable JSON-LD metadata extraction.
318 ///
319 /// JSON-LD structured data can provide additional article metadata
320 /// (author, publish date, etc.). Disable this if you don't need
321 /// metadata or if it causes issues.
322 ///
323 /// # Arguments
324 /// * `val` - true to disable JSON-LD parsing, false to enable it
325 pub fn disable_jsonld(mut self, val: bool) -> Self {
326 self.disable_jsonld = Some(val);
327 self
328 }
329
330 /// Modify the link density calculation.
331 ///
332 /// Content with high link density is often navigation rather than article
333 /// content. This modifier adjusts how strictly link density is evaluated.
334 /// Values > 1.0 are more permissive, < 1.0 are stricter.
335 ///
336 /// # Arguments
337 /// * `val` - Link density modifier (recommended: 0.5-2.0, default: 1.0)
338 pub fn link_density_modifier(mut self, val: f32) -> Self {
339 self.link_density_modifier = Some(val);
340 self
341 }
342
343 fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
344 let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
345 context: "failed to create options object".into(),
346 source: e,
347 })?;
348
349 if let Some(val) = self.max_elems_to_parse {
350 obj.set("maxElemsToParse", val)
351 .map_err(|e| ReadabilityError::JsEvaluation {
352 context: "failed to set maxElemsToParse option".into(),
353 source: e,
354 })?;
355 }
356 if let Some(val) = self.nb_top_candidates {
357 obj.set("nbTopCandidates", val)
358 .map_err(|e| ReadabilityError::JsEvaluation {
359 context: "failed to set nbTopCandidates option".into(),
360 source: e,
361 })?;
362 }
363 if let Some(val) = self.char_threshold {
364 obj.set("charThreshold", val)
365 .map_err(|e| ReadabilityError::JsEvaluation {
366 context: "failed to set charThreshold option".to_string(),
367 source: e,
368 })?;
369 }
370 if let Some(ref val) = self.classes_to_preserve {
371 obj.set("classesToPreserve", val.clone()).map_err(|e| {
372 ReadabilityError::JsEvaluation {
373 context: "failed to set classesToPreserve option".to_string(),
374 source: e,
375 }
376 })?;
377 }
378 if let Some(val) = self.keep_classes {
379 obj.set("keepClasses", val)
380 .map_err(|e| ReadabilityError::JsEvaluation {
381 context: "failed to set keepClasses option".to_string(),
382 source: e,
383 })?;
384 }
385 if let Some(val) = self.disable_jsonld {
386 obj.set("disableJSONLD", val)
387 .map_err(|e| ReadabilityError::JsEvaluation {
388 context: "failed to set disableJSONLD option".to_string(),
389 source: e,
390 })?;
391 }
392 if let Some(val) = self.link_density_modifier {
393 obj.set("linkDensityModifier", val)
394 .map_err(|e| ReadabilityError::JsEvaluation {
395 context: "failed to set linkDensityModifier option".to_string(),
396 source: e,
397 })?;
398 }
399 Ok(obj)
400 }
401}
402
403// #[derive(Default, Debug, Clone)]
404// struct ReadabilityCheckOptions {
405// pub min_content_length: Option<usize>, // default 140
406// pub min_score: Option<usize>, // default 20
407// // TODO visibility checker
408// }
409
410// impl ReadabilityCheckOptions {
411// pub fn new() -> Self {
412// Self::default()
413// }
414// pub fn min_content_length(mut self, val: usize) -> Self {
415// self.min_content_length = Some(val);
416// self
417// }
418// pub fn min_score(mut self, val: usize) -> Self {
419// self.min_score = Some(val);
420// self
421// }
422
423// fn build<'js>(self, ctx: Ctx<'js>) -> Result<Object<'js>> {
424// let obj = Object::new(ctx).map_err(|e| ReadabilityError::JsEvaluation {
425// context: "failed to create check options object".to_string(),
426// source: e,
427// })?;
428
429// if let Some(val) = self.min_content_length {
430// obj.set("minContentLength", val)
431// .map_err(|e| ReadabilityError::JsEvaluation {
432// context: "failed to set minContentLength option".to_string(),
433// source: e,
434// })?
435// }
436// if let Some(val) = self.min_score {
437// obj.set("minScore", val)
438// .map_err(|e| ReadabilityError::JsEvaluation {
439// context: "failed to set minScore option".to_string(),
440// source: e,
441// })?;
442// }
443// Ok(obj)
444// }
445// }
446//
447/// Errors that can occur during content extraction.
448#[derive(Error, Debug)]
449pub enum ReadabilityError {
450 /// HTML could not be parsed (malformed, empty, etc.)
451 ///
452 /// This typically occurs when:
453 /// - HTML is severely malformed or incomplete
454 /// - Empty or whitespace-only input
455 /// - Input contains non-HTML content
456 ///
457 /// # Examples
458 ///
459 /// ```rust
460 /// # use readability_js::Readability;
461 /// let reader = Readability::new()?;
462 /// // This will likely fail with HtmlParseError
463 /// let result = reader.parse("<not valid html>");
464 /// # Ok::<(), Box<dyn std::error::Error>>(())
465 /// ```
466 #[error("Failed to parse HTML: {0}")]
467 HtmlParseError(String),
468
469 /// Content failed internal readability checks
470 ///
471 /// This usually means:
472 /// - Page has too little readable content (< 140 characters by default)
473 /// - Content couldn't be reliably distinguished from navigation/ads
474 /// - Page is mostly navigation, ads, or other non-content elements
475 /// - Content has too high link density (likely navigation)
476 ///
477 /// # What to do
478 ///
479 /// Try lowering the `char_threshold` in [`ReadabilityOptions`] or check
480 /// if the HTML actually contains substantial article content:
481 ///
482 /// ```rust
483 /// # use readability_js::{Readability, ReadabilityOptions};
484 /// let options = ReadabilityOptions::new().char_threshold(50);
485 /// let reader = Readability::new()?;
486 /// let article = reader.parse_with_options(&html, None, Some(options))?;
487 /// # Ok::<(), readability_js::ReadabilityError>(())
488 /// ```
489 #[error("Content failed readability check")]
490 ReadabilityCheckFailed,
491
492 /// Content extraction failed for other reasons
493 ///
494 /// This is a catch-all error for unexpected extraction failures that don't
495 /// fit into other categories. Often indicates issues with the JavaScript
496 /// execution environment or unexpected content structures.
497 ///
498 /// # Examples
499 ///
500 /// ```rust
501 /// # use readability_js::{Readability, ReadabilityError};
502 /// let reader = Readability::new()?;
503 /// match reader.parse(&html) {
504 /// Err(ReadabilityError::ExtractionError(msg)) => {
505 /// eprintln!("Extraction failed: {}", msg);
506 /// // Maybe try with different options or fallback processing
507 /// }
508 /// Ok(article) => println!("Success: {}", article.title),
509 /// Err(e) => eprintln!("Other error: {}", e),
510 /// }
511 /// # Ok::<(), Box<dyn std::error::Error>>(())
512 /// ```
513 #[error("Failed to extract readable content: {0}")]
514 ExtractionError(String),
515
516 /// JavaScript engine evaluation error
517 ///
518 /// Occurs when the embedded JavaScript engine fails to execute Readability.js
519 /// code. This could indicate:
520 /// - Memory constraints
521 /// - JavaScript syntax errors in the bundled code
522 /// - Runtime exceptions in the JavaScript environment
523 ///
524 /// # Examples
525 ///
526 /// ```rust
527 /// # use readability_js::{Readability, ReadabilityError};
528 /// let reader = Readability::new()?;
529 /// match reader.parse(&html) {
530 /// Err(ReadabilityError::JsEvaluation { context, source }) => {
531 /// eprintln!("JavaScript error in {}: {}", context, source);
532 /// // This usually indicates a bug - please report it!
533 /// }
534 /// Ok(article) => println!("Success: {}", article.title),
535 /// Err(e) => eprintln!("Other error: {}", e),
536 /// }
537 /// # Ok::<(), Box<dyn std::error::Error>>(())
538 /// ```
539 #[error("Failed to evaluate JavaScript: {context}")]
540 JsEvaluation {
541 context: String,
542 #[source]
543 source: rquickjs::Error,
544 },
545
546 /// Invalid input parameters (usually base URL)
547 ///
548 /// This error occurs when:
549 /// - Base URL has invalid format or unsupported scheme
550 /// - URL uses dangerous schemes like `javascript:` or `data:`
551 /// - URL is not HTTP(S) when validation is enabled
552 ///
553 /// # Examples
554 ///
555 /// ```rust
556 /// # use readability_js::{Readability, ReadabilityError};
557 /// let reader = Readability::new()?;
558 /// // This will fail with InvalidOptions
559 /// let result = reader.parse_with_url(&html, "javascript:alert('xss')");
560 /// assert!(matches!(result, Err(ReadabilityError::InvalidOptions(_))));
561 /// # Ok::<(), Box<dyn std::error::Error>>(())
562 /// ```
563 #[error("Invalid options: {0}")]
564 InvalidOptions(String),
565}
566
567trait JsResultExt<T> {
568 fn js_context(self, context: &str) -> Result<T>;
569}
570
571impl<T> JsResultExt<T> for std::result::Result<T, rquickjs::Error> {
572 fn js_context(self, context: &str) -> Result<T> {
573 self.map_err(|source| ReadabilityError::JsEvaluation {
574 context: context.into(),
575 source,
576 })
577 }
578}
579
580type Result<T> = std::result::Result<T, ReadabilityError>;
581
582/// The main readability parser that extracts clean content from HTML.
583///
584/// Uses Mozilla's Readability.js algorithm running in an embedded JavaScript engine.
585/// Create once and reuse for multiple extractions - the JS context initialization
586/// is expensive.
587///
588/// # Examples
589///
590/// ```rust
591/// use readability_js::{Readability, ReadabilityOptions};
592///
593/// // Create parser (expensive - reuse this!)
594/// let reader = Readability::new()?;
595///
596/// // Basic extraction
597/// let article = reader.extract(html, Some("https://example.com"), None)?;
598///
599/// // With custom options
600/// let options = ReadabilityOptions::new()
601/// .char_threshold(500);
602/// let article = reader.extract(html, Some("https://example.com"), Some(options))?;
603/// # Ok::<(), readability_js::ReadabilityError>(())
604/// ```
605///
606/// # Thread Safety
607///
608/// `Readability` instances are **not** thread-safe (`!Send + !Sync`). Each instance
609/// contains an embedded JavaScript engine that cannot be moved between threads or
610/// shared between threads.
611pub struct Readability {
612 context: QuickContext,
613}
614impl Readability {
615 /// Creates a new readability parser.
616 ///
617 /// # Performance
618 ///
619 /// This operation is expensive (50-100ms) as it initializes a JavaScript engine
620 /// and loads the Readability.js library. Create one instance and reuse it for
621 /// multiple extractions.
622 ///
623 /// # JavaScript Engine
624 ///
625 /// This method initializes an embedded QuickJS runtime. The JavaScript code
626 /// executed is Mozilla's Readability.js library and is considered safe for
627 /// processing untrusted HTML input.
628 pub fn new() -> Result<Self> {
629 let runtime = Runtime::new().js_context("Failed to create runtime")?;
630 let context = QuickContext::full(&runtime).js_context("Failed to create context")?;
631
632 context.with(|ctx| {
633 let readability_code = include_str!("../vendor/readability/Readability.js");
634 ctx.eval::<(), _>(readability_code)
635 .js_context("Failed to load Readability")?;
636
637 let bundle = include_str!("../js/bundled.js");
638 ctx.eval::<(), _>(bundle)
639 .js_context("Failed to load bundle")?;
640
641 Ok(())
642 })?;
643
644 Ok(Self { context })
645 }
646
647 fn validate_base_url(url: &str) -> Result<String> {
648 if url.starts_with("javascript:") || url.starts_with("data:") {
649 return Err(ReadabilityError::InvalidOptions(
650 "Invalid base URL scheme".into(),
651 ));
652 }
653
654 // Optional: Parse with url crate for stricter validation
655 match url::Url::parse(url) {
656 Ok(parsed) if matches!(parsed.scheme(), "http" | "https") => Ok(url.to_string()),
657 _ => Err(ReadabilityError::InvalidOptions(
658 "Base URL must be HTTP(S)".into(),
659 )),
660 }
661 }
662
663 /// Extract readable content from HTML.
664 ///
665 /// This is the main extraction method. It processes the HTML to remove
666 /// ads, navigation, sidebars and other clutter, leaving just the main article content.
667 ///
668 /// # Arguments
669 ///
670 /// * `html` - The HTML content to process. Should be a complete HTML document.
671 ///
672 /// # Examples
673 ///
674 /// ```rust
675 /// use readability_js::Readability;
676 ///
677 /// let html = r#"
678 /// <html>
679 /// <body>
680 /// <article>
681 /// <h1>Breaking News</h1>
682 /// <p>Important news content here...</p>
683 /// </article>
684 /// <nav>Navigation menu</nav>
685 /// <aside>Advertisement</aside>
686 /// </body>
687 /// </html>
688 /// "#;
689 ///
690 /// let reader = Readability::new()?;
691 /// let article = reader.parse(html)?;
692 ///
693 /// assert_eq!(article.title, "Breaking News");
694 /// assert!(article.content.contains("Important news content"));
695 /// // Navigation and ads are removed from the output
696 /// # Ok::<(), readability_js::ReadabilityError>(())
697 /// ```
698 ///
699 /// # Errors
700 ///
701 /// Returns [`ReadabilityError`] if:
702 /// * The HTML is malformed or empty (`HtmlParseError`)
703 /// * The page fails readability checks (`ReadabilityCheckFailed`)
704 /// * JavaScript evaluation fails (`JsEvaluation`)
705 ///
706 /// # Performance
707 ///
708 /// This method is fast (typically <10ms) once the [`Readability`] instance
709 /// is created. The expensive operation is [`Readability::new()`] which should
710 /// be called once and reused.
711 pub fn parse(&self, html: &str) -> Result<Article> {
712 self.extract(html, None, None)
713 }
714
715 /// Extract readable content from HTML with URL context.
716 ///
717 /// The URL helps with better link resolution and metadata extraction.
718 ///
719 /// # Arguments
720 ///
721 /// * `html` - The HTML content to extract from
722 /// * `base_url` - The original URL of the page for link resolution
723 ///
724 /// # Examples
725 /// ```rust
726 /// use readability_js::Readability;
727 ///
728 /// let reader = Readability::new()?;
729 /// let article = reader.parse_with_url(html, "https://example.com/article")?;
730 /// // Links in the article will be properly resolved
731 /// # Ok::<(), readability_js::ReadabilityError>(())
732 /// ```
733 ///
734 /// # Errors
735 ///
736 /// This function will return an error if:
737 /// * The HTML is malformed or cannot be parsed ([`ReadabilityError::HtmlParseError`])
738 /// * The base URL is invalid ([`ReadabilityError::InvalidOptions`])
739 /// * The content fails internal readability checks ([`ReadabilityError::ReadabilityCheckFailed`])
740 /// * JavaScript evaluation fails ([`ReadabilityError::JsEvaluation`])
741 pub fn parse_with_url(&self, html: &str, base_url: &str) -> Result<Article> {
742 self.extract(html, Some(base_url), None)
743 }
744
745 /// Extract readable content with custom parsing options.
746 ///
747 /// # Arguments
748 ///
749 /// * `html` - The HTML content to extract from
750 /// * `base_url` - Optional URL for link resolution
751 /// * `options` - Custom parsing options
752 ///
753 /// # Examples
754 /// ```rust
755 /// use readability_js::{Readability, ReadabilityOptions};
756 ///
757 /// let options = ReadabilityOptions::new()
758 /// .char_threshold(500);
759 ///
760 /// let reader = Readability::new()?;
761 /// let article = reader.parse_with_options(html, Some("https://example.com"), Some(options))?;
762 /// # Ok::<(), readability_js::ReadabilityError>(())
763 /// ```
764 ///
765 /// # Errors
766 ///
767 /// This function will return an error if:
768 /// * The HTML is malformed or cannot be parsed ([`ReadabilityError::HtmlParseError`])
769 /// * The base URL is invalid ([`ReadabilityError::InvalidOptions`])
770 /// * The content fails internal readability checks ([`ReadabilityError::ReadabilityCheckFailed`])
771 /// * JavaScript evaluation fails ([`ReadabilityError::JsEvaluation`])
772 pub fn parse_with_options(
773 &self,
774 html: &str,
775 base_url: Option<&str>,
776 options: Option<ReadabilityOptions>,
777 ) -> Result<Article> {
778 self.extract(html, base_url, options)
779 }
780
781 fn extract(
782 &self,
783 html: &str,
784 base_url: Option<&str>,
785 options: Option<ReadabilityOptions>,
786 ) -> Result<Article> {
787 let clean_base_url = match base_url {
788 None => None,
789 Some(url) => Some(Self::validate_base_url(url)?),
790 };
791 self.context.with(|ctx| {
792 let extract_fn: Function = ctx
793 .globals()
794 .get("extract")
795 .js_context("extract function not found")?;
796 let options_obj = match options {
797 None => None,
798 Some(options) => Some(options.build(ctx.clone())?),
799 };
800
801 let result: Value = extract_fn
802 .call((html, clean_base_url, options_obj))
803 .js_context("Failed to call extract")?;
804
805 // Check if result is an error object
806 if let Some(obj) = result.as_object()
807 && let Ok(error_type) = obj.get::<_, String>("errorType")
808 {
809 let error_msg = obj
810 .get::<_, String>("error")
811 .unwrap_or_else(|_| "Unknown error".to_string());
812
813 return Err(match error_type.as_str() {
814 "HtmlParseError" => ReadabilityError::HtmlParseError(error_msg),
815 "ExtractionError" => ReadabilityError::ExtractionError(error_msg),
816 "RuntimeError" => ReadabilityError::JsEvaluation {
817 context: format!("JavaScript runtime error: {}", error_msg),
818 source: rquickjs::Error::Unknown,
819 },
820 _ => ReadabilityError::ExtractionError(format!(
821 "Unknown error type '{}': {}",
822 error_type, error_msg
823 )),
824 });
825 }
826
827 // If not an error object, try to parse as Article
828 Article::try_from(result)
829 })
830 }
831}
832
833#[cfg(test)]
834mod tests {
835 use super::*;
836
837 #[test]
838 fn test_basic_extraction() {
839 let html = r#"
840 <html>
841 <head><title>Test Article Title</title></head>
842 <body>
843 <h1>This is a test article</h1>
844 <p>This is the first paragraph with some content that should be long enough to be considered readable content by the readability algorithm.</p>
845 <p>This is another paragraph with more content. It has enough text to make the article substantial and worth reading.</p>
846 <p>And here's a third paragraph to make sure we have enough content for the readability parser to work with.</p>
847 </body>
848 </html>
849 "#;
850
851 let readability = Readability::new().unwrap();
852 let article = readability
853 .extract(html, Some("https://example.com"), None)
854 .unwrap();
855
856 assert_eq!(article.title, "Test Article Title");
857 assert!(article.content.contains("first paragraph"));
858 assert!(article.content.contains("another paragraph"));
859 assert!(article.content.contains("third paragraph"));
860 assert!(article.content.contains("<p>"));
861 assert!(article.text_content.contains("This is a test article"));
862 assert!(!article.text_content.contains("<"));
863 assert!(article.length > 0);
864 }
865}