1use regex::Regex;
32use scraper::{Html, Selector, ElementRef};
33use serde::{Deserialize, Serialize};
34use std::collections::HashMap;
35use thiserror::Error;
36mod regexps;
39mod utils;
40
41pub use regexps::{
43 is_unlikely_candidate, has_positive_indicators, has_negative_indicators,
44 is_byline, is_video_url, is_whitespace, has_content, contains_ad_words, contains_loading_words,
45 is_extraneous_content, is_share_element, is_next_link, is_prev_link, is_hash_url,
46 is_b64_data_url, is_json_ld_article_type, replace_font_tags, normalize_whitespace,
47 tokenize_text, count_commas
48};
49
50pub use utils::{
51 to_absolute_uri, is_url, get_inner_text, get_char_count, is_phrasing_content,
52 is_single_image, is_node_visible, has_ancestor_tag, get_node_ancestors,
53 is_element_without_content, has_single_tag_inside_element, has_child_block_element,
54 should_clean_attribute, extract_text_content, word_count, is_title_candidate,
55 unescape_html_entities, clean_text, get_link_density
56};
57
58#[derive(Error, Debug)]
60pub enum ReadabilityError {
61 #[error("Invalid HTML document")]
62 InvalidHtml,
63 #[error("No content found")]
64 NoContent,
65 #[error("Parsing failed: {0}")]
66 ParseError(String),
67}
68
69#[derive(Debug, Clone, Copy)]
71pub struct ReadabilityFlags {
72 pub strip_unlikelys: bool,
73 pub weight_classes: bool,
74 pub clean_conditionally: bool,
75}
76
77impl Default for ReadabilityFlags {
78 fn default() -> Self {
79 Self {
80 strip_unlikelys: true,
81 weight_classes: true,
82 clean_conditionally: true,
83 }
84 }
85}
86
87#[derive(Debug, Clone)]
89pub struct ReadabilityOptions {
90 pub debug: bool,
92 pub max_elems_to_parse: usize,
94 pub nb_top_candidates: usize,
96 pub char_threshold: usize,
98 pub classes_to_preserve: Vec<String>,
100 pub keep_classes: bool,
102 pub disable_json_ld: bool,
104 pub allowed_video_regex: Option<Regex>,
106 pub link_density_modifier: f64,
108 pub flags: ReadabilityFlags,
110}
111
112impl Default for ReadabilityOptions {
113 fn default() -> Self {
114 Self {
115 debug: false,
116 max_elems_to_parse: 0,
117 nb_top_candidates: 5,
118 char_threshold: 25, classes_to_preserve: Vec::new(),
120 keep_classes: false,
121 disable_json_ld: false,
122 allowed_video_regex: None,
123 link_density_modifier: 1.0,
124 flags: ReadabilityFlags::default(),
125 }
126 }
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct Article {
132 pub title: Option<String>,
133 pub content: Option<String>,
134 pub text_content: Option<String>,
135 pub length: Option<usize>,
136 pub excerpt: Option<String>,
137 pub byline: Option<String>,
138 pub dir: Option<String>,
139 pub site_name: Option<String>,
140 pub lang: Option<String>,
141 pub published_time: Option<String>,
142 pub readerable: Option<bool>,
144}
145
146pub struct Readability {
148 document: Html,
149 options: ReadabilityOptions,
150 base_uri: Option<String>,
151 article_title: Option<String>,
152 article_byline: Option<String>,
153 article_dir: Option<String>,
154 article_site_name: Option<String>,
155 metadata: HashMap<String, String>,
156}
157
158impl Readability {
159 pub fn new(html: &str, options: Option<ReadabilityOptions>) -> Result<Self, ReadabilityError> {
161 let document = Html::parse_document(html);
162 let options = options.unwrap_or_default();
163
164 Ok(Self {
165 document,
166 options,
167 base_uri: None,
168 article_title: None,
169 article_byline: None,
170 article_dir: None,
171 article_site_name: None,
172 metadata: HashMap::new(),
173 })
174 }
175
176 pub fn new_with_base_uri(html: &str, base_uri: &str, options: Option<ReadabilityOptions>) -> Result<Self, ReadabilityError> {
178 let mut parser = Self::new(html, options)?;
179 parser.base_uri = Some(base_uri.to_string());
180 Ok(parser)
181 }
182
183 pub fn parse(&mut self) -> Option<Article> {
185 if self.options.debug {
186 println!("Starting readability parsing...");
187 }
188
189 self.unwrap_noscript_images();
191
192 if !self.options.disable_json_ld {
194 self.extract_json_ld_metadata();
195 }
196
197 self.remove_scripts();
199
200 self.prep_document();
202
203 self.get_article_metadata();
205
206 self.get_article_title();
208
209 let char_threshold = self.options.char_threshold;
211 let debug = self.options.debug;
212 let has_description = self.metadata.get("description").is_some();
213 let description = self.metadata.get("description").cloned();
214
215 let article_content = self.grab_article()?;
217 let raw_content_html = article_content.inner_html();
218 let text_content = get_inner_text(&article_content, true);
219
220 let excerpt = if !has_description {
222 let p_selector = Selector::parse("p").unwrap();
224 article_content.select(&p_selector)
225 .next()
226 .map(|p| get_inner_text(&p, true))
227 .filter(|text| !text.trim().is_empty())
228 } else {
229 description
230 };
231
232 let content_html = self.clean_article_content(&raw_content_html);
233 let text_length = text_content.len();
234
235 if text_length < char_threshold {
237 if debug {
238 println!("Content too short: {} chars (minimum: {})", text_length, char_threshold);
239 }
240 return None;
241 }
242
243 Some(Article {
244 title: self.article_title.clone(),
245 content: Some(content_html),
246 text_content: Some(text_content),
247 length: Some(text_length),
248 excerpt,
249 byline: self.article_byline.clone(),
250 dir: self.article_dir.clone(),
251 site_name: self.article_site_name.clone(),
252 lang: self.metadata.get("lang").cloned(),
253 published_time: self.metadata.get("publishedTime").cloned(),
254 readerable: Some(true), })
256 }
257
258
259
260 fn remove_scripts(&mut self) {
261 }
264
265
266
267 fn get_article_metadata(&mut self) {
268 let meta_selector = Selector::parse("meta").unwrap();
270
271 for element in self.document.select(&meta_selector) {
272 if let Some(property) = element.value().attr("property") {
273 if let Some(content) = element.value().attr("content") {
274 self.metadata.insert(property.to_string(), content.to_string());
275
276 match property {
278 "og:site_name" => self.article_site_name = Some(content.to_string()),
279 "article:published_time" => {
280 self.metadata.insert("publishedTime".to_string(), content.to_string());
281 },
282 _ => {}
283 }
284 }
285 }
286 if let Some(name) = element.value().attr("name") {
287 if let Some(content) = element.value().attr("content") {
288 self.metadata.insert(name.to_string(), content.to_string());
289
290 match name {
292 "author" => self.article_byline = Some(content.to_string()),
293 _ => {}
294 }
295 }
296 }
297 }
298
299 self.extract_byline_from_dom();
301
302 if let Ok(html_selector) = Selector::parse("html") {
304 if let Some(html_element) = self.document.select(&html_selector).next() {
305 if let Some(lang) = html_element.value().attr("lang") {
306 self.metadata.insert("lang".to_string(), lang.to_string());
307 }
308 }
309 }
310 }
311
312 fn extract_byline_from_dom(&mut self) {
313 if self.article_byline.is_some() {
315 return;
316 }
317
318 let byline_selectors = [
320 ".byline",
321 ".author",
322 ".post-author",
323 ".article-author",
324 "[rel=\"author\"]",
325 ".by-author",
326 ".writer",
327 ];
328
329 for selector_str in &byline_selectors {
330 if let Ok(selector) = Selector::parse(selector_str) {
331 if let Some(element) = self.document.select(&selector).next() {
332 let byline_text = self.get_inner_text_from_ref(&element, false);
333 let cleaned_byline = byline_text.trim();
334
335 let cleaned_byline = cleaned_byline
337 .strip_prefix("By ")
338 .or_else(|| cleaned_byline.strip_prefix("by "))
339 .or_else(|| cleaned_byline.strip_prefix("BY "))
340 .or_else(|| cleaned_byline.strip_prefix("Author: "))
341 .or_else(|| cleaned_byline.strip_prefix("Written by "))
342 .unwrap_or(cleaned_byline);
343
344 if !cleaned_byline.is_empty() && cleaned_byline.len() < 100 {
345 self.article_byline = Some(cleaned_byline.to_string());
346 break;
347 }
348 }
349 }
350 }
351 }
352
353 fn get_article_title(&mut self) {
354 let title_selector = Selector::parse("title").unwrap();
355 if let Some(title_element) = self.document.select(&title_selector).next() {
356 self.article_title = Some(title_element.inner_html());
357 }
358
359 let h1_selector = Selector::parse("h1").unwrap();
361 for h1 in self.document.select(&h1_selector) {
362 let h1_text = self.get_inner_text_from_ref(&h1, false);
363 if h1_text.len() > 10 {
364 self.article_title = Some(h1_text);
365 break;
366 }
367 }
368 }
369
370 fn grab_article(&mut self) -> Option<ElementRef> {
371 if self.options.debug {
372 println!("**** grabArticle ****");
373 }
374
375 if self.options.max_elems_to_parse > 0 {
377 let all_elements: Vec<_> = self.document.select(&Selector::parse("*").unwrap()).collect();
378 if all_elements.len() > self.options.max_elems_to_parse {
379 return None;
380 }
381 }
382
383 if self.options.flags.strip_unlikelys {
385 self.remove_unlikely_candidates_from_dom();
386 }
387
388 self.remove_empty_paragraphs();
390
391 let candidates = self.find_and_score_candidates();
393
394 if candidates.is_empty() {
395 return self.fallback_content_selection();
397 }
398
399 if let Some(best_candidate) = self.select_best_candidate(&candidates) {
401 let tag_name = best_candidate.value().name();
403 let text_content = self.get_inner_text_from_ref(&best_candidate, true);
404
405 let selector = Selector::parse(tag_name).unwrap();
407 for element in self.document.select(&selector) {
408 let element_text = self.get_inner_text_from_ref(&element, true);
409 if element_text == text_content {
410 return Some(element);
411 }
412 }
413 }
414
415 None
416 }
417
418
419
420 fn get_class_weight(&self, element: &ElementRef) -> f64 {
421 if !self.options.flags.weight_classes {
423 return 0.0;
424 }
425
426 let mut weight = 0.0;
427
428 if let Some(class_name) = element.value().attr("class") {
430 if has_negative_indicators(class_name) {
431 weight -= 25.0;
432 }
433 if has_positive_indicators(class_name) {
434 weight += 25.0;
435 }
436 }
437
438 if let Some(id) = element.value().attr("id") {
440 if has_negative_indicators(id) {
441 weight -= 25.0;
442 }
443 if has_positive_indicators(id) {
444 weight += 25.0;
445 }
446 }
447
448 weight
449 }
450
451 fn find_and_score_candidates(&self) -> Vec<(ElementRef, f64)> {
452 let mut candidates = Vec::new();
453 let mut candidate_map: HashMap<String, (ElementRef, f64)> = HashMap::new();
454
455 let content_selector = Selector::parse("p, td, pre").unwrap();
457
458 for element in self.document.select(&content_selector) {
459 let text = get_inner_text(&element, true);
460 let text_length = text.trim().len();
461
462 if text_length < 25 {
464 continue;
465 }
466
467 let mut ancestors = Vec::new();
469 if let Some(parent) = element.parent() {
470 if let Some(parent_element) = ElementRef::wrap(parent) {
471 if self.options.flags.strip_unlikelys && self.is_unlikely_candidate(&parent_element) {
473 continue;
474 }
475 ancestors.push((parent_element, 1));
476
477 if let Some(grandparent) = parent.parent() {
478 if let Some(grandparent_element) = ElementRef::wrap(grandparent) {
479 if self.options.flags.strip_unlikelys && self.is_unlikely_candidate(&grandparent_element) {
480 continue;
481 }
482 ancestors.push((grandparent_element, 2));
483 }
484 }
485 }
486 }
487
488 for (ancestor, _level) in &ancestors {
490 let ancestor_id = self.get_element_id(ancestor);
491 if !candidate_map.contains_key(&ancestor_id) {
492 let content_score = self.initialize_candidate_score(ancestor);
493 candidate_map.insert(ancestor_id, (*ancestor, content_score));
494 }
495 }
496
497 let mut content_score = 1.0;
499
500 content_score += count_commas(&text) as f64;
502
503 content_score += (text_length as f64 / 100.0).min(3.0);
505
506 for (ancestor, level) in &ancestors {
508 let ancestor_id = self.get_element_id(ancestor);
509 if let Some((_, current_score)) = candidate_map.get_mut(&ancestor_id) {
510 let score_divider = match level {
511 1 => 1.0, 2 => 2.0, _ => (*level as f64) * 3.0, };
515 *current_score += content_score / score_divider;
516 }
517 }
518 }
519
520 for (_, (element, mut score)) in candidate_map {
522 let link_density = get_link_density(&element);
523 score *= 1.0 - link_density;
524 candidates.push((element, score));
525 }
526
527 candidates
528 }
529
530 fn is_unlikely_candidate(&self, element: &ElementRef) -> bool {
531 let tag_name = element.value().name();
532
533 if matches!(tag_name, "nav" | "aside" | "header" | "footer") {
535 return true;
536 }
537
538 if matches!(tag_name, "body" | "a" | "table" | "tbody" | "tr" | "td" | "th" | "article" | "section") {
540 return false;
541 }
542
543 let class_and_id = format!(
545 "{} {}",
546 element.value().attr("class").unwrap_or(""),
547 element.value().attr("id").unwrap_or("")
548 );
549
550 if is_unlikely_candidate(&class_and_id) && !has_positive_indicators(&class_and_id) {
552 return true;
553 }
554
555 if let Some(role) = element.value().attr("role") {
557 if matches!(role, "menu" | "menubar" | "complementary" | "navigation" | "alert" | "alertdialog" | "dialog") {
558 return true;
559 }
560 }
561
562 false
563 }
564
565 fn get_element_id(&self, element: &ElementRef) -> String {
566 format!("{:p}", element.value())
568 }
569
570 fn initialize_candidate_score(&self, element: &ElementRef) -> f64 {
571 let mut score = 1.0;
572
573 let tag_name = element.value().name().to_uppercase();
575 match tag_name.as_str() {
576 "DIV" => score += 5.0,
577 "PRE" | "TD" | "BLOCKQUOTE" => score += 3.0,
578 "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => score -= 3.0,
579 "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => score -= 5.0,
580 _ => {},
581 }
582
583 score += self.get_class_weight(element);
585
586 score
587 }
588
589
590
591
592
593 fn select_best_candidate<'a>(&self, candidates: &'a [(ElementRef<'a>, f64)]) -> Option<ElementRef<'a>> {
594 if candidates.is_empty() {
595 return None;
596 }
597
598 let mut sorted_candidates = candidates.to_vec();
600 sorted_candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
601
602 let best_candidate = sorted_candidates[0].0;
603 let best_score = sorted_candidates[0].1;
604
605 if self.options.debug {
606 println!("Best candidate score: {}", best_score);
607 }
608
609 if let Some(parent) = best_candidate.parent() {
612 if let Some(parent_element) = ElementRef::wrap(parent) {
613 let nav_selector = Selector::parse("nav, aside, header, footer, [class*='sidebar'], [class*='navigation']").unwrap();
615 if parent_element.select(&nav_selector).next().is_some() {
616 if self.options.debug {
617 println!("Parent contains navigation elements, skipping");
618 }
619 } else {
620 let parent_text_length = self.get_inner_text_from_ref(&parent_element, false).len();
622 let candidate_text_length = self.get_inner_text_from_ref(&best_candidate, false).len();
623
624 if parent_text_length > candidate_text_length * 2 {
626 let parent_score = self.calculate_candidate_score(&parent_element);
627 if parent_score > best_score * 0.75 {
628 if self.options.debug {
629 println!("Using parent element with score: {}", parent_score);
630 }
631 return Some(parent_element);
632 }
633 }
634 }
635 }
636 }
637
638 Some(best_candidate)
639 }
640
641
642
643 fn calculate_candidate_score(&self, element: &ElementRef) -> f64 {
644 let text = get_inner_text(element, true);
645
646 if text.len() < 25 {
648 return 0.0;
649 }
650
651 let mut content_score = 0.0;
652
653 content_score += 1.0;
655
656 content_score += count_commas(&text) as f64;
658
659 content_score += (text.len() as f64 / 100.0).min(3.0);
661
662 content_score
663 }
664
665 fn fallback_content_selection(&self) -> Option<ElementRef> {
666 let selectors = ["article", "main", "#content", ".content", ".entry-content", "body"];
667
668 for selector_str in &selectors {
669 if let Ok(selector) = Selector::parse(selector_str) {
670 if let Some(element) = self.document.select(&selector).next() {
671 if self.options.debug {
672 println!("Found content using fallback selector: {}", selector_str);
673 }
674 return Some(element);
675 }
676 }
677 }
678
679 None
680 }
681
682 fn extract_json_ld_metadata(&mut self) {
683 let script_selector = Selector::parse("script[type='application/ld+json']").unwrap();
685
686 for element in self.document.select(&script_selector) {
687 let text = element.text().collect::<String>();
688 if text.contains("@type") && text.contains("Article") {
691 if self.options.debug {
693 println!("Found JSON-LD article metadata");
694 }
695 }
696 }
697 }
698
699
700
701 fn unwrap_noscript_images(&mut self) {
702 let _noscript_selector = Selector::parse("noscript").unwrap();
704 }
706
707 fn prep_document(&mut self) {
708 if self.options.debug {
709 println!("**** prepDocument ****");
710 }
711
712 self.remove_nodes_by_tag("script");
714 self.remove_nodes_by_tag("style");
715 self.remove_nodes_by_tag("noscript");
716
717 if self.options.flags.strip_unlikelys {
719 self.remove_unlikely_candidates_from_dom();
720 }
721
722 self.replace_font_tags();
724
725 self.replace_brs();
727
728 self.unwrap_noscript_images();
730
731 self.convert_divs_to_paragraphs();
733
734 self.remove_empty_paragraphs();
736
737 if self.options.debug {
738 println!("Document preparation complete");
739 }
740 }
741
742 fn remove_unlikely_candidates_from_dom(&mut self) {
743 if self.options.debug {
747 println!("Removing unlikely candidates from DOM");
748 }
749 }
750
751 fn remove_empty_paragraphs(&mut self) {
752 if self.options.debug {
756 println!("Removing empty paragraphs");
757 }
758 }
759
760 fn remove_nodes_by_tag(&mut self, tag_name: &str) {
761 if self.options.debug {
764 println!("Removing {} tags", tag_name);
765 }
766 }
767
768 fn replace_font_tags(&mut self) {
769 if self.options.debug {
771 println!("Replacing font tags with span tags");
772 }
773 }
774
775 fn replace_brs(&mut self) {
776 if self.options.debug {
778 println!("Converting <br> sequences to paragraphs");
779 }
780 }
781
782 fn convert_divs_to_paragraphs(&mut self) {
783 if self.options.debug {
785 println!("Converting appropriate DIVs to paragraphs");
786 }
787 }
788
789 fn clean_article_content(&self, content: &str) -> String {
790 if self.options.debug {
791 println!("Cleaning article content");
792 }
793
794 let mut cleaned_content = content.to_string();
795
796 if self.options.debug {
797 println!("Original content before cleaning: {}", cleaned_content);
798 }
799
800 let unwanted_patterns = [
802 r"(?s)<nav[^>]*>.*?</nav>",
803 r"(?s)<aside[^>]*>.*?</aside>",
804 r"(?s)<header[^>]*>.*?</header>",
805 r"(?s)<footer[^>]*>.*?</footer>",
806 r#"(?s)<div[^>]*class=["'][^"']*sidebar[^"']*["'][^>]*>.*?</div>"#,
807 r#"(?s)<div[^>]*class=["'][^"']*navigation[^"']*["'][^>]*>.*?</div>"#,
808 ];
809
810 for pattern in &unwanted_patterns {
811 let re = regex::Regex::new(pattern).unwrap();
812 cleaned_content = re.replace_all(&cleaned_content, "").to_string();
813 }
814
815 let re_whitespace = regex::Regex::new(r"\s{2,}").unwrap();
817 cleaned_content = re_whitespace.replace_all(&cleaned_content, " ").to_string();
818
819 cleaned_content.trim().to_string()
820 }
821
822
823
824 fn get_inner_text_from_ref(&self, element: &ElementRef, normalize_spaces: bool) -> String {
825 let text = element.text().collect::<Vec<_>>().join(" ");
826 if normalize_spaces {
827 let re = Regex::new(r"\s+").unwrap();
828 re.replace_all(&text, " ").trim().to_string()
829 } else {
830 text
831 }
832 }
833}
834
835pub fn is_probably_readerable(html: &str, options: Option<ReadabilityOptions>) -> bool {
837 let document = Html::parse_document(html);
838 let opts = options.unwrap_or_default();
839
840 let min_content_length = if opts.char_threshold > 0 {
842 opts.char_threshold
843 } else {
844 140 };
846
847 let min_score = if min_content_length <= 20 {
849 8.0 } else if min_content_length <= 50 {
851 20.0 } else if min_content_length <= 100 {
853 30.0 } else {
855 40.0 };
857
858 let content_selectors = ["p", "pre", "article", "div"];
860 let mut score = 0.0;
861 let mut total_text_length = 0;
862
863 for selector_str in &content_selectors {
864 if let Ok(selector) = Selector::parse(selector_str) {
865 for element in document.select(&selector) {
866 let text_content = element.text().collect::<String>();
867 let text_length = text_content.trim().len();
868
869 if text_length < 10 { continue;
871 }
872
873 total_text_length += text_length;
874
875 let class_and_id = format!("{} {}",
877 element.value().attr("class").unwrap_or(""),
878 element.value().attr("id").unwrap_or("")
879 );
880
881 if is_unlikely_candidate(&class_and_id) {
882 score -= 5.0; continue;
884 }
885
886 let element_score = match element.value().name() {
888 "article" => (text_length as f64 * 0.5).min(30.0),
889 "p" => (text_length as f64 * 0.3).min(20.0),
890 "pre" => (text_length as f64 * 0.4).min(25.0),
891 "div" => {
892 if min_content_length <= 50 && text_length > 20 {
894 (text_length as f64 * 0.25).min(15.0)
895 } else if text_length > 80 {
896 (text_length as f64 * 0.2).min(15.0)
897 } else {
898 0.0
899 }
900 },
901 _ => 0.0,
902 };
903
904 score += element_score;
905
906 if score > min_score && total_text_length >= min_content_length {
908 return true;
909 }
910 }
911 }
912 }
913
914 score > min_score && total_text_length >= min_content_length
916}
917
918#[cfg(test)]
919mod tests {
920 use super::*;
921 use std::fs;
922 use std::path::Path;
923 use serde_json;
924
925 fn create_parser(html: &str) -> Readability {
927 Readability::new(html, Some(ReadabilityOptions {
928 debug: true,
929 char_threshold: 25, ..Default::default()
931 })).unwrap()
932 }
933
934 fn create_parser_with_options(html: &str, options: ReadabilityOptions) -> Readability {
936 Readability::new(html, Some(options)).unwrap()
937 }
938
939 fn load_test_case(test_dir: &str) -> Result<(String, String, serde_json::Value), Box<dyn std::error::Error>> {
941 let base_path = Path::new("mozzila-readability/test/test-pages").join(test_dir);
942
943 let source_path = base_path.join("source.html");
944 let expected_content_path = base_path.join("expected.html");
945 let expected_metadata_path = base_path.join("expected-metadata.json");
946
947 let source = fs::read_to_string(&source_path)
948 .map_err(|e| format!("Failed to read source.html for {}: {}", test_dir, e))?;
949 let expected_content = fs::read_to_string(&expected_content_path)
950 .map_err(|e| format!("Failed to read expected.html for {}: {}", test_dir, e))?;
951 let expected_metadata: serde_json::Value = serde_json::from_str(
952 &fs::read_to_string(&expected_metadata_path)
953 .map_err(|e| format!("Failed to read expected-metadata.json for {}: {}", test_dir, e))?
954 ).map_err(|e| format!("Failed to parse expected-metadata.json for {}: {}", test_dir, e))?;
955
956 Ok((source, expected_content, expected_metadata))
957 }
958
959 fn get_test_case_dirs() -> Vec<String> {
961 let test_pages_path = Path::new("mozzila-readability/test/test-pages");
962
963 if !test_pages_path.exists() {
964 println!("Warning: Mozilla test pages directory not found at {:?}", test_pages_path);
965 return Vec::new();
966 }
967
968 let mut dirs = Vec::new();
969 if let Ok(entries) = fs::read_dir(test_pages_path) {
970 for entry in entries {
971 if let Ok(entry) = entry {
972 if entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
973 if let Some(name) = entry.file_name().to_str() {
974 dirs.push(name.to_string());
975 }
976 }
977 }
978 }
979 }
980
981 dirs.sort();
982 dirs
983 }
984
985 fn test_mozilla_case(test_dir: &str) {
987 let (source, _expected_content, expected_metadata) = match load_test_case(test_dir) {
988 Ok(data) => data,
989 Err(e) => {
990 println!("Skipping test case {}: {}", test_dir, e);
991 return;
992 }
993 };
994
995 let base_uri = "http://fakehost/test/page.html";
997 let mut parser = match Readability::new_with_base_uri(&source, base_uri, Some(ReadabilityOptions {
998 debug: false,
999 char_threshold: 25,
1000 classes_to_preserve: vec!["caption".to_string()],
1001 ..Default::default()
1002 })) {
1003 Ok(p) => p,
1004 Err(e) => {
1005 println!("Failed to create parser for {}: {:?}", test_dir, e);
1006 return;
1007 }
1008 };
1009
1010 let is_readerable = is_probably_readerable(&source, Some(ReadabilityOptions {
1012 char_threshold: 25,
1013 ..Default::default()
1014 }));
1015
1016 let expected_readerable = expected_metadata["readerable"].as_bool().unwrap_or(false);
1017
1018 if expected_readerable && !is_readerable {
1020 println!("Warning: {} expected to be readerable but failed readerable check", test_dir);
1021 }
1022
1023 let article = parser.parse();
1025
1026 if expected_readerable {
1027 if let Some(article) = article {
1028 if let Some(expected_title) = expected_metadata["title"].as_str() {
1030 if let Some(actual_title) = &article.title {
1031 if !actual_title.contains(expected_title) && !expected_title.contains(actual_title) {
1033 println!("Title mismatch in {}: expected '{}', got '{}'",
1034 test_dir, expected_title, actual_title);
1035 }
1036 } else {
1037 println!("Missing title in {}: expected '{}'", test_dir, expected_title);
1038 }
1039 }
1040
1041 if let Some(expected_byline) = expected_metadata["byline"].as_str() {
1042 if let Some(actual_byline) = &article.byline {
1043 if actual_byline != expected_byline {
1044 println!("Byline mismatch in {}: expected '{}', got '{}'",
1045 test_dir, expected_byline, actual_byline);
1046 }
1047 } else {
1048 println!("Missing byline in {}: expected '{}'", test_dir, expected_byline);
1049 }
1050 }
1051
1052 if let Some(expected_lang) = expected_metadata["lang"].as_str() {
1053 if let Some(actual_lang) = &article.lang {
1054 if actual_lang != expected_lang {
1055 println!("Language mismatch in {}: expected '{}', got '{}'",
1056 test_dir, expected_lang, actual_lang);
1057 }
1058 } else {
1059 println!("Missing language in {}: expected '{}'", test_dir, expected_lang);
1060 }
1061 }
1062
1063 if let Some(expected_site_name) = expected_metadata["siteName"].as_str() {
1064 if let Some(actual_site_name) = &article.site_name {
1065 if actual_site_name != expected_site_name {
1066 println!("Site name mismatch in {}: expected '{}', got '{}'",
1067 test_dir, expected_site_name, actual_site_name);
1068 }
1069 } else {
1070 println!("Missing site name in {}: expected '{}'", test_dir, expected_site_name);
1071 }
1072 }
1073
1074 if let Some(expected_published_time) = expected_metadata["publishedTime"].as_str() {
1075 if let Some(actual_published_time) = &article.published_time {
1076 if actual_published_time != expected_published_time {
1077 println!("Published time mismatch in {}: expected '{}', got '{}'",
1078 test_dir, expected_published_time, actual_published_time);
1079 }
1080 } else {
1081 println!("Missing published time in {}: expected '{}'", test_dir, expected_published_time);
1082 }
1083 }
1084
1085 if let Some(content) = &article.content {
1087 if content.trim().is_empty() {
1088 println!("Empty content in {}", test_dir);
1089 }
1090 } else {
1091 println!("Missing content in {}", test_dir);
1092 }
1093
1094 assert_eq!(article.readerable, Some(true), "Article should be marked as readerable for {}", test_dir);
1096 } else {
1097 println!("Failed to parse article for {} (expected to be readerable)", test_dir);
1098 }
1099 } else {
1100 if article.is_some() {
1102 println!("Unexpectedly parsed article for {} (expected not readerable)", test_dir);
1103 }
1104 }
1105 }
1106
1107 #[test]
1108 fn test_readability_options_default() {
1109 let options = ReadabilityOptions::default();
1110 assert!(!options.debug);
1111 assert_eq!(options.max_elems_to_parse, 0);
1112 assert_eq!(options.nb_top_candidates, 5);
1113 assert_eq!(options.char_threshold, 25);
1114 assert!(!options.keep_classes);
1115 assert!(!options.disable_json_ld);
1116 }
1117
1118 #[test]
1119 fn test_article_creation() {
1120 let article = Article {
1121 title: Some("Test Title".to_string()),
1122 content: Some("<div>Test content</div>".to_string()),
1123 text_content: Some("Test content".to_string()),
1124 length: Some(12),
1125 excerpt: Some("Test excerpt".to_string()),
1126 byline: Some("Test Author".to_string()),
1127 readerable: Some(true),
1128 dir: None,
1129 site_name: Some("Test Site".to_string()),
1130 lang: Some("en".to_string()),
1131 published_time: None,
1132 };
1133
1134 assert_eq!(article.title.unwrap(), "Test Title");
1135 assert_eq!(article.length.unwrap(), 12);
1136 assert!(article.excerpt.is_some());
1137 }
1138
1139 #[test]
1140 fn test_simple_article_parsing() {
1141 let html = r#"
1142 <!DOCTYPE html>
1143 <html>
1144 <head>
1145 <title>Test Article</title>
1146 <meta name="author" content="John Doe">
1147 <meta name="description" content="This is a test article">
1148 </head>
1149 <body>
1150 <h1>Test Article Title</h1>
1151 <article>
1152 <p>This is the first paragraph of our test article. It contains enough content to be considered readable.</p>
1153 <p>This is the second paragraph with more content. It helps ensure the article meets the minimum length requirements for processing.</p>
1154 <p>A third paragraph to add more substance to our test article and make it comprehensive enough for testing.</p>
1155 </article>
1156 </body>
1157 </html>
1158 "#;
1159
1160 let mut options = ReadabilityOptions::default();
1161 options.debug = true;
1162 let mut parser = create_parser_with_options(html, options);
1163 let result = parser.parse();
1164
1165 assert!(result.is_some());
1166 let article = result.unwrap();
1167 assert!(article.title.is_some() && !article.title.as_ref().unwrap().is_empty());
1168 assert!(article.content.is_some());
1169 assert!(article.length.is_some() && article.length.unwrap() > 100);
1170 }
1171
1172 #[test]
1173 fn test_empty_document() {
1174 let html = "<html><body></body></html>";
1175 let mut options = ReadabilityOptions::default();
1176 options.debug = true;
1177 let mut parser = create_parser_with_options(html, options);
1178 let result = parser.parse();
1179
1180 assert!(result.is_none());
1182 }
1183
1184 #[test]
1185 fn test_minimal_content() {
1186 let html = r#"
1187 <html>
1188 <body>
1189 <p>Short</p>
1190 </body>
1191 </html>
1192 "#;
1193
1194 let mut options = ReadabilityOptions::default();
1195 options.debug = true;
1196 let mut parser = create_parser_with_options(html, options);
1197 let result = parser.parse();
1198
1199 assert!(result.is_none());
1201 }
1202
1203 #[test]
1204 fn test_article_with_metadata() {
1205 let html = r#"
1206 <!DOCTYPE html>
1207 <html lang="en">
1208 <head>
1209 <title>Test Article - Test Site</title>
1210 <meta name="author" content="Jane Smith">
1211 <meta name="description" content="A comprehensive test article for readability testing">
1212 <meta property="og:site_name" content="Test Publishing">
1213 <meta property="og:title" content="Test Article">
1214 </head>
1215 <body>
1216 <article>
1217 <h1>Test Article Title</h1>
1218 <div class="byline">By Jane Smith</div>
1219 <p>This is a comprehensive test article with enough content to be considered readable by the parser.</p>
1220 <p>The article contains multiple paragraphs with substantial text content that should pass all readability checks.</p>
1221 <p>Additional content to ensure the article meets minimum length requirements and provides meaningful extractable content.</p>
1222 <p>More content to test the parsing and extraction capabilities of the readability implementation.</p>
1223 </article>
1224 </body>
1225 </html>
1226 "#;
1227
1228 let mut parser = create_parser(html);
1229 let result = parser.parse();
1230
1231 assert!(result.is_some());
1232 let article = result.unwrap();
1233
1234 assert!(article.title.is_some() && !article.title.as_ref().unwrap().is_empty());
1235 assert!(article.byline.is_some());
1236 assert!(article.site_name.is_some());
1237 assert!(article.lang.is_some());
1238 assert_eq!(article.lang.as_ref().unwrap(), "en");
1239 assert!(article.length.is_some() && article.length.unwrap() > 200);
1240 }
1241
1242 #[test]
1243 fn test_is_probably_readerable_basic() {
1244 let readable_html = r#"
1246 <html>
1247 <body>
1248 <article>
1249 <h1>Long Article Title</h1>
1250 <p>This is a long article with substantial content that should be considered readable.</p>
1251 <p>Multiple paragraphs with enough text to meet the readability thresholds.</p>
1252 <p>Additional content to ensure this passes the readability checks.</p>
1253 <p>Even more content to make sure this document is substantial enough.</p>
1254 </article>
1255 </body>
1256 </html>
1257 "#;
1258
1259 assert!(is_probably_readerable(readable_html, None));
1260
1261 let unreadable_html = r#"
1263 <html>
1264 <body>
1265 <nav>Menu</nav>
1266 <footer>Copyright</footer>
1267 </body>
1268 </html>
1269 "#;
1270
1271 assert!(!is_probably_readerable(unreadable_html, None));
1272 }
1273
1274 #[test]
1275 fn test_is_probably_readerable_with_options() {
1276 let html = r#"
1277 <html>
1278 <body>
1279 <p>Medium length content that is somewhat substantial.</p>
1280 </body>
1281 </html>
1282 "#;
1283
1284 assert!(!is_probably_readerable(html, None));
1286
1287 let lenient_options = ReadabilityOptions {
1289 char_threshold: 20,
1290 ..Default::default()
1291 };
1292 assert!(is_probably_readerable(html, Some(lenient_options)));
1293 }
1294
1295 #[test]
1296 fn test_parser_creation() {
1297 let html = "<html><body><p>Test content</p></body></html>";
1298 let parser = Readability::new(html, None);
1299 assert!(parser.is_ok());
1300 }
1301
1302 #[test]
1303 fn test_parser_with_options() {
1304 let html = "<html><body><p>Test content</p></body></html>";
1305 let options = ReadabilityOptions {
1306 debug: true,
1307 char_threshold: 100,
1308 ..Default::default()
1309 };
1310 let parser = Readability::new(html, Some(options));
1311 assert!(parser.is_ok());
1312 }
1313
1314 #[test]
1315 fn test_unicode_handling() {
1316 let unicode_html = r#"
1317 <!DOCTYPE html>
1318 <html lang="zh">
1319 <head>
1320 <title>测试文章</title>
1321 <meta charset="UTF-8">
1322 </head>
1323 <body>
1324 <article>
1325 <h1>Unicode Content Test</h1>
1326 <p>This article contains unicode characters: 测试 🚀 ñáéíóú àèìòù</p>
1327 <p>Emoji support test: 😀 🎉 🌟 💻 📚</p>
1328 <p>Various languages: English, Español, Français, 中文, 日本語, العربية</p>
1329 <p>Special characters: ™ © ® € £ ¥ § ¶ † ‡ • … ‰ ′ ″ ‹ › « » " " ' '</p>
1330 </article>
1331 </body>
1332 </html>
1333 "#;
1334
1335 let mut parser = create_parser(unicode_html);
1336 let result = parser.parse();
1337
1338 assert!(result.is_some());
1339 let article = result.unwrap();
1340
1341 assert!(article.title.is_some());
1343 assert!(article.text_content.is_some());
1344 }
1345
1346 #[test]
1347 fn test_malformed_html_handling() {
1348 let malformed_html = r#"
1349 <html>
1350 <head>
1351 <title>Malformed HTML Test</title>
1352 </head>
1353 <body>
1354 <article>
1355 <h1>Test Article</h1>
1356 <p>This is a test article with malformed HTML that contains substantial content to meet the minimum character threshold. The article discusses various aspects of HTML parsing and how robust parsers should handle malformed markup gracefully without failing completely.</p>
1357 <p>Missing closing tags and other issues are common in real-world HTML documents. A good readability parser should be able to extract meaningful content even when the HTML structure is not perfect. This includes handling unclosed tags, missing attributes, and other structural problems.</p>
1358 <div>Unclosed div with more content to ensure we meet the character requirements for successful parsing.</div>
1359 </article>
1360 </body>
1361 </html>
1362 "#;
1363
1364 let options = ReadabilityOptions {
1366 char_threshold: 50, debug: true,
1368 ..Default::default()
1369 };
1370 let mut parser = Readability::new(malformed_html, Some(options)).unwrap();
1371 let article = parser.parse();
1372
1373 assert!(article.is_some());
1375 let article = article.unwrap();
1376 assert!(article.title.is_some());
1377 assert_eq!(article.title.unwrap(), "Test Article");
1379 }
1380
1381 #[test]
1382 fn test_mozilla_test_case_001() {
1383 let html = r#"
1385 <!DOCTYPE html>
1386 <html class="no-js" lang="en">
1387 <head>
1388 <meta charset="utf-8"/>
1389 <title>Get your Frontend JavaScript Code Covered | Code | Nicolas Perriault</title>
1390 <meta name="description" content="Nicolas Perriault's homepage."/>
1391 <meta name="author" content="Nicolas Perriault"/>
1392 </head>
1393 <body>
1394 <div class="container">
1395 <article>
1396 <h1>Get your Frontend JavaScript Code Covered</h1>
1397 <p>This is the main content of the article about JavaScript code coverage.</p>
1398 <p>It contains multiple paragraphs with substantial content that should be extracted.</p>
1399 <p>The readability algorithm should identify this as the main content area.</p>
1400 </article>
1401 <nav class="sidebar">
1402 <ul>
1403 <li><a href="/">Home</a></li>
1404 <li><a href="/about">About</a></li>
1405 </ul>
1406 </nav>
1407 </div>
1408 </body>
1409 </html>
1410 "#;
1411
1412 let mut parser = create_parser(html);
1413 let article = parser.parse();
1414
1415 assert!(article.is_some());
1416 let article = article.unwrap();
1417
1418 assert!(article.title.is_some());
1420 assert!(article.title.as_ref().unwrap().contains("Get your Frontend JavaScript Code Covered"));
1421 assert_eq!(article.byline, Some("Nicolas Perriault".to_string()));
1422 assert_eq!(article.lang, Some("en".to_string()));
1423 assert_eq!(article.excerpt, Some("Nicolas Perriault's homepage.".to_string()));
1424
1425 assert!(article.content.is_some());
1427 let content = article.content.unwrap();
1428 println!("Extracted content: {}", content);
1429 assert!(content.contains("main content of the article"));
1430 assert!(content.contains("JavaScript code coverage"));
1431
1432 assert!(!content.contains("sidebar"));
1434 assert!(!content.contains("Home"));
1435 assert!(!content.contains("About"));
1436 }
1437
1438 #[test]
1439 fn test_mozilla_test_case_wikipedia() {
1440 let html = r#"
1442 <!DOCTYPE html>
1443 <html lang="en">
1444 <head>
1445 <title>Mozilla - Wikipedia</title>
1446 <meta name="description" content="Mozilla is a free software community founded in 1998."/>
1447 </head>
1448 <body>
1449 <div id="content">
1450 <h1>Mozilla</h1>
1451 <p><strong>Mozilla</strong> is a free software community founded in 1998.</p>
1452 <p>Mozilla Firefox is a web browser developed by Mozilla.</p>
1453 <h2>History</h2>
1454 <p>Mozilla was founded in 1998 when Netscape Communications Corporation released the source code for its flagship Netscape Communicator product.</p>
1455 <p>The Mozilla project was created to coordinate the development of the Mozilla Application Suite.</p>
1456 <h2>Products</h2>
1457 <h3>Firefox</h3>
1458 <p>Firefox is a free and open-source web browser developed by Mozilla Foundation.</p>
1459 <h3>Thunderbird</h3>
1460 <p>Thunderbird is a free and open-source email client developed by Mozilla Foundation.</p>
1461 </div>
1462 <div id="navigation">
1463 <ul>
1464 <li><a href="/wiki/Main_Page">Main page</a></li>
1465 <li><a href="/wiki/Special:Random">Random article</a></li>
1466 </ul>
1467 </div>
1468 </body>
1469 </html>
1470 "#;
1471
1472 let mut parser = create_parser(html);
1473 let article = parser.parse();
1474
1475 assert!(article.is_some());
1476 let article = article.unwrap();
1477
1478 assert!(article.title.is_some());
1480 assert!(article.title.as_ref().unwrap().contains("Mozilla"));
1481
1482 assert!(article.content.is_some());
1484 let content = article.content.unwrap();
1485 assert!(content.contains("free software community"));
1486 assert!(content.contains("Firefox"));
1487 assert!(content.contains("Thunderbird"));
1488 assert!(content.contains("History"));
1489 assert!(content.contains("Products"));
1490
1491 assert!(!content.contains("Main page"));
1493 assert!(!content.contains("Random article"));
1494 }
1495
1496 #[test]
1497 fn test_content_scoring_algorithm() {
1498 let html = r#"
1500 <!DOCTYPE html>
1501 <html>
1502 <head>
1503 <title>Content Scoring Test</title>
1504 </head>
1505 <body>
1506 <div class="advertisement">
1507 <p>This is an advertisement that should be filtered out.</p>
1508 </div>
1509 <article class="main-content">
1510 <h1>Main Article Title</h1>
1511 <p>This is the main article content with substantial text. It contains multiple sentences and should be scored highly by the readability algorithm. The content is meaningful and provides value to readers.</p>
1512 <p>Another paragraph with more substantial content. This paragraph also contains commas, which should increase the content score according to Mozilla's algorithm.</p>
1513 <p>A third paragraph to ensure we have enough content for proper scoring.</p>
1514 </article>
1515 <div class="sidebar">
1516 <p>Short sidebar text.</p>
1517 </div>
1518 <footer>
1519 <p>Copyright notice and other footer content.</p>
1520 </footer>
1521 </body>
1522 </html>
1523 "#;
1524
1525 let mut parser = create_parser(html);
1526 let article = parser.parse();
1527
1528 assert!(article.is_some());
1529 let article = article.unwrap();
1530
1531 assert!(article.content.is_some());
1533 let content = article.content.unwrap();
1534
1535 assert!(content.contains("main article content"));
1537 assert!(content.contains("substantial text"));
1538 assert!(content.contains("commas, which should increase"));
1539
1540 assert!(!content.contains("advertisement"));
1542 assert!(!content.contains("Short sidebar"));
1543 assert!(!content.contains("Copyright notice"));
1544 }
1545
1546 #[test]
1547 fn test_metadata_extraction_comprehensive() {
1548 let html = r#"
1550 <!DOCTYPE html>
1551 <html lang="en-US">
1552 <head>
1553 <title>Comprehensive Metadata Test Article</title>
1554 <meta name="author" content="John Doe">
1555 <meta name="description" content="A comprehensive test of metadata extraction capabilities.">
1556 <meta property="og:title" content="OG Title Override">
1557 <meta property="og:description" content="Open Graph description.">
1558 <meta property="og:site_name" content="Test Site">
1559 <meta property="article:published_time" content="2023-01-15T10:30:00Z">
1560 <meta name="twitter:title" content="Twitter Title">
1561 <meta name="twitter:description" content="Twitter description.">
1562 <script type="application/ld+json">
1563 {
1564 "@context": "https://schema.org",
1565 "@type": "Article",
1566 "headline": "JSON-LD Headline",
1567 "author": {
1568 "@type": "Person",
1569 "name": "Jane Smith"
1570 },
1571 "datePublished": "2023-01-15"
1572 }
1573 </script>
1574 </head>
1575 <body>
1576 <article>
1577 <header>
1578 <h1>Article Title</h1>
1579 <p class="byline">By <span class="author">Article Author</span></p>
1580 <time datetime="2023-01-15">January 15, 2023</time>
1581 </header>
1582 <div class="content">
1583 <p>This is the main article content for testing metadata extraction capabilities in our readability parser. The article demonstrates how various metadata formats can be parsed and extracted from HTML documents, including Open Graph tags, Twitter Card metadata, and JSON-LD structured data.</p>
1584 <p>The article contains substantial content to ensure proper parsing and meets the minimum character threshold required by the readability algorithm. This comprehensive test validates that our parser can handle multiple metadata sources and prioritize them correctly according to the Mozilla Readability specification.</p>
1585 <p>Additional content is provided here to ensure we have enough text for the parser to consider this a valid article worth extracting. The metadata extraction process should work seamlessly with content extraction to provide a complete article parsing solution.</p>
1586 </div>
1587 </article>
1588 </body>
1589 </html>
1590 "#;
1591
1592 let mut parser = create_parser(html);
1593 let article = parser.parse();
1594
1595 assert!(article.is_some());
1596 let article = article.unwrap();
1597
1598 assert!(article.title.is_some());
1600 assert!(article.byline.is_some());
1601 assert_eq!(article.lang, Some("en-US".to_string()));
1602 assert!(article.excerpt.is_some());
1603 assert!(article.site_name.is_some());
1604 assert!(article.published_time.is_some());
1605
1606 assert!(article.content.is_some());
1608 let content = article.content.unwrap();
1609 assert!(content.contains("main article content"));
1610 assert!(content.contains("metadata extraction"));
1611 }
1612
1613 #[test]
1614 fn test_readability_assessment() {
1615 let readable_html = r#"
1617 <!DOCTYPE html>
1618 <html>
1619 <head><title>Readable Article</title></head>
1620 <body>
1621 <article>
1622 <h1>This is a readable article</h1>
1623 <p>This article contains substantial content that makes it worth reading. It has multiple paragraphs with meaningful text that provides value to the reader.</p>
1624 <p>The content is well-structured and contains enough text to be considered readable by the algorithm.</p>
1625 <p>Additional paragraphs ensure that there is sufficient content for proper assessment.</p>
1626 </article>
1627 </body>
1628 </html>
1629 "#;
1630
1631 let unreadable_html = r#"
1632 <!DOCTYPE html>
1633 <html>
1634 <head><title>Unreadable Page</title></head>
1635 <body>
1636 <div class="navigation">
1637 <a href="/home">Home</a>
1638 <a href="/about">About</a>
1639 </div>
1640 <p>Short text.</p>
1641 <footer>Footer content</footer>
1642 </body>
1643 </html>
1644 "#;
1645
1646 assert!(is_probably_readerable(readable_html, None));
1648
1649 assert!(!is_probably_readerable(unreadable_html, None));
1651 }
1652
1653 #[test]
1654 fn test_cli_integration() {
1655 let html = r#"
1657 <!DOCTYPE html>
1658 <html>
1659 <head>
1660 <title>CLI Integration Test</title>
1661 <meta name="author" content="CLI Tester">
1662 </head>
1663 <body>
1664 <main>
1665 <h1>CLI Integration Test Article</h1>
1666 <p>This article tests the integration between the library and CLI usage patterns. The CLI tool should be able to parse HTML documents and extract readable content in various output formats including JSON, plain text, and HTML.</p>
1667 <p>It should be parseable and return structured data suitable for JSON output. The parser needs to handle various input sources like files, URLs, and stdin, while providing comprehensive metadata extraction and content cleaning capabilities.</p>
1668 <p>The CLI integration test ensures that all the core functionality works correctly when invoked from command-line tools, maintaining compatibility with the original Mozilla Readability library while providing additional Rust-specific features and performance improvements.</p>
1669 </main>
1670 </body>
1671 </html>
1672 "#;
1673
1674 let mut parser = create_parser(html);
1675 let article = parser.parse();
1676
1677 assert!(article.is_some());
1678 let article = article.unwrap();
1679
1680 assert!(article.title.is_some());
1682 assert!(article.content.is_some());
1683 assert!(article.text_content.is_some());
1684 assert!(article.length.is_some());
1685 assert!(article.byline.is_some());
1686
1687 let json_result = serde_json::to_string(&article);
1689 assert!(json_result.is_ok());
1690
1691 let json_str = json_result.unwrap();
1692 assert!(json_str.contains("CLI Integration Test"));
1693 assert!(json_str.contains("CLI Tester"));
1694 }
1695
1696 #[test]
1697 fn test_mozilla_test_cases_sample() {
1698 let test_cases = vec![
1700 "001",
1701 "002",
1702 "basic-tags-cleaning",
1703 "003-metadata-preferred",
1704 "article-author-tag"
1705 ];
1706
1707 for test_case in test_cases {
1708 println!("Testing Mozilla case: {}", test_case);
1709 test_mozilla_case(test_case);
1710 }
1711 }
1712
1713 #[test]
1714 fn test_all_mozilla_test_cases() {
1715 let test_dirs = get_test_case_dirs();
1717
1718 if test_dirs.is_empty() {
1719 println!("No Mozilla test cases found - skipping comprehensive test");
1720 return;
1721 }
1722
1723 println!("Running {} Mozilla test cases", test_dirs.len());
1724
1725 let mut passed = 0;
1726 let mut failed = 0;
1727
1728 for test_dir in &test_dirs {
1729 println!("Testing: {}", test_dir);
1730
1731 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1733 test_mozilla_case(test_dir);
1734 }));
1735
1736 match result {
1737 Ok(_) => {
1738 passed += 1;
1739 println!("✓ {}", test_dir);
1740 },
1741 Err(e) => {
1742 failed += 1;
1743 println!("✗ {} - {:?}", test_dir, e);
1744 }
1745 }
1746 }
1747
1748 println!("\nMozilla test results: {} passed, {} failed", passed, failed);
1749
1750 }
1753
1754 #[test]
1755 fn test_mozilla_metadata_extraction() {
1756 let test_cases = vec![
1758 ("003-metadata-preferred", "Dublin Core property title", Some("Dublin Core property author")),
1759 ("article-author-tag", "The Deck of Cards That Made Tarot A Global Phenomenon", Some("Laura June Topolsky")),
1760 ];
1761
1762 for (test_dir, expected_title, expected_byline) in test_cases {
1763 if let Ok((source, _, expected_metadata)) = load_test_case(test_dir) {
1764 let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
1765 debug: false,
1766 char_threshold: 25,
1767 ..Default::default()
1768 })).unwrap();
1769
1770 if let Some(article) = parser.parse() {
1771 if let Some(title) = &article.title {
1773 if !title.contains(expected_title) && !expected_title.contains(title) {
1774 println!("Title difference in {}: expected '{}', got '{}'", test_dir, expected_title, title);
1775 }
1776 }
1777
1778 if let Some(expected_byline) = expected_byline {
1780 if let Some(byline) = &article.byline {
1781 if byline != expected_byline {
1782 println!("Byline difference in {}: expected '{}', got '{}'", test_dir, expected_byline, byline);
1783 }
1784 }
1785 }
1786
1787 if let Some(expected_lang) = expected_metadata["lang"].as_str() {
1789 assert_eq!(article.lang.as_deref(), Some(expected_lang),
1790 "Language mismatch in {}", test_dir);
1791 }
1792
1793 if let Some(expected_site_name) = expected_metadata["siteName"].as_str() {
1794 assert_eq!(article.site_name.as_deref(), Some(expected_site_name),
1795 "Site name mismatch in {}", test_dir);
1796 }
1797 }
1798 }
1799 }
1800 }
1801
1802 #[test]
1803 fn test_mozilla_readerable_detection() {
1804 let test_cases = vec![
1806 "001",
1807 "basic-tags-cleaning",
1808 "article-author-tag",
1809 "bbc-1",
1810 "cnn"
1811 ];
1812
1813 for test_case in test_cases {
1814 if let Ok((source, _, expected_metadata)) = load_test_case(test_case) {
1815 let expected_readerable = expected_metadata["readerable"].as_bool().unwrap_or(false);
1816 let actual_readerable = is_probably_readerable(&source, Some(ReadabilityOptions {
1817 char_threshold: 25,
1818 ..Default::default()
1819 }));
1820
1821 if expected_readerable != actual_readerable {
1823 println!("Readerable detection difference in {}: expected {}, got {}",
1824 test_case, expected_readerable, actual_readerable);
1825 }
1826 }
1827 }
1828 }
1829
1830 #[test]
1831 fn test_mozilla_content_extraction_quality() {
1832 let test_cases = vec![
1834 "001",
1835 "bbc-1",
1836 "guardian-1",
1837 "nytimes-1",
1838 "medium-1"
1839 ];
1840
1841 for test_case in test_cases {
1842 if let Ok((source, _expected_content, _)) = load_test_case(test_case) {
1843 let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
1844 debug: false,
1845 char_threshold: 25,
1846 classes_to_preserve: vec!["caption".to_string()],
1847 ..Default::default()
1848 })).unwrap();
1849
1850 if let Some(article) = parser.parse() {
1851 if let Some(content) = &article.content {
1852 assert!(!content.trim().is_empty(), "Content should not be empty for {}", test_case);
1854 assert!(content.len() > 100, "Content should be substantial for {}", test_case);
1855
1856 if !content.contains("<p>") && !content.contains("<div>") {
1858 println!("Warning: Content does not contain paragraphs or divs for {}", test_case);
1859 }
1860
1861 let content_lower = content.to_lowercase();
1863 if content_lower.contains("navigation") {
1864 println!("Warning: Content contains navigation elements for {}", test_case);
1865 }
1866 if content_lower.contains("menu") {
1867 println!("Warning: Content contains menu elements for {}", test_case);
1868 }
1869 }
1870 }
1871 }
1872 }
1873 }
1874
1875 #[test]
1876 fn test_mozilla_edge_cases() {
1877 let edge_cases = vec![
1879 "comment-inside-script-parsing",
1880 "malformed-html",
1881 "missing-paragraphs",
1882 "normalize-spaces",
1883 "remove-extra-brs",
1884 "remove-extra-paragraphs"
1885 ];
1886
1887 for test_case in edge_cases {
1888 if let Ok((source, _, _expected_metadata)) = load_test_case(test_case) {
1889 let mut parser = Readability::new_with_base_uri(&source, "http://fakehost/test/page.html", Some(ReadabilityOptions {
1890 debug: false,
1891 char_threshold: 100, ..Default::default()
1893 })).unwrap();
1894
1895 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
1897 parser.parse()
1898 }));
1899
1900 match result {
1901 Ok(_) => {
1902 println!("✓ Edge case {} handled gracefully", test_case);
1903 },
1904 Err(_) => {
1905 println!("✗ Edge case {} caused panic", test_case);
1906 }
1907 }
1908 }
1909 }
1910 }
1911}