1pub mod formatters;
41pub use formatters::ArticleFormat;
42
43use std::collections::HashMap;
44
45use serde::Serialize;
46use tree_sitter::{Node, Parser};
47use tree_sitter_html::LANGUAGE;
48
49#[derive(Debug, Clone)]
54pub enum InlineNode {
55 Text(String),
57 Bold(String),
59 Italic(String),
61 Link { text: String, href: String },
63 Ref { label: String, note_id: String },
68}
69
70#[derive(Debug, Clone, Serialize)]
75pub struct ImageSegment {
76 pub src: String,
78 pub alt: String,
80 pub caption: String,
82 pub section: String,
84 pub section_level: u8,
86}
87
88#[derive(Debug, Clone)]
94pub enum ArticleItem {
95 Paragraph(TextSegment),
97 Image(ImageSegment),
99 References(HashMap<String, String>),
104}
105
106impl InlineNode {
107 pub fn plain_text(&self) -> &str {
110 match self {
111 InlineNode::Text(s) | InlineNode::Bold(s) | InlineNode::Italic(s) => s,
112 InlineNode::Link { text, .. } => text,
113 InlineNode::Ref { .. } => "",
114 }
115 }
116}
117
118#[derive(Debug, Clone, Serialize)]
124pub struct TextSegment {
125 pub text: String,
127 #[serde(skip)]
129 pub content: Vec<InlineNode>,
130 pub mwid: String,
132 pub section: String,
134 pub section_level: u8,
136}
137
138#[derive(Debug, Clone)]
139struct SectionInfo {
140 title: String,
141 level: u8,
142}
143
144pub struct WikiPage {
162 parser: Parser,
163 items: Vec<ArticleItem>,
164 current_sections: Vec<SectionInfo>,
165 base_url: Option<String>,
167 references: HashMap<String, String>,
170}
171
172impl WikiPage {
173 pub fn new() -> anyhow::Result<Self> {
175 let language = LANGUAGE.into();
176 let mut parser = Parser::new();
177 parser.set_language(&language)?;
178 Ok(WikiPage {
179 parser,
180 items: Vec::new(),
181 current_sections: Vec::new(),
182 base_url: None,
183 references: HashMap::new(),
184 })
185 }
186
187 pub fn set_base_url(&mut self, language: &str) {
199 self.base_url = Some(format!("https://{language}.wikipedia.org/wiki/"));
200 }
201
202 fn resolve_href(&self, href: &str) -> String {
209 if href.starts_with("http://") || href.starts_with("https://") {
210 return href.to_string();
211 }
212 if let Some(rest) = href.strip_prefix("//") {
213 return format!("https://{rest}");
214 }
215 if let Some(path) = href.strip_prefix("./") {
216 if let Some(base) = &self.base_url {
217 return format!("{base}{path}");
218 }
219 }
220 href.to_string()
221 }
222
223 pub fn extract_text(&mut self, html: &str) -> anyhow::Result<Vec<ArticleItem>> {
235 self.items.clear();
236 self.current_sections.clear();
237 self.references.clear();
238 let tree = self
239 .parser
240 .parse(html, None)
241 .ok_or_else(|| anyhow::anyhow!("Failed to parse HTML"))?;
242 let source = html.as_bytes();
243 self.extract_references(&tree.root_node(), source);
244 self.walk_and_collect(&tree.root_node(), source, false);
245 if !self.references.is_empty() {
246 self.items
247 .push(ArticleItem::References(self.references.clone()));
248 }
249 Ok(self.items.clone())
250 }
251
252 pub fn extract_text_plain(html: &str) -> anyhow::Result<String> {
254 let mut page = WikiPage::new()?;
255 let items = page.extract_text(html)?;
256 let text = items
257 .iter()
258 .filter_map(|item| match item {
259 ArticleItem::Paragraph(seg) => {
260 let t = seg.text.trim();
261 if t.is_empty() { None } else { Some(t) }
262 }
263 ArticleItem::Image(_) | ArticleItem::References(_) => None,
264 })
265 .collect::<Vec<_>>()
266 .join("\n\n");
267 Ok(text)
268 }
269
270 fn get_header_level(tag_name: &str) -> Option<u8> {
271 match tag_name {
272 "h1" => Some(1),
273 "h2" => Some(2),
274 "h3" => Some(3),
275 "h4" => Some(4),
276 "h5" => Some(5),
277 "h6" => Some(6),
278 _ => None,
279 }
280 }
281
282 fn extract_text_from_element(&self, node: &Node, source: &[u8]) -> String {
283 let mut text = String::new();
284 for child in node.children(&mut node.walk()) {
285 match child.kind() {
286 "text" => {
287 if let Ok(t) = child.utf8_text(source) {
288 text.push_str(t.trim());
289 }
290 }
291 "element" => {
292 let child_text = self.extract_text_from_element(&child, source);
293 if !child_text.is_empty() {
294 if !text.is_empty() {
295 text.push(' ');
296 }
297 text.push_str(&child_text);
298 }
299 }
300 _ => {}
301 }
302 }
303 text
304 }
305
306 fn update_sections(&mut self, level: u8, title: String) {
307 self.current_sections
308 .retain(|section| section.level < level);
309 self.current_sections.push(SectionInfo { title, level });
310 }
311
312 fn get_current_section_string(&self) -> String {
313 self.current_sections
314 .iter()
315 .map(|s| s.title.as_str())
316 .collect::<Vec<_>>()
317 .join(" - ")
318 }
319
320 fn get_current_section_level(&self) -> u8 {
321 self.current_sections.last().map(|s| s.level).unwrap_or(0)
322 }
323
324 fn extract_references(&mut self, node: &Node, source: &[u8]) {
330 match node.kind() {
331 "element" => {
332 if let Some((tag, attrs)) = self.parse_element(node, source) {
333 let class = attrs
334 .iter()
335 .find(|(k, _)| k == "class")
336 .map(|(_, v)| v.as_str())
337 .unwrap_or("");
338 let classes: Vec<&str> = class.split_whitespace().collect();
339
340 if tag == "ol" && classes.contains(&"references") {
342 for child in node.children(&mut node.walk()) {
343 if child.kind() != "element" {
344 continue;
345 }
346 if let Some((child_tag, child_attrs)) =
347 self.parse_element(&child, source)
348 {
349 if child_tag != "li" {
350 continue;
351 }
352 let note_id = child_attrs
353 .iter()
354 .find(|(k, _)| k == "id")
355 .map(|(_, v)| v.clone())
356 .unwrap_or_default();
357 if note_id.is_empty() {
358 continue;
359 }
360 let citation = self.find_reference_text(&child, source);
362 if !citation.is_empty() {
363 self.references.insert(note_id, citation);
364 }
365 }
366 }
367 return; }
369
370 for child in node.children(&mut node.walk()) {
372 self.extract_references(&child, source);
373 }
374 }
375 }
376 _ => {
377 for child in node.children(&mut node.walk()) {
378 self.extract_references(&child, source);
379 }
380 }
381 }
382 }
383
384 fn find_reference_text(&self, li_node: &Node, source: &[u8]) -> String {
386 for child in li_node.children(&mut li_node.walk()) {
387 if child.kind() != "element" {
388 continue;
389 }
390 if let Some((tag, attrs)) = self.parse_element(&child, source) {
391 let class = attrs
392 .iter()
393 .find(|(k, _)| k == "class")
394 .map(|(_, v)| v.as_str())
395 .unwrap_or("");
396 if tag == "span" && class.split_whitespace().any(|c| c == "reference-text") {
397 return self.extract_text_from_element(&child, source);
398 }
399 let found = self.find_reference_text(&child, source);
401 if !found.is_empty() {
402 return found;
403 }
404 }
405 }
406 String::new()
407 }
408
409 fn extract_inline_ref(&self, sup_node: &Node, source: &[u8]) -> Option<InlineNode> {
414 let mut note_id = String::new();
415 let mut label = String::new();
416
417 self.find_ref_parts(sup_node, source, &mut note_id, &mut label);
418
419 if note_id.is_empty() || label.is_empty() {
420 return None;
421 }
422 Some(InlineNode::Ref { label, note_id })
423 }
424
425 fn find_ref_parts(&self, node: &Node, source: &[u8], note_id: &mut String, label: &mut String) {
428 for child in node.children(&mut node.walk()) {
429 if child.kind() != "element" {
430 continue;
431 }
432 if let Some((tag, attrs)) = self.parse_element(&child, source) {
433 match tag.as_str() {
434 "a" => {
435 if note_id.is_empty() {
436 let href = attrs
437 .iter()
438 .find(|(k, _)| k == "href")
439 .map(|(_, v)| v.as_str())
440 .unwrap_or_default();
441 if let Some(fragment) = href.rsplit_once('#') {
443 *note_id = fragment.1.to_string();
444 }
445 }
446 self.find_ref_parts(&child, source, note_id, label);
447 }
448 "span" => {
449 let class = attrs
450 .iter()
451 .find(|(k, _)| k == "class")
452 .map(|(_, v)| v.as_str())
453 .unwrap_or("");
454 if class.split_whitespace().any(|c| c == "mw-reflink-text") {
455 let raw = self.extract_text_from_element(&child, source);
457 *label = raw
458 .trim_matches(|c: char| c == '[' || c == ']' || c.is_whitespace())
459 .to_string();
460 } else {
461 self.find_ref_parts(&child, source, note_id, label);
462 }
463 }
464 _ => {
465 self.find_ref_parts(&child, source, note_id, label);
466 }
467 }
468 }
469 }
470 }
471
472 fn push_inline(&mut self, node: InlineNode) {
474 let last_seg = self.items.iter_mut().rev().find_map(|item| {
475 if let ArticleItem::Paragraph(seg) = item {
476 Some(seg)
477 } else {
478 None
479 }
480 });
481 if let Some(seg) = last_seg {
482 let plain = node.plain_text().to_string();
483 if !seg.text.is_empty() && !plain.is_empty() {
484 if !seg.text.ends_with(' ') {
485 seg.text.push(' ');
486 }
487 }
488 seg.text.push_str(plain.trim());
489 seg.content.push(node);
490 }
491 }
492
493 fn collect_inline_text(&self, node: &Node, source: &[u8]) -> String {
495 let mut text = String::new();
496 for child in node.children(&mut node.walk()) {
497 match child.kind() {
498 "text" => {
499 if let Ok(t) = child.utf8_text(source) {
500 let trimmed = t.trim();
501 if !trimmed.is_empty() {
502 if !text.is_empty() {
503 text.push(' ');
504 }
505 text.push_str(trimmed);
506 }
507 }
508 }
509 "element" => {
510 let child_text = self.collect_inline_text(&child, source);
511 if !child_text.is_empty() {
512 if !text.is_empty() {
513 text.push(' ');
514 }
515 text.push_str(&child_text);
516 }
517 }
518 _ => {}
519 }
520 }
521 text
522 }
523
524 fn walk_and_collect(&mut self, node: &Node, source: &[u8], inside_paragraph: bool) {
525 match node.kind() {
526 "text" => {
527 if let Ok(text) = node.utf8_text(source) {
528 let trimmed = text.trim();
529 if !trimmed.is_empty() {
530 if self.items.is_empty() {
531 self.items.push(ArticleItem::Paragraph(TextSegment {
532 text: String::new(),
533 content: Vec::new(),
534 mwid: String::new(),
535 section: self.get_current_section_string(),
536 section_level: self.get_current_section_level(),
537 }));
538 }
539 self.push_inline(InlineNode::Text(trimmed.to_string()));
540 }
541 }
542 }
543 "script_element" | "style_element" => (),
544 "element" => {
545 if let Some((tag_name, attributes)) = self.parse_element(node, source) {
546 if tag_name == "link" {
547 return;
548 }
549
550 let class_attr = attributes
551 .iter()
552 .find(|(k, _)| k == "class")
553 .map(|(_, v)| v.as_str())
554 .unwrap_or("");
555
556 if inside_paragraph
560 && tag_name == "sup"
561 && class_attr.split_whitespace().any(|c| c == "mw-ref")
562 {
563 if let Some(r) = self.extract_inline_ref(node, source) {
564 self.push_inline(r);
565 }
566 return;
567 }
568
569 const EXCLUDED_CLASSES: &[&str] = &[
570 "shortdescription",
571 "hatnote",
572 "infobox",
573 "reference",
574 "navbox",
575 "noprint",
576 "reflist",
577 "citation",
578 "mw-references",
579 ];
580 if EXCLUDED_CLASSES
581 .iter()
582 .any(|c| class_attr.split_whitespace().any(|cls| cls == *c))
583 {
584 return;
585 }
586
587 if let Some(level) = Self::get_header_level(&tag_name) {
588 let header_text = self.extract_text_from_element(node, source);
589 if !header_text.is_empty() {
590 self.update_sections(level, header_text);
591 }
592 return;
593 }
594
595 if tag_name == "p" {
596 let mwid = attributes
597 .iter()
598 .find(|(k, _)| k == "id")
599 .map(|(_, v)| v.clone())
600 .unwrap_or_default();
601 self.items.push(ArticleItem::Paragraph(TextSegment {
602 text: String::new(),
603 content: Vec::new(),
604 mwid,
605 section: self.get_current_section_string(),
606 section_level: self.get_current_section_level(),
607 }));
608 for i in 0..node.child_count() {
609 if let Some(child) = node.child(i as u32) {
610 self.walk_and_collect(&child, source, true);
611 }
612 }
613 return;
614 }
615
616 if tag_name == "figure" {
617 if let Some(img) = self.extract_image(node, source) {
618 self.items.push(ArticleItem::Image(img));
619 }
620 return;
621 }
622
623 if inside_paragraph {
625 match tag_name.as_str() {
626 "b" | "strong" => {
627 let text = self.collect_inline_text(node, source);
628 if !text.is_empty() {
629 self.push_inline(InlineNode::Bold(text));
630 }
631 return;
632 }
633 "i" | "em" => {
634 let text = self.collect_inline_text(node, source);
635 if !text.is_empty() {
636 self.push_inline(InlineNode::Italic(text));
637 }
638 return;
639 }
640 "a" => {
641 let raw_href = attributes
642 .iter()
643 .find(|(k, _)| k == "href")
644 .map(|(_, v)| v.as_str())
645 .unwrap_or_default();
646 let href = self.resolve_href(raw_href);
647 let text = self.collect_inline_text(node, source);
648 if !text.is_empty() {
649 self.push_inline(InlineNode::Link { text, href });
650 }
651 return;
652 }
653 _ => {}
654 }
655 }
656
657 for i in 0..node.child_count() {
658 if let Some(child) = node.child(i as u32) {
659 self.walk_and_collect(&child, source, inside_paragraph);
660 }
661 }
662 }
663 }
664 _ => {
665 for i in 0..node.child_count() {
666 if let Some(child) = node.child(i as u32) {
667 self.walk_and_collect(&child, source, inside_paragraph);
668 }
669 }
670 }
671 }
672 }
673
674 fn parse_element(
675 &self,
676 element_node: &Node,
677 source: &[u8],
678 ) -> Option<(String, Vec<(String, String)>)> {
679 let tag_container = element_node
681 .children(&mut element_node.walk())
682 .find(|child| child.kind() == "start_tag" || child.kind() == "self_closing_tag")?;
683
684 let tag_name_node = tag_container
685 .children(&mut tag_container.walk())
686 .find(|child| child.kind() == "tag_name")?;
687
688 let tag_name = tag_name_node.utf8_text(source).ok()?.to_string();
689 let mut attributes = Vec::new();
690
691 for child in tag_container.children(&mut tag_container.walk()) {
692 if child.kind() == "attribute" {
693 if let Some(pair) = self.parse_attribute(&child, source) {
694 attributes.push(pair);
695 }
696 }
697 }
698
699 Some((tag_name, attributes))
700 }
701
702 fn parse_attribute(&self, attr_node: &Node, source: &[u8]) -> Option<(String, String)> {
703 let mut attr_name = None;
704 let mut attr_value = String::new();
705
706 for child in attr_node.children(&mut attr_node.walk()) {
707 match child.kind() {
708 "attribute_name" => {
709 attr_name = child.utf8_text(source).ok().map(|s| s.to_string());
710 }
711 "quoted_attribute_value" => {
712 for grandchild in child.children(&mut child.walk()) {
713 if grandchild.kind() == "attribute_value" {
714 if let Ok(value) = grandchild.utf8_text(source) {
715 attr_value = value.to_string();
716 }
717 }
718 }
719 }
720 "attribute_value" => {
721 if let Ok(value) = child.utf8_text(source) {
722 attr_value = value.to_string();
723 }
724 }
725 _ => {}
726 }
727 }
728
729 attr_name.map(|name| (name, attr_value))
730 }
731
732 fn extract_image(&self, figure_node: &Node, source: &[u8]) -> Option<ImageSegment> {
737 let mut src = String::new();
738 let mut alt = String::new();
739 let mut caption = String::new();
740
741 for child in figure_node.children(&mut figure_node.walk()) {
742 if child.kind() == "element" {
743 if let Some((tag, attrs)) = self.parse_element(&child, source) {
744 if tag == "figcaption" {
745 caption = self.extract_text_from_element(&child, source);
746 } else {
747 self.find_img(&child, source, &tag, &attrs, &mut src, &mut alt);
749 }
750 }
751 }
752 }
753
754 if src.is_empty() {
755 return None;
756 }
757
758 Some(ImageSegment {
759 src: self.resolve_href(&src),
760 alt,
761 caption,
762 section: self.get_current_section_string(),
763 section_level: self.get_current_section_level(),
764 })
765 }
766
767 fn find_img(
769 &self,
770 node: &Node,
771 source: &[u8],
772 tag: &str,
773 attrs: &[(String, String)],
774 src: &mut String,
775 alt: &mut String,
776 ) {
777 if !src.is_empty() {
778 return;
779 }
780 if tag == "img" {
781 if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "src") {
782 *src = v.clone();
783 }
784 if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "alt") {
785 *alt = v.clone();
786 }
787 return;
788 }
789 for child in node.children(&mut node.walk()) {
790 if child.kind() == "element" {
791 if let Some((child_tag, child_attrs)) = self.parse_element(&child, source) {
792 self.find_img(&child, source, &child_tag, &child_attrs, src, alt);
793 }
794 }
795 }
796 }
797}
798
799impl Default for WikiPage {
800 fn default() -> Self {
801 Self::new().expect("Failed to initialise tree-sitter HTML parser")
802 }
803}
804
805pub fn strip_references(items: Vec<ArticleItem>) -> Vec<ArticleItem> {
811 items
812 .into_iter()
813 .filter_map(|item| match item {
814 ArticleItem::References(_) => None,
815 ArticleItem::Paragraph(mut seg) => {
816 seg.content.retain(|n| !matches!(n, InlineNode::Ref { .. }));
817 seg.text = seg
819 .content
820 .iter()
821 .map(|n| n.plain_text())
822 .filter(|s| !s.is_empty())
823 .collect::<Vec<_>>()
824 .join(" ");
825 Some(ArticleItem::Paragraph(seg))
826 }
827 other => Some(other),
828 })
829 .collect()
830}
831
832#[cfg(any(feature = "cli", feature = "web"))]
836pub async fn get_text(language: &str, title: &str) -> anyhow::Result<Vec<ArticleItem>> {
837 let html = get_page_content_html(language, title).await?;
838 let mut page = WikiPage::new()?;
839 page.set_base_url(language);
840 Ok(page.extract_text(&html)?)
841}
842
843#[cfg(any(feature = "cli", feature = "web"))]
844async fn get_page_content_html(language: &str, title: &str) -> anyhow::Result<String> {
845 let normalized_title = normalize_title(title);
846 let url = format!(
847 "https://{language}.wikipedia.org/api/rest_v1/page/html/{normalized_title}?stash=false"
848 );
849 let client = reqwest::Client::new();
850 let response = client
851 .get(&url)
852 .header(
853 "User-Agent",
854 "wikipedia-article-transform/0.1 (https://github.com/santhoshtr/wikipedia-article-transform)",
855 )
856 .send()
857 .await?;
858 if !response.status().is_success() {
859 anyhow::bail!("Failed to fetch article: HTTP {}", response.status());
860 }
861 Ok(response.text().await?)
862}
863
864#[cfg(any(feature = "cli", feature = "web"))]
865fn normalize_title(title: &str) -> String {
866 title.split_whitespace().collect::<Vec<_>>().join("_")
867}
868
869#[cfg(test)]
870mod tests {
871 use super::*;
872
873 #[cfg(any(feature = "cli", feature = "web"))]
874 #[test]
875 fn test_normalize_title_replaces_whitespace_with_underscore() {
876 assert_eq!(normalize_title("Marie Curie"), "Marie_Curie");
877 assert_eq!(normalize_title(" Marie Curie "), "Marie_Curie");
878 assert_eq!(normalize_title("Ada\tLovelace"), "Ada_Lovelace");
879 }
880
881 fn extract(html: &str) -> Vec<ArticleItem> {
882 WikiPage::extract_text_plain(html).unwrap();
883 let mut page = WikiPage::new().unwrap();
884 page.extract_text(html).unwrap()
885 }
886
887 fn paragraphs(items: &[ArticleItem]) -> Vec<&TextSegment> {
888 items
889 .iter()
890 .filter_map(|i| {
891 if let ArticleItem::Paragraph(s) = i {
892 Some(s)
893 } else {
894 None
895 }
896 })
897 .collect()
898 }
899
900 fn images(items: &[ArticleItem]) -> Vec<&ImageSegment> {
901 items
902 .iter()
903 .filter_map(|i| {
904 if let ArticleItem::Image(s) = i {
905 Some(s)
906 } else {
907 None
908 }
909 })
910 .collect()
911 }
912
913 #[test]
914 fn test_basic_paragraph() {
915 let items = extract("<html><body><p id=\"p1\">Hello world.</p></body></html>");
916 let segs = paragraphs(&items);
917 assert_eq!(segs.len(), 1);
918 assert_eq!(segs[0].text, "Hello world.");
919 assert_eq!(segs[0].mwid, "p1");
920 assert_eq!(segs[0].section, "");
921 assert_eq!(segs[0].section_level, 0);
922 }
923
924 #[test]
925 fn test_multiple_paragraphs() {
926 let items = extract("<p>First.</p><p>Second.</p><p>Third.</p>");
927 let segs = paragraphs(&items);
928 assert_eq!(segs.len(), 3);
929 assert_eq!(segs[0].text, "First.");
930 assert_eq!(segs[1].text, "Second.");
931 assert_eq!(segs[2].text, "Third.");
932 }
933
934 #[test]
935 fn test_section_tracking() {
936 let html = "<h2>History</h2><p>Para one.</p><h3>Early life</h3><p>Para two.</p>";
937 let items = extract(html);
938 let segs = paragraphs(&items);
939 assert_eq!(segs[0].section, "History");
940 assert_eq!(segs[1].section, "History - Early life");
941 }
942
943 #[test]
944 fn test_section_level() {
945 let html = "<h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
946 let items = extract(html);
947 let segs = paragraphs(&items);
948 assert_eq!(segs[0].section_level, 2);
949 assert_eq!(segs[1].section_level, 3);
950 }
951
952 #[test]
953 fn test_section_resets_at_same_level() {
954 let html = "<h2>History</h2><p>A.</p><h2>Geography</h2><p>B.</p>";
955 let items = extract(html);
956 let segs = paragraphs(&items);
957 assert_eq!(segs[0].section, "History");
958 assert_eq!(segs[1].section, "Geography");
959 }
960
961 #[test]
962 fn test_excluded_class_infobox() {
963 let html = r#"<p>Visible.</p><table class="infobox"><tr><td>Hidden.</td></tr></table><p>Also visible.</p>"#;
964 let items = extract(html);
965 let segs = paragraphs(&items);
966 assert!(segs.iter().all(|s| !s.text.contains("Hidden")));
967 assert_eq!(segs.len(), 2);
968 }
969
970 #[test]
971 fn test_excluded_class_reflist() {
972 let html = r#"<p>Main text.</p><div class="reflist"><p>Ref text.</p></div>"#;
973 let items = extract(html);
974 let segs = paragraphs(&items);
975 assert_eq!(segs.len(), 1);
976 assert_eq!(segs[0].text, "Main text.");
977 }
978
979 #[test]
980 fn test_script_and_style_skipped() {
981 let html = "<p>Real.</p><script>var x=1;</script><style>body{}</style><p>Also real.</p>";
982 let items = extract(html);
983 let segs = paragraphs(&items);
984 assert_eq!(segs.len(), 2);
985 assert!(segs.iter().all(|s| !s.text.contains("var x")));
986 }
987
988 #[test]
989 fn test_empty_html() {
990 let items = extract("");
991 assert!(items.is_empty());
992 }
993
994 #[test]
995 fn test_extract_text_plain() {
996 let html = "<p>First paragraph.</p><p>Second paragraph.</p>";
997 let text = WikiPage::extract_text_plain(html).unwrap();
998 assert_eq!(text, "First paragraph.\n\nSecond paragraph.");
999 }
1000
1001 #[test]
1002 fn test_default_impl() {
1003 let mut page = WikiPage::default();
1004 let items = page.extract_text("<p>Works.</p>").unwrap();
1005 let segs = paragraphs(&items);
1006 assert_eq!(segs[0].text, "Works.");
1007 }
1008
1009 #[test]
1010 fn test_inline_bold() {
1011 let items = extract("<p><b>Bold</b> text</p>");
1012 let segs = paragraphs(&items);
1013 assert_eq!(segs.len(), 1);
1014 assert_eq!(segs[0].text, "Bold text");
1015 assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "Bold"));
1016 assert!(matches!(&segs[0].content[1], InlineNode::Text(s) if s == "text"));
1017 }
1018
1019 #[test]
1020 fn test_inline_italic() {
1021 let items = extract("<p><i>italic</i></p>");
1022 let segs = paragraphs(&items);
1023 assert_eq!(segs.len(), 1);
1024 assert!(matches!(&segs[0].content[0], InlineNode::Italic(s) if s == "italic"));
1025 }
1026
1027 #[test]
1028 fn test_inline_strong_em() {
1029 let items = extract("<p><strong>S</strong> and <em>E</em></p>");
1030 let segs = paragraphs(&items);
1031 assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "S"));
1032 assert!(matches!(&segs[0].content[2], InlineNode::Italic(s) if s == "E"));
1033 }
1034
1035 #[test]
1036 fn test_inline_link() {
1037 let items = extract(r#"<p><a href="./X">anchor</a></p>"#);
1038 let segs = paragraphs(&items);
1039 assert_eq!(segs.len(), 1);
1040 assert!(matches!(&segs[0].content[0],
1042 InlineNode::Link { text, href } if text == "anchor" && href == "./X"));
1043 }
1044
1045 #[test]
1046 fn test_inline_link_absolute() {
1047 let html = r#"<p><a href="./Cryogenics">Cryogenics</a></p>"#;
1048 let mut page = WikiPage::new().unwrap();
1049 page.set_base_url("en");
1050 let items = page.extract_text(html).unwrap();
1051 let segs = paragraphs(&items);
1052 assert!(matches!(&segs[0].content[0],
1053 InlineNode::Link { text, href }
1054 if text == "Cryogenics"
1055 && href == "https://en.wikipedia.org/wiki/Cryogenics"));
1056 }
1057
1058 #[test]
1059 fn test_resolve_href_protocol_relative() {
1060 let html = r#"<p><a href="//en.wikipedia.org/wiki/Oxygen">O</a></p>"#;
1061 let mut page = WikiPage::new().unwrap();
1062 let items = page.extract_text(html).unwrap();
1063 let segs = paragraphs(&items);
1064 assert!(matches!(&segs[0].content[0],
1065 InlineNode::Link { href, .. } if href == "https://en.wikipedia.org/wiki/Oxygen"));
1066 }
1067
1068 #[test]
1069 fn test_format_plain_sections() {
1070 let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1071 let items = extract(html);
1072 let out = items.format_plain();
1073 assert!(out.contains("\nIntro.\n"), "intro paragraph missing");
1074 assert!(out.contains("## History\n"), "h2 heading missing");
1075 assert!(out.contains("\nA.\n"), "first section paragraph missing");
1076 assert!(out.contains("### Early life\n"), "h3 heading missing");
1077 assert!(out.contains("\nB.\n"), "subsection paragraph missing");
1078 assert!(out.find("## History").unwrap() < out.find("\nA.\n").unwrap());
1079 assert!(out.find("### Early life").unwrap() < out.find("\nB.\n").unwrap());
1080 }
1081
1082 #[test]
1083 fn test_format_json_tree() {
1084 let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1085 let items = extract(html);
1086 let json_str = items.format_json().unwrap();
1087 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1088 assert_eq!(v["intro"][0]["text"], "Intro.");
1089 assert_eq!(v["intro"][0]["citations"].as_array().unwrap().len(), 0);
1090 assert_eq!(v["sections"][0]["heading"], "History");
1091 assert_eq!(v["sections"][0]["level"], 2);
1092 assert_eq!(v["sections"][0]["paragraphs"][0]["text"], "A.");
1093 assert_eq!(
1094 v["sections"][0]["paragraphs"][0]["citations"]
1095 .as_array()
1096 .unwrap()
1097 .len(),
1098 0
1099 );
1100 assert_eq!(v["sections"][0]["subsections"][0]["heading"], "Early life");
1101 assert_eq!(v["sections"][0]["subsections"][0]["level"], 3);
1102 assert_eq!(
1103 v["sections"][0]["subsections"][0]["paragraphs"][0]["text"],
1104 "B."
1105 );
1106 }
1107
1108 #[test]
1109 fn test_format_markdown_inline() {
1110 let items = extract(
1111 "<h2>Title</h2><p><b>Bold</b> and <i>italic</i> and <a href=\"/x\">link</a></p>",
1112 );
1113 let out = items.format_markdown();
1114 assert!(out.contains("## Title"));
1115 assert!(out.contains("**Bold**"));
1116 assert!(out.contains("_italic_"));
1117 assert!(out.contains("[link](/x)"));
1118 assert!(
1120 out.contains("**Bold** and"),
1121 "space after bold missing: {out}"
1122 );
1123 assert!(
1124 out.contains("_italic_ and"),
1125 "space after italic missing: {out}"
1126 );
1127 assert!(
1128 out.contains("and [link]"),
1129 "space before link missing: {out}"
1130 );
1131 }
1132
1133 #[test]
1134 fn test_image_extraction() {
1135 let html = r#"<figure typeof="mw:File/Thumb">
1136 <a href="./File:Foo.jpg" class="mw-file-description">
1137 <img alt="A description" src="//upload.wikimedia.org/thumb/foo.jpg" class="mw-file-element"/>
1138 </a>
1139 <figcaption>Caption text here.</figcaption>
1140 </figure>"#;
1141 let items = extract(html);
1142 let imgs = images(&items);
1143 assert_eq!(imgs.len(), 1);
1144 assert_eq!(imgs[0].src, "https://upload.wikimedia.org/thumb/foo.jpg");
1145 assert_eq!(imgs[0].alt, "A description");
1146 assert_eq!(imgs[0].caption, "Caption text here.");
1147 }
1148
1149 #[test]
1150 fn test_image_no_caption() {
1151 let html = r#"<figure typeof="mw:File/Frameless">
1152 <a href="./File:Bar.png" class="mw-file-description">
1153 <img alt="Bar" src="//upload.wikimedia.org/bar.png" class="mw-file-element"/>
1154 </a>
1155 <figcaption></figcaption>
1156 </figure>"#;
1157 let items = extract(html);
1158 let imgs = images(&items);
1159 assert_eq!(imgs.len(), 1);
1160 assert_eq!(imgs[0].caption, "");
1161 }
1162
1163 #[test]
1164 fn test_image_section_tracking() {
1165 let html = r#"<h2>History</h2>
1166 <figure typeof="mw:File/Thumb">
1167 <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1168 <figcaption>X caption</figcaption>
1169 </figure>
1170 <p>A paragraph.</p>"#;
1171 let items = extract(html);
1172 let imgs = images(&items);
1173 assert_eq!(imgs.len(), 1);
1174 assert_eq!(imgs[0].section, "History");
1175 assert_eq!(imgs[0].section_level, 2);
1176 }
1177
1178 #[test]
1179 fn test_image_interleaved_order() {
1180 let html = r#"<p>Before.</p>
1181 <figure typeof="mw:File/Thumb">
1182 <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1183 <figcaption>Caption</figcaption>
1184 </figure>
1185 <p>After.</p>"#;
1186 let items = extract(html);
1187 assert!(matches!(&items[0], ArticleItem::Paragraph(s) if s.text == "Before."));
1188 assert!(matches!(&items[1], ArticleItem::Image(_)));
1189 assert!(matches!(&items[2], ArticleItem::Paragraph(s) if s.text == "After."));
1190 }
1191
1192 #[test]
1193 fn test_markdown_image() {
1194 let html = r#"<figure typeof="mw:File/Thumb">
1195 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1196 <figcaption>The caption.</figcaption>
1197 </figure>"#;
1198 let items = extract(html);
1199 let out = items.format_markdown();
1200 assert!(out.contains(""));
1202 assert!(out.contains("_The caption._"));
1203 }
1204
1205 #[test]
1206 fn test_markdown_image_no_caption() {
1207 let html = r#"<figure typeof="mw:File/Frameless">
1208 <a href="./File:Bar.png"><img alt="Bar" src="//upload.wikimedia.org/bar.png"/></a>
1209 <figcaption></figcaption>
1210 </figure>"#;
1211 let items = extract(html);
1212 let out = items.format_markdown();
1213 assert!(out.contains(""));
1214 assert!(!out.contains("__"));
1216 }
1217
1218 #[test]
1219 fn test_plain_image() {
1220 let html = r#"<figure typeof="mw:File/Thumb">
1221 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1222 <figcaption>The caption.</figcaption>
1223 </figure>"#;
1224 let items = extract(html);
1225 let out = items.format_plain();
1226 assert!(out.contains("[Image: Alt text]"));
1227 assert!(out.contains("The caption."));
1228 }
1229
1230 #[test]
1231 fn test_json_image() {
1232 let html = r#"<h2>Section</h2>
1233 <figure typeof="mw:File/Thumb">
1234 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1235 <figcaption>The caption.</figcaption>
1236 </figure>
1237 <p>A paragraph.</p>"#;
1238 let items = extract(html);
1239 let json_str = items.format_json().unwrap();
1240 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1241 assert_eq!(v["sections"][0]["images"][0]["alt"], "Alt text");
1242 assert_eq!(
1243 v["sections"][0]["images"][0]["src"],
1244 "https://upload.wikimedia.org/foo.jpg"
1245 );
1246 assert_eq!(v["sections"][0]["images"][0]["caption"], "The caption.");
1247 }
1248
1249 fn ref_html() -> &'static str {
1252 r#"<p id="p1">Some text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1253 ><a href="./Article#cite_note-Foo-1"><span class="mw-reflink-text">[1]</span></a
1254 ></sup> More text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1255 ><a href="./Article#cite_note-Bar-2"><span class="mw-reflink-text">[2]</span></a
1256 ></sup></p>
1257 <ol class="mw-references references">
1258 <li id="cite_note-Foo-1" data-mw-footnote-number="1">
1259 <span class="mw-cite-backlink"><a href="./Article#cite_ref-Foo_1-0">↑</a></span>
1260 <span id="mw-reference-text-cite_note-Foo-1" class="mw-reference-text reference-text">Author A. <i>Title One</i>. Publisher, 2020.</span>
1261 </li>
1262 <li id="cite_note-Bar-2" data-mw-footnote-number="2">
1263 <span class="mw-cite-backlink"><a href="./Article#cite_ref-Bar_2-0">↑</a></span>
1264 <span id="mw-reference-text-cite_note-Bar-2" class="mw-reference-text reference-text">Author B. Title Two. Journal, 2021.</span>
1265 </li>
1266 </ol>"#
1267 }
1268
1269 #[test]
1270 fn test_ref_inline_nodes() {
1271 let items = extract(ref_html());
1272 let segs = paragraphs(&items);
1273 assert_eq!(segs.len(), 1);
1274 assert!(matches!(&segs[0].content[0], InlineNode::Text(s) if s.contains("Some text")));
1276 assert!(
1277 matches!(&segs[0].content[1], InlineNode::Ref { label, note_id }
1278 if label == "1" && note_id == "cite_note-Foo-1")
1279 );
1280 assert!(
1281 matches!(&segs[0].content[3], InlineNode::Ref { label, note_id }
1282 if label == "2" && note_id == "cite_note-Bar-2")
1283 );
1284 }
1285
1286 #[test]
1287 fn test_ref_plain_text_excludes_label() {
1288 let items = extract(ref_html());
1290 let segs = paragraphs(&items);
1291 assert!(!segs[0].text.contains('['));
1292 assert!(segs[0].text.contains("Some text"));
1293 assert!(segs[0].text.contains("More text"));
1294 }
1295
1296 #[test]
1297 fn test_ref_references_item_appended() {
1298 let items = extract(ref_html());
1299 let refs = items.iter().find_map(|i| {
1300 if let ArticleItem::References(r) = i {
1301 Some(r)
1302 } else {
1303 None
1304 }
1305 });
1306 assert!(refs.is_some());
1307 let refs = refs.unwrap();
1308 assert_eq!(refs.len(), 2);
1309 assert!(refs["cite_note-Foo-1"].contains("Title One"));
1310 assert!(refs["cite_note-Bar-2"].contains("Title Two"));
1311 }
1312
1313 #[test]
1314 fn test_ref_no_refs_no_item() {
1315 let items = extract("<p>No citations here.</p>");
1316 assert!(
1317 !items
1318 .iter()
1319 .any(|i| matches!(i, ArticleItem::References(_)))
1320 );
1321 }
1322
1323 #[test]
1324 fn test_ref_markdown_inline_and_list() {
1325 let items = extract(ref_html());
1326 let out = items.format_markdown();
1327 assert!(out.contains("[^1]"), "inline [^1] missing");
1329 assert!(out.contains("[^2]"), "inline [^2] missing");
1330 assert!(out.contains("## References"), "References heading missing");
1332 assert!(out.contains("[^1]: "), "[^1]: definition missing");
1333 assert!(out.contains("Title One"), "citation text missing");
1334 assert!(out.contains("[^2]: "), "[^2]: definition missing");
1335 assert!(out.contains("Title Two"), "citation text missing");
1336 assert!(out.find("Some text").unwrap() < out.find("## References").unwrap());
1338 }
1339
1340 #[test]
1341 fn test_ref_json_references_key() {
1342 let items = extract(ref_html());
1343 let json_str = items.format_json().unwrap();
1344 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1345 assert!(v["references"].is_object(), "references key missing");
1346 assert!(
1347 v["references"]["cite_note-Foo-1"]
1348 .as_str()
1349 .unwrap()
1350 .contains("Title One")
1351 );
1352 assert!(
1353 v["references"]["cite_note-Bar-2"]
1354 .as_str()
1355 .unwrap()
1356 .contains("Title Two")
1357 );
1358
1359 let para = &v["intro"][0];
1361 let citations = para["citations"].as_array().unwrap();
1362 assert_eq!(citations.len(), 2);
1363 assert_eq!(citations[0]["label"], "1");
1364 assert!(citations[0]["text"].as_str().unwrap().contains("Title One"));
1365 assert_eq!(citations[1]["label"], "2");
1366 assert!(citations[1]["text"].as_str().unwrap().contains("Title Two"));
1367 }
1368
1369 #[test]
1370 fn test_strip_references() {
1371 let items = extract(ref_html());
1372 let stripped = strip_references(items);
1373 assert!(
1375 !stripped
1376 .iter()
1377 .any(|i| matches!(i, ArticleItem::References(_)))
1378 );
1379 let segs = paragraphs(&stripped);
1381 for seg in segs {
1382 assert!(
1383 !seg.content
1384 .iter()
1385 .any(|n| matches!(n, InlineNode::Ref { .. }))
1386 );
1387 assert!(!seg.text.contains('['));
1388 }
1389 }
1390}