1pub mod formatters;
41pub use formatters::ArticleFormat;
42
43use std::collections::HashMap;
44
45use serde::Serialize;
46use tree_sitter::{Node, Parser};
47use tree_sitter_html::LANGUAGE;
48
49#[derive(Debug, Clone)]
54pub enum InlineNode {
55 Text(String),
57 Bold(String),
59 Italic(String),
61 Link { text: String, href: String },
63 Ref { label: String, note_id: String },
68}
69
70#[derive(Debug, Clone, Serialize)]
75pub struct ImageSegment {
76 pub src: String,
78 pub alt: String,
80 pub caption: String,
82 pub section: String,
84 pub section_level: u8,
86}
87
88#[derive(Debug, Clone)]
94pub enum ArticleItem {
95 Paragraph(TextSegment),
97 Image(ImageSegment),
99 References(HashMap<String, String>),
104}
105
106impl InlineNode {
107 pub fn plain_text(&self) -> &str {
110 match self {
111 InlineNode::Text(s) | InlineNode::Bold(s) | InlineNode::Italic(s) => s,
112 InlineNode::Link { text, .. } => text,
113 InlineNode::Ref { .. } => "",
114 }
115 }
116}
117
118#[derive(Debug, Clone, Serialize)]
124pub struct TextSegment {
125 pub text: String,
127 #[serde(skip)]
129 pub content: Vec<InlineNode>,
130 pub mwid: String,
132 pub section: String,
134 pub section_level: u8,
136}
137
138#[derive(Debug, Clone)]
139struct SectionInfo {
140 title: String,
141 level: u8,
142}
143
144pub struct WikiPage {
162 parser: Parser,
163 items: Vec<ArticleItem>,
164 current_sections: Vec<SectionInfo>,
165 base_url: Option<String>,
167 references: HashMap<String, String>,
170}
171
172impl WikiPage {
173 pub fn new() -> anyhow::Result<Self> {
175 let language = LANGUAGE.into();
176 let mut parser = Parser::new();
177 parser.set_language(&language)?;
178 Ok(WikiPage {
179 parser,
180 items: Vec::new(),
181 current_sections: Vec::new(),
182 base_url: None,
183 references: HashMap::new(),
184 })
185 }
186
187 pub fn set_base_url(&mut self, language: &str) {
199 self.base_url = Some(format!("https://{language}.wikipedia.org/wiki/"));
200 }
201
202 fn resolve_href(&self, href: &str) -> String {
209 if href.starts_with("http://") || href.starts_with("https://") {
210 return href.to_string();
211 }
212 if let Some(rest) = href.strip_prefix("//") {
213 return format!("https://{rest}");
214 }
215 if let Some(path) = href.strip_prefix("./") {
216 if let Some(base) = &self.base_url {
217 return format!("{base}{path}");
218 }
219 }
220 href.to_string()
221 }
222
223 pub fn extract_text(&mut self, html: &str) -> anyhow::Result<Vec<ArticleItem>> {
235 self.items.clear();
236 self.current_sections.clear();
237 self.references.clear();
238 let tree = self
239 .parser
240 .parse(html, None)
241 .ok_or_else(|| anyhow::anyhow!("Failed to parse HTML"))?;
242 let source = html.as_bytes();
243 self.extract_references(&tree.root_node(), source);
244 self.walk_and_collect(&tree.root_node(), source, false);
245 if !self.references.is_empty() {
246 self.items
247 .push(ArticleItem::References(self.references.clone()));
248 }
249 Ok(self.items.clone())
250 }
251
252 pub fn extract_text_plain(html: &str) -> anyhow::Result<String> {
254 let mut page = WikiPage::new()?;
255 let items = page.extract_text(html)?;
256 let text = items
257 .iter()
258 .filter_map(|item| match item {
259 ArticleItem::Paragraph(seg) => {
260 let t = seg.text.trim();
261 if t.is_empty() { None } else { Some(t) }
262 }
263 ArticleItem::Image(_) | ArticleItem::References(_) => None,
264 })
265 .collect::<Vec<_>>()
266 .join("\n\n");
267 Ok(text)
268 }
269
270 fn get_header_level(tag_name: &str) -> Option<u8> {
271 match tag_name {
272 "h1" => Some(1),
273 "h2" => Some(2),
274 "h3" => Some(3),
275 "h4" => Some(4),
276 "h5" => Some(5),
277 "h6" => Some(6),
278 _ => None,
279 }
280 }
281
282 fn extract_text_from_element(&self, node: &Node, source: &[u8]) -> String {
283 let mut text = String::new();
284 for child in node.children(&mut node.walk()) {
285 match child.kind() {
286 "text" => {
287 if let Ok(t) = child.utf8_text(source) {
288 text.push_str(t.trim());
289 }
290 }
291 "element" => {
292 let child_text = self.extract_text_from_element(&child, source);
293 if !child_text.is_empty() {
294 if !text.is_empty() {
295 text.push(' ');
296 }
297 text.push_str(&child_text);
298 }
299 }
300 _ => {}
301 }
302 }
303 text
304 }
305
306 fn update_sections(&mut self, level: u8, title: String) {
307 self.current_sections
308 .retain(|section| section.level < level);
309 self.current_sections.push(SectionInfo { title, level });
310 }
311
312 fn get_current_section_string(&self) -> String {
313 self.current_sections
314 .iter()
315 .map(|s| s.title.as_str())
316 .collect::<Vec<_>>()
317 .join(" - ")
318 }
319
320 fn get_current_section_level(&self) -> u8 {
321 self.current_sections.last().map(|s| s.level).unwrap_or(0)
322 }
323
324 fn extract_references(&mut self, node: &Node, source: &[u8]) {
330 match node.kind() {
331 "element" => {
332 if let Some((tag, attrs)) = self.parse_element(node, source) {
333 let class = attrs
334 .iter()
335 .find(|(k, _)| k == "class")
336 .map(|(_, v)| v.as_str())
337 .unwrap_or("");
338 let classes: Vec<&str> = class.split_whitespace().collect();
339
340 if tag == "ol" && classes.contains(&"references") {
342 for child in node.children(&mut node.walk()) {
343 if child.kind() != "element" {
344 continue;
345 }
346 if let Some((child_tag, child_attrs)) =
347 self.parse_element(&child, source)
348 {
349 if child_tag != "li" {
350 continue;
351 }
352 let note_id = child_attrs
353 .iter()
354 .find(|(k, _)| k == "id")
355 .map(|(_, v)| v.clone())
356 .unwrap_or_default();
357 if note_id.is_empty() {
358 continue;
359 }
360 let citation = self.find_reference_text(&child, source);
362 if !citation.is_empty() {
363 self.references.insert(note_id, citation);
364 }
365 }
366 }
367 return; }
369
370 for child in node.children(&mut node.walk()) {
372 self.extract_references(&child, source);
373 }
374 }
375 }
376 _ => {
377 for child in node.children(&mut node.walk()) {
378 self.extract_references(&child, source);
379 }
380 }
381 }
382 }
383
384 fn find_reference_text(&self, li_node: &Node, source: &[u8]) -> String {
386 for child in li_node.children(&mut li_node.walk()) {
387 if child.kind() != "element" {
388 continue;
389 }
390 if let Some((tag, attrs)) = self.parse_element(&child, source) {
391 let class = attrs
392 .iter()
393 .find(|(k, _)| k == "class")
394 .map(|(_, v)| v.as_str())
395 .unwrap_or("");
396 if tag == "span" && class.split_whitespace().any(|c| c == "reference-text") {
397 return self.extract_text_from_element(&child, source);
398 }
399 let found = self.find_reference_text(&child, source);
401 if !found.is_empty() {
402 return found;
403 }
404 }
405 }
406 String::new()
407 }
408
409 fn extract_inline_ref(&self, sup_node: &Node, source: &[u8]) -> Option<InlineNode> {
414 let mut note_id = String::new();
415 let mut label = String::new();
416
417 self.find_ref_parts(sup_node, source, &mut note_id, &mut label);
418
419 if note_id.is_empty() || label.is_empty() {
420 return None;
421 }
422 Some(InlineNode::Ref { label, note_id })
423 }
424
425 fn find_ref_parts(&self, node: &Node, source: &[u8], note_id: &mut String, label: &mut String) {
428 for child in node.children(&mut node.walk()) {
429 if child.kind() != "element" {
430 continue;
431 }
432 if let Some((tag, attrs)) = self.parse_element(&child, source) {
433 match tag.as_str() {
434 "a" => {
435 if note_id.is_empty() {
436 let href = attrs
437 .iter()
438 .find(|(k, _)| k == "href")
439 .map(|(_, v)| v.as_str())
440 .unwrap_or_default();
441 if let Some(fragment) = href.rsplit_once('#') {
443 *note_id = fragment.1.to_string();
444 }
445 }
446 self.find_ref_parts(&child, source, note_id, label);
447 }
448 "span" => {
449 let class = attrs
450 .iter()
451 .find(|(k, _)| k == "class")
452 .map(|(_, v)| v.as_str())
453 .unwrap_or("");
454 if class.split_whitespace().any(|c| c == "mw-reflink-text") {
455 let raw = self.extract_text_from_element(&child, source);
457 *label = raw
458 .trim_matches(|c: char| c == '[' || c == ']' || c.is_whitespace())
459 .to_string();
460 } else {
461 self.find_ref_parts(&child, source, note_id, label);
462 }
463 }
464 _ => {
465 self.find_ref_parts(&child, source, note_id, label);
466 }
467 }
468 }
469 }
470 }
471
472 fn push_inline(&mut self, node: InlineNode) {
474 let last_seg = self.items.iter_mut().rev().find_map(|item| {
475 if let ArticleItem::Paragraph(seg) = item {
476 Some(seg)
477 } else {
478 None
479 }
480 });
481 if let Some(seg) = last_seg {
482 let plain = node.plain_text().to_string();
483 if !seg.text.is_empty() && !plain.is_empty() {
484 if !seg.text.ends_with(' ') {
485 seg.text.push(' ');
486 }
487 }
488 seg.text.push_str(plain.trim());
489 seg.content.push(node);
490 }
491 }
492
493 fn collect_inline_text(&self, node: &Node, source: &[u8]) -> String {
495 let mut text = String::new();
496 for child in node.children(&mut node.walk()) {
497 match child.kind() {
498 "text" => {
499 if let Ok(t) = child.utf8_text(source) {
500 let trimmed = t.trim();
501 if !trimmed.is_empty() {
502 if !text.is_empty() {
503 text.push(' ');
504 }
505 text.push_str(trimmed);
506 }
507 }
508 }
509 "element" => {
510 let child_text = self.collect_inline_text(&child, source);
511 if !child_text.is_empty() {
512 if !text.is_empty() {
513 text.push(' ');
514 }
515 text.push_str(&child_text);
516 }
517 }
518 _ => {}
519 }
520 }
521 text
522 }
523
524 fn walk_and_collect(&mut self, node: &Node, source: &[u8], inside_paragraph: bool) {
525 match node.kind() {
526 "text" => {
527 if let Ok(text) = node.utf8_text(source) {
528 let trimmed = text.trim();
529 if !trimmed.is_empty() {
530 if self.items.is_empty() {
531 self.items.push(ArticleItem::Paragraph(TextSegment {
532 text: String::new(),
533 content: Vec::new(),
534 mwid: String::new(),
535 section: self.get_current_section_string(),
536 section_level: self.get_current_section_level(),
537 }));
538 }
539 self.push_inline(InlineNode::Text(trimmed.to_string()));
540 }
541 }
542 }
543 "script_element" | "style_element" => (),
544 "element" => {
545 if let Some((tag_name, attributes)) = self.parse_element(node, source) {
546 if tag_name == "link" {
547 return;
548 }
549
550 let class_attr = attributes
551 .iter()
552 .find(|(k, _)| k == "class")
553 .map(|(_, v)| v.as_str())
554 .unwrap_or("");
555
556 if inside_paragraph
560 && tag_name == "sup"
561 && class_attr.split_whitespace().any(|c| c == "mw-ref")
562 {
563 if let Some(r) = self.extract_inline_ref(node, source) {
564 self.push_inline(r);
565 }
566 return;
567 }
568
569 const EXCLUDED_CLASSES: &[&str] = &[
570 "shortdescription",
571 "hatnote",
572 "infobox",
573 "reference",
574 "navbox",
575 "noprint",
576 "reflist",
577 "citation",
578 "mw-references",
579 ];
580 if EXCLUDED_CLASSES
581 .iter()
582 .any(|c| class_attr.split_whitespace().any(|cls| cls == *c))
583 {
584 return;
585 }
586
587 if let Some(level) = Self::get_header_level(&tag_name) {
588 let header_text = self.extract_text_from_element(node, source);
589 if !header_text.is_empty() {
590 self.update_sections(level, header_text);
591 }
592 return;
593 }
594
595 if tag_name == "p" {
596 let mwid = attributes
597 .iter()
598 .find(|(k, _)| k == "id")
599 .map(|(_, v)| v.clone())
600 .unwrap_or_default();
601 self.items.push(ArticleItem::Paragraph(TextSegment {
602 text: String::new(),
603 content: Vec::new(),
604 mwid,
605 section: self.get_current_section_string(),
606 section_level: self.get_current_section_level(),
607 }));
608 for i in 0..node.child_count() {
609 if let Some(child) = node.child(i as u32) {
610 self.walk_and_collect(&child, source, true);
611 }
612 }
613 return;
614 }
615
616 if tag_name == "figure" {
617 if let Some(img) = self.extract_image(node, source) {
618 self.items.push(ArticleItem::Image(img));
619 }
620 return;
621 }
622
623 if inside_paragraph {
625 match tag_name.as_str() {
626 "b" | "strong" => {
627 let text = self.collect_inline_text(node, source);
628 if !text.is_empty() {
629 self.push_inline(InlineNode::Bold(text));
630 }
631 return;
632 }
633 "i" | "em" => {
634 let text = self.collect_inline_text(node, source);
635 if !text.is_empty() {
636 self.push_inline(InlineNode::Italic(text));
637 }
638 return;
639 }
640 "a" => {
641 let raw_href = attributes
642 .iter()
643 .find(|(k, _)| k == "href")
644 .map(|(_, v)| v.as_str())
645 .unwrap_or_default();
646 let href = self.resolve_href(raw_href);
647 let text = self.collect_inline_text(node, source);
648 if !text.is_empty() {
649 self.push_inline(InlineNode::Link { text, href });
650 }
651 return;
652 }
653 _ => {}
654 }
655 }
656
657 for i in 0..node.child_count() {
658 if let Some(child) = node.child(i as u32) {
659 self.walk_and_collect(&child, source, inside_paragraph);
660 }
661 }
662 }
663 }
664 _ => {
665 for i in 0..node.child_count() {
666 if let Some(child) = node.child(i as u32) {
667 self.walk_and_collect(&child, source, inside_paragraph);
668 }
669 }
670 }
671 }
672 }
673
674 fn parse_element(
675 &self,
676 element_node: &Node,
677 source: &[u8],
678 ) -> Option<(String, Vec<(String, String)>)> {
679 let tag_container = element_node
681 .children(&mut element_node.walk())
682 .find(|child| child.kind() == "start_tag" || child.kind() == "self_closing_tag")?;
683
684 let tag_name_node = tag_container
685 .children(&mut tag_container.walk())
686 .find(|child| child.kind() == "tag_name")?;
687
688 let tag_name = tag_name_node.utf8_text(source).ok()?.to_string();
689 let mut attributes = Vec::new();
690
691 for child in tag_container.children(&mut tag_container.walk()) {
692 if child.kind() == "attribute" {
693 if let Some(pair) = self.parse_attribute(&child, source) {
694 attributes.push(pair);
695 }
696 }
697 }
698
699 Some((tag_name, attributes))
700 }
701
702 fn parse_attribute(&self, attr_node: &Node, source: &[u8]) -> Option<(String, String)> {
703 let mut attr_name = None;
704 let mut attr_value = String::new();
705
706 for child in attr_node.children(&mut attr_node.walk()) {
707 match child.kind() {
708 "attribute_name" => {
709 attr_name = child.utf8_text(source).ok().map(|s| s.to_string());
710 }
711 "quoted_attribute_value" => {
712 for grandchild in child.children(&mut child.walk()) {
713 if grandchild.kind() == "attribute_value" {
714 if let Ok(value) = grandchild.utf8_text(source) {
715 attr_value = value.to_string();
716 }
717 }
718 }
719 }
720 "attribute_value" => {
721 if let Ok(value) = child.utf8_text(source) {
722 attr_value = value.to_string();
723 }
724 }
725 _ => {}
726 }
727 }
728
729 attr_name.map(|name| (name, attr_value))
730 }
731
732 fn extract_image(&self, figure_node: &Node, source: &[u8]) -> Option<ImageSegment> {
737 let mut src = String::new();
738 let mut alt = String::new();
739 let mut caption = String::new();
740
741 for child in figure_node.children(&mut figure_node.walk()) {
742 if child.kind() == "element" {
743 if let Some((tag, attrs)) = self.parse_element(&child, source) {
744 if tag == "figcaption" {
745 caption = self.extract_text_from_element(&child, source);
746 } else {
747 self.find_img(&child, source, &tag, &attrs, &mut src, &mut alt);
749 }
750 }
751 }
752 }
753
754 if src.is_empty() {
755 return None;
756 }
757
758 Some(ImageSegment {
759 src: self.resolve_href(&src),
760 alt,
761 caption,
762 section: self.get_current_section_string(),
763 section_level: self.get_current_section_level(),
764 })
765 }
766
767 fn find_img(
769 &self,
770 node: &Node,
771 source: &[u8],
772 tag: &str,
773 attrs: &[(String, String)],
774 src: &mut String,
775 alt: &mut String,
776 ) {
777 if !src.is_empty() {
778 return;
779 }
780 if tag == "img" {
781 if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "src") {
782 *src = v.clone();
783 }
784 if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "alt") {
785 *alt = v.clone();
786 }
787 return;
788 }
789 for child in node.children(&mut node.walk()) {
790 if child.kind() == "element" {
791 if let Some((child_tag, child_attrs)) = self.parse_element(&child, source) {
792 self.find_img(&child, source, &child_tag, &child_attrs, src, alt);
793 }
794 }
795 }
796 }
797}
798
799impl Default for WikiPage {
800 fn default() -> Self {
801 Self::new().expect("Failed to initialise tree-sitter HTML parser")
802 }
803}
804
805pub fn strip_references(items: Vec<ArticleItem>) -> Vec<ArticleItem> {
811 items
812 .into_iter()
813 .filter_map(|item| match item {
814 ArticleItem::References(_) => None,
815 ArticleItem::Paragraph(mut seg) => {
816 seg.content.retain(|n| !matches!(n, InlineNode::Ref { .. }));
817 seg.text = seg
819 .content
820 .iter()
821 .map(|n| n.plain_text())
822 .filter(|s| !s.is_empty())
823 .collect::<Vec<_>>()
824 .join(" ");
825 Some(ArticleItem::Paragraph(seg))
826 }
827 other => Some(other),
828 })
829 .collect()
830}
831
832#[cfg(any(feature = "cli", feature = "web"))]
836pub async fn get_text(language: &str, title: &str) -> anyhow::Result<Vec<ArticleItem>> {
837 let html = get_page_content_html(language, title).await?;
838 let mut page = WikiPage::new()?;
839 page.set_base_url(language);
840 Ok(page.extract_text(&html)?)
841}
842
843#[cfg(any(feature = "cli", feature = "web"))]
844async fn get_page_content_html(language: &str, title: &str) -> anyhow::Result<String> {
845 let url = format!("https://{language}.wikipedia.org/api/rest_v1/page/html/{title}?stash=false");
846 let client = reqwest::Client::new();
847 let response = client
848 .get(&url)
849 .header(
850 "User-Agent",
851 "wikipedia-article-transform/0.1 (https://github.com/santhoshtr/wikipedia-article-transform)",
852 )
853 .send()
854 .await?;
855 if !response.status().is_success() {
856 anyhow::bail!("Failed to fetch article: HTTP {}", response.status());
857 }
858 Ok(response.text().await?)
859}
860
861#[cfg(test)]
862mod tests {
863 use super::*;
864
865 fn extract(html: &str) -> Vec<ArticleItem> {
866 WikiPage::extract_text_plain(html).unwrap();
867 let mut page = WikiPage::new().unwrap();
868 page.extract_text(html).unwrap()
869 }
870
871 fn paragraphs(items: &[ArticleItem]) -> Vec<&TextSegment> {
872 items
873 .iter()
874 .filter_map(|i| {
875 if let ArticleItem::Paragraph(s) = i {
876 Some(s)
877 } else {
878 None
879 }
880 })
881 .collect()
882 }
883
884 fn images(items: &[ArticleItem]) -> Vec<&ImageSegment> {
885 items
886 .iter()
887 .filter_map(|i| {
888 if let ArticleItem::Image(s) = i {
889 Some(s)
890 } else {
891 None
892 }
893 })
894 .collect()
895 }
896
897 #[test]
898 fn test_basic_paragraph() {
899 let items = extract("<html><body><p id=\"p1\">Hello world.</p></body></html>");
900 let segs = paragraphs(&items);
901 assert_eq!(segs.len(), 1);
902 assert_eq!(segs[0].text, "Hello world.");
903 assert_eq!(segs[0].mwid, "p1");
904 assert_eq!(segs[0].section, "");
905 assert_eq!(segs[0].section_level, 0);
906 }
907
908 #[test]
909 fn test_multiple_paragraphs() {
910 let items = extract("<p>First.</p><p>Second.</p><p>Third.</p>");
911 let segs = paragraphs(&items);
912 assert_eq!(segs.len(), 3);
913 assert_eq!(segs[0].text, "First.");
914 assert_eq!(segs[1].text, "Second.");
915 assert_eq!(segs[2].text, "Third.");
916 }
917
918 #[test]
919 fn test_section_tracking() {
920 let html = "<h2>History</h2><p>Para one.</p><h3>Early life</h3><p>Para two.</p>";
921 let items = extract(html);
922 let segs = paragraphs(&items);
923 assert_eq!(segs[0].section, "History");
924 assert_eq!(segs[1].section, "History - Early life");
925 }
926
927 #[test]
928 fn test_section_level() {
929 let html = "<h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
930 let items = extract(html);
931 let segs = paragraphs(&items);
932 assert_eq!(segs[0].section_level, 2);
933 assert_eq!(segs[1].section_level, 3);
934 }
935
936 #[test]
937 fn test_section_resets_at_same_level() {
938 let html = "<h2>History</h2><p>A.</p><h2>Geography</h2><p>B.</p>";
939 let items = extract(html);
940 let segs = paragraphs(&items);
941 assert_eq!(segs[0].section, "History");
942 assert_eq!(segs[1].section, "Geography");
943 }
944
945 #[test]
946 fn test_excluded_class_infobox() {
947 let html = r#"<p>Visible.</p><table class="infobox"><tr><td>Hidden.</td></tr></table><p>Also visible.</p>"#;
948 let items = extract(html);
949 let segs = paragraphs(&items);
950 assert!(segs.iter().all(|s| !s.text.contains("Hidden")));
951 assert_eq!(segs.len(), 2);
952 }
953
954 #[test]
955 fn test_excluded_class_reflist() {
956 let html = r#"<p>Main text.</p><div class="reflist"><p>Ref text.</p></div>"#;
957 let items = extract(html);
958 let segs = paragraphs(&items);
959 assert_eq!(segs.len(), 1);
960 assert_eq!(segs[0].text, "Main text.");
961 }
962
963 #[test]
964 fn test_script_and_style_skipped() {
965 let html = "<p>Real.</p><script>var x=1;</script><style>body{}</style><p>Also real.</p>";
966 let items = extract(html);
967 let segs = paragraphs(&items);
968 assert_eq!(segs.len(), 2);
969 assert!(segs.iter().all(|s| !s.text.contains("var x")));
970 }
971
972 #[test]
973 fn test_empty_html() {
974 let items = extract("");
975 assert!(items.is_empty());
976 }
977
978 #[test]
979 fn test_extract_text_plain() {
980 let html = "<p>First paragraph.</p><p>Second paragraph.</p>";
981 let text = WikiPage::extract_text_plain(html).unwrap();
982 assert_eq!(text, "First paragraph.\n\nSecond paragraph.");
983 }
984
985 #[test]
986 fn test_default_impl() {
987 let mut page = WikiPage::default();
988 let items = page.extract_text("<p>Works.</p>").unwrap();
989 let segs = paragraphs(&items);
990 assert_eq!(segs[0].text, "Works.");
991 }
992
993 #[test]
994 fn test_inline_bold() {
995 let items = extract("<p><b>Bold</b> text</p>");
996 let segs = paragraphs(&items);
997 assert_eq!(segs.len(), 1);
998 assert_eq!(segs[0].text, "Bold text");
999 assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "Bold"));
1000 assert!(matches!(&segs[0].content[1], InlineNode::Text(s) if s == "text"));
1001 }
1002
1003 #[test]
1004 fn test_inline_italic() {
1005 let items = extract("<p><i>italic</i></p>");
1006 let segs = paragraphs(&items);
1007 assert_eq!(segs.len(), 1);
1008 assert!(matches!(&segs[0].content[0], InlineNode::Italic(s) if s == "italic"));
1009 }
1010
1011 #[test]
1012 fn test_inline_strong_em() {
1013 let items = extract("<p><strong>S</strong> and <em>E</em></p>");
1014 let segs = paragraphs(&items);
1015 assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "S"));
1016 assert!(matches!(&segs[0].content[2], InlineNode::Italic(s) if s == "E"));
1017 }
1018
1019 #[test]
1020 fn test_inline_link() {
1021 let items = extract(r#"<p><a href="./X">anchor</a></p>"#);
1022 let segs = paragraphs(&items);
1023 assert_eq!(segs.len(), 1);
1024 assert!(matches!(&segs[0].content[0],
1026 InlineNode::Link { text, href } if text == "anchor" && href == "./X"));
1027 }
1028
1029 #[test]
1030 fn test_inline_link_absolute() {
1031 let html = r#"<p><a href="./Cryogenics">Cryogenics</a></p>"#;
1032 let mut page = WikiPage::new().unwrap();
1033 page.set_base_url("en");
1034 let items = page.extract_text(html).unwrap();
1035 let segs = paragraphs(&items);
1036 assert!(matches!(&segs[0].content[0],
1037 InlineNode::Link { text, href }
1038 if text == "Cryogenics"
1039 && href == "https://en.wikipedia.org/wiki/Cryogenics"));
1040 }
1041
1042 #[test]
1043 fn test_resolve_href_protocol_relative() {
1044 let html = r#"<p><a href="//en.wikipedia.org/wiki/Oxygen">O</a></p>"#;
1045 let mut page = WikiPage::new().unwrap();
1046 let items = page.extract_text(html).unwrap();
1047 let segs = paragraphs(&items);
1048 assert!(matches!(&segs[0].content[0],
1049 InlineNode::Link { href, .. } if href == "https://en.wikipedia.org/wiki/Oxygen"));
1050 }
1051
1052 #[test]
1053 fn test_format_plain_sections() {
1054 let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1055 let items = extract(html);
1056 let out = items.format_plain();
1057 assert!(out.contains("\nIntro.\n"), "intro paragraph missing");
1058 assert!(out.contains("## History\n"), "h2 heading missing");
1059 assert!(out.contains("\nA.\n"), "first section paragraph missing");
1060 assert!(out.contains("### Early life\n"), "h3 heading missing");
1061 assert!(out.contains("\nB.\n"), "subsection paragraph missing");
1062 assert!(out.find("## History").unwrap() < out.find("\nA.\n").unwrap());
1063 assert!(out.find("### Early life").unwrap() < out.find("\nB.\n").unwrap());
1064 }
1065
1066 #[test]
1067 fn test_format_json_tree() {
1068 let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1069 let items = extract(html);
1070 let json_str = items.format_json().unwrap();
1071 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1072 assert_eq!(v["intro"][0], "Intro.");
1073 assert_eq!(v["sections"][0]["heading"], "History");
1074 assert_eq!(v["sections"][0]["level"], 2);
1075 assert_eq!(v["sections"][0]["paragraphs"][0], "A.");
1076 assert_eq!(v["sections"][0]["subsections"][0]["heading"], "Early life");
1077 assert_eq!(v["sections"][0]["subsections"][0]["level"], 3);
1078 assert_eq!(v["sections"][0]["subsections"][0]["paragraphs"][0], "B.");
1079 }
1080
1081 #[test]
1082 fn test_format_markdown_inline() {
1083 let items = extract(
1084 "<h2>Title</h2><p><b>Bold</b> and <i>italic</i> and <a href=\"/x\">link</a></p>",
1085 );
1086 let out = items.format_markdown();
1087 assert!(out.contains("## Title"));
1088 assert!(out.contains("**Bold**"));
1089 assert!(out.contains("_italic_"));
1090 assert!(out.contains("[link](/x)"));
1091 assert!(
1093 out.contains("**Bold** and"),
1094 "space after bold missing: {out}"
1095 );
1096 assert!(
1097 out.contains("_italic_ and"),
1098 "space after italic missing: {out}"
1099 );
1100 assert!(
1101 out.contains("and [link]"),
1102 "space before link missing: {out}"
1103 );
1104 }
1105
1106 #[test]
1107 fn test_image_extraction() {
1108 let html = r#"<figure typeof="mw:File/Thumb">
1109 <a href="./File:Foo.jpg" class="mw-file-description">
1110 <img alt="A description" src="//upload.wikimedia.org/thumb/foo.jpg" class="mw-file-element"/>
1111 </a>
1112 <figcaption>Caption text here.</figcaption>
1113 </figure>"#;
1114 let items = extract(html);
1115 let imgs = images(&items);
1116 assert_eq!(imgs.len(), 1);
1117 assert_eq!(imgs[0].src, "https://upload.wikimedia.org/thumb/foo.jpg");
1118 assert_eq!(imgs[0].alt, "A description");
1119 assert_eq!(imgs[0].caption, "Caption text here.");
1120 }
1121
1122 #[test]
1123 fn test_image_no_caption() {
1124 let html = r#"<figure typeof="mw:File/Frameless">
1125 <a href="./File:Bar.png" class="mw-file-description">
1126 <img alt="Bar" src="//upload.wikimedia.org/bar.png" class="mw-file-element"/>
1127 </a>
1128 <figcaption></figcaption>
1129 </figure>"#;
1130 let items = extract(html);
1131 let imgs = images(&items);
1132 assert_eq!(imgs.len(), 1);
1133 assert_eq!(imgs[0].caption, "");
1134 }
1135
1136 #[test]
1137 fn test_image_section_tracking() {
1138 let html = r#"<h2>History</h2>
1139 <figure typeof="mw:File/Thumb">
1140 <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1141 <figcaption>X caption</figcaption>
1142 </figure>
1143 <p>A paragraph.</p>"#;
1144 let items = extract(html);
1145 let imgs = images(&items);
1146 assert_eq!(imgs.len(), 1);
1147 assert_eq!(imgs[0].section, "History");
1148 assert_eq!(imgs[0].section_level, 2);
1149 }
1150
1151 #[test]
1152 fn test_image_interleaved_order() {
1153 let html = r#"<p>Before.</p>
1154 <figure typeof="mw:File/Thumb">
1155 <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1156 <figcaption>Caption</figcaption>
1157 </figure>
1158 <p>After.</p>"#;
1159 let items = extract(html);
1160 assert!(matches!(&items[0], ArticleItem::Paragraph(s) if s.text == "Before."));
1161 assert!(matches!(&items[1], ArticleItem::Image(_)));
1162 assert!(matches!(&items[2], ArticleItem::Paragraph(s) if s.text == "After."));
1163 }
1164
1165 #[test]
1166 fn test_markdown_image() {
1167 let html = r#"<figure typeof="mw:File/Thumb">
1168 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1169 <figcaption>The caption.</figcaption>
1170 </figure>"#;
1171 let items = extract(html);
1172 let out = items.format_markdown();
1173 assert!(out.contains(""));
1175 assert!(out.contains("_The caption._"));
1176 }
1177
1178 #[test]
1179 fn test_markdown_image_no_caption() {
1180 let html = r#"<figure typeof="mw:File/Frameless">
1181 <a href="./File:Bar.png"><img alt="Bar" src="//upload.wikimedia.org/bar.png"/></a>
1182 <figcaption></figcaption>
1183 </figure>"#;
1184 let items = extract(html);
1185 let out = items.format_markdown();
1186 assert!(out.contains(""));
1187 assert!(!out.contains("__"));
1189 }
1190
1191 #[test]
1192 fn test_plain_image() {
1193 let html = r#"<figure typeof="mw:File/Thumb">
1194 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1195 <figcaption>The caption.</figcaption>
1196 </figure>"#;
1197 let items = extract(html);
1198 let out = items.format_plain();
1199 assert!(out.contains("[Image: Alt text]"));
1200 assert!(out.contains("The caption."));
1201 }
1202
1203 #[test]
1204 fn test_json_image() {
1205 let html = r#"<h2>Section</h2>
1206 <figure typeof="mw:File/Thumb">
1207 <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1208 <figcaption>The caption.</figcaption>
1209 </figure>
1210 <p>A paragraph.</p>"#;
1211 let items = extract(html);
1212 let json_str = items.format_json().unwrap();
1213 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1214 assert_eq!(v["sections"][0]["images"][0]["alt"], "Alt text");
1215 assert_eq!(
1216 v["sections"][0]["images"][0]["src"],
1217 "https://upload.wikimedia.org/foo.jpg"
1218 );
1219 assert_eq!(v["sections"][0]["images"][0]["caption"], "The caption.");
1220 }
1221
1222 fn ref_html() -> &'static str {
1225 r#"<p id="p1">Some text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1226 ><a href="./Article#cite_note-Foo-1"><span class="mw-reflink-text">[1]</span></a
1227 ></sup> More text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1228 ><a href="./Article#cite_note-Bar-2"><span class="mw-reflink-text">[2]</span></a
1229 ></sup></p>
1230 <ol class="mw-references references">
1231 <li id="cite_note-Foo-1" data-mw-footnote-number="1">
1232 <span class="mw-cite-backlink"><a href="./Article#cite_ref-Foo_1-0">↑</a></span>
1233 <span id="mw-reference-text-cite_note-Foo-1" class="mw-reference-text reference-text">Author A. <i>Title One</i>. Publisher, 2020.</span>
1234 </li>
1235 <li id="cite_note-Bar-2" data-mw-footnote-number="2">
1236 <span class="mw-cite-backlink"><a href="./Article#cite_ref-Bar_2-0">↑</a></span>
1237 <span id="mw-reference-text-cite_note-Bar-2" class="mw-reference-text reference-text">Author B. Title Two. Journal, 2021.</span>
1238 </li>
1239 </ol>"#
1240 }
1241
1242 #[test]
1243 fn test_ref_inline_nodes() {
1244 let items = extract(ref_html());
1245 let segs = paragraphs(&items);
1246 assert_eq!(segs.len(), 1);
1247 assert!(matches!(&segs[0].content[0], InlineNode::Text(s) if s.contains("Some text")));
1249 assert!(
1250 matches!(&segs[0].content[1], InlineNode::Ref { label, note_id }
1251 if label == "1" && note_id == "cite_note-Foo-1")
1252 );
1253 assert!(
1254 matches!(&segs[0].content[3], InlineNode::Ref { label, note_id }
1255 if label == "2" && note_id == "cite_note-Bar-2")
1256 );
1257 }
1258
1259 #[test]
1260 fn test_ref_plain_text_excludes_label() {
1261 let items = extract(ref_html());
1263 let segs = paragraphs(&items);
1264 assert!(!segs[0].text.contains('['));
1265 assert!(segs[0].text.contains("Some text"));
1266 assert!(segs[0].text.contains("More text"));
1267 }
1268
1269 #[test]
1270 fn test_ref_references_item_appended() {
1271 let items = extract(ref_html());
1272 let refs = items.iter().find_map(|i| {
1273 if let ArticleItem::References(r) = i {
1274 Some(r)
1275 } else {
1276 None
1277 }
1278 });
1279 assert!(refs.is_some());
1280 let refs = refs.unwrap();
1281 assert_eq!(refs.len(), 2);
1282 assert!(refs["cite_note-Foo-1"].contains("Title One"));
1283 assert!(refs["cite_note-Bar-2"].contains("Title Two"));
1284 }
1285
1286 #[test]
1287 fn test_ref_no_refs_no_item() {
1288 let items = extract("<p>No citations here.</p>");
1289 assert!(
1290 !items
1291 .iter()
1292 .any(|i| matches!(i, ArticleItem::References(_)))
1293 );
1294 }
1295
1296 #[test]
1297 fn test_ref_markdown_inline_and_list() {
1298 let items = extract(ref_html());
1299 let out = items.format_markdown();
1300 assert!(out.contains("[^1]"), "inline [^1] missing");
1302 assert!(out.contains("[^2]"), "inline [^2] missing");
1303 assert!(out.contains("## References"), "References heading missing");
1305 assert!(out.contains("[^1]: "), "[^1]: definition missing");
1306 assert!(out.contains("Title One"), "citation text missing");
1307 assert!(out.contains("[^2]: "), "[^2]: definition missing");
1308 assert!(out.contains("Title Two"), "citation text missing");
1309 assert!(out.find("Some text").unwrap() < out.find("## References").unwrap());
1311 }
1312
1313 #[test]
1314 fn test_ref_json_references_key() {
1315 let items = extract(ref_html());
1316 let json_str = items.format_json().unwrap();
1317 let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1318 assert!(v["references"].is_object(), "references key missing");
1319 assert!(
1320 v["references"]["cite_note-Foo-1"]
1321 .as_str()
1322 .unwrap()
1323 .contains("Title One")
1324 );
1325 assert!(
1326 v["references"]["cite_note-Bar-2"]
1327 .as_str()
1328 .unwrap()
1329 .contains("Title Two")
1330 );
1331 }
1332
1333 #[test]
1334 fn test_strip_references() {
1335 let items = extract(ref_html());
1336 let stripped = strip_references(items);
1337 assert!(
1339 !stripped
1340 .iter()
1341 .any(|i| matches!(i, ArticleItem::References(_)))
1342 );
1343 let segs = paragraphs(&stripped);
1345 for seg in segs {
1346 assert!(
1347 !seg.content
1348 .iter()
1349 .any(|n| matches!(n, InlineNode::Ref { .. }))
1350 );
1351 assert!(!seg.text.contains('['));
1352 }
1353 }
1354}