wikipedia_article_transform/
lib.rs

1//! Extract plain text from Wikipedia article HTML.
2//!
3//! This crate parses Wikipedia article HTML using [tree-sitter](https://tree-sitter.github.io/)
4//! and extracts clean, structured plain text — skipping navigation, infoboxes, references,
5//! and other non-prose content.
6//!
7//! # Quick start
8//!
9//! ```rust
10//! use wikipedia_article_transform::WikiPage;
11//!
12//! let html = r#"<html><body><p id="intro">Hello world.</p></body></html>"#;
13//! let text = WikiPage::extract_text_plain(html).unwrap();
14//! assert_eq!(text, "Hello world.");
15//! ```
16//!
17//! For richer output with section tracking and inline structure, use [`WikiPage::extract_text`]:
18//!
19//! ```rust
20//! use wikipedia_article_transform::{WikiPage, ArticleItem};
21//!
22//! let html = r#"<html><body><h2>History</h2><p id="p1">Some text.</p></body></html>"#;
23//! let mut page = WikiPage::new().unwrap();
24//! let items = page.extract_text(html).unwrap();
25//! if let ArticleItem::Paragraph(seg) = &items[0] {
26//!     assert_eq!(seg.section, "History");
27//!     assert_eq!(seg.section_level, 2);
28//!     assert_eq!(seg.text, "Some text.");
29//! }
30//! ```
31//!
32//! # Optional feature: `fetch`
33//!
34//! Enable the `fetch` feature to fetch Wikipedia articles directly via the REST API:
35//!
36//! ```toml
37//! wikipedia-article-transform = { version = "0.1", features = ["cli"] }
38//! ```
39
40pub mod formatters;
41pub use formatters::ArticleFormat;
42
43use std::collections::HashMap;
44
45use serde::Serialize;
46use tree_sitter::{Node, Parser};
47use tree_sitter_html::LANGUAGE;
48
49/// An inline content node within a paragraph.
50///
51/// Captures the inline structure of paragraph text so formatters can render
52/// bold, italic, link, and citation reference markup.
53#[derive(Debug, Clone)]
54pub enum InlineNode {
55    /// Plain text.
56    Text(String),
57    /// Bold text (`<b>` or `<strong>`).
58    Bold(String),
59    /// Italic text (`<i>` or `<em>`).
60    Italic(String),
61    /// A hyperlink (`<a href="...">`).
62    Link { text: String, href: String },
63    /// A citation reference (`<sup class="mw-ref reference">`).
64    ///
65    /// `label` is the display number (e.g. `"1"`), `note_id` is the fragment
66    /// identifying the entry in the reference list (e.g. `"cite_note-Foo-1"`).
67    Ref { label: String, note_id: String },
68}
69
70/// An image extracted from a `<figure>` block in a Wikipedia article.
71///
72/// Wikipedia wraps images in `<figure>` elements containing an `<img>` and an
73/// optional `<figcaption>`. Images appear between paragraphs, not inside them.
74#[derive(Debug, Clone, Serialize)]
75pub struct ImageSegment {
76    /// Resolved URL of the image (thumbnail size as served by Wikimedia).
77    pub src: String,
78    /// Alt text from the `<img alt="...">` attribute.
79    pub alt: String,
80    /// Plain text of the `<figcaption>` element, if present.
81    pub caption: String,
82    /// The section heading path at the point where the image appears.
83    pub section: String,
84    /// The heading level of the current section (1–6). 0 if before any heading.
85    pub section_level: u8,
86}
87
88/// A single item extracted from a Wikipedia article, in document order.
89///
90/// Paragraphs and images are interleaved as they appear in the source HTML,
91/// so formatters can reproduce the original reading order. If references were
92/// found, a single [`ArticleItem::References`] item is appended last.
93#[derive(Debug, Clone)]
94pub enum ArticleItem {
95    /// A paragraph extracted from a `<p>` element.
96    Paragraph(TextSegment),
97    /// An image extracted from a `<figure>` element.
98    Image(ImageSegment),
99    /// All citation references collected from `<ol class="references">` lists.
100    ///
101    /// Keyed by the fragment id (e.g. `"cite_note-Foo-1"`), valued by the
102    /// full plain-text citation string.
103    References(HashMap<String, String>),
104}
105
106impl InlineNode {
107    /// Returns the plain text content, stripping any markup.
108    /// Returns an empty string for `Ref` nodes — citations are not prose text.
109    pub fn plain_text(&self) -> &str {
110        match self {
111            InlineNode::Text(s) | InlineNode::Bold(s) | InlineNode::Italic(s) => s,
112            InlineNode::Link { text, .. } => text,
113            InlineNode::Ref { .. } => "",
114        }
115    }
116}
117
118/// A single paragraph-level text segment extracted from a Wikipedia article.
119///
120/// Each segment corresponds to a `<p>` block in the HTML. It captures the plain
121/// text, the inline content structure, the MediaWiki paragraph ID, the section
122/// heading path, and the heading depth.
123#[derive(Debug, Clone, Serialize)]
124pub struct TextSegment {
125    /// The extracted plain text of this segment (inline markup stripped).
126    pub text: String,
127    /// The inline content nodes, preserving bold/italic/link structure.
128    #[serde(skip)]
129    pub content: Vec<InlineNode>,
130    /// The `id` attribute of the enclosing `<p>` element, if present.
131    pub mwid: String,
132    /// The section heading path, e.g. `"History - Early life"`.
133    pub section: String,
134    /// The heading level of the current section (1–6). 0 if before any heading.
135    pub section_level: u8,
136}
137
138#[derive(Debug, Clone)]
139struct SectionInfo {
140    title: String,
141    level: u8,
142}
143
144/// A reusable Wikipedia HTML parser.
145///
146/// Reusing a single `WikiPage` instance across multiple articles is more efficient
147/// than creating one per article, since it avoids re-initialising the tree-sitter
148/// parser and grammar on each call.
149///
150/// # Example
151///
152/// ```rust
153/// use wikipedia_article_transform::{WikiPage, ArticleItem};
154///
155/// let mut page = WikiPage::new().unwrap();
156/// let items = page.extract_text("<p>Hello.</p>").unwrap();
157/// if let ArticleItem::Paragraph(seg) = &items[0] {
158///     assert_eq!(seg.text, "Hello.");
159/// }
160/// ```
161pub struct WikiPage {
162    parser: Parser,
163    items: Vec<ArticleItem>,
164    current_sections: Vec<SectionInfo>,
165    /// Base URL used to resolve relative hrefs, e.g. `https://en.wikipedia.org/wiki/`.
166    base_url: Option<String>,
167    /// Citation references collected by `extract_references()`.
168    /// Keyed by note id (e.g. `"cite_note-Foo-1"`), valued by plain-text citation.
169    references: HashMap<String, String>,
170}
171
172impl WikiPage {
173    /// Creates a new `WikiPage`, initialising the tree-sitter HTML parser.
174    pub fn new() -> anyhow::Result<Self> {
175        let language = LANGUAGE.into();
176        let mut parser = Parser::new();
177        parser.set_language(&language)?;
178        Ok(WikiPage {
179            parser,
180            items: Vec::new(),
181            current_sections: Vec::new(),
182            base_url: None,
183            references: HashMap::new(),
184        })
185    }
186
187    /// Set the base URL for resolving relative link hrefs.
188    ///
189    /// Call this before [`extract_text`] when the HTML comes from a known origin.
190    /// The `language` parameter is a Wikipedia language code (e.g. `"en"`, `"ml"`).
191    ///
192    /// ```rust
193    /// use wikipedia_article_transform::WikiPage;
194    ///
195    /// let mut page = WikiPage::new().unwrap();
196    /// page.set_base_url("en");
197    /// ```
198    pub fn set_base_url(&mut self, language: &str) {
199        self.base_url = Some(format!("https://{language}.wikipedia.org/wiki/"));
200    }
201
202    /// Resolve an href against the base URL.
203    ///
204    /// - `./Foo`           → `{base}Foo`
205    /// - `//en.wikipedia.org/wiki/Foo` → `https://en.wikipedia.org/wiki/Foo`
206    /// - already `http(s)://` → unchanged
207    /// - anything else (anchors, mw-data:, etc.) → unchanged
208    fn resolve_href(&self, href: &str) -> String {
209        if href.starts_with("http://") || href.starts_with("https://") {
210            return href.to_string();
211        }
212        if let Some(rest) = href.strip_prefix("//") {
213            return format!("https://{rest}");
214        }
215        if let Some(path) = href.strip_prefix("./") {
216            if let Some(base) = &self.base_url {
217                return format!("{base}{path}");
218            }
219        }
220        href.to_string()
221    }
222
223    /// Parses `html` and returns one [`ArticleItem`] per paragraph or image, in document order.
224    ///
225    /// If any `<ol class="references">` lists are found, a final
226    /// [`ArticleItem::References`] item is appended containing all citations.
227    ///
228    /// The parser state is reset on each call, so the same `WikiPage` can be
229    /// reused safely across multiple articles.
230    ///
231    /// Skipped elements: `<script>`, `<style>`, `<link>`, and elements with
232    /// classes `shortdescription`, `hatnote`, `infobox`, `reference`, `navbox`,
233    /// `noprint`, `reflist`, `citation`.
234    pub fn extract_text(&mut self, html: &str) -> anyhow::Result<Vec<ArticleItem>> {
235        self.items.clear();
236        self.current_sections.clear();
237        self.references.clear();
238        let tree = self
239            .parser
240            .parse(html, None)
241            .ok_or_else(|| anyhow::anyhow!("Failed to parse HTML"))?;
242        let source = html.as_bytes();
243        self.extract_references(&tree.root_node(), source);
244        self.walk_and_collect(&tree.root_node(), source, false);
245        if !self.references.is_empty() {
246            self.items
247                .push(ArticleItem::References(self.references.clone()));
248        }
249        Ok(self.items.clone())
250    }
251
252    /// Convenience method: parse `html` and return all paragraph text joined by `"\n\n"`.
253    pub fn extract_text_plain(html: &str) -> anyhow::Result<String> {
254        let mut page = WikiPage::new()?;
255        let items = page.extract_text(html)?;
256        let text = items
257            .iter()
258            .filter_map(|item| match item {
259                ArticleItem::Paragraph(seg) => {
260                    let t = seg.text.trim();
261                    if t.is_empty() { None } else { Some(t) }
262                }
263                ArticleItem::Image(_) | ArticleItem::References(_) => None,
264            })
265            .collect::<Vec<_>>()
266            .join("\n\n");
267        Ok(text)
268    }
269
270    fn get_header_level(tag_name: &str) -> Option<u8> {
271        match tag_name {
272            "h1" => Some(1),
273            "h2" => Some(2),
274            "h3" => Some(3),
275            "h4" => Some(4),
276            "h5" => Some(5),
277            "h6" => Some(6),
278            _ => None,
279        }
280    }
281
282    fn extract_text_from_element(&self, node: &Node, source: &[u8]) -> String {
283        let mut text = String::new();
284        for child in node.children(&mut node.walk()) {
285            match child.kind() {
286                "text" => {
287                    if let Ok(t) = child.utf8_text(source) {
288                        text.push_str(t.trim());
289                    }
290                }
291                "element" => {
292                    let child_text = self.extract_text_from_element(&child, source);
293                    if !child_text.is_empty() {
294                        if !text.is_empty() {
295                            text.push(' ');
296                        }
297                        text.push_str(&child_text);
298                    }
299                }
300                _ => {}
301            }
302        }
303        text
304    }
305
306    fn update_sections(&mut self, level: u8, title: String) {
307        self.current_sections
308            .retain(|section| section.level < level);
309        self.current_sections.push(SectionInfo { title, level });
310    }
311
312    fn get_current_section_string(&self) -> String {
313        self.current_sections
314            .iter()
315            .map(|s| s.title.as_str())
316            .collect::<Vec<_>>()
317            .join(" - ")
318    }
319
320    fn get_current_section_level(&self) -> u8 {
321        self.current_sections.last().map(|s| s.level).unwrap_or(0)
322    }
323
324    /// Pre-scan the parse tree for `<ol class="mw-references references">` elements
325    /// and populate `self.references` with `note_id → citation_text` pairs.
326    ///
327    /// This runs before `walk_and_collect` so that inline `<sup>` nodes encountered
328    /// during the main walk can be cross-referenced.
329    fn extract_references(&mut self, node: &Node, source: &[u8]) {
330        match node.kind() {
331            "element" => {
332                if let Some((tag, attrs)) = self.parse_element(node, source) {
333                    let class = attrs
334                        .iter()
335                        .find(|(k, _)| k == "class")
336                        .map(|(_, v)| v.as_str())
337                        .unwrap_or("");
338                    let classes: Vec<&str> = class.split_whitespace().collect();
339
340                    // Found a reference list: collect its <li> children
341                    if tag == "ol" && classes.contains(&"references") {
342                        for child in node.children(&mut node.walk()) {
343                            if child.kind() != "element" {
344                                continue;
345                            }
346                            if let Some((child_tag, child_attrs)) =
347                                self.parse_element(&child, source)
348                            {
349                                if child_tag != "li" {
350                                    continue;
351                                }
352                                let note_id = child_attrs
353                                    .iter()
354                                    .find(|(k, _)| k == "id")
355                                    .map(|(_, v)| v.clone())
356                                    .unwrap_or_default();
357                                if note_id.is_empty() {
358                                    continue;
359                                }
360                                // Find the <span class="mw-reference-text reference-text">
361                                let citation = self.find_reference_text(&child, source);
362                                if !citation.is_empty() {
363                                    self.references.insert(note_id, citation);
364                                }
365                            }
366                        }
367                        return; // don't recurse into the ol further
368                    }
369
370                    // Recurse into other elements looking for more reference lists
371                    for child in node.children(&mut node.walk()) {
372                        self.extract_references(&child, source);
373                    }
374                }
375            }
376            _ => {
377                for child in node.children(&mut node.walk()) {
378                    self.extract_references(&child, source);
379                }
380            }
381        }
382    }
383
384    /// Find and return the plain text of the `<span class="mw-reference-text">` inside a `<li>`.
385    fn find_reference_text(&self, li_node: &Node, source: &[u8]) -> String {
386        for child in li_node.children(&mut li_node.walk()) {
387            if child.kind() != "element" {
388                continue;
389            }
390            if let Some((tag, attrs)) = self.parse_element(&child, source) {
391                let class = attrs
392                    .iter()
393                    .find(|(k, _)| k == "class")
394                    .map(|(_, v)| v.as_str())
395                    .unwrap_or("");
396                if tag == "span" && class.split_whitespace().any(|c| c == "reference-text") {
397                    return self.extract_text_from_element(&child, source);
398                }
399                // Recurse — the span may be nested
400                let found = self.find_reference_text(&child, source);
401                if !found.is_empty() {
402                    return found;
403                }
404            }
405        }
406        String::new()
407    }
408
409    /// Extract an [`InlineNode::Ref`] from a `<sup class="mw-ref reference">` node.
410    ///
411    /// Finds the inner `<a href="...#note_id">` for the note_id and the
412    /// `<span class="mw-reflink-text">` for the display label.
413    fn extract_inline_ref(&self, sup_node: &Node, source: &[u8]) -> Option<InlineNode> {
414        let mut note_id = String::new();
415        let mut label = String::new();
416
417        self.find_ref_parts(sup_node, source, &mut note_id, &mut label);
418
419        if note_id.is_empty() || label.is_empty() {
420            return None;
421        }
422        Some(InlineNode::Ref { label, note_id })
423    }
424
425    /// Recursively walk a `<sup>` subtree collecting the anchor fragment (note_id)
426    /// and the mw-reflink-text content (label).
427    fn find_ref_parts(&self, node: &Node, source: &[u8], note_id: &mut String, label: &mut String) {
428        for child in node.children(&mut node.walk()) {
429            if child.kind() != "element" {
430                continue;
431            }
432            if let Some((tag, attrs)) = self.parse_element(&child, source) {
433                match tag.as_str() {
434                    "a" => {
435                        if note_id.is_empty() {
436                            let href = attrs
437                                .iter()
438                                .find(|(k, _)| k == "href")
439                                .map(|(_, v)| v.as_str())
440                                .unwrap_or_default();
441                            // href is like "./Article#cite_note-Foo-1" — take the fragment
442                            if let Some(fragment) = href.rsplit_once('#') {
443                                *note_id = fragment.1.to_string();
444                            }
445                        }
446                        self.find_ref_parts(&child, source, note_id, label);
447                    }
448                    "span" => {
449                        let class = attrs
450                            .iter()
451                            .find(|(k, _)| k == "class")
452                            .map(|(_, v)| v.as_str())
453                            .unwrap_or("");
454                        if class.split_whitespace().any(|c| c == "mw-reflink-text") {
455                            // Inner text is like "[1]" — strip the brackets
456                            let raw = self.extract_text_from_element(&child, source);
457                            *label = raw
458                                .trim_matches(|c: char| c == '[' || c == ']' || c.is_whitespace())
459                                .to_string();
460                        } else {
461                            self.find_ref_parts(&child, source, note_id, label);
462                        }
463                    }
464                    _ => {
465                        self.find_ref_parts(&child, source, note_id, label);
466                    }
467                }
468            }
469        }
470    }
471
472    /// Push an inline node onto the last text segment, also updating the plain text.
473    fn push_inline(&mut self, node: InlineNode) {
474        let last_seg = self.items.iter_mut().rev().find_map(|item| {
475            if let ArticleItem::Paragraph(seg) = item {
476                Some(seg)
477            } else {
478                None
479            }
480        });
481        if let Some(seg) = last_seg {
482            let plain = node.plain_text().to_string();
483            if !seg.text.is_empty() && !plain.is_empty() {
484                if !seg.text.ends_with(' ') {
485                    seg.text.push(' ');
486                }
487            }
488            seg.text.push_str(plain.trim());
489            seg.content.push(node);
490        }
491    }
492
493    /// Collect inline text from an element node into a single String (used for bold/italic).
494    fn collect_inline_text(&self, node: &Node, source: &[u8]) -> String {
495        let mut text = String::new();
496        for child in node.children(&mut node.walk()) {
497            match child.kind() {
498                "text" => {
499                    if let Ok(t) = child.utf8_text(source) {
500                        let trimmed = t.trim();
501                        if !trimmed.is_empty() {
502                            if !text.is_empty() {
503                                text.push(' ');
504                            }
505                            text.push_str(trimmed);
506                        }
507                    }
508                }
509                "element" => {
510                    let child_text = self.collect_inline_text(&child, source);
511                    if !child_text.is_empty() {
512                        if !text.is_empty() {
513                            text.push(' ');
514                        }
515                        text.push_str(&child_text);
516                    }
517                }
518                _ => {}
519            }
520        }
521        text
522    }
523
524    fn walk_and_collect(&mut self, node: &Node, source: &[u8], inside_paragraph: bool) {
525        match node.kind() {
526            "text" => {
527                if let Ok(text) = node.utf8_text(source) {
528                    let trimmed = text.trim();
529                    if !trimmed.is_empty() {
530                        if self.items.is_empty() {
531                            self.items.push(ArticleItem::Paragraph(TextSegment {
532                                text: String::new(),
533                                content: Vec::new(),
534                                mwid: String::new(),
535                                section: self.get_current_section_string(),
536                                section_level: self.get_current_section_level(),
537                            }));
538                        }
539                        self.push_inline(InlineNode::Text(trimmed.to_string()));
540                    }
541                }
542            }
543            "script_element" | "style_element" => (),
544            "element" => {
545                if let Some((tag_name, attributes)) = self.parse_element(node, source) {
546                    if tag_name == "link" {
547                        return;
548                    }
549
550                    let class_attr = attributes
551                        .iter()
552                        .find(|(k, _)| k == "class")
553                        .map(|(_, v)| v.as_str())
554                        .unwrap_or("");
555
556                    // Handle citation refs before the class exclusion check:
557                    // <sup class="mw-ref reference"> contains "reference" which would
558                    // otherwise be excluded, but these are inline markers we want to keep.
559                    if inside_paragraph
560                        && tag_name == "sup"
561                        && class_attr.split_whitespace().any(|c| c == "mw-ref")
562                    {
563                        if let Some(r) = self.extract_inline_ref(node, source) {
564                            self.push_inline(r);
565                        }
566                        return;
567                    }
568
569                    const EXCLUDED_CLASSES: &[&str] = &[
570                        "shortdescription",
571                        "hatnote",
572                        "infobox",
573                        "reference",
574                        "navbox",
575                        "noprint",
576                        "reflist",
577                        "citation",
578                        "mw-references",
579                    ];
580                    if EXCLUDED_CLASSES
581                        .iter()
582                        .any(|c| class_attr.split_whitespace().any(|cls| cls == *c))
583                    {
584                        return;
585                    }
586
587                    if let Some(level) = Self::get_header_level(&tag_name) {
588                        let header_text = self.extract_text_from_element(node, source);
589                        if !header_text.is_empty() {
590                            self.update_sections(level, header_text);
591                        }
592                        return;
593                    }
594
595                    if tag_name == "p" {
596                        let mwid = attributes
597                            .iter()
598                            .find(|(k, _)| k == "id")
599                            .map(|(_, v)| v.clone())
600                            .unwrap_or_default();
601                        self.items.push(ArticleItem::Paragraph(TextSegment {
602                            text: String::new(),
603                            content: Vec::new(),
604                            mwid,
605                            section: self.get_current_section_string(),
606                            section_level: self.get_current_section_level(),
607                        }));
608                        for i in 0..node.child_count() {
609                            if let Some(child) = node.child(i as u32) {
610                                self.walk_and_collect(&child, source, true);
611                            }
612                        }
613                        return;
614                    }
615
616                    if tag_name == "figure" {
617                        if let Some(img) = self.extract_image(node, source) {
618                            self.items.push(ArticleItem::Image(img));
619                        }
620                        return;
621                    }
622
623                    // Inline elements inside a paragraph
624                    if inside_paragraph {
625                        match tag_name.as_str() {
626                            "b" | "strong" => {
627                                let text = self.collect_inline_text(node, source);
628                                if !text.is_empty() {
629                                    self.push_inline(InlineNode::Bold(text));
630                                }
631                                return;
632                            }
633                            "i" | "em" => {
634                                let text = self.collect_inline_text(node, source);
635                                if !text.is_empty() {
636                                    self.push_inline(InlineNode::Italic(text));
637                                }
638                                return;
639                            }
640                            "a" => {
641                                let raw_href = attributes
642                                    .iter()
643                                    .find(|(k, _)| k == "href")
644                                    .map(|(_, v)| v.as_str())
645                                    .unwrap_or_default();
646                                let href = self.resolve_href(raw_href);
647                                let text = self.collect_inline_text(node, source);
648                                if !text.is_empty() {
649                                    self.push_inline(InlineNode::Link { text, href });
650                                }
651                                return;
652                            }
653                            _ => {}
654                        }
655                    }
656
657                    for i in 0..node.child_count() {
658                        if let Some(child) = node.child(i as u32) {
659                            self.walk_and_collect(&child, source, inside_paragraph);
660                        }
661                    }
662                }
663            }
664            _ => {
665                for i in 0..node.child_count() {
666                    if let Some(child) = node.child(i as u32) {
667                        self.walk_and_collect(&child, source, inside_paragraph);
668                    }
669                }
670            }
671        }
672    }
673
674    fn parse_element(
675        &self,
676        element_node: &Node,
677        source: &[u8],
678    ) -> Option<(String, Vec<(String, String)>)> {
679        // Handle both normal elements (<tag>) and self-closing elements (<img/>)
680        let tag_container = element_node
681            .children(&mut element_node.walk())
682            .find(|child| child.kind() == "start_tag" || child.kind() == "self_closing_tag")?;
683
684        let tag_name_node = tag_container
685            .children(&mut tag_container.walk())
686            .find(|child| child.kind() == "tag_name")?;
687
688        let tag_name = tag_name_node.utf8_text(source).ok()?.to_string();
689        let mut attributes = Vec::new();
690
691        for child in tag_container.children(&mut tag_container.walk()) {
692            if child.kind() == "attribute" {
693                if let Some(pair) = self.parse_attribute(&child, source) {
694                    attributes.push(pair);
695                }
696            }
697        }
698
699        Some((tag_name, attributes))
700    }
701
702    fn parse_attribute(&self, attr_node: &Node, source: &[u8]) -> Option<(String, String)> {
703        let mut attr_name = None;
704        let mut attr_value = String::new();
705
706        for child in attr_node.children(&mut attr_node.walk()) {
707            match child.kind() {
708                "attribute_name" => {
709                    attr_name = child.utf8_text(source).ok().map(|s| s.to_string());
710                }
711                "quoted_attribute_value" => {
712                    for grandchild in child.children(&mut child.walk()) {
713                        if grandchild.kind() == "attribute_value" {
714                            if let Ok(value) = grandchild.utf8_text(source) {
715                                attr_value = value.to_string();
716                            }
717                        }
718                    }
719                }
720                "attribute_value" => {
721                    if let Ok(value) = child.utf8_text(source) {
722                        attr_value = value.to_string();
723                    }
724                }
725                _ => {}
726            }
727        }
728
729        attr_name.map(|name| (name, attr_value))
730    }
731
732    /// Extract an [`ImageSegment`] from a `<figure>` element node.
733    ///
734    /// Looks for a descendant `<img>` (self-closing) for `src`/`alt`, and a
735    /// `<figcaption>` child for the caption text.
736    fn extract_image(&self, figure_node: &Node, source: &[u8]) -> Option<ImageSegment> {
737        let mut src = String::new();
738        let mut alt = String::new();
739        let mut caption = String::new();
740
741        for child in figure_node.children(&mut figure_node.walk()) {
742            if child.kind() == "element" {
743                if let Some((tag, attrs)) = self.parse_element(&child, source) {
744                    if tag == "figcaption" {
745                        caption = self.extract_text_from_element(&child, source);
746                    } else {
747                        // Recurse into <a class="mw-file-description"> to find <img>
748                        self.find_img(&child, source, &tag, &attrs, &mut src, &mut alt);
749                    }
750                }
751            }
752        }
753
754        if src.is_empty() {
755            return None;
756        }
757
758        Some(ImageSegment {
759            src: self.resolve_href(&src),
760            alt,
761            caption,
762            section: self.get_current_section_string(),
763            section_level: self.get_current_section_level(),
764        })
765    }
766
767    /// Recursively find the first `<img>` inside `node`, writing into `src`/`alt`.
768    fn find_img(
769        &self,
770        node: &Node,
771        source: &[u8],
772        tag: &str,
773        attrs: &[(String, String)],
774        src: &mut String,
775        alt: &mut String,
776    ) {
777        if !src.is_empty() {
778            return;
779        }
780        if tag == "img" {
781            if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "src") {
782                *src = v.clone();
783            }
784            if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "alt") {
785                *alt = v.clone();
786            }
787            return;
788        }
789        for child in node.children(&mut node.walk()) {
790            if child.kind() == "element" {
791                if let Some((child_tag, child_attrs)) = self.parse_element(&child, source) {
792                    self.find_img(&child, source, &child_tag, &child_attrs, src, alt);
793                }
794            }
795        }
796    }
797}
798
799impl Default for WikiPage {
800    fn default() -> Self {
801        Self::new().expect("Failed to initialise tree-sitter HTML parser")
802    }
803}
804
805/// Remove all reference-related content from a list of [`ArticleItem`]s.
806///
807/// Drops the [`ArticleItem::References`] item and removes [`InlineNode::Ref`]
808/// nodes from every paragraph's content (also rebuilds the plain text).
809/// Call this when `--include-references=false` is requested.
810pub fn strip_references(items: Vec<ArticleItem>) -> Vec<ArticleItem> {
811    items
812        .into_iter()
813        .filter_map(|item| match item {
814            ArticleItem::References(_) => None,
815            ArticleItem::Paragraph(mut seg) => {
816                seg.content.retain(|n| !matches!(n, InlineNode::Ref { .. }));
817                // Rebuild plain text without the ref labels
818                seg.text = seg
819                    .content
820                    .iter()
821                    .map(|n| n.plain_text())
822                    .filter(|s| !s.is_empty())
823                    .collect::<Vec<_>>()
824                    .join(" ");
825                Some(ArticleItem::Paragraph(seg))
826            }
827            other => Some(other),
828        })
829        .collect()
830}
831
832/// Fetch a Wikipedia article by language code and title, returning article items in document order.
833///
834/// Requires the `fetch` feature.
835#[cfg(any(feature = "cli", feature = "web"))]
836pub async fn get_text(language: &str, title: &str) -> anyhow::Result<Vec<ArticleItem>> {
837    let html = get_page_content_html(language, title).await?;
838    let mut page = WikiPage::new()?;
839    page.set_base_url(language);
840    Ok(page.extract_text(&html)?)
841}
842
843#[cfg(any(feature = "cli", feature = "web"))]
844async fn get_page_content_html(language: &str, title: &str) -> anyhow::Result<String> {
845    let normalized_title = normalize_title(title);
846    let url = format!(
847        "https://{language}.wikipedia.org/api/rest_v1/page/html/{normalized_title}?stash=false"
848    );
849    let client = reqwest::Client::new();
850    let response = client
851        .get(&url)
852        .header(
853            "User-Agent",
854            "wikipedia-article-transform/0.1 (https://github.com/santhoshtr/wikipedia-article-transform)",
855        )
856        .send()
857        .await?;
858    if !response.status().is_success() {
859        anyhow::bail!("Failed to fetch article: HTTP {}", response.status());
860    }
861    Ok(response.text().await?)
862}
863
864#[cfg(any(feature = "cli", feature = "web"))]
865fn normalize_title(title: &str) -> String {
866    title.split_whitespace().collect::<Vec<_>>().join("_")
867}
868
869#[cfg(test)]
870mod tests {
871    use super::*;
872
873    #[cfg(any(feature = "cli", feature = "web"))]
874    #[test]
875    fn test_normalize_title_replaces_whitespace_with_underscore() {
876        assert_eq!(normalize_title("Marie Curie"), "Marie_Curie");
877        assert_eq!(normalize_title("  Marie   Curie  "), "Marie_Curie");
878        assert_eq!(normalize_title("Ada\tLovelace"), "Ada_Lovelace");
879    }
880
881    fn extract(html: &str) -> Vec<ArticleItem> {
882        WikiPage::extract_text_plain(html).unwrap();
883        let mut page = WikiPage::new().unwrap();
884        page.extract_text(html).unwrap()
885    }
886
887    fn paragraphs(items: &[ArticleItem]) -> Vec<&TextSegment> {
888        items
889            .iter()
890            .filter_map(|i| {
891                if let ArticleItem::Paragraph(s) = i {
892                    Some(s)
893                } else {
894                    None
895                }
896            })
897            .collect()
898    }
899
900    fn images(items: &[ArticleItem]) -> Vec<&ImageSegment> {
901        items
902            .iter()
903            .filter_map(|i| {
904                if let ArticleItem::Image(s) = i {
905                    Some(s)
906                } else {
907                    None
908                }
909            })
910            .collect()
911    }
912
913    #[test]
914    fn test_basic_paragraph() {
915        let items = extract("<html><body><p id=\"p1\">Hello world.</p></body></html>");
916        let segs = paragraphs(&items);
917        assert_eq!(segs.len(), 1);
918        assert_eq!(segs[0].text, "Hello world.");
919        assert_eq!(segs[0].mwid, "p1");
920        assert_eq!(segs[0].section, "");
921        assert_eq!(segs[0].section_level, 0);
922    }
923
924    #[test]
925    fn test_multiple_paragraphs() {
926        let items = extract("<p>First.</p><p>Second.</p><p>Third.</p>");
927        let segs = paragraphs(&items);
928        assert_eq!(segs.len(), 3);
929        assert_eq!(segs[0].text, "First.");
930        assert_eq!(segs[1].text, "Second.");
931        assert_eq!(segs[2].text, "Third.");
932    }
933
934    #[test]
935    fn test_section_tracking() {
936        let html = "<h2>History</h2><p>Para one.</p><h3>Early life</h3><p>Para two.</p>";
937        let items = extract(html);
938        let segs = paragraphs(&items);
939        assert_eq!(segs[0].section, "History");
940        assert_eq!(segs[1].section, "History - Early life");
941    }
942
943    #[test]
944    fn test_section_level() {
945        let html = "<h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
946        let items = extract(html);
947        let segs = paragraphs(&items);
948        assert_eq!(segs[0].section_level, 2);
949        assert_eq!(segs[1].section_level, 3);
950    }
951
952    #[test]
953    fn test_section_resets_at_same_level() {
954        let html = "<h2>History</h2><p>A.</p><h2>Geography</h2><p>B.</p>";
955        let items = extract(html);
956        let segs = paragraphs(&items);
957        assert_eq!(segs[0].section, "History");
958        assert_eq!(segs[1].section, "Geography");
959    }
960
961    #[test]
962    fn test_excluded_class_infobox() {
963        let html = r#"<p>Visible.</p><table class="infobox"><tr><td>Hidden.</td></tr></table><p>Also visible.</p>"#;
964        let items = extract(html);
965        let segs = paragraphs(&items);
966        assert!(segs.iter().all(|s| !s.text.contains("Hidden")));
967        assert_eq!(segs.len(), 2);
968    }
969
970    #[test]
971    fn test_excluded_class_reflist() {
972        let html = r#"<p>Main text.</p><div class="reflist"><p>Ref text.</p></div>"#;
973        let items = extract(html);
974        let segs = paragraphs(&items);
975        assert_eq!(segs.len(), 1);
976        assert_eq!(segs[0].text, "Main text.");
977    }
978
979    #[test]
980    fn test_script_and_style_skipped() {
981        let html = "<p>Real.</p><script>var x=1;</script><style>body{}</style><p>Also real.</p>";
982        let items = extract(html);
983        let segs = paragraphs(&items);
984        assert_eq!(segs.len(), 2);
985        assert!(segs.iter().all(|s| !s.text.contains("var x")));
986    }
987
988    #[test]
989    fn test_empty_html() {
990        let items = extract("");
991        assert!(items.is_empty());
992    }
993
994    #[test]
995    fn test_extract_text_plain() {
996        let html = "<p>First paragraph.</p><p>Second paragraph.</p>";
997        let text = WikiPage::extract_text_plain(html).unwrap();
998        assert_eq!(text, "First paragraph.\n\nSecond paragraph.");
999    }
1000
1001    #[test]
1002    fn test_default_impl() {
1003        let mut page = WikiPage::default();
1004        let items = page.extract_text("<p>Works.</p>").unwrap();
1005        let segs = paragraphs(&items);
1006        assert_eq!(segs[0].text, "Works.");
1007    }
1008
1009    #[test]
1010    fn test_inline_bold() {
1011        let items = extract("<p><b>Bold</b> text</p>");
1012        let segs = paragraphs(&items);
1013        assert_eq!(segs.len(), 1);
1014        assert_eq!(segs[0].text, "Bold text");
1015        assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "Bold"));
1016        assert!(matches!(&segs[0].content[1], InlineNode::Text(s) if s == "text"));
1017    }
1018
1019    #[test]
1020    fn test_inline_italic() {
1021        let items = extract("<p><i>italic</i></p>");
1022        let segs = paragraphs(&items);
1023        assert_eq!(segs.len(), 1);
1024        assert!(matches!(&segs[0].content[0], InlineNode::Italic(s) if s == "italic"));
1025    }
1026
1027    #[test]
1028    fn test_inline_strong_em() {
1029        let items = extract("<p><strong>S</strong> and <em>E</em></p>");
1030        let segs = paragraphs(&items);
1031        assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "S"));
1032        assert!(matches!(&segs[0].content[2], InlineNode::Italic(s) if s == "E"));
1033    }
1034
1035    #[test]
1036    fn test_inline_link() {
1037        let items = extract(r#"<p><a href="./X">anchor</a></p>"#);
1038        let segs = paragraphs(&items);
1039        assert_eq!(segs.len(), 1);
1040        // No base URL set: ./X passes through unchanged
1041        assert!(matches!(&segs[0].content[0],
1042            InlineNode::Link { text, href } if text == "anchor" && href == "./X"));
1043    }
1044
1045    #[test]
1046    fn test_inline_link_absolute() {
1047        let html = r#"<p><a href="./Cryogenics">Cryogenics</a></p>"#;
1048        let mut page = WikiPage::new().unwrap();
1049        page.set_base_url("en");
1050        let items = page.extract_text(html).unwrap();
1051        let segs = paragraphs(&items);
1052        assert!(matches!(&segs[0].content[0],
1053            InlineNode::Link { text, href }
1054                if text == "Cryogenics"
1055                && href == "https://en.wikipedia.org/wiki/Cryogenics"));
1056    }
1057
1058    #[test]
1059    fn test_resolve_href_protocol_relative() {
1060        let html = r#"<p><a href="//en.wikipedia.org/wiki/Oxygen">O</a></p>"#;
1061        let mut page = WikiPage::new().unwrap();
1062        let items = page.extract_text(html).unwrap();
1063        let segs = paragraphs(&items);
1064        assert!(matches!(&segs[0].content[0],
1065            InlineNode::Link { href, .. } if href == "https://en.wikipedia.org/wiki/Oxygen"));
1066    }
1067
1068    #[test]
1069    fn test_format_plain_sections() {
1070        let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1071        let items = extract(html);
1072        let out = items.format_plain();
1073        assert!(out.contains("\nIntro.\n"), "intro paragraph missing");
1074        assert!(out.contains("## History\n"), "h2 heading missing");
1075        assert!(out.contains("\nA.\n"), "first section paragraph missing");
1076        assert!(out.contains("### Early life\n"), "h3 heading missing");
1077        assert!(out.contains("\nB.\n"), "subsection paragraph missing");
1078        assert!(out.find("## History").unwrap() < out.find("\nA.\n").unwrap());
1079        assert!(out.find("### Early life").unwrap() < out.find("\nB.\n").unwrap());
1080    }
1081
1082    #[test]
1083    fn test_format_json_tree() {
1084        let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1085        let items = extract(html);
1086        let json_str = items.format_json().unwrap();
1087        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1088        assert_eq!(v["intro"][0]["text"], "Intro.");
1089        assert_eq!(v["intro"][0]["citations"].as_array().unwrap().len(), 0);
1090        assert_eq!(v["sections"][0]["heading"], "History");
1091        assert_eq!(v["sections"][0]["level"], 2);
1092        assert_eq!(v["sections"][0]["paragraphs"][0]["text"], "A.");
1093        assert_eq!(
1094            v["sections"][0]["paragraphs"][0]["citations"]
1095                .as_array()
1096                .unwrap()
1097                .len(),
1098            0
1099        );
1100        assert_eq!(v["sections"][0]["subsections"][0]["heading"], "Early life");
1101        assert_eq!(v["sections"][0]["subsections"][0]["level"], 3);
1102        assert_eq!(
1103            v["sections"][0]["subsections"][0]["paragraphs"][0]["text"],
1104            "B."
1105        );
1106    }
1107
1108    #[test]
1109    fn test_format_markdown_inline() {
1110        let items = extract(
1111            "<h2>Title</h2><p><b>Bold</b> and <i>italic</i> and <a href=\"/x\">link</a></p>",
1112        );
1113        let out = items.format_markdown();
1114        assert!(out.contains("## Title"));
1115        assert!(out.contains("**Bold**"));
1116        assert!(out.contains("_italic_"));
1117        assert!(out.contains("[link](/x)"));
1118        // spaces between inline nodes must be preserved
1119        assert!(
1120            out.contains("**Bold** and"),
1121            "space after bold missing: {out}"
1122        );
1123        assert!(
1124            out.contains("_italic_ and"),
1125            "space after italic missing: {out}"
1126        );
1127        assert!(
1128            out.contains("and [link]"),
1129            "space before link missing: {out}"
1130        );
1131    }
1132
1133    #[test]
1134    fn test_image_extraction() {
1135        let html = r#"<figure typeof="mw:File/Thumb">
1136            <a href="./File:Foo.jpg" class="mw-file-description">
1137                <img alt="A description" src="//upload.wikimedia.org/thumb/foo.jpg" class="mw-file-element"/>
1138            </a>
1139            <figcaption>Caption text here.</figcaption>
1140        </figure>"#;
1141        let items = extract(html);
1142        let imgs = images(&items);
1143        assert_eq!(imgs.len(), 1);
1144        assert_eq!(imgs[0].src, "https://upload.wikimedia.org/thumb/foo.jpg");
1145        assert_eq!(imgs[0].alt, "A description");
1146        assert_eq!(imgs[0].caption, "Caption text here.");
1147    }
1148
1149    #[test]
1150    fn test_image_no_caption() {
1151        let html = r#"<figure typeof="mw:File/Frameless">
1152            <a href="./File:Bar.png" class="mw-file-description">
1153                <img alt="Bar" src="//upload.wikimedia.org/bar.png" class="mw-file-element"/>
1154            </a>
1155            <figcaption></figcaption>
1156        </figure>"#;
1157        let items = extract(html);
1158        let imgs = images(&items);
1159        assert_eq!(imgs.len(), 1);
1160        assert_eq!(imgs[0].caption, "");
1161    }
1162
1163    #[test]
1164    fn test_image_section_tracking() {
1165        let html = r#"<h2>History</h2>
1166        <figure typeof="mw:File/Thumb">
1167            <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1168            <figcaption>X caption</figcaption>
1169        </figure>
1170        <p>A paragraph.</p>"#;
1171        let items = extract(html);
1172        let imgs = images(&items);
1173        assert_eq!(imgs.len(), 1);
1174        assert_eq!(imgs[0].section, "History");
1175        assert_eq!(imgs[0].section_level, 2);
1176    }
1177
1178    #[test]
1179    fn test_image_interleaved_order() {
1180        let html = r#"<p>Before.</p>
1181        <figure typeof="mw:File/Thumb">
1182            <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1183            <figcaption>Caption</figcaption>
1184        </figure>
1185        <p>After.</p>"#;
1186        let items = extract(html);
1187        assert!(matches!(&items[0], ArticleItem::Paragraph(s) if s.text == "Before."));
1188        assert!(matches!(&items[1], ArticleItem::Image(_)));
1189        assert!(matches!(&items[2], ArticleItem::Paragraph(s) if s.text == "After."));
1190    }
1191
1192    #[test]
1193    fn test_markdown_image() {
1194        let html = r#"<figure typeof="mw:File/Thumb">
1195            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1196            <figcaption>The caption.</figcaption>
1197        </figure>"#;
1198        let items = extract(html);
1199        let out = items.format_markdown();
1200        // caption is used as alt text intentionally
1201        assert!(out.contains("![Alt text](https://upload.wikimedia.org/foo.jpg)"));
1202        assert!(out.contains("_The caption._"));
1203    }
1204
1205    #[test]
1206    fn test_markdown_image_no_caption() {
1207        let html = r#"<figure typeof="mw:File/Frameless">
1208            <a href="./File:Bar.png"><img alt="Bar" src="//upload.wikimedia.org/bar.png"/></a>
1209            <figcaption></figcaption>
1210        </figure>"#;
1211        let items = extract(html);
1212        let out = items.format_markdown();
1213        assert!(out.contains("![Bar](https://upload.wikimedia.org/bar.png)"));
1214        // no caption line when caption is empty
1215        assert!(!out.contains("__"));
1216    }
1217
1218    #[test]
1219    fn test_plain_image() {
1220        let html = r#"<figure typeof="mw:File/Thumb">
1221            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1222            <figcaption>The caption.</figcaption>
1223        </figure>"#;
1224        let items = extract(html);
1225        let out = items.format_plain();
1226        assert!(out.contains("[Image: Alt text]"));
1227        assert!(out.contains("The caption."));
1228    }
1229
1230    #[test]
1231    fn test_json_image() {
1232        let html = r#"<h2>Section</h2>
1233        <figure typeof="mw:File/Thumb">
1234            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1235            <figcaption>The caption.</figcaption>
1236        </figure>
1237        <p>A paragraph.</p>"#;
1238        let items = extract(html);
1239        let json_str = items.format_json().unwrap();
1240        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1241        assert_eq!(v["sections"][0]["images"][0]["alt"], "Alt text");
1242        assert_eq!(
1243            v["sections"][0]["images"][0]["src"],
1244            "https://upload.wikimedia.org/foo.jpg"
1245        );
1246        assert_eq!(v["sections"][0]["images"][0]["caption"], "The caption.");
1247    }
1248
1249    // ── Reference tests ─────────────────────────────────────────────────────
1250
1251    fn ref_html() -> &'static str {
1252        r#"<p id="p1">Some text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1253            ><a href="./Article#cite_note-Foo-1"><span class="mw-reflink-text">[1]</span></a
1254            ></sup> More text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1255            ><a href="./Article#cite_note-Bar-2"><span class="mw-reflink-text">[2]</span></a
1256            ></sup></p>
1257        <ol class="mw-references references">
1258            <li id="cite_note-Foo-1" data-mw-footnote-number="1">
1259                <span class="mw-cite-backlink"><a href="./Article#cite_ref-Foo_1-0">↑</a></span>
1260                <span id="mw-reference-text-cite_note-Foo-1" class="mw-reference-text reference-text">Author A. <i>Title One</i>. Publisher, 2020.</span>
1261            </li>
1262            <li id="cite_note-Bar-2" data-mw-footnote-number="2">
1263                <span class="mw-cite-backlink"><a href="./Article#cite_ref-Bar_2-0">↑</a></span>
1264                <span id="mw-reference-text-cite_note-Bar-2" class="mw-reference-text reference-text">Author B. Title Two. Journal, 2021.</span>
1265            </li>
1266        </ol>"#
1267    }
1268
1269    #[test]
1270    fn test_ref_inline_nodes() {
1271        let items = extract(ref_html());
1272        let segs = paragraphs(&items);
1273        assert_eq!(segs.len(), 1);
1274        // content: Text, Ref[1], Text, Ref[2]
1275        assert!(matches!(&segs[0].content[0], InlineNode::Text(s) if s.contains("Some text")));
1276        assert!(
1277            matches!(&segs[0].content[1], InlineNode::Ref { label, note_id }
1278            if label == "1" && note_id == "cite_note-Foo-1")
1279        );
1280        assert!(
1281            matches!(&segs[0].content[3], InlineNode::Ref { label, note_id }
1282            if label == "2" && note_id == "cite_note-Bar-2")
1283        );
1284    }
1285
1286    #[test]
1287    fn test_ref_plain_text_excludes_label() {
1288        // plain_text() on Ref returns "" so the ref label is not in seg.text
1289        let items = extract(ref_html());
1290        let segs = paragraphs(&items);
1291        assert!(!segs[0].text.contains('['));
1292        assert!(segs[0].text.contains("Some text"));
1293        assert!(segs[0].text.contains("More text"));
1294    }
1295
1296    #[test]
1297    fn test_ref_references_item_appended() {
1298        let items = extract(ref_html());
1299        let refs = items.iter().find_map(|i| {
1300            if let ArticleItem::References(r) = i {
1301                Some(r)
1302            } else {
1303                None
1304            }
1305        });
1306        assert!(refs.is_some());
1307        let refs = refs.unwrap();
1308        assert_eq!(refs.len(), 2);
1309        assert!(refs["cite_note-Foo-1"].contains("Title One"));
1310        assert!(refs["cite_note-Bar-2"].contains("Title Two"));
1311    }
1312
1313    #[test]
1314    fn test_ref_no_refs_no_item() {
1315        let items = extract("<p>No citations here.</p>");
1316        assert!(
1317            !items
1318                .iter()
1319                .any(|i| matches!(i, ArticleItem::References(_)))
1320        );
1321    }
1322
1323    #[test]
1324    fn test_ref_markdown_inline_and_list() {
1325        let items = extract(ref_html());
1326        let out = items.format_markdown();
1327        // Inline labels appear attached to surrounding text
1328        assert!(out.contains("[^1]"), "inline [^1] missing");
1329        assert!(out.contains("[^2]"), "inline [^2] missing");
1330        // Reference definitions at the bottom
1331        assert!(out.contains("## References"), "References heading missing");
1332        assert!(out.contains("[^1]: "), "[^1]: definition missing");
1333        assert!(out.contains("Title One"), "citation text missing");
1334        assert!(out.contains("[^2]: "), "[^2]: definition missing");
1335        assert!(out.contains("Title Two"), "citation text missing");
1336        // Definitions must appear after body text
1337        assert!(out.find("Some text").unwrap() < out.find("## References").unwrap());
1338    }
1339
1340    #[test]
1341    fn test_ref_json_references_key() {
1342        let items = extract(ref_html());
1343        let json_str = items.format_json().unwrap();
1344        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1345        assert!(v["references"].is_object(), "references key missing");
1346        assert!(
1347            v["references"]["cite_note-Foo-1"]
1348                .as_str()
1349                .unwrap()
1350                .contains("Title One")
1351        );
1352        assert!(
1353            v["references"]["cite_note-Bar-2"]
1354                .as_str()
1355                .unwrap()
1356                .contains("Title Two")
1357        );
1358
1359        // Paragraph-level citations are preserved and resolved
1360        let para = &v["intro"][0];
1361        let citations = para["citations"].as_array().unwrap();
1362        assert_eq!(citations.len(), 2);
1363        assert_eq!(citations[0]["label"], "1");
1364        assert!(citations[0]["text"].as_str().unwrap().contains("Title One"));
1365        assert_eq!(citations[1]["label"], "2");
1366        assert!(citations[1]["text"].as_str().unwrap().contains("Title Two"));
1367    }
1368
1369    #[test]
1370    fn test_strip_references() {
1371        let items = extract(ref_html());
1372        let stripped = strip_references(items);
1373        // No References item
1374        assert!(
1375            !stripped
1376                .iter()
1377                .any(|i| matches!(i, ArticleItem::References(_)))
1378        );
1379        // No Ref inline nodes in paragraphs
1380        let segs = paragraphs(&stripped);
1381        for seg in segs {
1382            assert!(
1383                !seg.content
1384                    .iter()
1385                    .any(|n| matches!(n, InlineNode::Ref { .. }))
1386            );
1387            assert!(!seg.text.contains('['));
1388        }
1389    }
1390}
wikipedia_article_transform/lib.rs

wikipedia_article_transform/
lib.rs