Skip to main content

wikipedia_article_transform/
lib.rs

1//! Extract plain text from Wikipedia article HTML.
2//!
3//! This crate parses Wikipedia article HTML using [tree-sitter](https://tree-sitter.github.io/)
4//! and extracts clean, structured plain text — skipping navigation, infoboxes, references,
5//! and other non-prose content.
6//!
7//! # Quick start
8//!
9//! ```rust
10//! use wikipedia_article_transform::WikiPage;
11//!
12//! let html = r#"<html><body><p id="intro">Hello world.</p></body></html>"#;
13//! let text = WikiPage::extract_text_plain(html).unwrap();
14//! assert_eq!(text, "Hello world.");
15//! ```
16//!
17//! For richer output with section tracking and inline structure, use [`WikiPage::extract_text`]:
18//!
19//! ```rust
20//! use wikipedia_article_transform::{WikiPage, ArticleItem};
21//!
22//! let html = r#"<html><body><h2>History</h2><p id="p1">Some text.</p></body></html>"#;
23//! let mut page = WikiPage::new().unwrap();
24//! let items = page.extract_text(html).unwrap();
25//! if let ArticleItem::Paragraph(seg) = &items[0] {
26//!     assert_eq!(seg.section, "History");
27//!     assert_eq!(seg.section_level, 2);
28//!     assert_eq!(seg.text, "Some text.");
29//! }
30//! ```
31//!
32//! # Optional feature: `fetch`
33//!
34//! Enable the `fetch` feature to fetch Wikipedia articles directly via the REST API:
35//!
36//! ```toml
37//! wikipedia-article-transform = { version = "0.1", features = ["cli"] }
38//! ```
39
40pub mod formatters;
41pub use formatters::ArticleFormat;
42
43use std::collections::HashMap;
44
45use serde::Serialize;
46use tree_sitter::{Node, Parser};
47use tree_sitter_html::LANGUAGE;
48
49/// An inline content node within a paragraph.
50///
51/// Captures the inline structure of paragraph text so formatters can render
52/// bold, italic, link, and citation reference markup.
53#[derive(Debug, Clone)]
54pub enum InlineNode {
55    /// Plain text.
56    Text(String),
57    /// Bold text (`<b>` or `<strong>`).
58    Bold(String),
59    /// Italic text (`<i>` or `<em>`).
60    Italic(String),
61    /// A hyperlink (`<a href="...">`).
62    Link { text: String, href: String },
63    /// A citation reference (`<sup class="mw-ref reference">`).
64    ///
65    /// `label` is the display number (e.g. `"1"`), `note_id` is the fragment
66    /// identifying the entry in the reference list (e.g. `"cite_note-Foo-1"`).
67    Ref { label: String, note_id: String },
68}
69
70/// An image extracted from a `<figure>` block in a Wikipedia article.
71///
72/// Wikipedia wraps images in `<figure>` elements containing an `<img>` and an
73/// optional `<figcaption>`. Images appear between paragraphs, not inside them.
74#[derive(Debug, Clone, Serialize)]
75pub struct ImageSegment {
76    /// Resolved URL of the image (thumbnail size as served by Wikimedia).
77    pub src: String,
78    /// Alt text from the `<img alt="...">` attribute.
79    pub alt: String,
80    /// Plain text of the `<figcaption>` element, if present.
81    pub caption: String,
82    /// The section heading path at the point where the image appears.
83    pub section: String,
84    /// The heading level of the current section (1–6). 0 if before any heading.
85    pub section_level: u8,
86}
87
88/// A single item extracted from a Wikipedia article, in document order.
89///
90/// Paragraphs and images are interleaved as they appear in the source HTML,
91/// so formatters can reproduce the original reading order. If references were
92/// found, a single [`ArticleItem::References`] item is appended last.
93#[derive(Debug, Clone)]
94pub enum ArticleItem {
95    /// A paragraph extracted from a `<p>` element.
96    Paragraph(TextSegment),
97    /// An image extracted from a `<figure>` element.
98    Image(ImageSegment),
99    /// All citation references collected from `<ol class="references">` lists.
100    ///
101    /// Keyed by the fragment id (e.g. `"cite_note-Foo-1"`), valued by the
102    /// full plain-text citation string.
103    References(HashMap<String, String>),
104}
105
106impl InlineNode {
107    /// Returns the plain text content, stripping any markup.
108    /// Returns an empty string for `Ref` nodes — citations are not prose text.
109    pub fn plain_text(&self) -> &str {
110        match self {
111            InlineNode::Text(s) | InlineNode::Bold(s) | InlineNode::Italic(s) => s,
112            InlineNode::Link { text, .. } => text,
113            InlineNode::Ref { .. } => "",
114        }
115    }
116}
117
118/// A single paragraph-level text segment extracted from a Wikipedia article.
119///
120/// Each segment corresponds to a `<p>` block in the HTML. It captures the plain
121/// text, the inline content structure, the MediaWiki paragraph ID, the section
122/// heading path, and the heading depth.
123#[derive(Debug, Clone, Serialize)]
124pub struct TextSegment {
125    /// The extracted plain text of this segment (inline markup stripped).
126    pub text: String,
127    /// The inline content nodes, preserving bold/italic/link structure.
128    #[serde(skip)]
129    pub content: Vec<InlineNode>,
130    /// The `id` attribute of the enclosing `<p>` element, if present.
131    pub mwid: String,
132    /// The section heading path, e.g. `"History - Early life"`.
133    pub section: String,
134    /// The heading level of the current section (1–6). 0 if before any heading.
135    pub section_level: u8,
136}
137
138#[derive(Debug, Clone)]
139struct SectionInfo {
140    title: String,
141    level: u8,
142}
143
144/// A reusable Wikipedia HTML parser.
145///
146/// Reusing a single `WikiPage` instance across multiple articles is more efficient
147/// than creating one per article, since it avoids re-initialising the tree-sitter
148/// parser and grammar on each call.
149///
150/// # Example
151///
152/// ```rust
153/// use wikipedia_article_transform::{WikiPage, ArticleItem};
154///
155/// let mut page = WikiPage::new().unwrap();
156/// let items = page.extract_text("<p>Hello.</p>").unwrap();
157/// if let ArticleItem::Paragraph(seg) = &items[0] {
158///     assert_eq!(seg.text, "Hello.");
159/// }
160/// ```
161pub struct WikiPage {
162    parser: Parser,
163    items: Vec<ArticleItem>,
164    current_sections: Vec<SectionInfo>,
165    /// Base URL used to resolve relative hrefs, e.g. `https://en.wikipedia.org/wiki/`.
166    base_url: Option<String>,
167    /// Citation references collected by `extract_references()`.
168    /// Keyed by note id (e.g. `"cite_note-Foo-1"`), valued by plain-text citation.
169    references: HashMap<String, String>,
170}
171
172impl WikiPage {
173    /// Creates a new `WikiPage`, initialising the tree-sitter HTML parser.
174    pub fn new() -> anyhow::Result<Self> {
175        let language = LANGUAGE.into();
176        let mut parser = Parser::new();
177        parser.set_language(&language)?;
178        Ok(WikiPage {
179            parser,
180            items: Vec::new(),
181            current_sections: Vec::new(),
182            base_url: None,
183            references: HashMap::new(),
184        })
185    }
186
187    /// Set the base URL for resolving relative link hrefs.
188    ///
189    /// Call this before [`extract_text`] when the HTML comes from a known origin.
190    /// The `language` parameter is a Wikipedia language code (e.g. `"en"`, `"ml"`).
191    ///
192    /// ```rust
193    /// use wikipedia_article_transform::WikiPage;
194    ///
195    /// let mut page = WikiPage::new().unwrap();
196    /// page.set_base_url("en");
197    /// ```
198    pub fn set_base_url(&mut self, language: &str) {
199        self.base_url = Some(format!("https://{language}.wikipedia.org/wiki/"));
200    }
201
202    /// Resolve an href against the base URL.
203    ///
204    /// - `./Foo`           → `{base}Foo`
205    /// - `//en.wikipedia.org/wiki/Foo` → `https://en.wikipedia.org/wiki/Foo`
206    /// - already `http(s)://` → unchanged
207    /// - anything else (anchors, mw-data:, etc.) → unchanged
208    fn resolve_href(&self, href: &str) -> String {
209        if href.starts_with("http://") || href.starts_with("https://") {
210            return href.to_string();
211        }
212        if let Some(rest) = href.strip_prefix("//") {
213            return format!("https://{rest}");
214        }
215        if let Some(path) = href.strip_prefix("./") {
216            if let Some(base) = &self.base_url {
217                return format!("{base}{path}");
218            }
219        }
220        href.to_string()
221    }
222
223    /// Parses `html` and returns one [`ArticleItem`] per paragraph or image, in document order.
224    ///
225    /// If any `<ol class="references">` lists are found, a final
226    /// [`ArticleItem::References`] item is appended containing all citations.
227    ///
228    /// The parser state is reset on each call, so the same `WikiPage` can be
229    /// reused safely across multiple articles.
230    ///
231    /// Skipped elements: `<script>`, `<style>`, `<link>`, and elements with
232    /// classes `shortdescription`, `hatnote`, `infobox`, `reference`, `navbox`,
233    /// `noprint`, `reflist`, `citation`.
234    pub fn extract_text(&mut self, html: &str) -> anyhow::Result<Vec<ArticleItem>> {
235        self.items.clear();
236        self.current_sections.clear();
237        self.references.clear();
238        let tree = self
239            .parser
240            .parse(html, None)
241            .ok_or_else(|| anyhow::anyhow!("Failed to parse HTML"))?;
242        let source = html.as_bytes();
243        self.extract_references(&tree.root_node(), source);
244        self.walk_and_collect(&tree.root_node(), source, false);
245        if !self.references.is_empty() {
246            self.items
247                .push(ArticleItem::References(self.references.clone()));
248        }
249        Ok(self.items.clone())
250    }
251
252    /// Convenience method: parse `html` and return all paragraph text joined by `"\n\n"`.
253    pub fn extract_text_plain(html: &str) -> anyhow::Result<String> {
254        let mut page = WikiPage::new()?;
255        let items = page.extract_text(html)?;
256        let text = items
257            .iter()
258            .filter_map(|item| match item {
259                ArticleItem::Paragraph(seg) => {
260                    let t = seg.text.trim();
261                    if t.is_empty() { None } else { Some(t) }
262                }
263                ArticleItem::Image(_) | ArticleItem::References(_) => None,
264            })
265            .collect::<Vec<_>>()
266            .join("\n\n");
267        Ok(text)
268    }
269
270    fn get_header_level(tag_name: &str) -> Option<u8> {
271        match tag_name {
272            "h1" => Some(1),
273            "h2" => Some(2),
274            "h3" => Some(3),
275            "h4" => Some(4),
276            "h5" => Some(5),
277            "h6" => Some(6),
278            _ => None,
279        }
280    }
281
282    fn extract_text_from_element(&self, node: &Node, source: &[u8]) -> String {
283        let mut text = String::new();
284        for child in node.children(&mut node.walk()) {
285            match child.kind() {
286                "text" => {
287                    if let Ok(t) = child.utf8_text(source) {
288                        text.push_str(t.trim());
289                    }
290                }
291                "element" => {
292                    let child_text = self.extract_text_from_element(&child, source);
293                    if !child_text.is_empty() {
294                        if !text.is_empty() {
295                            text.push(' ');
296                        }
297                        text.push_str(&child_text);
298                    }
299                }
300                _ => {}
301            }
302        }
303        text
304    }
305
306    fn update_sections(&mut self, level: u8, title: String) {
307        self.current_sections
308            .retain(|section| section.level < level);
309        self.current_sections.push(SectionInfo { title, level });
310    }
311
312    fn get_current_section_string(&self) -> String {
313        self.current_sections
314            .iter()
315            .map(|s| s.title.as_str())
316            .collect::<Vec<_>>()
317            .join(" - ")
318    }
319
320    fn get_current_section_level(&self) -> u8 {
321        self.current_sections.last().map(|s| s.level).unwrap_or(0)
322    }
323
324    /// Pre-scan the parse tree for `<ol class="mw-references references">` elements
325    /// and populate `self.references` with `note_id → citation_text` pairs.
326    ///
327    /// This runs before `walk_and_collect` so that inline `<sup>` nodes encountered
328    /// during the main walk can be cross-referenced.
329    fn extract_references(&mut self, node: &Node, source: &[u8]) {
330        match node.kind() {
331            "element" => {
332                if let Some((tag, attrs)) = self.parse_element(node, source) {
333                    let class = attrs
334                        .iter()
335                        .find(|(k, _)| k == "class")
336                        .map(|(_, v)| v.as_str())
337                        .unwrap_or("");
338                    let classes: Vec<&str> = class.split_whitespace().collect();
339
340                    // Found a reference list: collect its <li> children
341                    if tag == "ol" && classes.contains(&"references") {
342                        for child in node.children(&mut node.walk()) {
343                            if child.kind() != "element" {
344                                continue;
345                            }
346                            if let Some((child_tag, child_attrs)) =
347                                self.parse_element(&child, source)
348                            {
349                                if child_tag != "li" {
350                                    continue;
351                                }
352                                let note_id = child_attrs
353                                    .iter()
354                                    .find(|(k, _)| k == "id")
355                                    .map(|(_, v)| v.clone())
356                                    .unwrap_or_default();
357                                if note_id.is_empty() {
358                                    continue;
359                                }
360                                // Find the <span class="mw-reference-text reference-text">
361                                let citation = self.find_reference_text(&child, source);
362                                if !citation.is_empty() {
363                                    self.references.insert(note_id, citation);
364                                }
365                            }
366                        }
367                        return; // don't recurse into the ol further
368                    }
369
370                    // Recurse into other elements looking for more reference lists
371                    for child in node.children(&mut node.walk()) {
372                        self.extract_references(&child, source);
373                    }
374                }
375            }
376            _ => {
377                for child in node.children(&mut node.walk()) {
378                    self.extract_references(&child, source);
379                }
380            }
381        }
382    }
383
384    /// Find and return the plain text of the `<span class="mw-reference-text">` inside a `<li>`.
385    fn find_reference_text(&self, li_node: &Node, source: &[u8]) -> String {
386        for child in li_node.children(&mut li_node.walk()) {
387            if child.kind() != "element" {
388                continue;
389            }
390            if let Some((tag, attrs)) = self.parse_element(&child, source) {
391                let class = attrs
392                    .iter()
393                    .find(|(k, _)| k == "class")
394                    .map(|(_, v)| v.as_str())
395                    .unwrap_or("");
396                if tag == "span" && class.split_whitespace().any(|c| c == "reference-text") {
397                    return self.extract_text_from_element(&child, source);
398                }
399                // Recurse — the span may be nested
400                let found = self.find_reference_text(&child, source);
401                if !found.is_empty() {
402                    return found;
403                }
404            }
405        }
406        String::new()
407    }
408
409    /// Extract an [`InlineNode::Ref`] from a `<sup class="mw-ref reference">` node.
410    ///
411    /// Finds the inner `<a href="...#note_id">` for the note_id and the
412    /// `<span class="mw-reflink-text">` for the display label.
413    fn extract_inline_ref(&self, sup_node: &Node, source: &[u8]) -> Option<InlineNode> {
414        let mut note_id = String::new();
415        let mut label = String::new();
416
417        self.find_ref_parts(sup_node, source, &mut note_id, &mut label);
418
419        if note_id.is_empty() || label.is_empty() {
420            return None;
421        }
422        Some(InlineNode::Ref { label, note_id })
423    }
424
425    /// Recursively walk a `<sup>` subtree collecting the anchor fragment (note_id)
426    /// and the mw-reflink-text content (label).
427    fn find_ref_parts(&self, node: &Node, source: &[u8], note_id: &mut String, label: &mut String) {
428        for child in node.children(&mut node.walk()) {
429            if child.kind() != "element" {
430                continue;
431            }
432            if let Some((tag, attrs)) = self.parse_element(&child, source) {
433                match tag.as_str() {
434                    "a" => {
435                        if note_id.is_empty() {
436                            let href = attrs
437                                .iter()
438                                .find(|(k, _)| k == "href")
439                                .map(|(_, v)| v.as_str())
440                                .unwrap_or_default();
441                            // href is like "./Article#cite_note-Foo-1" — take the fragment
442                            if let Some(fragment) = href.rsplit_once('#') {
443                                *note_id = fragment.1.to_string();
444                            }
445                        }
446                        self.find_ref_parts(&child, source, note_id, label);
447                    }
448                    "span" => {
449                        let class = attrs
450                            .iter()
451                            .find(|(k, _)| k == "class")
452                            .map(|(_, v)| v.as_str())
453                            .unwrap_or("");
454                        if class.split_whitespace().any(|c| c == "mw-reflink-text") {
455                            // Inner text is like "[1]" — strip the brackets
456                            let raw = self.extract_text_from_element(&child, source);
457                            *label = raw
458                                .trim_matches(|c: char| c == '[' || c == ']' || c.is_whitespace())
459                                .to_string();
460                        } else {
461                            self.find_ref_parts(&child, source, note_id, label);
462                        }
463                    }
464                    _ => {
465                        self.find_ref_parts(&child, source, note_id, label);
466                    }
467                }
468            }
469        }
470    }
471
472    /// Push an inline node onto the last text segment, also updating the plain text.
473    fn push_inline(&mut self, node: InlineNode) {
474        let last_seg = self.items.iter_mut().rev().find_map(|item| {
475            if let ArticleItem::Paragraph(seg) = item {
476                Some(seg)
477            } else {
478                None
479            }
480        });
481        if let Some(seg) = last_seg {
482            let plain = node.plain_text().to_string();
483            if !seg.text.is_empty() && !plain.is_empty() {
484                if !seg.text.ends_with(' ') {
485                    seg.text.push(' ');
486                }
487            }
488            seg.text.push_str(plain.trim());
489            seg.content.push(node);
490        }
491    }
492
493    /// Collect inline text from an element node into a single String (used for bold/italic).
494    fn collect_inline_text(&self, node: &Node, source: &[u8]) -> String {
495        let mut text = String::new();
496        for child in node.children(&mut node.walk()) {
497            match child.kind() {
498                "text" => {
499                    if let Ok(t) = child.utf8_text(source) {
500                        let trimmed = t.trim();
501                        if !trimmed.is_empty() {
502                            if !text.is_empty() {
503                                text.push(' ');
504                            }
505                            text.push_str(trimmed);
506                        }
507                    }
508                }
509                "element" => {
510                    let child_text = self.collect_inline_text(&child, source);
511                    if !child_text.is_empty() {
512                        if !text.is_empty() {
513                            text.push(' ');
514                        }
515                        text.push_str(&child_text);
516                    }
517                }
518                _ => {}
519            }
520        }
521        text
522    }
523
524    fn walk_and_collect(&mut self, node: &Node, source: &[u8], inside_paragraph: bool) {
525        match node.kind() {
526            "text" => {
527                if let Ok(text) = node.utf8_text(source) {
528                    let trimmed = text.trim();
529                    if !trimmed.is_empty() {
530                        if self.items.is_empty() {
531                            self.items.push(ArticleItem::Paragraph(TextSegment {
532                                text: String::new(),
533                                content: Vec::new(),
534                                mwid: String::new(),
535                                section: self.get_current_section_string(),
536                                section_level: self.get_current_section_level(),
537                            }));
538                        }
539                        self.push_inline(InlineNode::Text(trimmed.to_string()));
540                    }
541                }
542            }
543            "script_element" | "style_element" => (),
544            "element" => {
545                if let Some((tag_name, attributes)) = self.parse_element(node, source) {
546                    if tag_name == "link" {
547                        return;
548                    }
549
550                    let class_attr = attributes
551                        .iter()
552                        .find(|(k, _)| k == "class")
553                        .map(|(_, v)| v.as_str())
554                        .unwrap_or("");
555
556                    // Handle citation refs before the class exclusion check:
557                    // <sup class="mw-ref reference"> contains "reference" which would
558                    // otherwise be excluded, but these are inline markers we want to keep.
559                    if inside_paragraph
560                        && tag_name == "sup"
561                        && class_attr.split_whitespace().any(|c| c == "mw-ref")
562                    {
563                        if let Some(r) = self.extract_inline_ref(node, source) {
564                            self.push_inline(r);
565                        }
566                        return;
567                    }
568
569                    const EXCLUDED_CLASSES: &[&str] = &[
570                        "shortdescription",
571                        "hatnote",
572                        "infobox",
573                        "reference",
574                        "navbox",
575                        "noprint",
576                        "reflist",
577                        "citation",
578                        "mw-references",
579                    ];
580                    if EXCLUDED_CLASSES
581                        .iter()
582                        .any(|c| class_attr.split_whitespace().any(|cls| cls == *c))
583                    {
584                        return;
585                    }
586
587                    if let Some(level) = Self::get_header_level(&tag_name) {
588                        let header_text = self.extract_text_from_element(node, source);
589                        if !header_text.is_empty() {
590                            self.update_sections(level, header_text);
591                        }
592                        return;
593                    }
594
595                    if tag_name == "p" {
596                        let mwid = attributes
597                            .iter()
598                            .find(|(k, _)| k == "id")
599                            .map(|(_, v)| v.clone())
600                            .unwrap_or_default();
601                        self.items.push(ArticleItem::Paragraph(TextSegment {
602                            text: String::new(),
603                            content: Vec::new(),
604                            mwid,
605                            section: self.get_current_section_string(),
606                            section_level: self.get_current_section_level(),
607                        }));
608                        for i in 0..node.child_count() {
609                            if let Some(child) = node.child(i as u32) {
610                                self.walk_and_collect(&child, source, true);
611                            }
612                        }
613                        return;
614                    }
615
616                    if tag_name == "figure" {
617                        if let Some(img) = self.extract_image(node, source) {
618                            self.items.push(ArticleItem::Image(img));
619                        }
620                        return;
621                    }
622
623                    // Inline elements inside a paragraph
624                    if inside_paragraph {
625                        match tag_name.as_str() {
626                            "b" | "strong" => {
627                                let text = self.collect_inline_text(node, source);
628                                if !text.is_empty() {
629                                    self.push_inline(InlineNode::Bold(text));
630                                }
631                                return;
632                            }
633                            "i" | "em" => {
634                                let text = self.collect_inline_text(node, source);
635                                if !text.is_empty() {
636                                    self.push_inline(InlineNode::Italic(text));
637                                }
638                                return;
639                            }
640                            "a" => {
641                                let raw_href = attributes
642                                    .iter()
643                                    .find(|(k, _)| k == "href")
644                                    .map(|(_, v)| v.as_str())
645                                    .unwrap_or_default();
646                                let href = self.resolve_href(raw_href);
647                                let text = self.collect_inline_text(node, source);
648                                if !text.is_empty() {
649                                    self.push_inline(InlineNode::Link { text, href });
650                                }
651                                return;
652                            }
653                            _ => {}
654                        }
655                    }
656
657                    for i in 0..node.child_count() {
658                        if let Some(child) = node.child(i as u32) {
659                            self.walk_and_collect(&child, source, inside_paragraph);
660                        }
661                    }
662                }
663            }
664            _ => {
665                for i in 0..node.child_count() {
666                    if let Some(child) = node.child(i as u32) {
667                        self.walk_and_collect(&child, source, inside_paragraph);
668                    }
669                }
670            }
671        }
672    }
673
674    fn parse_element(
675        &self,
676        element_node: &Node,
677        source: &[u8],
678    ) -> Option<(String, Vec<(String, String)>)> {
679        // Handle both normal elements (<tag>) and self-closing elements (<img/>)
680        let tag_container = element_node
681            .children(&mut element_node.walk())
682            .find(|child| child.kind() == "start_tag" || child.kind() == "self_closing_tag")?;
683
684        let tag_name_node = tag_container
685            .children(&mut tag_container.walk())
686            .find(|child| child.kind() == "tag_name")?;
687
688        let tag_name = tag_name_node.utf8_text(source).ok()?.to_string();
689        let mut attributes = Vec::new();
690
691        for child in tag_container.children(&mut tag_container.walk()) {
692            if child.kind() == "attribute" {
693                if let Some(pair) = self.parse_attribute(&child, source) {
694                    attributes.push(pair);
695                }
696            }
697        }
698
699        Some((tag_name, attributes))
700    }
701
702    fn parse_attribute(&self, attr_node: &Node, source: &[u8]) -> Option<(String, String)> {
703        let mut attr_name = None;
704        let mut attr_value = String::new();
705
706        for child in attr_node.children(&mut attr_node.walk()) {
707            match child.kind() {
708                "attribute_name" => {
709                    attr_name = child.utf8_text(source).ok().map(|s| s.to_string());
710                }
711                "quoted_attribute_value" => {
712                    for grandchild in child.children(&mut child.walk()) {
713                        if grandchild.kind() == "attribute_value" {
714                            if let Ok(value) = grandchild.utf8_text(source) {
715                                attr_value = value.to_string();
716                            }
717                        }
718                    }
719                }
720                "attribute_value" => {
721                    if let Ok(value) = child.utf8_text(source) {
722                        attr_value = value.to_string();
723                    }
724                }
725                _ => {}
726            }
727        }
728
729        attr_name.map(|name| (name, attr_value))
730    }
731
732    /// Extract an [`ImageSegment`] from a `<figure>` element node.
733    ///
734    /// Looks for a descendant `<img>` (self-closing) for `src`/`alt`, and a
735    /// `<figcaption>` child for the caption text.
736    fn extract_image(&self, figure_node: &Node, source: &[u8]) -> Option<ImageSegment> {
737        let mut src = String::new();
738        let mut alt = String::new();
739        let mut caption = String::new();
740
741        for child in figure_node.children(&mut figure_node.walk()) {
742            if child.kind() == "element" {
743                if let Some((tag, attrs)) = self.parse_element(&child, source) {
744                    if tag == "figcaption" {
745                        caption = self.extract_text_from_element(&child, source);
746                    } else {
747                        // Recurse into <a class="mw-file-description"> to find <img>
748                        self.find_img(&child, source, &tag, &attrs, &mut src, &mut alt);
749                    }
750                }
751            }
752        }
753
754        if src.is_empty() {
755            return None;
756        }
757
758        Some(ImageSegment {
759            src: self.resolve_href(&src),
760            alt,
761            caption,
762            section: self.get_current_section_string(),
763            section_level: self.get_current_section_level(),
764        })
765    }
766
767    /// Recursively find the first `<img>` inside `node`, writing into `src`/`alt`.
768    fn find_img(
769        &self,
770        node: &Node,
771        source: &[u8],
772        tag: &str,
773        attrs: &[(String, String)],
774        src: &mut String,
775        alt: &mut String,
776    ) {
777        if !src.is_empty() {
778            return;
779        }
780        if tag == "img" {
781            if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "src") {
782                *src = v.clone();
783            }
784            if let Some((_, v)) = attrs.iter().find(|(k, _)| k == "alt") {
785                *alt = v.clone();
786            }
787            return;
788        }
789        for child in node.children(&mut node.walk()) {
790            if child.kind() == "element" {
791                if let Some((child_tag, child_attrs)) = self.parse_element(&child, source) {
792                    self.find_img(&child, source, &child_tag, &child_attrs, src, alt);
793                }
794            }
795        }
796    }
797}
798
799impl Default for WikiPage {
800    fn default() -> Self {
801        Self::new().expect("Failed to initialise tree-sitter HTML parser")
802    }
803}
804
805/// Remove all reference-related content from a list of [`ArticleItem`]s.
806///
807/// Drops the [`ArticleItem::References`] item and removes [`InlineNode::Ref`]
808/// nodes from every paragraph's content (also rebuilds the plain text).
809/// Call this when `--include-references=false` is requested.
810pub fn strip_references(items: Vec<ArticleItem>) -> Vec<ArticleItem> {
811    items
812        .into_iter()
813        .filter_map(|item| match item {
814            ArticleItem::References(_) => None,
815            ArticleItem::Paragraph(mut seg) => {
816                seg.content.retain(|n| !matches!(n, InlineNode::Ref { .. }));
817                // Rebuild plain text without the ref labels
818                seg.text = seg
819                    .content
820                    .iter()
821                    .map(|n| n.plain_text())
822                    .filter(|s| !s.is_empty())
823                    .collect::<Vec<_>>()
824                    .join(" ");
825                Some(ArticleItem::Paragraph(seg))
826            }
827            other => Some(other),
828        })
829        .collect()
830}
831
832/// Fetch a Wikipedia article by language code and title, returning article items in document order.
833///
834/// Requires the `fetch` feature.
835#[cfg(any(feature = "cli", feature = "web"))]
836pub async fn get_text(language: &str, title: &str) -> anyhow::Result<Vec<ArticleItem>> {
837    let html = get_page_content_html(language, title).await?;
838    let mut page = WikiPage::new()?;
839    page.set_base_url(language);
840    Ok(page.extract_text(&html)?)
841}
842
843#[cfg(any(feature = "cli", feature = "web"))]
844async fn get_page_content_html(language: &str, title: &str) -> anyhow::Result<String> {
845    let url = format!("https://{language}.wikipedia.org/api/rest_v1/page/html/{title}?stash=false");
846    let client = reqwest::Client::new();
847    let response = client
848        .get(&url)
849        .header(
850            "User-Agent",
851            "wikipedia-article-transform/0.1 (https://github.com/santhoshtr/wikipedia-article-transform)",
852        )
853        .send()
854        .await?;
855    if !response.status().is_success() {
856        anyhow::bail!("Failed to fetch article: HTTP {}", response.status());
857    }
858    Ok(response.text().await?)
859}
860
861#[cfg(test)]
862mod tests {
863    use super::*;
864
865    fn extract(html: &str) -> Vec<ArticleItem> {
866        WikiPage::extract_text_plain(html).unwrap();
867        let mut page = WikiPage::new().unwrap();
868        page.extract_text(html).unwrap()
869    }
870
871    fn paragraphs(items: &[ArticleItem]) -> Vec<&TextSegment> {
872        items
873            .iter()
874            .filter_map(|i| {
875                if let ArticleItem::Paragraph(s) = i {
876                    Some(s)
877                } else {
878                    None
879                }
880            })
881            .collect()
882    }
883
884    fn images(items: &[ArticleItem]) -> Vec<&ImageSegment> {
885        items
886            .iter()
887            .filter_map(|i| {
888                if let ArticleItem::Image(s) = i {
889                    Some(s)
890                } else {
891                    None
892                }
893            })
894            .collect()
895    }
896
897    #[test]
898    fn test_basic_paragraph() {
899        let items = extract("<html><body><p id=\"p1\">Hello world.</p></body></html>");
900        let segs = paragraphs(&items);
901        assert_eq!(segs.len(), 1);
902        assert_eq!(segs[0].text, "Hello world.");
903        assert_eq!(segs[0].mwid, "p1");
904        assert_eq!(segs[0].section, "");
905        assert_eq!(segs[0].section_level, 0);
906    }
907
908    #[test]
909    fn test_multiple_paragraphs() {
910        let items = extract("<p>First.</p><p>Second.</p><p>Third.</p>");
911        let segs = paragraphs(&items);
912        assert_eq!(segs.len(), 3);
913        assert_eq!(segs[0].text, "First.");
914        assert_eq!(segs[1].text, "Second.");
915        assert_eq!(segs[2].text, "Third.");
916    }
917
918    #[test]
919    fn test_section_tracking() {
920        let html = "<h2>History</h2><p>Para one.</p><h3>Early life</h3><p>Para two.</p>";
921        let items = extract(html);
922        let segs = paragraphs(&items);
923        assert_eq!(segs[0].section, "History");
924        assert_eq!(segs[1].section, "History - Early life");
925    }
926
927    #[test]
928    fn test_section_level() {
929        let html = "<h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
930        let items = extract(html);
931        let segs = paragraphs(&items);
932        assert_eq!(segs[0].section_level, 2);
933        assert_eq!(segs[1].section_level, 3);
934    }
935
936    #[test]
937    fn test_section_resets_at_same_level() {
938        let html = "<h2>History</h2><p>A.</p><h2>Geography</h2><p>B.</p>";
939        let items = extract(html);
940        let segs = paragraphs(&items);
941        assert_eq!(segs[0].section, "History");
942        assert_eq!(segs[1].section, "Geography");
943    }
944
945    #[test]
946    fn test_excluded_class_infobox() {
947        let html = r#"<p>Visible.</p><table class="infobox"><tr><td>Hidden.</td></tr></table><p>Also visible.</p>"#;
948        let items = extract(html);
949        let segs = paragraphs(&items);
950        assert!(segs.iter().all(|s| !s.text.contains("Hidden")));
951        assert_eq!(segs.len(), 2);
952    }
953
954    #[test]
955    fn test_excluded_class_reflist() {
956        let html = r#"<p>Main text.</p><div class="reflist"><p>Ref text.</p></div>"#;
957        let items = extract(html);
958        let segs = paragraphs(&items);
959        assert_eq!(segs.len(), 1);
960        assert_eq!(segs[0].text, "Main text.");
961    }
962
963    #[test]
964    fn test_script_and_style_skipped() {
965        let html = "<p>Real.</p><script>var x=1;</script><style>body{}</style><p>Also real.</p>";
966        let items = extract(html);
967        let segs = paragraphs(&items);
968        assert_eq!(segs.len(), 2);
969        assert!(segs.iter().all(|s| !s.text.contains("var x")));
970    }
971
972    #[test]
973    fn test_empty_html() {
974        let items = extract("");
975        assert!(items.is_empty());
976    }
977
978    #[test]
979    fn test_extract_text_plain() {
980        let html = "<p>First paragraph.</p><p>Second paragraph.</p>";
981        let text = WikiPage::extract_text_plain(html).unwrap();
982        assert_eq!(text, "First paragraph.\n\nSecond paragraph.");
983    }
984
985    #[test]
986    fn test_default_impl() {
987        let mut page = WikiPage::default();
988        let items = page.extract_text("<p>Works.</p>").unwrap();
989        let segs = paragraphs(&items);
990        assert_eq!(segs[0].text, "Works.");
991    }
992
993    #[test]
994    fn test_inline_bold() {
995        let items = extract("<p><b>Bold</b> text</p>");
996        let segs = paragraphs(&items);
997        assert_eq!(segs.len(), 1);
998        assert_eq!(segs[0].text, "Bold text");
999        assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "Bold"));
1000        assert!(matches!(&segs[0].content[1], InlineNode::Text(s) if s == "text"));
1001    }
1002
1003    #[test]
1004    fn test_inline_italic() {
1005        let items = extract("<p><i>italic</i></p>");
1006        let segs = paragraphs(&items);
1007        assert_eq!(segs.len(), 1);
1008        assert!(matches!(&segs[0].content[0], InlineNode::Italic(s) if s == "italic"));
1009    }
1010
1011    #[test]
1012    fn test_inline_strong_em() {
1013        let items = extract("<p><strong>S</strong> and <em>E</em></p>");
1014        let segs = paragraphs(&items);
1015        assert!(matches!(&segs[0].content[0], InlineNode::Bold(s) if s == "S"));
1016        assert!(matches!(&segs[0].content[2], InlineNode::Italic(s) if s == "E"));
1017    }
1018
1019    #[test]
1020    fn test_inline_link() {
1021        let items = extract(r#"<p><a href="./X">anchor</a></p>"#);
1022        let segs = paragraphs(&items);
1023        assert_eq!(segs.len(), 1);
1024        // No base URL set: ./X passes through unchanged
1025        assert!(matches!(&segs[0].content[0],
1026            InlineNode::Link { text, href } if text == "anchor" && href == "./X"));
1027    }
1028
1029    #[test]
1030    fn test_inline_link_absolute() {
1031        let html = r#"<p><a href="./Cryogenics">Cryogenics</a></p>"#;
1032        let mut page = WikiPage::new().unwrap();
1033        page.set_base_url("en");
1034        let items = page.extract_text(html).unwrap();
1035        let segs = paragraphs(&items);
1036        assert!(matches!(&segs[0].content[0],
1037            InlineNode::Link { text, href }
1038                if text == "Cryogenics"
1039                && href == "https://en.wikipedia.org/wiki/Cryogenics"));
1040    }
1041
1042    #[test]
1043    fn test_resolve_href_protocol_relative() {
1044        let html = r#"<p><a href="//en.wikipedia.org/wiki/Oxygen">O</a></p>"#;
1045        let mut page = WikiPage::new().unwrap();
1046        let items = page.extract_text(html).unwrap();
1047        let segs = paragraphs(&items);
1048        assert!(matches!(&segs[0].content[0],
1049            InlineNode::Link { href, .. } if href == "https://en.wikipedia.org/wiki/Oxygen"));
1050    }
1051
1052    #[test]
1053    fn test_format_plain_sections() {
1054        let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1055        let items = extract(html);
1056        let out = items.format_plain();
1057        assert!(out.contains("\nIntro.\n"), "intro paragraph missing");
1058        assert!(out.contains("## History\n"), "h2 heading missing");
1059        assert!(out.contains("\nA.\n"), "first section paragraph missing");
1060        assert!(out.contains("### Early life\n"), "h3 heading missing");
1061        assert!(out.contains("\nB.\n"), "subsection paragraph missing");
1062        assert!(out.find("## History").unwrap() < out.find("\nA.\n").unwrap());
1063        assert!(out.find("### Early life").unwrap() < out.find("\nB.\n").unwrap());
1064    }
1065
1066    #[test]
1067    fn test_format_json_tree() {
1068        let html = "<p>Intro.</p><h2>History</h2><p>A.</p><h3>Early life</h3><p>B.</p>";
1069        let items = extract(html);
1070        let json_str = items.format_json().unwrap();
1071        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1072        assert_eq!(v["intro"][0], "Intro.");
1073        assert_eq!(v["sections"][0]["heading"], "History");
1074        assert_eq!(v["sections"][0]["level"], 2);
1075        assert_eq!(v["sections"][0]["paragraphs"][0], "A.");
1076        assert_eq!(v["sections"][0]["subsections"][0]["heading"], "Early life");
1077        assert_eq!(v["sections"][0]["subsections"][0]["level"], 3);
1078        assert_eq!(v["sections"][0]["subsections"][0]["paragraphs"][0], "B.");
1079    }
1080
1081    #[test]
1082    fn test_format_markdown_inline() {
1083        let items = extract(
1084            "<h2>Title</h2><p><b>Bold</b> and <i>italic</i> and <a href=\"/x\">link</a></p>",
1085        );
1086        let out = items.format_markdown();
1087        assert!(out.contains("## Title"));
1088        assert!(out.contains("**Bold**"));
1089        assert!(out.contains("_italic_"));
1090        assert!(out.contains("[link](/x)"));
1091        // spaces between inline nodes must be preserved
1092        assert!(
1093            out.contains("**Bold** and"),
1094            "space after bold missing: {out}"
1095        );
1096        assert!(
1097            out.contains("_italic_ and"),
1098            "space after italic missing: {out}"
1099        );
1100        assert!(
1101            out.contains("and [link]"),
1102            "space before link missing: {out}"
1103        );
1104    }
1105
1106    #[test]
1107    fn test_image_extraction() {
1108        let html = r#"<figure typeof="mw:File/Thumb">
1109            <a href="./File:Foo.jpg" class="mw-file-description">
1110                <img alt="A description" src="//upload.wikimedia.org/thumb/foo.jpg" class="mw-file-element"/>
1111            </a>
1112            <figcaption>Caption text here.</figcaption>
1113        </figure>"#;
1114        let items = extract(html);
1115        let imgs = images(&items);
1116        assert_eq!(imgs.len(), 1);
1117        assert_eq!(imgs[0].src, "https://upload.wikimedia.org/thumb/foo.jpg");
1118        assert_eq!(imgs[0].alt, "A description");
1119        assert_eq!(imgs[0].caption, "Caption text here.");
1120    }
1121
1122    #[test]
1123    fn test_image_no_caption() {
1124        let html = r#"<figure typeof="mw:File/Frameless">
1125            <a href="./File:Bar.png" class="mw-file-description">
1126                <img alt="Bar" src="//upload.wikimedia.org/bar.png" class="mw-file-element"/>
1127            </a>
1128            <figcaption></figcaption>
1129        </figure>"#;
1130        let items = extract(html);
1131        let imgs = images(&items);
1132        assert_eq!(imgs.len(), 1);
1133        assert_eq!(imgs[0].caption, "");
1134    }
1135
1136    #[test]
1137    fn test_image_section_tracking() {
1138        let html = r#"<h2>History</h2>
1139        <figure typeof="mw:File/Thumb">
1140            <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1141            <figcaption>X caption</figcaption>
1142        </figure>
1143        <p>A paragraph.</p>"#;
1144        let items = extract(html);
1145        let imgs = images(&items);
1146        assert_eq!(imgs.len(), 1);
1147        assert_eq!(imgs[0].section, "History");
1148        assert_eq!(imgs[0].section_level, 2);
1149    }
1150
1151    #[test]
1152    fn test_image_interleaved_order() {
1153        let html = r#"<p>Before.</p>
1154        <figure typeof="mw:File/Thumb">
1155            <a href="./File:X.jpg"><img alt="X" src="//upload.wikimedia.org/x.jpg"/></a>
1156            <figcaption>Caption</figcaption>
1157        </figure>
1158        <p>After.</p>"#;
1159        let items = extract(html);
1160        assert!(matches!(&items[0], ArticleItem::Paragraph(s) if s.text == "Before."));
1161        assert!(matches!(&items[1], ArticleItem::Image(_)));
1162        assert!(matches!(&items[2], ArticleItem::Paragraph(s) if s.text == "After."));
1163    }
1164
1165    #[test]
1166    fn test_markdown_image() {
1167        let html = r#"<figure typeof="mw:File/Thumb">
1168            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1169            <figcaption>The caption.</figcaption>
1170        </figure>"#;
1171        let items = extract(html);
1172        let out = items.format_markdown();
1173        // caption is used as alt text intentionally
1174        assert!(out.contains("![Alt text](https://upload.wikimedia.org/foo.jpg)"));
1175        assert!(out.contains("_The caption._"));
1176    }
1177
1178    #[test]
1179    fn test_markdown_image_no_caption() {
1180        let html = r#"<figure typeof="mw:File/Frameless">
1181            <a href="./File:Bar.png"><img alt="Bar" src="//upload.wikimedia.org/bar.png"/></a>
1182            <figcaption></figcaption>
1183        </figure>"#;
1184        let items = extract(html);
1185        let out = items.format_markdown();
1186        assert!(out.contains("![Bar](https://upload.wikimedia.org/bar.png)"));
1187        // no caption line when caption is empty
1188        assert!(!out.contains("__"));
1189    }
1190
1191    #[test]
1192    fn test_plain_image() {
1193        let html = r#"<figure typeof="mw:File/Thumb">
1194            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1195            <figcaption>The caption.</figcaption>
1196        </figure>"#;
1197        let items = extract(html);
1198        let out = items.format_plain();
1199        assert!(out.contains("[Image: Alt text]"));
1200        assert!(out.contains("The caption."));
1201    }
1202
1203    #[test]
1204    fn test_json_image() {
1205        let html = r#"<h2>Section</h2>
1206        <figure typeof="mw:File/Thumb">
1207            <a href="./File:Foo.jpg"><img alt="Alt text" src="//upload.wikimedia.org/foo.jpg"/></a>
1208            <figcaption>The caption.</figcaption>
1209        </figure>
1210        <p>A paragraph.</p>"#;
1211        let items = extract(html);
1212        let json_str = items.format_json().unwrap();
1213        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1214        assert_eq!(v["sections"][0]["images"][0]["alt"], "Alt text");
1215        assert_eq!(
1216            v["sections"][0]["images"][0]["src"],
1217            "https://upload.wikimedia.org/foo.jpg"
1218        );
1219        assert_eq!(v["sections"][0]["images"][0]["caption"], "The caption.");
1220    }
1221
1222    // ── Reference tests ─────────────────────────────────────────────────────
1223
1224    fn ref_html() -> &'static str {
1225        r#"<p id="p1">Some text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1226            ><a href="./Article#cite_note-Foo-1"><span class="mw-reflink-text">[1]</span></a
1227            ></sup> More text.<sup class="mw-ref reference" typeof="mw:Extension/ref"
1228            ><a href="./Article#cite_note-Bar-2"><span class="mw-reflink-text">[2]</span></a
1229            ></sup></p>
1230        <ol class="mw-references references">
1231            <li id="cite_note-Foo-1" data-mw-footnote-number="1">
1232                <span class="mw-cite-backlink"><a href="./Article#cite_ref-Foo_1-0">↑</a></span>
1233                <span id="mw-reference-text-cite_note-Foo-1" class="mw-reference-text reference-text">Author A. <i>Title One</i>. Publisher, 2020.</span>
1234            </li>
1235            <li id="cite_note-Bar-2" data-mw-footnote-number="2">
1236                <span class="mw-cite-backlink"><a href="./Article#cite_ref-Bar_2-0">↑</a></span>
1237                <span id="mw-reference-text-cite_note-Bar-2" class="mw-reference-text reference-text">Author B. Title Two. Journal, 2021.</span>
1238            </li>
1239        </ol>"#
1240    }
1241
1242    #[test]
1243    fn test_ref_inline_nodes() {
1244        let items = extract(ref_html());
1245        let segs = paragraphs(&items);
1246        assert_eq!(segs.len(), 1);
1247        // content: Text, Ref[1], Text, Ref[2]
1248        assert!(matches!(&segs[0].content[0], InlineNode::Text(s) if s.contains("Some text")));
1249        assert!(
1250            matches!(&segs[0].content[1], InlineNode::Ref { label, note_id }
1251            if label == "1" && note_id == "cite_note-Foo-1")
1252        );
1253        assert!(
1254            matches!(&segs[0].content[3], InlineNode::Ref { label, note_id }
1255            if label == "2" && note_id == "cite_note-Bar-2")
1256        );
1257    }
1258
1259    #[test]
1260    fn test_ref_plain_text_excludes_label() {
1261        // plain_text() on Ref returns "" so the ref label is not in seg.text
1262        let items = extract(ref_html());
1263        let segs = paragraphs(&items);
1264        assert!(!segs[0].text.contains('['));
1265        assert!(segs[0].text.contains("Some text"));
1266        assert!(segs[0].text.contains("More text"));
1267    }
1268
1269    #[test]
1270    fn test_ref_references_item_appended() {
1271        let items = extract(ref_html());
1272        let refs = items.iter().find_map(|i| {
1273            if let ArticleItem::References(r) = i {
1274                Some(r)
1275            } else {
1276                None
1277            }
1278        });
1279        assert!(refs.is_some());
1280        let refs = refs.unwrap();
1281        assert_eq!(refs.len(), 2);
1282        assert!(refs["cite_note-Foo-1"].contains("Title One"));
1283        assert!(refs["cite_note-Bar-2"].contains("Title Two"));
1284    }
1285
1286    #[test]
1287    fn test_ref_no_refs_no_item() {
1288        let items = extract("<p>No citations here.</p>");
1289        assert!(
1290            !items
1291                .iter()
1292                .any(|i| matches!(i, ArticleItem::References(_)))
1293        );
1294    }
1295
1296    #[test]
1297    fn test_ref_markdown_inline_and_list() {
1298        let items = extract(ref_html());
1299        let out = items.format_markdown();
1300        // Inline labels appear attached to surrounding text
1301        assert!(out.contains("[^1]"), "inline [^1] missing");
1302        assert!(out.contains("[^2]"), "inline [^2] missing");
1303        // Reference definitions at the bottom
1304        assert!(out.contains("## References"), "References heading missing");
1305        assert!(out.contains("[^1]: "), "[^1]: definition missing");
1306        assert!(out.contains("Title One"), "citation text missing");
1307        assert!(out.contains("[^2]: "), "[^2]: definition missing");
1308        assert!(out.contains("Title Two"), "citation text missing");
1309        // Definitions must appear after body text
1310        assert!(out.find("Some text").unwrap() < out.find("## References").unwrap());
1311    }
1312
1313    #[test]
1314    fn test_ref_json_references_key() {
1315        let items = extract(ref_html());
1316        let json_str = items.format_json().unwrap();
1317        let v: serde_json::Value = serde_json::from_str(&json_str).unwrap();
1318        assert!(v["references"].is_object(), "references key missing");
1319        assert!(
1320            v["references"]["cite_note-Foo-1"]
1321                .as_str()
1322                .unwrap()
1323                .contains("Title One")
1324        );
1325        assert!(
1326            v["references"]["cite_note-Bar-2"]
1327                .as_str()
1328                .unwrap()
1329                .contains("Title Two")
1330        );
1331    }
1332
1333    #[test]
1334    fn test_strip_references() {
1335        let items = extract(ref_html());
1336        let stripped = strip_references(items);
1337        // No References item
1338        assert!(
1339            !stripped
1340                .iter()
1341                .any(|i| matches!(i, ArticleItem::References(_)))
1342        );
1343        // No Ref inline nodes in paragraphs
1344        let segs = paragraphs(&stripped);
1345        for seg in segs {
1346            assert!(
1347                !seg.content
1348                    .iter()
1349                    .any(|n| matches!(n, InlineNode::Ref { .. }))
1350            );
1351            assert!(!seg.text.contains('['));
1352        }
1353    }
1354}