Skip to main content

wikipedia_article_transform/
formatters.rs

1//! Output formatters for Wikipedia article items.
2//!
3//! Provides the [`ArticleFormat`] trait with three output formats:
4//! - [`ArticleFormat::format_plain`] — plain text with heading lines and image placeholders
5//! - [`ArticleFormat::format_json`] — semantic JSON section tree with images and references
6//! - [`ArticleFormat::format_markdown`] — Markdown with inline formatting, images, and footnotes
7
8use std::collections::HashMap;
9
10use crate::{ArticleItem, ImageSegment, InlineNode, TextSegment};
11use serde::Serialize;
12
13/// Output formatting for a collection of [`ArticleItem`]s.
14pub trait ArticleFormat {
15    /// Format as plain text.
16    ///
17    /// Section headings are emitted as `#`/`##`/`###` lines. Images are rendered
18    /// as `[Image: alt text]` followed by caption. References are omitted.
19    fn format_plain(&self) -> String;
20
21    /// Format as a semantic JSON section tree.
22    ///
23    /// Structure:
24    /// ```json
25    /// {
26    ///   "intro": [{"text":"...","citations":[]}], "intro_images": [...],
27    ///   "sections": [{"heading":"...","level":2,"paragraphs":[{"text":"...","citations":[]}],"images":[...],"subsections":[...]}],
28    ///   "references": {"cite_note-Foo-1": "Full citation text..."}
29    /// }
30    /// ```
31    fn format_json(&self) -> anyhow::Result<String>;
32
33    /// Format as Markdown.
34    ///
35    /// Inline: bold → `**text**`, italic → `_text_`, links → `[text](href)`,
36    /// citation refs → `[N]`. Images → `![alt](src)` with italic caption.
37    /// A `## References` section with `[N]: citation` definitions is appended
38    /// when references are present.
39    fn format_markdown(&self) -> String;
40}
41
42impl ArticleFormat for Vec<ArticleItem> {
43    fn format_plain(&self) -> String {
44        format_plain(self)
45    }
46    fn format_json(&self) -> anyhow::Result<String> {
47        format_json(self)
48    }
49    fn format_markdown(&self) -> String {
50        format_markdown(self)
51    }
52}
53
54impl ArticleFormat for &[ArticleItem] {
55    fn format_plain(&self) -> String {
56        format_plain(self)
57    }
58    fn format_json(&self) -> anyhow::Result<String> {
59        format_json(self)
60    }
61    fn format_markdown(&self) -> String {
62        format_markdown(self)
63    }
64}
65
66fn emit_section_heading(
67    out: &mut String,
68    seg_section: &str,
69    seg_level: u8,
70    last_section: &mut String,
71) {
72    if seg_section != *last_section {
73        if !out.is_empty() {
74            out.push('\n');
75        }
76        if !seg_section.is_empty() {
77            let hashes = "#".repeat(seg_level.max(1) as usize);
78            let heading = seg_section.rsplit(" - ").next().unwrap_or(seg_section);
79            out.push_str(&hashes);
80            out.push(' ');
81            out.push_str(heading);
82            out.push('\n');
83        }
84        *last_section = seg_section.to_string();
85    }
86}
87
88fn format_plain(items: &[ArticleItem]) -> String {
89    let mut out = String::new();
90    let mut last_section = String::new();
91
92    for item in items {
93        match item {
94            ArticleItem::Paragraph(seg) => {
95                let text = seg.text.trim();
96                if text.is_empty() {
97                    continue;
98                }
99                emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
100                out.push('\n');
101                out.push_str(text);
102                out.push('\n');
103            }
104            ArticleItem::Image(img) => {
105                emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
106                out.push('\n');
107                out.push_str("[Image: ");
108                out.push_str(&img.alt);
109                out.push(']');
110                out.push('\n');
111                if !img.caption.is_empty() {
112                    out.push_str(&img.caption);
113                    out.push('\n');
114                }
115            }
116            ArticleItem::References(_) => {} // omit from plain output
117        }
118    }
119
120    out
121}
122
123fn format_json(items: &[ArticleItem]) -> anyhow::Result<String> {
124    #[derive(Serialize)]
125    struct CitationEntry {
126        label: String,
127        text: String,
128    }
129
130    #[derive(Serialize)]
131    struct ParagraphEntry {
132        text: String,
133        citations: Vec<CitationEntry>,
134    }
135
136    #[derive(Serialize)]
137    struct ImageEntry {
138        src: String,
139        alt: String,
140        caption: String,
141    }
142
143    impl From<&ImageSegment> for ImageEntry {
144        fn from(img: &ImageSegment) -> Self {
145            ImageEntry {
146                src: img.src.clone(),
147                alt: img.alt.clone(),
148                caption: img.caption.clone(),
149            }
150        }
151    }
152
153    #[derive(Serialize)]
154    struct Section {
155        heading: String,
156        level: u8,
157        paragraphs: Vec<ParagraphEntry>,
158        images: Vec<ImageEntry>,
159        subsections: Vec<Section>,
160    }
161
162    #[derive(Serialize)]
163    struct ArticleTree {
164        intro: Vec<ParagraphEntry>,
165        intro_images: Vec<ImageEntry>,
166        sections: Vec<Section>,
167        references: HashMap<String, String>,
168    }
169
170    fn paragraph_from_segment(
171        seg: &TextSegment,
172        references: &HashMap<String, String>,
173    ) -> Option<ParagraphEntry> {
174        let text = seg.text.trim().to_string();
175        if text.is_empty() {
176            return None;
177        }
178
179        let mut citations = Vec::new();
180        let mut seen_note_ids: Vec<&str> = Vec::new();
181        for node in &seg.content {
182            if let InlineNode::Ref { label, note_id } = node {
183                if seen_note_ids.iter().any(|seen| *seen == note_id) {
184                    continue;
185                }
186                seen_note_ids.push(note_id);
187                citations.push(CitationEntry {
188                    label: label.clone(),
189                    text: references.get(note_id).cloned().unwrap_or_default(),
190                });
191            }
192        }
193
194        Some(ParagraphEntry { text, citations })
195    }
196
197    let references = items
198        .iter()
199        .find_map(|item| {
200            if let ArticleItem::References(refs) = item {
201                Some(refs.clone())
202            } else {
203                None
204            }
205        })
206        .unwrap_or_default();
207
208    let mut tree = ArticleTree {
209        intro: Vec::new(),
210        intro_images: Vec::new(),
211        sections: Vec::new(),
212        references: references.clone(),
213    };
214
215    for item in items {
216        match item {
217            ArticleItem::Paragraph(seg) => {
218                let Some(paragraph) = paragraph_from_segment(seg, &references) else {
219                    continue;
220                };
221                if seg.section.is_empty() {
222                    tree.intro.push(paragraph);
223                    continue;
224                }
225                let parts: Vec<&str> = seg.section.split(" - ").collect();
226                let mut siblings = &mut tree.sections;
227                for (i, part) in parts.iter().enumerate() {
228                    let depth_from_bottom = (parts.len() - 1 - i) as u8;
229                    let level = seg.section_level.saturating_sub(depth_from_bottom);
230                    if !siblings.iter().any(|s: &Section| s.heading == *part) {
231                        siblings.push(Section {
232                            heading: part.to_string(),
233                            level,
234                            paragraphs: Vec::new(),
235                            images: Vec::new(),
236                            subsections: Vec::new(),
237                        });
238                    }
239                    let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
240                    if i == parts.len() - 1 {
241                        siblings[idx].paragraphs.push(paragraph);
242                        break;
243                    } else {
244                        siblings = &mut siblings[idx].subsections;
245                    }
246                }
247            }
248            ArticleItem::Image(img) => {
249                let entry = ImageEntry::from(img);
250                if img.section.is_empty() {
251                    tree.intro_images.push(entry);
252                    continue;
253                }
254                let parts: Vec<&str> = img.section.split(" - ").collect();
255                let mut siblings = &mut tree.sections;
256                for (i, part) in parts.iter().enumerate() {
257                    let depth_from_bottom = (parts.len() - 1 - i) as u8;
258                    let level = img.section_level.saturating_sub(depth_from_bottom);
259                    if !siblings.iter().any(|s: &Section| s.heading == *part) {
260                        siblings.push(Section {
261                            heading: part.to_string(),
262                            level,
263                            paragraphs: Vec::new(),
264                            images: Vec::new(),
265                            subsections: Vec::new(),
266                        });
267                    }
268                    let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
269                    if i == parts.len() - 1 {
270                        siblings[idx].images.push(entry);
271                        break;
272                    } else {
273                        siblings = &mut siblings[idx].subsections;
274                    }
275                }
276            }
277            ArticleItem::References(_) => {}
278        }
279    }
280
281    Ok(serde_json::to_string_pretty(&tree)?)
282}
283
284/// Sort a reference map by the trailing integer in the note_id (`cite_note-Name-N`).
285fn sorted_refs(refs: &HashMap<String, String>) -> Vec<(&String, &String)> {
286    let mut entries: Vec<(&String, &String)> = refs.iter().collect();
287    entries.sort_by_key(|(note_id, _)| {
288        note_id
289            .rsplit('-')
290            .next()
291            .and_then(|n| n.parse::<u32>().ok())
292            .unwrap_or(u32::MAX)
293    });
294    entries
295}
296
297fn format_markdown(items: &[ArticleItem]) -> String {
298    let mut out = String::new();
299    let mut last_section = String::new();
300
301    for item in items {
302        match item {
303            ArticleItem::Paragraph(seg) => {
304                if seg.text.trim().is_empty() {
305                    continue;
306                }
307                emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
308                out.push('\n');
309                let mut para = String::new();
310                for node in &seg.content {
311                    match node {
312                        InlineNode::Text(s) => para.push_str(s),
313                        InlineNode::Bold(s) => {
314                            if !para.ends_with(' ') && !para.is_empty() {
315                                para.push(' ');
316                            }
317                            para.push_str("**");
318                            para.push_str(s);
319                            para.push_str("** ");
320                        }
321                        InlineNode::Italic(s) => {
322                            if !para.ends_with(' ') && !para.is_empty() {
323                                para.push(' ');
324                            }
325                            para.push('_');
326                            para.push_str(s);
327                            para.push_str("_ ");
328                        }
329                        InlineNode::Link { text, href } => {
330                            if !para.ends_with(' ') && !para.is_empty() {
331                                para.push(' ');
332                            }
333                            para.push('[');
334                            para.push_str(text);
335                            para.push_str("](");
336                            para.push_str(href);
337                            para.push_str(") ");
338                        }
339                        InlineNode::Ref { label, .. } => {
340                            // Append [N] directly — no space before a superscript marker
341                            para.push('[');
342                            para.push('^');
343                            para.push_str(label);
344                            para.push(']');
345                        }
346                    }
347                }
348                out.push_str(para.trim_end());
349                out.push('\n');
350            }
351            ArticleItem::Image(img) => {
352                emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
353                out.push('\n');
354                out.push_str("![");
355                out.push_str(&img.alt);
356                out.push_str("](");
357                out.push_str(&img.src);
358                out.push(')');
359                out.push('\n');
360                if !img.caption.is_empty() {
361                    out.push('_');
362                    out.push_str(&img.caption);
363                    out.push('_');
364                    out.push('\n');
365                }
366            }
367            ArticleItem::References(refs) => {
368                if refs.is_empty() {
369                    continue;
370                }
371                out.push_str("\n## References\n");
372                for (note_id, citation) in sorted_refs(refs) {
373                    // Extract the numeric label from the note_id tail
374                    let label = note_id.rsplit('-').next().unwrap_or(note_id.as_str());
375                    out.push('\n');
376                    out.push('[');
377                    out.push('^');
378                    out.push_str(label);
379                    out.push_str("]: ");
380                    out.push_str(citation);
381                    out.push('\n');
382                }
383            }
384        }
385    }
386
387    out
388}