Skip to main content

wikipedia_article_transform/
formatters.rs

1//! Output formatters for Wikipedia article items.
2//!
3//! Provides the [`ArticleFormat`] trait with three output formats:
4//! - [`ArticleFormat::format_plain`] — plain text with heading lines and image placeholders
5//! - [`ArticleFormat::format_json`] — semantic JSON section tree with images and references
6//! - [`ArticleFormat::format_markdown`] — Markdown with inline formatting, images, and footnotes
7
8use std::collections::HashMap;
9
10use crate::{ArticleItem, ImageSegment, InlineNode};
11use serde::Serialize;
12
13/// Output formatting for a collection of [`ArticleItem`]s.
14pub trait ArticleFormat {
15    /// Format as plain text.
16    ///
17    /// Section headings are emitted as `#`/`##`/`###` lines. Images are rendered
18    /// as `[Image: alt text]` followed by caption. References are omitted.
19    fn format_plain(&self) -> String;
20
21    /// Format as a semantic JSON section tree.
22    ///
23    /// Structure:
24    /// ```json
25    /// {
26    ///   "intro": ["..."], "intro_images": [...],
27    ///   "sections": [{"heading":"...","level":2,"paragraphs":[...],"images":[...],"subsections":[...]}],
28    ///   "references": {"cite_note-Foo-1": "Full citation text..."}
29    /// }
30    /// ```
31    fn format_json(&self) -> anyhow::Result<String>;
32
33    /// Format as Markdown.
34    ///
35    /// Inline: bold → `**text**`, italic → `_text_`, links → `[text](href)`,
36    /// citation refs → `[N]`. Images → `![alt](src)` with italic caption.
37    /// A `## References` section with `[N]: citation` definitions is appended
38    /// when references are present.
39    fn format_markdown(&self) -> String;
40}
41
42impl ArticleFormat for Vec<ArticleItem> {
43    fn format_plain(&self) -> String {
44        format_plain(self)
45    }
46    fn format_json(&self) -> anyhow::Result<String> {
47        format_json(self)
48    }
49    fn format_markdown(&self) -> String {
50        format_markdown(self)
51    }
52}
53
54impl ArticleFormat for &[ArticleItem] {
55    fn format_plain(&self) -> String {
56        format_plain(self)
57    }
58    fn format_json(&self) -> anyhow::Result<String> {
59        format_json(self)
60    }
61    fn format_markdown(&self) -> String {
62        format_markdown(self)
63    }
64}
65
66fn emit_section_heading(
67    out: &mut String,
68    seg_section: &str,
69    seg_level: u8,
70    last_section: &mut String,
71) {
72    if seg_section != *last_section {
73        if !out.is_empty() {
74            out.push('\n');
75        }
76        if !seg_section.is_empty() {
77            let hashes = "#".repeat(seg_level.max(1) as usize);
78            let heading = seg_section.rsplit(" - ").next().unwrap_or(seg_section);
79            out.push_str(&hashes);
80            out.push(' ');
81            out.push_str(heading);
82            out.push('\n');
83        }
84        *last_section = seg_section.to_string();
85    }
86}
87
88fn format_plain(items: &[ArticleItem]) -> String {
89    let mut out = String::new();
90    let mut last_section = String::new();
91
92    for item in items {
93        match item {
94            ArticleItem::Paragraph(seg) => {
95                let text = seg.text.trim();
96                if text.is_empty() {
97                    continue;
98                }
99                emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
100                out.push('\n');
101                out.push_str(text);
102                out.push('\n');
103            }
104            ArticleItem::Image(img) => {
105                emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
106                out.push('\n');
107                out.push_str("[Image: ");
108                out.push_str(&img.alt);
109                out.push(']');
110                out.push('\n');
111                if !img.caption.is_empty() {
112                    out.push_str(&img.caption);
113                    out.push('\n');
114                }
115            }
116            ArticleItem::References(_) => {} // omit from plain output
117        }
118    }
119
120    out
121}
122
123fn format_json(items: &[ArticleItem]) -> anyhow::Result<String> {
124    #[derive(Serialize)]
125    struct ImageEntry {
126        src: String,
127        alt: String,
128        caption: String,
129    }
130
131    impl From<&ImageSegment> for ImageEntry {
132        fn from(img: &ImageSegment) -> Self {
133            ImageEntry {
134                src: img.src.clone(),
135                alt: img.alt.clone(),
136                caption: img.caption.clone(),
137            }
138        }
139    }
140
141    #[derive(Serialize)]
142    struct Section {
143        heading: String,
144        level: u8,
145        paragraphs: Vec<String>,
146        images: Vec<ImageEntry>,
147        subsections: Vec<Section>,
148    }
149
150    #[derive(Serialize)]
151    struct ArticleTree {
152        intro: Vec<String>,
153        intro_images: Vec<ImageEntry>,
154        sections: Vec<Section>,
155        references: HashMap<String, String>,
156    }
157
158    let mut tree = ArticleTree {
159        intro: Vec::new(),
160        intro_images: Vec::new(),
161        sections: Vec::new(),
162        references: HashMap::new(),
163    };
164
165    for item in items {
166        match item {
167            ArticleItem::Paragraph(seg) => {
168                let text = seg.text.trim().to_string();
169                if text.is_empty() {
170                    continue;
171                }
172                if seg.section.is_empty() {
173                    tree.intro.push(text);
174                    continue;
175                }
176                let parts: Vec<&str> = seg.section.split(" - ").collect();
177                let mut siblings = &mut tree.sections;
178                for (i, part) in parts.iter().enumerate() {
179                    let depth_from_bottom = (parts.len() - 1 - i) as u8;
180                    let level = seg.section_level.saturating_sub(depth_from_bottom);
181                    if !siblings.iter().any(|s: &Section| s.heading == *part) {
182                        siblings.push(Section {
183                            heading: part.to_string(),
184                            level,
185                            paragraphs: Vec::new(),
186                            images: Vec::new(),
187                            subsections: Vec::new(),
188                        });
189                    }
190                    let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
191                    if i == parts.len() - 1 {
192                        siblings[idx].paragraphs.push(text.clone());
193                        break;
194                    } else {
195                        siblings = &mut siblings[idx].subsections;
196                    }
197                }
198            }
199            ArticleItem::Image(img) => {
200                let entry = ImageEntry::from(img);
201                if img.section.is_empty() {
202                    tree.intro_images.push(entry);
203                    continue;
204                }
205                let parts: Vec<&str> = img.section.split(" - ").collect();
206                let mut siblings = &mut tree.sections;
207                for (i, part) in parts.iter().enumerate() {
208                    let depth_from_bottom = (parts.len() - 1 - i) as u8;
209                    let level = img.section_level.saturating_sub(depth_from_bottom);
210                    if !siblings.iter().any(|s: &Section| s.heading == *part) {
211                        siblings.push(Section {
212                            heading: part.to_string(),
213                            level,
214                            paragraphs: Vec::new(),
215                            images: Vec::new(),
216                            subsections: Vec::new(),
217                        });
218                    }
219                    let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
220                    if i == parts.len() - 1 {
221                        siblings[idx].images.push(entry);
222                        break;
223                    } else {
224                        siblings = &mut siblings[idx].subsections;
225                    }
226                }
227            }
228            ArticleItem::References(refs) => {
229                tree.references = refs.clone();
230            }
231        }
232    }
233
234    Ok(serde_json::to_string_pretty(&tree)?)
235}
236
237/// Sort a reference map by the trailing integer in the note_id (`cite_note-Name-N`).
238fn sorted_refs(refs: &HashMap<String, String>) -> Vec<(&String, &String)> {
239    let mut entries: Vec<(&String, &String)> = refs.iter().collect();
240    entries.sort_by_key(|(note_id, _)| {
241        note_id
242            .rsplit('-')
243            .next()
244            .and_then(|n| n.parse::<u32>().ok())
245            .unwrap_or(u32::MAX)
246    });
247    entries
248}
249
250fn format_markdown(items: &[ArticleItem]) -> String {
251    let mut out = String::new();
252    let mut last_section = String::new();
253
254    for item in items {
255        match item {
256            ArticleItem::Paragraph(seg) => {
257                if seg.text.trim().is_empty() {
258                    continue;
259                }
260                emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
261                out.push('\n');
262                let mut para = String::new();
263                for node in &seg.content {
264                    match node {
265                        InlineNode::Text(s) => para.push_str(s),
266                        InlineNode::Bold(s) => {
267                            if !para.ends_with(' ') && !para.is_empty() {
268                                para.push(' ');
269                            }
270                            para.push_str("**");
271                            para.push_str(s);
272                            para.push_str("** ");
273                        }
274                        InlineNode::Italic(s) => {
275                            if !para.ends_with(' ') && !para.is_empty() {
276                                para.push(' ');
277                            }
278                            para.push('_');
279                            para.push_str(s);
280                            para.push_str("_ ");
281                        }
282                        InlineNode::Link { text, href } => {
283                            if !para.ends_with(' ') && !para.is_empty() {
284                                para.push(' ');
285                            }
286                            para.push('[');
287                            para.push_str(text);
288                            para.push_str("](");
289                            para.push_str(href);
290                            para.push_str(") ");
291                        }
292                        InlineNode::Ref { label, .. } => {
293                            // Append [N] directly — no space before a superscript marker
294                            para.push('[');
295                            para.push('^');
296                            para.push_str(label);
297                            para.push(']');
298                        }
299                    }
300                }
301                out.push_str(para.trim_end());
302                out.push('\n');
303            }
304            ArticleItem::Image(img) => {
305                emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
306                out.push('\n');
307                out.push_str("![");
308                out.push_str(&img.alt);
309                out.push_str("](");
310                out.push_str(&img.src);
311                out.push(')');
312                out.push('\n');
313                if !img.caption.is_empty() {
314                    out.push('_');
315                    out.push_str(&img.caption);
316                    out.push('_');
317                    out.push('\n');
318                }
319            }
320            ArticleItem::References(refs) => {
321                if refs.is_empty() {
322                    continue;
323                }
324                out.push_str("\n## References\n");
325                for (note_id, citation) in sorted_refs(refs) {
326                    // Extract the numeric label from the note_id tail
327                    let label = note_id.rsplit('-').next().unwrap_or(note_id.as_str());
328                    out.push('\n');
329                    out.push('[');
330                    out.push('^');
331                    out.push_str(label);
332                    out.push_str("]: ");
333                    out.push_str(citation);
334                    out.push('\n');
335                }
336            }
337        }
338    }
339
340    out
341}