wikipedia_article_transform/
formatters.rs1use std::collections::HashMap;
9
10use crate::{ArticleItem, ImageSegment, InlineNode, TextSegment};
11use serde::Serialize;
12
13pub trait ArticleFormat {
15 fn format_plain(&self) -> String;
20
21 fn format_json(&self) -> anyhow::Result<String>;
32
33 fn format_markdown(&self) -> String;
40}
41
42impl ArticleFormat for Vec<ArticleItem> {
43 fn format_plain(&self) -> String {
44 format_plain(self)
45 }
46 fn format_json(&self) -> anyhow::Result<String> {
47 format_json(self)
48 }
49 fn format_markdown(&self) -> String {
50 format_markdown(self)
51 }
52}
53
54impl ArticleFormat for &[ArticleItem] {
55 fn format_plain(&self) -> String {
56 format_plain(self)
57 }
58 fn format_json(&self) -> anyhow::Result<String> {
59 format_json(self)
60 }
61 fn format_markdown(&self) -> String {
62 format_markdown(self)
63 }
64}
65
66fn emit_section_heading(
67 out: &mut String,
68 seg_section: &str,
69 seg_level: u8,
70 last_section: &mut String,
71) {
72 if seg_section != *last_section {
73 if !out.is_empty() {
74 out.push('\n');
75 }
76 if !seg_section.is_empty() {
77 let hashes = "#".repeat(seg_level.max(1) as usize);
78 let heading = seg_section.rsplit(" - ").next().unwrap_or(seg_section);
79 out.push_str(&hashes);
80 out.push(' ');
81 out.push_str(heading);
82 out.push('\n');
83 }
84 *last_section = seg_section.to_string();
85 }
86}
87
88fn format_plain(items: &[ArticleItem]) -> String {
89 let mut out = String::new();
90 let mut last_section = String::new();
91
92 for item in items {
93 match item {
94 ArticleItem::Paragraph(seg) => {
95 let text = seg.text.trim();
96 if text.is_empty() {
97 continue;
98 }
99 emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
100 out.push('\n');
101 out.push_str(text);
102 out.push('\n');
103 }
104 ArticleItem::Image(img) => {
105 emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
106 out.push('\n');
107 out.push_str("[Image: ");
108 out.push_str(&img.alt);
109 out.push(']');
110 out.push('\n');
111 if !img.caption.is_empty() {
112 out.push_str(&img.caption);
113 out.push('\n');
114 }
115 }
116 ArticleItem::References(_) => {} }
118 }
119
120 out
121}
122
123fn format_json(items: &[ArticleItem]) -> anyhow::Result<String> {
124 #[derive(Serialize)]
125 struct CitationEntry {
126 label: String,
127 text: String,
128 }
129
130 #[derive(Serialize)]
131 struct ParagraphEntry {
132 text: String,
133 citations: Vec<CitationEntry>,
134 }
135
136 #[derive(Serialize)]
137 struct ImageEntry {
138 src: String,
139 alt: String,
140 caption: String,
141 }
142
143 impl From<&ImageSegment> for ImageEntry {
144 fn from(img: &ImageSegment) -> Self {
145 ImageEntry {
146 src: img.src.clone(),
147 alt: img.alt.clone(),
148 caption: img.caption.clone(),
149 }
150 }
151 }
152
153 #[derive(Serialize)]
154 struct Section {
155 heading: String,
156 level: u8,
157 paragraphs: Vec<ParagraphEntry>,
158 images: Vec<ImageEntry>,
159 subsections: Vec<Section>,
160 }
161
162 #[derive(Serialize)]
163 struct ArticleTree {
164 intro: Vec<ParagraphEntry>,
165 intro_images: Vec<ImageEntry>,
166 sections: Vec<Section>,
167 references: HashMap<String, String>,
168 }
169
170 fn paragraph_from_segment(
171 seg: &TextSegment,
172 references: &HashMap<String, String>,
173 ) -> Option<ParagraphEntry> {
174 let text = seg.text.trim().to_string();
175 if text.is_empty() {
176 return None;
177 }
178
179 let mut citations = Vec::new();
180 let mut seen_note_ids: Vec<&str> = Vec::new();
181 for node in &seg.content {
182 if let InlineNode::Ref { label, note_id } = node {
183 if seen_note_ids.iter().any(|seen| *seen == note_id) {
184 continue;
185 }
186 seen_note_ids.push(note_id);
187 citations.push(CitationEntry {
188 label: label.clone(),
189 text: references.get(note_id).cloned().unwrap_or_default(),
190 });
191 }
192 }
193
194 Some(ParagraphEntry { text, citations })
195 }
196
197 let references = items
198 .iter()
199 .find_map(|item| {
200 if let ArticleItem::References(refs) = item {
201 Some(refs.clone())
202 } else {
203 None
204 }
205 })
206 .unwrap_or_default();
207
208 let mut tree = ArticleTree {
209 intro: Vec::new(),
210 intro_images: Vec::new(),
211 sections: Vec::new(),
212 references: references.clone(),
213 };
214
215 for item in items {
216 match item {
217 ArticleItem::Paragraph(seg) => {
218 let Some(paragraph) = paragraph_from_segment(seg, &references) else {
219 continue;
220 };
221 if seg.section.is_empty() {
222 tree.intro.push(paragraph);
223 continue;
224 }
225 let parts: Vec<&str> = seg.section.split(" - ").collect();
226 let mut siblings = &mut tree.sections;
227 for (i, part) in parts.iter().enumerate() {
228 let depth_from_bottom = (parts.len() - 1 - i) as u8;
229 let level = seg.section_level.saturating_sub(depth_from_bottom);
230 if !siblings.iter().any(|s: &Section| s.heading == *part) {
231 siblings.push(Section {
232 heading: part.to_string(),
233 level,
234 paragraphs: Vec::new(),
235 images: Vec::new(),
236 subsections: Vec::new(),
237 });
238 }
239 let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
240 if i == parts.len() - 1 {
241 siblings[idx].paragraphs.push(paragraph);
242 break;
243 } else {
244 siblings = &mut siblings[idx].subsections;
245 }
246 }
247 }
248 ArticleItem::Image(img) => {
249 let entry = ImageEntry::from(img);
250 if img.section.is_empty() {
251 tree.intro_images.push(entry);
252 continue;
253 }
254 let parts: Vec<&str> = img.section.split(" - ").collect();
255 let mut siblings = &mut tree.sections;
256 for (i, part) in parts.iter().enumerate() {
257 let depth_from_bottom = (parts.len() - 1 - i) as u8;
258 let level = img.section_level.saturating_sub(depth_from_bottom);
259 if !siblings.iter().any(|s: &Section| s.heading == *part) {
260 siblings.push(Section {
261 heading: part.to_string(),
262 level,
263 paragraphs: Vec::new(),
264 images: Vec::new(),
265 subsections: Vec::new(),
266 });
267 }
268 let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
269 if i == parts.len() - 1 {
270 siblings[idx].images.push(entry);
271 break;
272 } else {
273 siblings = &mut siblings[idx].subsections;
274 }
275 }
276 }
277 ArticleItem::References(_) => {}
278 }
279 }
280
281 Ok(serde_json::to_string_pretty(&tree)?)
282}
283
284fn sorted_refs(refs: &HashMap<String, String>) -> Vec<(&String, &String)> {
286 let mut entries: Vec<(&String, &String)> = refs.iter().collect();
287 entries.sort_by_key(|(note_id, _)| {
288 note_id
289 .rsplit('-')
290 .next()
291 .and_then(|n| n.parse::<u32>().ok())
292 .unwrap_or(u32::MAX)
293 });
294 entries
295}
296
297fn format_markdown(items: &[ArticleItem]) -> String {
298 let mut out = String::new();
299 let mut last_section = String::new();
300
301 for item in items {
302 match item {
303 ArticleItem::Paragraph(seg) => {
304 if seg.text.trim().is_empty() {
305 continue;
306 }
307 emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
308 out.push('\n');
309 let mut para = String::new();
310 for node in &seg.content {
311 match node {
312 InlineNode::Text(s) => para.push_str(s),
313 InlineNode::Bold(s) => {
314 if !para.ends_with(' ') && !para.is_empty() {
315 para.push(' ');
316 }
317 para.push_str("**");
318 para.push_str(s);
319 para.push_str("** ");
320 }
321 InlineNode::Italic(s) => {
322 if !para.ends_with(' ') && !para.is_empty() {
323 para.push(' ');
324 }
325 para.push('_');
326 para.push_str(s);
327 para.push_str("_ ");
328 }
329 InlineNode::Link { text, href } => {
330 if !para.ends_with(' ') && !para.is_empty() {
331 para.push(' ');
332 }
333 para.push('[');
334 para.push_str(text);
335 para.push_str("](");
336 para.push_str(href);
337 para.push_str(") ");
338 }
339 InlineNode::Ref { label, .. } => {
340 para.push('[');
342 para.push('^');
343 para.push_str(label);
344 para.push(']');
345 }
346 }
347 }
348 out.push_str(para.trim_end());
349 out.push('\n');
350 }
351 ArticleItem::Image(img) => {
352 emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
353 out.push('\n');
354 out.push_str(";
357 out.push_str(&img.src);
358 out.push(')');
359 out.push('\n');
360 if !img.caption.is_empty() {
361 out.push('_');
362 out.push_str(&img.caption);
363 out.push('_');
364 out.push('\n');
365 }
366 }
367 ArticleItem::References(refs) => {
368 if refs.is_empty() {
369 continue;
370 }
371 out.push_str("\n## References\n");
372 for (note_id, citation) in sorted_refs(refs) {
373 let label = note_id.rsplit('-').next().unwrap_or(note_id.as_str());
375 out.push('\n');
376 out.push('[');
377 out.push('^');
378 out.push_str(label);
379 out.push_str("]: ");
380 out.push_str(citation);
381 out.push('\n');
382 }
383 }
384 }
385 }
386
387 out
388}