wikipedia_article_transform/
formatters.rs1use std::collections::HashMap;
9
10use crate::{ArticleItem, ImageSegment, InlineNode};
11use serde::Serialize;
12
13pub trait ArticleFormat {
15 fn format_plain(&self) -> String;
20
21 fn format_json(&self) -> anyhow::Result<String>;
32
33 fn format_markdown(&self) -> String;
40}
41
42impl ArticleFormat for Vec<ArticleItem> {
43 fn format_plain(&self) -> String {
44 format_plain(self)
45 }
46 fn format_json(&self) -> anyhow::Result<String> {
47 format_json(self)
48 }
49 fn format_markdown(&self) -> String {
50 format_markdown(self)
51 }
52}
53
54impl ArticleFormat for &[ArticleItem] {
55 fn format_plain(&self) -> String {
56 format_plain(self)
57 }
58 fn format_json(&self) -> anyhow::Result<String> {
59 format_json(self)
60 }
61 fn format_markdown(&self) -> String {
62 format_markdown(self)
63 }
64}
65
66fn emit_section_heading(
67 out: &mut String,
68 seg_section: &str,
69 seg_level: u8,
70 last_section: &mut String,
71) {
72 if seg_section != *last_section {
73 if !out.is_empty() {
74 out.push('\n');
75 }
76 if !seg_section.is_empty() {
77 let hashes = "#".repeat(seg_level.max(1) as usize);
78 let heading = seg_section.rsplit(" - ").next().unwrap_or(seg_section);
79 out.push_str(&hashes);
80 out.push(' ');
81 out.push_str(heading);
82 out.push('\n');
83 }
84 *last_section = seg_section.to_string();
85 }
86}
87
88fn format_plain(items: &[ArticleItem]) -> String {
89 let mut out = String::new();
90 let mut last_section = String::new();
91
92 for item in items {
93 match item {
94 ArticleItem::Paragraph(seg) => {
95 let text = seg.text.trim();
96 if text.is_empty() {
97 continue;
98 }
99 emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
100 out.push('\n');
101 out.push_str(text);
102 out.push('\n');
103 }
104 ArticleItem::Image(img) => {
105 emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
106 out.push('\n');
107 out.push_str("[Image: ");
108 out.push_str(&img.alt);
109 out.push(']');
110 out.push('\n');
111 if !img.caption.is_empty() {
112 out.push_str(&img.caption);
113 out.push('\n');
114 }
115 }
116 ArticleItem::References(_) => {} }
118 }
119
120 out
121}
122
123fn format_json(items: &[ArticleItem]) -> anyhow::Result<String> {
124 #[derive(Serialize)]
125 struct ImageEntry {
126 src: String,
127 alt: String,
128 caption: String,
129 }
130
131 impl From<&ImageSegment> for ImageEntry {
132 fn from(img: &ImageSegment) -> Self {
133 ImageEntry {
134 src: img.src.clone(),
135 alt: img.alt.clone(),
136 caption: img.caption.clone(),
137 }
138 }
139 }
140
141 #[derive(Serialize)]
142 struct Section {
143 heading: String,
144 level: u8,
145 paragraphs: Vec<String>,
146 images: Vec<ImageEntry>,
147 subsections: Vec<Section>,
148 }
149
150 #[derive(Serialize)]
151 struct ArticleTree {
152 intro: Vec<String>,
153 intro_images: Vec<ImageEntry>,
154 sections: Vec<Section>,
155 references: HashMap<String, String>,
156 }
157
158 let mut tree = ArticleTree {
159 intro: Vec::new(),
160 intro_images: Vec::new(),
161 sections: Vec::new(),
162 references: HashMap::new(),
163 };
164
165 for item in items {
166 match item {
167 ArticleItem::Paragraph(seg) => {
168 let text = seg.text.trim().to_string();
169 if text.is_empty() {
170 continue;
171 }
172 if seg.section.is_empty() {
173 tree.intro.push(text);
174 continue;
175 }
176 let parts: Vec<&str> = seg.section.split(" - ").collect();
177 let mut siblings = &mut tree.sections;
178 for (i, part) in parts.iter().enumerate() {
179 let depth_from_bottom = (parts.len() - 1 - i) as u8;
180 let level = seg.section_level.saturating_sub(depth_from_bottom);
181 if !siblings.iter().any(|s: &Section| s.heading == *part) {
182 siblings.push(Section {
183 heading: part.to_string(),
184 level,
185 paragraphs: Vec::new(),
186 images: Vec::new(),
187 subsections: Vec::new(),
188 });
189 }
190 let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
191 if i == parts.len() - 1 {
192 siblings[idx].paragraphs.push(text.clone());
193 break;
194 } else {
195 siblings = &mut siblings[idx].subsections;
196 }
197 }
198 }
199 ArticleItem::Image(img) => {
200 let entry = ImageEntry::from(img);
201 if img.section.is_empty() {
202 tree.intro_images.push(entry);
203 continue;
204 }
205 let parts: Vec<&str> = img.section.split(" - ").collect();
206 let mut siblings = &mut tree.sections;
207 for (i, part) in parts.iter().enumerate() {
208 let depth_from_bottom = (parts.len() - 1 - i) as u8;
209 let level = img.section_level.saturating_sub(depth_from_bottom);
210 if !siblings.iter().any(|s: &Section| s.heading == *part) {
211 siblings.push(Section {
212 heading: part.to_string(),
213 level,
214 paragraphs: Vec::new(),
215 images: Vec::new(),
216 subsections: Vec::new(),
217 });
218 }
219 let idx = siblings.iter().position(|s| s.heading == *part).unwrap();
220 if i == parts.len() - 1 {
221 siblings[idx].images.push(entry);
222 break;
223 } else {
224 siblings = &mut siblings[idx].subsections;
225 }
226 }
227 }
228 ArticleItem::References(refs) => {
229 tree.references = refs.clone();
230 }
231 }
232 }
233
234 Ok(serde_json::to_string_pretty(&tree)?)
235}
236
237fn sorted_refs(refs: &HashMap<String, String>) -> Vec<(&String, &String)> {
239 let mut entries: Vec<(&String, &String)> = refs.iter().collect();
240 entries.sort_by_key(|(note_id, _)| {
241 note_id
242 .rsplit('-')
243 .next()
244 .and_then(|n| n.parse::<u32>().ok())
245 .unwrap_or(u32::MAX)
246 });
247 entries
248}
249
250fn format_markdown(items: &[ArticleItem]) -> String {
251 let mut out = String::new();
252 let mut last_section = String::new();
253
254 for item in items {
255 match item {
256 ArticleItem::Paragraph(seg) => {
257 if seg.text.trim().is_empty() {
258 continue;
259 }
260 emit_section_heading(&mut out, &seg.section, seg.section_level, &mut last_section);
261 out.push('\n');
262 let mut para = String::new();
263 for node in &seg.content {
264 match node {
265 InlineNode::Text(s) => para.push_str(s),
266 InlineNode::Bold(s) => {
267 if !para.ends_with(' ') && !para.is_empty() {
268 para.push(' ');
269 }
270 para.push_str("**");
271 para.push_str(s);
272 para.push_str("** ");
273 }
274 InlineNode::Italic(s) => {
275 if !para.ends_with(' ') && !para.is_empty() {
276 para.push(' ');
277 }
278 para.push('_');
279 para.push_str(s);
280 para.push_str("_ ");
281 }
282 InlineNode::Link { text, href } => {
283 if !para.ends_with(' ') && !para.is_empty() {
284 para.push(' ');
285 }
286 para.push('[');
287 para.push_str(text);
288 para.push_str("](");
289 para.push_str(href);
290 para.push_str(") ");
291 }
292 InlineNode::Ref { label, .. } => {
293 para.push('[');
295 para.push('^');
296 para.push_str(label);
297 para.push(']');
298 }
299 }
300 }
301 out.push_str(para.trim_end());
302 out.push('\n');
303 }
304 ArticleItem::Image(img) => {
305 emit_section_heading(&mut out, &img.section, img.section_level, &mut last_section);
306 out.push('\n');
307 out.push_str(";
310 out.push_str(&img.src);
311 out.push(')');
312 out.push('\n');
313 if !img.caption.is_empty() {
314 out.push('_');
315 out.push_str(&img.caption);
316 out.push('_');
317 out.push('\n');
318 }
319 }
320 ArticleItem::References(refs) => {
321 if refs.is_empty() {
322 continue;
323 }
324 out.push_str("\n## References\n");
325 for (note_id, citation) in sorted_refs(refs) {
326 let label = note_id.rsplit('-').next().unwrap_or(note_id.as_str());
328 out.push('\n');
329 out.push('[');
330 out.push('^');
331 out.push_str(label);
332 out.push_str("]: ");
333 out.push_str(citation);
334 out.push('\n');
335 }
336 }
337 }
338 }
339
340 out
341}