hwpforge_smithy_hwpx/decoder/
mod.rs1pub(crate) mod chart;
9pub(crate) mod header;
10pub(crate) mod package;
11pub(crate) mod section;
12pub(crate) mod shapes;
13
14use std::path::Path;
15
16use hwpforge_core::document::{Document, Draft};
17use hwpforge_core::image::ImageStore;
18use hwpforge_core::section::{MasterPage, Section};
19use hwpforge_core::PageSettings;
20use hwpforge_foundation::ApplyPageType;
21
22use crate::error::HwpxResult;
23use crate::style_store::HwpxStyleStore;
24
25#[derive(Debug)]
33#[non_exhaustive]
34pub struct HwpxDocument {
35 pub document: Document<Draft>,
37 pub style_store: HwpxStyleStore,
39 pub image_store: ImageStore,
41}
42
43pub struct HwpxDecoder;
57
58impl HwpxDecoder {
59 pub fn decode(bytes: &[u8]) -> HwpxResult<HwpxDocument> {
67 let mut pkg = package::PackageReader::new(bytes)?;
69
70 let header_xml = pkg.read_header_xml()?;
72 let header_result = header::parse_header(&header_xml)?;
73 let style_store = header_result.style_store;
74 let begin_num = header_result.begin_num;
75
76 let chart_xmls = pkg.read_chart_xmls()?;
78
79 let masterpage_xmls = pkg.read_masterpage_xmls()?;
81 let parsed_masterpages = parse_masterpages(masterpage_xmls);
82
83 let mut document = Document::<Draft>::new();
85 let section_count = pkg.section_count();
86 let mut masterpage_cursor = 0usize;
88
89 for i in 0..section_count {
90 let section_xml = pkg.read_section_xml(i)?;
91 let result = section::parse_section(§ion_xml, i, &chart_xmls)?;
92
93 let page_settings = result.page_settings.unwrap_or_else(PageSettings::a4);
94
95 let mp_cnt = extract_master_page_cnt(§ion_xml);
99 let section_master_pages: Option<Vec<MasterPage>> = if mp_cnt > 0 {
100 let end = (masterpage_cursor + mp_cnt).min(parsed_masterpages.len());
101 let slice = parsed_masterpages[masterpage_cursor..end].to_vec();
102 masterpage_cursor = end;
103 if slice.is_empty() {
104 result.master_pages
105 } else {
106 Some(slice)
107 }
108 } else {
109 result.master_pages
110 };
111
112 let section = Section {
113 paragraphs: result.paragraphs,
114 page_settings,
115 header: result.header,
116 footer: result.footer,
117 page_number: result.page_number,
118 column_settings: result.column_settings,
119 visibility: result.visibility,
120 line_number_shape: result.line_number_shape,
121 page_border_fills: result.page_border_fills,
122 master_pages: section_master_pages,
123 begin_num: {
126 let mut bn = result.begin_num;
127 if i == 0 {
128 if let (Some(ref mut section_bn), Some(ref header_bn)) =
129 (&mut bn, &begin_num)
130 {
131 section_bn.footnote = header_bn.footnote;
132 section_bn.endnote = header_bn.endnote;
133 } else if bn.is_none() {
134 bn = begin_num;
135 }
136 }
137 bn
138 },
139 text_direction: result.text_direction,
140 };
141
142 document.add_section(section);
143 }
144
145 let image_store = pkg.read_all_bindata()?;
147
148 Ok(HwpxDocument { document, style_store, image_store })
149 }
150
151 pub fn decode_file(path: impl AsRef<Path>) -> HwpxResult<HwpxDocument> {
153 let bytes = std::fs::read(path.as_ref()).map_err(crate::error::HwpxError::Io)?;
154 Self::decode(&bytes)
155 }
156}
157
158fn parse_masterpages(xmls: std::collections::HashMap<usize, String>) -> Vec<MasterPage> {
165 let mut entries: Vec<(usize, String)> = xmls.into_iter().collect();
166 entries.sort_by_key(|(idx, _)| *idx);
167 entries.into_iter().map(|(_, xml)| parse_masterpage_xml(&xml)).collect()
168}
169
170fn parse_masterpage_xml(xml: &str) -> MasterPage {
176 use hwpforge_core::paragraph::Paragraph;
177 use hwpforge_core::run::{Run, RunContent};
178 use hwpforge_foundation::{CharShapeIndex, ParaShapeIndex};
179
180 let apply_page_type = extract_masterpage_apply_type(xml);
182
183 let mut paragraphs = Vec::new();
187 let mut search = xml;
188 while let Some(p_start) = search.find("<hp:p ").or_else(|| search.find("<hp:p>")) {
189 let after_p = &search[p_start..];
190 let Some(tag_end) = after_p.find('>') else { break };
192 let open_tag = &after_p[..tag_end];
193 let after_tag = &after_p[tag_end + 1..];
194 let Some(p_close) = after_tag.find("</hp:p>") else { break };
195 let p_content = &after_tag[..p_close];
196
197 let para_pr_id = extract_attr_u32(open_tag, "paraPrIDRef");
199
200 let mut runs = Vec::new();
202 let mut run_search = p_content;
203 while let Some(r_start) =
204 run_search.find("<hp:run ").or_else(|| run_search.find("<hp:run>"))
205 {
206 let after_r = &run_search[r_start..];
207 let Some(r_tag_end) = after_r.find('>') else { break };
208 let run_open = &after_r[..r_tag_end];
209 let char_pr_id = extract_attr_u32(run_open, "charPrIDRef");
210
211 let after_run_tag = &after_r[r_tag_end + 1..];
213 if let Some(t_start) = after_run_tag.find("<hp:t>") {
214 let after_t = &after_run_tag[t_start + "<hp:t>".len()..];
215 if let Some(t_end) = after_t.find("</hp:t>") {
216 let text = &after_t[..t_end];
217 if !text.is_empty() {
218 runs.push(Run {
219 content: RunContent::Text(text.to_string()),
220 char_shape_id: CharShapeIndex::new(char_pr_id as usize),
221 });
222 }
223 }
224 }
225
226 let run_end_tag = "</hp:run>";
228 if let Some(re) = after_r.find(run_end_tag) {
229 run_search = &after_r[re + run_end_tag.len()..];
230 } else {
231 break;
232 }
233 }
234
235 let mut para = Paragraph::new(ParaShapeIndex::new(para_pr_id as usize));
236 for run in runs {
237 para.runs.push(run);
238 }
239 paragraphs.push(para);
240
241 search = &after_tag[p_close + "</hp:p>".len()..];
243 }
244
245 MasterPage { apply_page_type, paragraphs }
246}
247
248fn extract_attr_u32(open_tag: &str, attr_name: &str) -> u32 {
252 let needle = format!("{attr_name}=\"");
253 if let Some(pos) = open_tag.find(&needle) {
254 let after = &open_tag[pos + needle.len()..];
255 if let Some(end) = after.find('"') {
256 return after[..end].parse().unwrap_or(0);
257 }
258 }
259 0
260}
261
262fn extract_masterpage_apply_type(xml: &str) -> ApplyPageType {
264 if let Some(pos) = xml.find("type=\"") {
267 let after = &xml[pos + "type=\"".len()..];
268 if let Some(end) = after.find('"') {
269 return match &after[..end] {
270 "BOTH" => ApplyPageType::Both,
271 "EVEN" => ApplyPageType::Even,
272 "ODD" => ApplyPageType::Odd,
273 _ => ApplyPageType::Both,
274 };
275 }
276 }
277 ApplyPageType::Both
278}
279
280fn extract_master_page_cnt(section_xml: &str) -> usize {
285 let needle = "masterPageCnt=\"";
286 if let Some(pos) = section_xml.find(needle) {
287 let after = §ion_xml[pos + needle.len()..];
288 if let Some(end) = after.find('"') {
289 return after[..end].parse().unwrap_or(0);
290 }
291 }
292 0
293}
294
295#[cfg(test)]
296mod tests {
297 use super::*;
298 use hwpforge_foundation::{HeadingType, NumberFormatType};
299 use std::io::{Cursor, Write};
300 use std::path::PathBuf;
301 use zip::write::SimpleFileOptions;
302 use zip::ZipWriter;
303
304 fn make_test_hwpx(header_xml: &str, section_xmls: &[&str]) -> Vec<u8> {
306 let buf = Vec::new();
307 let mut zip = ZipWriter::new(Cursor::new(buf));
308
309 let stored =
310 SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
311 let deflate = SimpleFileOptions::default();
312
313 zip.start_file("mimetype", stored).unwrap();
314 zip.write_all(b"application/hwp+zip").unwrap();
315
316 zip.start_file("Contents/header.xml", deflate).unwrap();
317 zip.write_all(header_xml.as_bytes()).unwrap();
318
319 for (i, xml) in section_xmls.iter().enumerate() {
320 let path = format!("Contents/section{}.xml", i);
321 zip.start_file(&path, deflate).unwrap();
322 zip.write_all(xml.as_bytes()).unwrap();
323 }
324
325 zip.finish().unwrap().into_inner()
326 }
327
328 fn fixture_path(name: &str) -> PathBuf {
329 PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../tests/fixtures").join(name)
330 }
331
332 fn decode_fixture(name: &str) -> HwpxDocument {
333 let path = fixture_path(name);
334 let bytes =
335 std::fs::read(&path).unwrap_or_else(|_| panic!("fixture should exist: {path:?}"));
336 HwpxDecoder::decode(&bytes).unwrap_or_else(|_| panic!("fixture should decode: {path:?}"))
337 }
338
339 fn collect_body_heading_triples(doc: &HwpxDocument) -> Vec<(HeadingType, u32, u32)> {
340 doc.document
341 .sections()
342 .iter()
343 .flat_map(|section| section.paragraphs.iter())
344 .map(|paragraph| {
345 let shape = doc
346 .style_store
347 .para_shape(paragraph.para_shape_id)
348 .expect("paragraph para shape should exist");
349 (shape.heading_type, shape.heading_id_ref, shape.heading_level)
350 })
351 .collect()
352 }
353
354 const HEADER: &str = r##"<head version="1.4" secCnt="1">
355 <refList>
356 <fontfaces itemCnt="1">
357 <fontface lang="HANGUL" fontCnt="1">
358 <font id="0" face="함초롬돋움" type="TTF" isEmbedded="0"/>
359 </fontface>
360 </fontfaces>
361 <charProperties itemCnt="1">
362 <charPr id="0" height="1000" textColor="#000000" shadeColor="none"
363 useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0">
364 <fontRef hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
365 </charPr>
366 </charProperties>
367 <paraProperties itemCnt="1">
368 <paraPr id="0">
369 <align horizontal="LEFT" vertical="BASELINE"/>
370 <switch><default>
371 <lineSpacing type="PERCENT" value="160"/>
372 </default></switch>
373 </paraPr>
374 </paraProperties>
375 </refList>
376 </head>"##;
377
378 const SECTION_TEXT: &str = r#"<sec>
379 <p paraPrIDRef="0">
380 <run charPrIDRef="0">
381 <secPr textDirection="HORIZONTAL">
382 <pagePr landscape="WIDELY" width="59528" height="84188">
383 <margin header="4252" footer="4252" gutter="0"
384 left="8504" right="8504" top="5668" bottom="4252"/>
385 </pagePr>
386 </secPr>
387 <t>안녕하세요</t>
388 </run>
389 </p>
390 </sec>"#;
391
392 #[test]
395 fn decode_minimal_hwpx() {
396 let bytes = make_test_hwpx(HEADER, &[SECTION_TEXT]);
397 let result = HwpxDecoder::decode(&bytes).unwrap();
398
399 assert_eq!(result.document.sections().len(), 1);
401 let section = &result.document.sections()[0];
402 assert_eq!(section.paragraphs.len(), 1);
403
404 let text = section.paragraphs[0].runs[0].content.as_text();
406 assert_eq!(text, Some("안녕하세요"));
407
408 assert_eq!(section.page_settings.width.as_i32(), 59528);
410 assert_eq!(section.page_settings.height.as_i32(), 84188);
411
412 assert_eq!(result.style_store.font_count(), 1);
414 assert_eq!(result.style_store.char_shape_count(), 1);
415 assert_eq!(result.style_store.para_shape_count(), 1);
416 }
417
418 #[test]
419 fn decode_multiple_sections() {
420 let s0 = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Section 0</t></run></p></sec>"#;
421 let s1 = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Section 1</t></run></p></sec>"#;
422 let bytes = make_test_hwpx(HEADER, &[s0, s1]);
423 let result = HwpxDecoder::decode(&bytes).unwrap();
424 assert_eq!(result.document.sections().len(), 2);
425 }
426
427 #[test]
428 fn decode_with_table() {
429 let section = r#"<sec>
430 <p paraPrIDRef="0">
431 <run charPrIDRef="0">
432 <tbl rowCnt="1" colCnt="1">
433 <tr>
434 <tc name="A1">
435 <cellSz width="5000" height="1000"/>
436 <subList><p paraPrIDRef="0"><run charPrIDRef="0"><t>Cell</t></run></p></subList>
437 </tc>
438 </tr>
439 </tbl>
440 </run>
441 </p>
442 </sec>"#;
443 let bytes = make_test_hwpx(HEADER, &[section]);
444 let result = HwpxDecoder::decode(&bytes).unwrap();
445 let run = &result.document.sections()[0].paragraphs[0].runs[0];
446 assert!(run.content.is_table());
447 }
448
449 #[test]
450 fn decode_section_without_secpr_uses_a4_defaults() {
451 let section = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Text</t></run></p></sec>"#;
452 let bytes = make_test_hwpx(HEADER, &[section]);
453 let result = HwpxDecoder::decode(&bytes).unwrap();
454 let ps = &result.document.sections()[0].page_settings;
455 assert_eq!(*ps, PageSettings::a4());
456 }
457
458 #[test]
459 fn decode_not_a_zip() {
460 let err = HwpxDecoder::decode(b"not a zip").unwrap_err();
461 assert!(matches!(err, crate::error::HwpxError::Zip(_)));
462 }
463
464 #[test]
465 fn decode_file_nonexistent() {
466 let err = HwpxDecoder::decode_file("/nonexistent/path.hwpx").unwrap_err();
467 assert!(matches!(err, crate::error::HwpxError::Io(_)));
468 }
469
470 #[test]
473 fn decode_section_with_header_ctrl() {
474 let section = r#"<sec>
475 <p paraPrIDRef="0">
476 <run charPrIDRef="0">
477 <ctrl>
478 <header id="0" applyPageType="BOTH">
479 <subList id="0" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP"
480 linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0">
481 <p paraPrIDRef="0">
482 <run charPrIDRef="0"><t>Page Header</t></run>
483 </p>
484 </subList>
485 </header>
486 </ctrl>
487 <t>Body text</t>
488 </run>
489 </p>
490 </sec>"#;
491 let bytes = make_test_hwpx(HEADER, &[section]);
492 let result = HwpxDecoder::decode(&bytes).unwrap();
493
494 let sec = &result.document.sections()[0];
495 let header = sec.header.as_ref().expect("section should have header");
496 assert_eq!(header.apply_page_type, hwpforge_foundation::ApplyPageType::Both);
497 assert_eq!(header.paragraphs.len(), 1);
498 assert_eq!(header.paragraphs[0].runs[0].content.as_text(), Some("Page Header"));
499 }
500
501 #[test]
502 fn decode_section_with_footer_and_pagenum() {
503 let section = r#"<sec>
504 <p paraPrIDRef="0">
505 <run charPrIDRef="0">
506 <ctrl>
507 <footer id="0" applyPageType="ODD">
508 <subList id="0" textDirection="HORIZONTAL" lineWrap="BREAK" vertAlign="TOP"
509 linkListIDRef="0" linkListNextIDRef="0" textWidth="0" textHeight="0">
510 <p paraPrIDRef="0">
511 <run charPrIDRef="0"><t>Footer</t></run>
512 </p>
513 </subList>
514 </footer>
515 </ctrl>
516 <ctrl>
517 <pageNum pos="BOTTOM_CENTER" formatType="DIGIT" sideChar="- "/>
518 </ctrl>
519 <t>Body</t>
520 </run>
521 </p>
522 </sec>"#;
523 let bytes = make_test_hwpx(HEADER, &[section]);
524 let result = HwpxDecoder::decode(&bytes).unwrap();
525
526 let sec = &result.document.sections()[0];
527 let footer = sec.footer.as_ref().expect("section should have footer");
528 assert_eq!(footer.apply_page_type, hwpforge_foundation::ApplyPageType::Odd);
529 assert_eq!(footer.paragraphs[0].runs[0].content.as_text(), Some("Footer"));
530
531 let pn = sec.page_number.as_ref().expect("section should have page number");
532 assert_eq!(pn.position, hwpforge_foundation::PageNumberPosition::BottomCenter);
533 assert_eq!(pn.number_format, hwpforge_foundation::NumberFormatType::Digit);
534 assert_eq!(pn.decoration, "- ");
535 }
536
537 #[test]
540 fn decode_extracts_bindata_images() {
541 let buf = Vec::new();
542 let mut zip = ZipWriter::new(Cursor::new(buf));
543 let stored =
544 SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
545 let deflate = SimpleFileOptions::default();
546
547 zip.start_file("mimetype", stored).unwrap();
548 zip.write_all(b"application/hwp+zip").unwrap();
549
550 zip.start_file("Contents/header.xml", deflate).unwrap();
551 zip.write_all(HEADER.as_bytes()).unwrap();
552
553 let section = r#"<sec><p paraPrIDRef="0"><run charPrIDRef="0"><t>Body</t></run></p></sec>"#;
554 zip.start_file("Contents/section0.xml", deflate).unwrap();
555 zip.write_all(section.as_bytes()).unwrap();
556
557 let fake_png = vec![0x89, 0x50, 0x4E, 0x47]; zip.start_file("BinData/logo.png", stored).unwrap();
560 zip.write_all(&fake_png).unwrap();
561
562 let bytes = zip.finish().unwrap().into_inner();
563 let result = HwpxDecoder::decode(&bytes).unwrap();
564
565 assert!(!result.image_store.is_empty(), "image store should contain extracted images");
566 let data = result.image_store.get("logo.png").expect("should find logo.png");
567 assert_eq!(data, &fake_png);
568 }
569
570 #[test]
571 fn decode_user_sample_bullet_list_preserves_bullet_semantics() {
572 let decoded = decode_fixture("user_samples/lists/sample-bullet-list.hwpx");
573 let headings = collect_body_heading_triples(&decoded);
574
575 assert!(headings.contains(&(HeadingType::Bullet, 1, 0)));
576 assert_eq!(decoded.style_store.bullet_count(), 1);
577 assert_eq!(decoded.style_store.numbering_count(), 1);
578 assert_eq!(decoded.style_store.iter_bullets().next().map(|bullet| bullet.id), Some(1));
579 }
580
581 #[test]
582 fn decode_user_sample_numbered_list_preserves_numbering_semantics() {
583 let decoded = decode_fixture("user_samples/lists/sample-numbered-list.hwpx");
584 let headings = collect_body_heading_triples(&decoded);
585
586 assert!(headings.contains(&(HeadingType::Number, 2, 0)));
587 assert!(decoded.style_store.numbering_count() >= 2);
588 }
589
590 #[test]
591 fn decode_user_sample_mixed_lists_with_outline_preserves_all_list_kinds() {
592 let decoded = decode_fixture("user_samples/lists/sample-mixed-lists-with-outline.hwpx");
593 let headings = collect_body_heading_triples(&decoded);
594
595 assert!(headings.contains(&(HeadingType::Outline, 0, 0)));
596 assert!(headings.contains(&(HeadingType::Outline, 0, 1)));
597 assert!(headings.contains(&(HeadingType::Outline, 0, 2)));
598 assert!(headings.contains(&(HeadingType::Bullet, 1, 0)));
599 assert!(headings.contains(&(HeadingType::Number, 2, 0)));
600 assert!(headings.contains(&(HeadingType::Number, 3, 0)));
601 assert_eq!(decoded.style_store.bullet_count(), 1);
602 assert!(decoded.style_store.numbering_count() >= 3);
603 }
604
605 #[test]
606 fn decode_user_sample_numbered_custom_formats_preserves_distinct_numbering_ids() {
607 let decoded = decode_fixture("user_samples/lists/sample-numbered-list-custom-formats.hwpx");
608 let headings = collect_body_heading_triples(&decoded);
609
610 for id_ref in [2, 3, 4, 5] {
611 assert!(headings.contains(&(HeadingType::Number, id_ref, 0)));
612 }
613 assert!(decoded.style_store.numbering_count() >= 5);
614 let numberings: Vec<_> = decoded.style_store.iter_numberings().collect();
615 assert_eq!(numberings[1].levels[0].text, "^1)");
616 assert_eq!(numberings[2].levels[0].text, "(^1)");
617 assert_eq!(numberings[4].levels[6].num_format, NumberFormatType::CircledLatinSmall);
618 }
619
620 #[test]
621 fn decode_user_sample_checkable_bullet_basic_preserves_checked_glyph_and_item_state() {
622 let decoded = decode_fixture("user_samples/lists/sample-checkable-bullet-basic.hwpx");
623 let paragraphs = &decoded.document.sections()[0].paragraphs;
624
625 let unchecked = paragraphs
626 .iter()
627 .find(|paragraph| paragraph.text_content().contains("unchecked item A"))
628 .expect("fixture should contain unchecked item A");
629 let checked = paragraphs
630 .iter()
631 .find(|paragraph| paragraph.text_content().contains("checked item B"))
632 .expect("fixture should contain checked item B");
633
634 let unchecked_shape = decoded.style_store.para_shape(unchecked.para_shape_id).unwrap();
635 let checked_shape = decoded.style_store.para_shape(checked.para_shape_id).unwrap();
636 let bullet = decoded
637 .style_store
638 .iter_bullets()
639 .find(|bullet| bullet.id == unchecked_shape.heading_id_ref)
640 .expect("checkable bullet definition should exist");
641
642 assert_eq!(unchecked_shape.heading_type, HeadingType::Bullet);
643 assert_eq!(checked_shape.heading_type, HeadingType::Bullet);
644 assert!(bullet.is_checkable());
645 assert_eq!(bullet.checked_char.as_deref(), Some("☑"));
646 assert!(!unchecked_shape.checked);
647 assert!(checked_shape.checked);
648 }
649
650 #[test]
651 fn decode_user_sample_checkable_bullet_nested_preserves_depth() {
652 let decoded = decode_fixture("user_samples/lists/sample-checkable-bullet-nested.hwpx");
653 let paragraphs = &decoded.document.sections()[0].paragraphs;
654
655 let level1 = paragraphs
656 .iter()
657 .find(|paragraph| paragraph.text_content().contains("level 1 unchecked"))
658 .expect("fixture should contain level 1 item");
659 let level2 = paragraphs
660 .iter()
661 .find(|paragraph| paragraph.text_content().contains("level 2 checked"))
662 .expect("fixture should contain level 2 item");
663 let level3 = paragraphs
664 .iter()
665 .find(|paragraph| paragraph.text_content().contains("level 3 unchecked"))
666 .expect("fixture should contain level 3 item");
667
668 assert_eq!(decoded.style_store.para_shape(level1.para_shape_id).unwrap().heading_level, 0);
669 assert_eq!(decoded.style_store.para_shape(level2.para_shape_id).unwrap().heading_level, 1);
670 assert_eq!(decoded.style_store.para_shape(level3.para_shape_id).unwrap().heading_level, 2);
671 }
672}