use crate::container::Container;
use crate::doc_info;
use crate::error::{Error, Location, WarningCode, WarningCollector};
use crate::model::{HwpDocument, ParagraphDetail, Section};
use crate::record::{
Record, RecordIter, HWPTAG_CTRL_HEADER, HWPTAG_EQEDIT, HWPTAG_LIST_HEADER,
HWPTAG_PARA_CHAR_SHAPE, HWPTAG_PARA_HEADER, HWPTAG_PARA_TEXT, HWPTAG_SHAPE_COMPONENT_PICTURE,
HWPTAG_TABLE,
};
use crate::structure::{classify, make_preview, NodeKind, StructureNode};
use crate::summary;
use crate::table::{parse_cell_list_header, parse_table_payload, Cell, Table, TableProps};
use crate::text::decode_para_text;
use std::path::Path;
pub fn read_document(path: &Path) -> Result<HwpDocument, Error> {
let mut container = Container::open(path)?;
let v = container.header.version;
let version = format!("{}.{}.{}.{}", v.major, v.minor, v.build, v.revision);
let mut warnings = WarningCollector::default();
let metadata = summary::read(&mut container).unwrap_or_default();
let properties = doc_info::read(&mut container).unwrap_or_default();
let shapes = doc_info::read_shape_tables(&mut container).unwrap_or_default();
let assets = doc_info::read_asset_catalog(&mut container).unwrap_or_default();
let mut sections = Vec::new();
let names = container.section_names();
for (section_index, name) in names.iter().enumerate() {
let bytes = container.read_section(name)?;
sections.push(read_section(section_index, &bytes, &mut warnings));
}
Ok(HwpDocument {
version,
metadata,
properties,
shapes,
sections,
assets,
warnings: warnings.take(),
})
}
struct ParaBuilder {
text: String,
ctrl_id: Option<[u8; 4]>,
kind: NodeKind,
para_shape_id: u32,
runs: Vec<(u32, u32)>,
footnotes_collected: Vec<crate::model::FootnoteBody>,
equation_script: Option<String>,
image_refs: Vec<crate::model::ImageRef>,
}
impl ParaBuilder {
fn new() -> Self {
Self {
text: String::new(),
ctrl_id: None,
kind: NodeKind::Paragraph,
para_shape_id: 0,
runs: Vec::new(),
footnotes_collected: Vec::new(),
equation_script: None,
image_refs: Vec::new(),
}
}
}
fn finalize_para(
b: ParaBuilder,
para_index: usize,
section_index: usize,
) -> (String, StructureNode, ParagraphDetail) {
let node = StructureNode {
id: format!("{}:{}", section_index, para_index),
kind: b.kind,
preview: make_preview(&b.text),
ctrl_id: b.ctrl_id.map(|c| String::from_utf8_lossy(&c).into_owned()),
};
let detail = ParagraphDetail {
text: b.text.clone(),
para_shape_id: b.para_shape_id,
runs: b.runs,
footnotes: b.footnotes_collected,
equation: b.equation_script,
image_refs: b.image_refs,
};
(b.text, node, detail)
}
fn read_section(index: usize, bytes: &[u8], warnings: &mut WarningCollector) -> Section {
let mut paragraphs: Vec<String> = Vec::new();
let mut paragraph_details: Vec<ParagraphDetail> = Vec::new();
let mut structure: Vec<StructureNode> = Vec::new();
let mut tables: Vec<Table> = Vec::new();
let mut current: Option<ParaBuilder> = None;
let mut iter = RecordIter::new(bytes);
while let Some(rec_res) = iter.next() {
let rec = match rec_res {
Ok(r) => r,
Err(e) => {
warnings.push(
WarningCode::TruncatedRecord,
e.to_string(),
Some(Location {
section: Some(index),
..Default::default()
}),
);
break;
}
};
let Record { header, payload } = rec;
match header.tag_id {
HWPTAG_PARA_HEADER => {
if let Some(b) = current.take() {
let pi = paragraphs.len();
let (t, n, d) = finalize_para(b, pi, index);
paragraphs.push(t);
structure.push(n);
paragraph_details.push(d);
}
let mut pb = ParaBuilder::new();
if payload.len() >= 10 {
pb.para_shape_id =
u16::from_le_bytes(payload[8..10].try_into().unwrap()) as u32;
}
current = Some(pb);
}
HWPTAG_PARA_CHAR_SHAPE => {
if let Some(b) = current.as_mut() {
let n = payload.len() / 8;
for i in 0..n {
let off = i * 8;
let pos = u32::from_le_bytes(payload[off..off + 4].try_into().unwrap());
let cid = u32::from_le_bytes(payload[off + 4..off + 8].try_into().unwrap());
b.runs.push((pos, cid));
}
}
}
HWPTAG_PARA_TEXT => {
let text = decode_para_text(payload);
match current.as_mut() {
Some(b) => b.text.push_str(&text),
None => {
let pi = paragraphs.len();
structure.push(StructureNode {
id: format!("{}:{}", index, pi),
kind: NodeKind::Paragraph,
preview: make_preview(&text),
ctrl_id: None,
});
paragraphs.push(text);
}
}
}
HWPTAG_CTRL_HEADER => {
if payload.len() >= 4 {
let word = u32::from_le_bytes(payload[..4].try_into().unwrap());
let ascii = word.to_be_bytes();
if let Some(b) = current.as_mut() {
if b.ctrl_id.is_none() {
b.ctrl_id = Some(ascii);
b.kind = classify(ascii);
}
}
match &ascii {
b"tbl " => {
match consume_table(index, paragraphs.len(), &mut iter, warnings) {
Some(table) => tables.push(table),
None => {
warnings.push(
WarningCode::UnsupportedContent,
"table parse aborted".to_string(),
Some(Location {
section: Some(index),
paragraph: Some(paragraphs.len()),
..Default::default()
}),
);
}
}
}
b"%fn " | b"%en " => {
let kind = if &ascii == b"%fn " {
"footnote"
} else {
"endnote"
};
let paragraphs_collected =
consume_list_body(&mut iter, warnings, index);
if let Some(b) = current.as_mut() {
b.footnotes_collected.push(crate::model::FootnoteBody {
kind: kind.into(),
text: paragraphs_collected.join("\n"),
});
}
}
b"eqed" => {
let saved = iter.offset();
if let Some(Ok(rec)) = iter.next() {
if rec.header.tag_id == HWPTAG_EQEDIT && rec.payload.len() >= 6 {
let p = rec.payload;
let script_len =
u16::from_le_bytes(p[4..6].try_into().unwrap()) as usize;
let nbytes = script_len * 2;
if 6 + nbytes <= p.len() {
let mut units = Vec::with_capacity(script_len);
for i in 0..script_len {
units.push(u16::from_le_bytes(
p[6 + i * 2..8 + i * 2].try_into().unwrap(),
));
}
let s = String::from_utf16_lossy(&units);
if let Some(b) = current.as_mut() {
b.equation_script = Some(s);
}
}
} else {
iter.set_offset(saved);
}
}
}
b"gso " => {
let mut bin_id: Option<u16> = None;
loop {
let saved = iter.offset();
let Some(rec_res) = iter.next() else { break };
let rec = match rec_res {
Ok(r) => r,
Err(_) => break,
};
if rec.header.tag_id == HWPTAG_PARA_HEADER
|| rec.header.tag_id == HWPTAG_CTRL_HEADER
|| rec.header.tag_id == HWPTAG_PARA_TEXT
|| rec.header.tag_id == HWPTAG_PARA_CHAR_SHAPE
{
iter.set_offset(saved);
break;
}
if rec.header.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE
&& rec.payload.len() >= 2
&& bin_id.is_none()
{
bin_id = Some(u16::from_le_bytes(
rec.payload[0..2].try_into().unwrap(),
));
}
}
if let Some(b) = current.as_mut() {
b.image_refs.push(crate::model::ImageRef {
bin_id: bin_id.unwrap_or(0),
});
}
}
_ => {}
}
}
}
other => {
warnings.push(
WarningCode::UnknownRecordTag,
format!(
"tag=0x{:03X} level={} size={}",
other, header.level, header.size
),
Some(Location {
section: Some(index),
paragraph: Some(paragraphs.len()),
..Default::default()
}),
);
}
}
}
if let Some(b) = current.take() {
let pi = paragraphs.len();
let (t, n, d) = finalize_para(b, pi, index);
paragraphs.push(t);
structure.push(n);
paragraph_details.push(d);
}
Section {
index,
paragraphs,
paragraph_details,
structure,
tables,
}
}
fn consume_table(
section_index: usize,
para_index: usize,
iter: &mut RecordIter<'_>,
warnings: &mut WarningCollector,
) -> Option<Table> {
let tbl_rec = next_record(iter, warnings, section_index)?;
if tbl_rec.header.tag_id != HWPTAG_TABLE {
warnings.push(
WarningCode::UnsupportedContent,
format!("expected HWPTAG_TABLE, got 0x{:03X}", tbl_rec.header.tag_id),
Some(Location {
section: Some(section_index),
paragraph: Some(para_index),
..Default::default()
}),
);
return None;
}
let TableProps { rows, cols } = match parse_table_payload(tbl_rec.payload) {
Ok(tp) => tp,
Err(e) => {
warnings.push(
WarningCode::UnsupportedContent,
e.to_string(),
Some(Location {
section: Some(section_index),
paragraph: Some(para_index),
..Default::default()
}),
);
return None;
}
};
let mut cells_grid: Vec<Vec<Option<Cell>>> = (0..rows)
.map(|_| (0..cols).map(|_| None).collect())
.collect();
let expected_cells = rows as usize * cols as usize;
let mut seen_cells = 0usize;
while seen_cells < expected_cells {
let lh = next_record(iter, warnings, section_index)?;
if lh.header.tag_id != HWPTAG_LIST_HEADER {
continue;
}
let ch = match parse_cell_list_header(lh.payload) {
Ok(c) => c,
Err(e) => {
warnings.push(
WarningCode::UnsupportedContent,
e.to_string(),
Some(Location {
section: Some(section_index),
paragraph: Some(para_index),
..Default::default()
}),
);
return None;
}
};
let para_count = ch.para_count.max(0) as usize;
let (paragraphs, text) = consume_cell_paragraphs(iter, para_count, warnings, section_index);
if (ch.row as usize) < cells_grid.len() && (ch.col as usize) < cols as usize {
cells_grid[ch.row as usize][ch.col as usize] = Some(Cell {
col: ch.col,
row: ch.row,
col_span: ch.col_span,
row_span: ch.row_span,
text,
paragraphs,
});
}
seen_cells += 1;
}
Some(Table {
id: format!("{}:{}", section_index, para_index),
rows,
cols,
caption: None,
cells: cells_grid,
})
}
fn next_record<'a>(
iter: &mut RecordIter<'a>,
warnings: &mut WarningCollector,
section_index: usize,
) -> Option<Record<'a>> {
match iter.next()? {
Ok(r) => Some(r),
Err(e) => {
warnings.push(
WarningCode::TruncatedRecord,
e.to_string(),
Some(Location {
section: Some(section_index),
..Default::default()
}),
);
None
}
}
}
fn consume_list_body(
iter: &mut RecordIter<'_>,
_warnings: &mut WarningCollector,
_section_index: usize,
) -> Vec<String> {
let saved = iter.offset();
let Some(rec_res) = iter.next() else {
iter.set_offset(saved);
return Vec::new();
};
let rec = match rec_res {
Ok(r) => r,
Err(_) => {
iter.set_offset(saved);
return Vec::new();
}
};
if rec.header.tag_id != HWPTAG_LIST_HEADER {
iter.set_offset(saved);
return Vec::new();
}
let para_count = if rec.payload.len() >= 2 {
i16::from_le_bytes(rec.payload[0..2].try_into().unwrap()).max(0) as usize
} else {
0
};
let inner_min_level = rec.header.level;
let mut collected: Vec<String> = Vec::with_capacity(para_count);
let mut current: Option<String> = None;
let mut seen_headers = 0usize;
loop {
if seen_headers >= para_count && current.is_none() {
break;
}
let saved = iter.offset();
let Some(rec_res) = iter.next() else { break };
let rec = match rec_res {
Ok(r) => r,
Err(_) => break,
};
if rec.header.level < inner_min_level {
iter.set_offset(saved);
if let Some(p) = current.take() {
collected.push(p);
}
break;
}
match rec.header.tag_id {
HWPTAG_PARA_HEADER => {
if let Some(p) = current.take() {
collected.push(p);
}
if seen_headers >= para_count {
iter.set_offset(saved);
break;
}
current = Some(String::new());
seen_headers += 1;
}
HWPTAG_PARA_TEXT => {
let t = decode_para_text(rec.payload);
if let Some(buf) = current.as_mut() {
buf.push_str(&t);
} else {
collected.push(t);
}
}
HWPTAG_LIST_HEADER => {
iter.set_offset(saved);
if let Some(p) = current.take() {
collected.push(p);
}
break;
}
_ => {}
}
}
if let Some(p) = current.take() {
collected.push(p);
}
collected
}
fn consume_cell_paragraphs(
iter: &mut RecordIter<'_>,
para_count: usize,
warnings: &mut WarningCollector,
section_index: usize,
) -> (Vec<String>, String) {
let mut collected: Vec<String> = Vec::with_capacity(para_count);
let mut current: Option<String> = None;
let mut seen_headers = 0usize;
loop {
if seen_headers >= para_count && current.is_none() {
break;
}
let saved = iter.offset();
let Some(rec_res) = iter.next() else { break };
let rec = match rec_res {
Ok(r) => r,
Err(e) => {
warnings.push(
WarningCode::TruncatedRecord,
e.to_string(),
Some(Location {
section: Some(section_index),
..Default::default()
}),
);
break;
}
};
match rec.header.tag_id {
HWPTAG_PARA_HEADER => {
if seen_headers >= para_count {
iter.set_offset(saved);
if let Some(p) = current.take() {
collected.push(p);
}
break;
}
if let Some(p) = current.take() {
collected.push(p);
}
current = Some(String::new());
seen_headers += 1;
}
HWPTAG_PARA_TEXT => {
let t = decode_para_text(rec.payload);
if let Some(buf) = current.as_mut() {
buf.push_str(&t);
} else {
collected.push(t);
}
}
HWPTAG_LIST_HEADER => {
iter.set_offset(saved);
if let Some(p) = current.take() {
collected.push(p);
}
break;
}
_ => {}
}
}
if let Some(p) = current.take() {
collected.push(p);
}
let text = collected.join("\n");
(collected, text)
}
#[cfg(test)]
mod tests {
use super::*;
fn encode_header(tag: u16, level: u16, size: u32) -> Vec<u8> {
let word = (tag as u32 & 0x3FF) | ((level as u32 & 0x3FF) << 10) | (size.min(0xFFF) << 20);
let mut v = word.to_le_bytes().to_vec();
if size >= 0xFFF {
v.extend_from_slice(&size.to_le_bytes());
}
v
}
fn le_utf16(s: &str) -> Vec<u8> {
s.encode_utf16().flat_map(|u| u.to_le_bytes()).collect()
}
fn build_table_props_payload(rows: u16, cols: u16) -> Vec<u8> {
let mut p = Vec::new();
p.extend_from_slice(&0u32.to_le_bytes());
p.extend_from_slice(&rows.to_le_bytes());
p.extend_from_slice(&cols.to_le_bytes());
p.extend_from_slice(&0u16.to_le_bytes());
p.extend_from_slice(&[0u8; 8]);
for _ in 0..rows {
p.extend_from_slice(&1000u16.to_le_bytes());
}
p.extend_from_slice(&0u16.to_le_bytes());
p.extend_from_slice(&0u16.to_le_bytes()); p
}
fn build_cell_list_header_payload(para_count: i16, col: u16, row: u16) -> Vec<u8> {
let mut p = Vec::new();
p.extend_from_slice(¶_count.to_le_bytes());
p.extend_from_slice(&[0u8; 4]); p.extend_from_slice(&[0u8; 2]); p.extend_from_slice(&col.to_le_bytes());
p.extend_from_slice(&row.to_le_bytes());
p.extend_from_slice(&1u16.to_le_bytes()); p.extend_from_slice(&1u16.to_le_bytes()); p.extend_from_slice(&[0u8; 18]); p
}
#[test]
fn classifies_paragraph_with_table_ctrl() {
let mut buf = Vec::new();
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
let t = le_utf16("before");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t.len() as u32));
buf.extend(&t);
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
buf.extend(encode_header(HWPTAG_CTRL_HEADER, 1, 4));
buf.extend_from_slice(&[0x20, 0x6C, 0x62, 0x74]); let t2 = le_utf16("caption");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t2.len() as u32));
buf.extend(&t2);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.paragraphs.len(), 2);
assert_eq!(s.structure.len(), 2);
assert_eq!(s.structure[0].kind, NodeKind::Paragraph);
assert_eq!(s.structure[1].kind, NodeKind::Table);
assert_eq!(s.structure[1].ctrl_id.as_deref(), Some("tbl "));
assert_eq!(s.structure[1].id, "0:1");
}
#[test]
fn structure_ids_are_stable() {
let mut buf = Vec::new();
for _ in 0..3 {
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
let t = le_utf16("x");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t.len() as u32));
buf.extend(&t);
}
let mut w = WarningCollector::default();
let s = read_section(2, &buf, &mut w);
assert_eq!(
s.structure
.iter()
.map(|n| n.id.as_str())
.collect::<Vec<_>>(),
["2:0", "2:1", "2:2"]
);
}
#[test]
fn captures_para_shape_id_and_char_shape_runs() {
let mut buf = Vec::new();
let mut ph = vec![0u8; 22];
ph[8..10].copy_from_slice(&7u16.to_le_bytes());
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, ph.len() as u32));
buf.extend(&ph);
let t = le_utf16("hello");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t.len() as u32));
buf.extend(&t);
let mut pcs = Vec::new();
pcs.extend_from_slice(&0u32.to_le_bytes());
pcs.extend_from_slice(&3u32.to_le_bytes());
pcs.extend_from_slice(&2u32.to_le_bytes());
pcs.extend_from_slice(&5u32.to_le_bytes());
buf.extend(encode_header(HWPTAG_PARA_CHAR_SHAPE, 1, pcs.len() as u32));
buf.extend(&pcs);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.paragraph_details.len(), 1);
assert_eq!(s.paragraph_details[0].para_shape_id, 7);
assert_eq!(s.paragraph_details[0].runs, vec![(0, 3), (2, 5)]);
assert_eq!(s.paragraph_details[0].text, "hello");
}
#[test]
fn captures_footnote_body() {
let mut buf = Vec::new();
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
buf.extend(encode_header(HWPTAG_CTRL_HEADER, 1, 4));
buf.extend_from_slice(&[0x20, 0x6E, 0x66, 0x25]);
let mut lh = Vec::new();
lh.extend_from_slice(&1i16.to_le_bytes());
lh.extend_from_slice(&[0u8; 30]);
buf.extend(encode_header(HWPTAG_LIST_HEADER, 2, lh.len() as u32));
buf.extend(&lh);
buf.extend(encode_header(HWPTAG_PARA_HEADER, 2, 0));
let t_inner = le_utf16("nota bene");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 2, t_inner.len() as u32));
buf.extend(&t_inner);
let t_outer = le_utf16("see below");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t_outer.len() as u32));
buf.extend(&t_outer);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.paragraphs.len(), 1);
assert_eq!(s.paragraphs[0], "see below");
assert_eq!(s.paragraph_details[0].footnotes.len(), 1);
assert_eq!(s.paragraph_details[0].footnotes[0].kind, "footnote");
assert_eq!(s.paragraph_details[0].footnotes[0].text, "nota bene");
}
#[test]
fn captures_equation_script() {
let mut buf = Vec::new();
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
buf.extend(encode_header(HWPTAG_CTRL_HEADER, 1, 4));
buf.extend_from_slice(b"deqe");
let script: Vec<u16> = "x=1".encode_utf16().collect();
let mut pl = vec![0u8; 4];
pl.extend_from_slice(&(script.len() as u16).to_le_bytes());
for u in &script {
pl.extend_from_slice(&u.to_le_bytes());
}
buf.extend(encode_header(HWPTAG_EQEDIT, 2, pl.len() as u32));
buf.extend(&pl);
let t = le_utf16("see eqn");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t.len() as u32));
buf.extend(&t);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.paragraph_details.len(), 1);
assert_eq!(s.paragraph_details[0].equation.as_deref(), Some("x=1"));
assert_eq!(s.paragraphs[0], "see eqn");
}
#[test]
fn captures_image_bin_id() {
let mut buf = Vec::new();
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
buf.extend(encode_header(HWPTAG_CTRL_HEADER, 1, 4));
buf.extend_from_slice(&[0x20, 0x6F, 0x73, 0x67]);
let mut pic = vec![0u8; 20];
pic[0..2].copy_from_slice(&7u16.to_le_bytes());
buf.extend(encode_header(
HWPTAG_SHAPE_COMPONENT_PICTURE,
2,
pic.len() as u32,
));
buf.extend(&pic);
let t = le_utf16("image below");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 1, t.len() as u32));
buf.extend(&t);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.paragraph_details.len(), 1);
assert_eq!(s.paragraph_details[0].image_refs.len(), 1);
assert_eq!(s.paragraph_details[0].image_refs[0].bin_id, 7);
assert_eq!(s.paragraphs[0], "image below");
}
#[test]
fn assembles_single_cell_table() {
let mut buf = Vec::new();
buf.extend(encode_header(HWPTAG_PARA_HEADER, 0, 0));
buf.extend(encode_header(HWPTAG_CTRL_HEADER, 1, 4));
buf.extend_from_slice(&[0x20, 0x6C, 0x62, 0x74]);
let tbl = build_table_props_payload(1, 1);
buf.extend(encode_header(HWPTAG_TABLE, 2, tbl.len() as u32));
buf.extend(&tbl);
let ch = build_cell_list_header_payload(1, 0, 0);
buf.extend(encode_header(HWPTAG_LIST_HEADER, 2, ch.len() as u32));
buf.extend(&ch);
buf.extend(encode_header(HWPTAG_PARA_HEADER, 2, 0));
let t = le_utf16("cell A");
buf.extend(encode_header(HWPTAG_PARA_TEXT, 2, t.len() as u32));
buf.extend(&t);
let mut w = WarningCollector::default();
let s = read_section(0, &buf, &mut w);
assert_eq!(s.tables.len(), 1);
let tbl = &s.tables[0];
assert_eq!((tbl.rows, tbl.cols), (1, 1));
let cell = tbl.cells[0][0].as_ref().unwrap();
assert_eq!(cell.text, "cell A");
}
}