use crate::error::Error;
use crate::error::Warning;
use crate::model::{FootnoteBody, ImageRef, ParagraphDetail, Section};
use crate::structure::{make_preview, NodeKind, StructureNode};
use crate::table::{Cell, Table};
use quick_xml::events::{BytesEnd, BytesStart, Event};
use quick_xml::Reader;
pub struct SectionOut {
pub section: Section,
pub warnings: Vec<Warning>,
}
#[derive(Default)]
struct ParaBuilder {
para_shape_id: u32,
text: String,
runs: Vec<(u32, u32)>,
current_run_char_id: Option<u32>,
contains_table: bool,
in_t_element: bool,
footnotes: Vec<FootnoteBody>,
equation: Option<String>,
image_refs: Vec<ImageRef>,
}
struct TableBuilder {
rows: u16,
cols: u16,
cells: Vec<Vec<Option<Cell>>>,
cur_cell_col: u16,
cur_cell_row: u16,
cur_cell_col_span: u16,
cur_cell_row_span: u16,
cur_cell_paragraphs: Vec<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ParaContext {
TopLevel,
Cell,
Note,
}
struct NoteFrame {
kind: String,
paragraphs: Vec<String>,
}
pub fn parse(bytes: &[u8], section_index: usize) -> Result<SectionOut, Error> {
let mut reader = Reader::from_reader(bytes);
reader.config_mut().trim_text(false);
let mut paragraphs: Vec<String> = Vec::new();
let mut paragraph_details: Vec<ParagraphDetail> = Vec::new();
let mut structure: Vec<StructureNode> = Vec::new();
let mut tables: Vec<Table> = Vec::new();
let warnings: Vec<Warning> = Vec::new();
let mut para_stack: Vec<ParaBuilder> = Vec::new();
let mut table_stack: Vec<TableBuilder> = Vec::new();
let mut context_stack: Vec<ParaContext> = Vec::new();
let mut note_stack: Vec<NoteFrame> = Vec::new();
let mut in_equation_script: bool = false;
let mut cur_equation_script: String = String::new();
loop {
let ev = reader
.read_event()
.map_err(|e| Error::Container(format!("section xml: {e}")))?;
match ev {
Event::Start(e) => handle_start(
&e,
&mut para_stack,
&mut context_stack,
&mut table_stack,
&mut note_stack,
&mut in_equation_script,
&mut cur_equation_script,
),
Event::Empty(e) => handle_empty(&e, &mut para_stack, &mut table_stack),
Event::Text(t) => {
let s = t
.unescape()
.map_err(|e| Error::Container(format!("text: {e}")))?
.into_owned();
if in_equation_script {
cur_equation_script.push_str(&s);
} else if let Some(pb) = para_stack.last_mut() {
if pb.in_t_element {
pb.text.push_str(&s);
}
}
}
Event::End(e) => handle_end(
&e,
&mut para_stack,
&mut context_stack,
&mut table_stack,
&mut note_stack,
&mut in_equation_script,
&mut cur_equation_script,
&mut paragraphs,
&mut paragraph_details,
&mut structure,
&mut tables,
section_index,
),
Event::Eof => break,
_ => {}
}
}
Ok(SectionOut {
section: Section {
index: section_index,
paragraphs,
paragraph_details,
structure,
tables,
},
warnings,
})
}
impl ParaBuilder {
fn new(para_shape_id: u32) -> Self {
Self {
para_shape_id,
text: String::new(),
runs: Vec::new(),
current_run_char_id: None,
contains_table: false,
in_t_element: false,
footnotes: Vec::new(),
equation: None,
image_refs: Vec::new(),
}
}
}
fn top_level_para<'a>(
para_stack: &'a mut [ParaBuilder],
context_stack: &[ParaContext],
) -> Option<&'a mut ParaBuilder> {
let n = para_stack.len().min(context_stack.len());
for i in (0..n).rev() {
if context_stack[i] == ParaContext::TopLevel {
return Some(&mut para_stack[i]);
}
}
None
}
#[allow(clippy::too_many_arguments)]
fn handle_start(
e: &BytesStart,
para_stack: &mut Vec<ParaBuilder>,
context_stack: &mut Vec<ParaContext>,
table_stack: &mut Vec<TableBuilder>,
note_stack: &mut Vec<NoteFrame>,
in_equation_script: &mut bool,
_cur_equation_script: &mut String,
) {
let local_owned = e.name().local_name().as_ref().to_vec();
let local = local_owned.as_slice();
match local {
b"p" => {
let id = get_attr_u32(e, b"paraPrIDRef").unwrap_or(0);
let ctx = if !note_stack.is_empty() {
ParaContext::Note
} else if !table_stack.is_empty() {
ParaContext::Cell
} else {
ParaContext::TopLevel
};
para_stack.push(ParaBuilder::new(id));
context_stack.push(ctx);
}
b"run" => {
if let Some(pb) = para_stack.last_mut() {
let char_id = get_attr_u32(e, b"charPrIDRef").unwrap_or(0);
pb.current_run_char_id = Some(char_id);
let start = pb.text.chars().count() as u32;
pb.runs.push((start, char_id));
}
}
b"t" => {
if let Some(pb) = para_stack.last_mut() {
pb.in_t_element = true;
}
}
b"tbl" => {
let rows = get_attr_u32(e, b"rowCnt").unwrap_or(0) as u16;
let cols = get_attr_u32(e, b"colCnt").unwrap_or(0) as u16;
let cells = vec![vec![None; cols as usize]; rows as usize];
table_stack.push(TableBuilder {
rows,
cols,
cells,
cur_cell_col: 0,
cur_cell_row: 0,
cur_cell_col_span: 1,
cur_cell_row_span: 1,
cur_cell_paragraphs: Vec::new(),
});
if let Some(pb) = para_stack.last_mut() {
pb.contains_table = true;
}
}
b"tc" => {
if let Some(tb) = table_stack.last_mut() {
tb.cur_cell_paragraphs.clear();
tb.cur_cell_col_span = 1;
tb.cur_cell_row_span = 1;
}
}
b"footNote" => {
note_stack.push(NoteFrame {
kind: "footnote".into(),
paragraphs: Vec::new(),
});
}
b"endNote" => {
note_stack.push(NoteFrame {
kind: "endnote".into(),
paragraphs: Vec::new(),
});
}
b"equation" => {
if let Some(s) = get_attr(e, b"script") {
if let Some(pb) = top_level_para(para_stack, context_stack) {
pb.equation = Some(s);
}
}
}
b"script" => {
*in_equation_script = true;
}
b"pic" => {
if let Some(id_str) = get_attr(e, b"binaryItemIDRef") {
let bin_id = digit_prefix(&id_str).unwrap_or(0);
if let Some(pb) = top_level_para(para_stack, context_stack) {
pb.image_refs.push(ImageRef { bin_id });
}
}
}
_ => {}
}
}
fn digit_prefix(s: &str) -> Option<u16> {
let trimmed: String = s
.chars()
.skip_while(|c| !c.is_ascii_digit())
.take_while(|c| c.is_ascii_digit())
.collect();
if trimmed.is_empty() {
return None;
}
trimmed.parse::<u16>().ok()
}
fn handle_empty(e: &BytesStart, para_stack: &mut [ParaBuilder], table_stack: &mut [TableBuilder]) {
let local_owned = e.name().local_name().as_ref().to_vec();
let local = local_owned.as_slice();
match local {
b"cellAddr" => {
if let Some(tb) = table_stack.last_mut() {
tb.cur_cell_col = get_attr_u32(e, b"colAddr").unwrap_or(0) as u16;
tb.cur_cell_row = get_attr_u32(e, b"rowAddr").unwrap_or(0) as u16;
}
}
b"cellSpan" => {
if let Some(tb) = table_stack.last_mut() {
tb.cur_cell_col_span = get_attr_u32(e, b"colSpan").unwrap_or(1) as u16;
tb.cur_cell_row_span = get_attr_u32(e, b"rowSpan").unwrap_or(1) as u16;
}
}
b"pic" => {
if let Some(id_str) = get_attr(e, b"binaryItemIDRef") {
let bin_id = digit_prefix(&id_str).unwrap_or(0);
if let Some(pb) = para_stack.last_mut() {
pb.image_refs.push(ImageRef { bin_id });
}
}
}
b"equation" => {
if let Some(s) = get_attr(e, b"script") {
if let Some(pb) = para_stack.last_mut() {
pb.equation = Some(s);
}
}
}
_ => {}
}
}
#[allow(clippy::too_many_arguments)]
fn handle_end(
e: &BytesEnd,
para_stack: &mut Vec<ParaBuilder>,
context_stack: &mut Vec<ParaContext>,
table_stack: &mut Vec<TableBuilder>,
note_stack: &mut Vec<NoteFrame>,
in_equation_script: &mut bool,
cur_equation_script: &mut String,
paragraphs: &mut Vec<String>,
paragraph_details: &mut Vec<ParagraphDetail>,
structure: &mut Vec<StructureNode>,
tables: &mut Vec<Table>,
section_index: usize,
) {
let local_owned = e.name().local_name().as_ref().to_vec();
let local = local_owned.as_slice();
match local {
b"t" => {
if let Some(pb) = para_stack.last_mut() {
pb.in_t_element = false;
}
}
b"run" => {
if let Some(pb) = para_stack.last_mut() {
pb.current_run_char_id = None;
}
}
b"script" => {
*in_equation_script = false;
if !cur_equation_script.is_empty() {
if let Some(pb) = top_level_para(para_stack, context_stack) {
if pb.equation.is_none() {
pb.equation = Some(std::mem::take(cur_equation_script));
} else {
cur_equation_script.clear();
}
} else {
cur_equation_script.clear();
}
}
}
b"footNote" | b"endNote" => {
if let Some(frame) = note_stack.pop() {
let text = frame.paragraphs.join("\n");
if let Some(pb) = top_level_para(para_stack, context_stack) {
pb.footnotes.push(FootnoteBody {
kind: frame.kind,
text,
});
}
}
}
b"p" => {
if let Some(pb) = para_stack.pop() {
let ctx = context_stack.pop().unwrap_or(ParaContext::TopLevel);
match ctx {
ParaContext::TopLevel => {
let text = pb.text.clone();
let kind = if pb.contains_table {
NodeKind::Table
} else {
NodeKind::Paragraph
};
let preview = make_preview(&text);
let id = format!("{}:{}", section_index, paragraphs.len());
paragraphs.push(text.clone());
paragraph_details.push(ParagraphDetail {
text,
para_shape_id: pb.para_shape_id,
runs: pb.runs,
footnotes: pb.footnotes,
equation: pb.equation,
image_refs: pb.image_refs,
});
structure.push(StructureNode {
id,
kind,
preview,
ctrl_id: None,
});
}
ParaContext::Cell => {
if let Some(tb) = table_stack.last_mut() {
tb.cur_cell_paragraphs.push(pb.text);
}
}
ParaContext::Note => {
if let Some(frame) = note_stack.last_mut() {
frame.paragraphs.push(pb.text);
}
}
}
}
}
b"tc" => {
if let Some(tb) = table_stack.last_mut() {
let row = tb.cur_cell_row as usize;
let col = tb.cur_cell_col as usize;
let paragraphs = std::mem::take(&mut tb.cur_cell_paragraphs);
let text = paragraphs.join("\n");
let cell = Cell {
col: tb.cur_cell_col,
row: tb.cur_cell_row,
col_span: tb.cur_cell_col_span,
row_span: tb.cur_cell_row_span,
text,
paragraphs,
};
if row < tb.cells.len() && col < tb.cells[row].len() {
tb.cells[row][col] = Some(cell);
}
}
}
b"tbl" => {
if let Some(tb) = table_stack.pop() {
let id = format!("{}:{}", section_index, paragraphs.len());
tables.push(Table {
id,
rows: tb.rows,
cols: tb.cols,
caption: None,
cells: tb.cells,
});
}
}
_ => {}
}
}
fn get_attr(e: &BytesStart, key: &[u8]) -> Option<String> {
for a in e.attributes().flatten() {
if a.key.as_ref() == key || a.key.local_name().as_ref() == key {
if let Ok(v) = a.unescape_value() {
return Some(v.into_owned());
}
}
}
None
}
fn get_attr_u32(e: &BytesStart, key: &[u8]) -> Option<u32> {
get_attr(e, key).and_then(|s| s.parse::<u32>().ok())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn captures_footnote_in_section_xml() {
let xml = r#"<?xml version="1.0"?>
<hs:sec xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section">
<hp:p paraPrIDRef="0">
<hp:run charPrIDRef="0"><hp:t>main text</hp:t></hp:run>
<hp:footNote>
<hp:subList>
<hp:p paraPrIDRef="1"><hp:run charPrIDRef="0"><hp:t>footnote body</hp:t></hp:run></hp:p>
</hp:subList>
</hp:footNote>
</hp:p>
</hs:sec>"#;
let out = parse(xml.as_bytes(), 0).expect("parse");
let details = &out.section.paragraph_details;
assert_eq!(details.len(), 1);
assert_eq!(out.section.paragraphs[0], "main text");
assert_eq!(details[0].footnotes.len(), 1);
assert_eq!(details[0].footnotes[0].kind, "footnote");
assert_eq!(details[0].footnotes[0].text, "footnote body");
}
#[test]
fn captures_equation_script_attribute() {
let xml = r#"<?xml version="1.0"?>
<hs:sec xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section">
<hp:p paraPrIDRef="0">
<hp:run charPrIDRef="0">
<hp:equation script="x=1"/>
<hp:t>hello</hp:t>
</hp:run>
</hp:p>
</hs:sec>"#;
let out = parse(xml.as_bytes(), 0).expect("parse");
let d = &out.section.paragraph_details[0];
assert_eq!(d.equation.as_deref(), Some("x=1"));
assert_eq!(d.text, "hello");
}
#[test]
fn captures_pic_bin_id() {
let xml = r#"<?xml version="1.0"?>
<hs:sec xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section">
<hp:p paraPrIDRef="0">
<hp:run charPrIDRef="0">
<hp:pic binaryItemIDRef="image3"/>
<hp:t>caption</hp:t>
</hp:run>
</hp:p>
</hs:sec>"#;
let out = parse(xml.as_bytes(), 0).expect("parse");
let d = &out.section.paragraph_details[0];
assert_eq!(d.image_refs.len(), 1);
assert_eq!(d.image_refs[0].bin_id, 3);
assert_eq!(d.text, "caption");
}
}