use std::collections::HashMap;
use std::io::BufRead;
use quick_xml::events::{BytesStart, Event};
use quick_xml::Reader;
use crate::error::Result;
use crate::ir::{BBox, Block, Cell, Paragraph, Table, TableSource};
use super::{attr_by_local, next_event, styles};
pub(super) fn parse_document(xml: &[u8], styles: &HashMap<String, u8>) -> Result<Vec<Block>> {
let mut reader = Reader::from_reader(xml);
let mut buf = Vec::new();
let mut blocks = Vec::new();
loop {
match next_event(&mut reader, &mut buf)? {
Event::Start(e) => match e.local_name().as_ref() {
b"p" => blocks.push(Block::Paragraph(read_paragraph(&mut reader, styles)?)),
b"tbl" => blocks.push(Block::Table(read_table(&mut reader)?)),
_ => {}
},
Event::Eof => break,
_ => {}
}
buf.clear();
}
Ok(blocks)
}
fn read_paragraph<R: BufRead>(
reader: &mut Reader<R>,
styles: &HashMap<String, u8>,
) -> Result<Paragraph> {
let mut buf = Vec::new();
let mut text = String::new();
let mut style_id: Option<String> = None;
let mut in_text = false;
loop {
match next_event(reader, &mut buf)? {
Event::Start(e) => match e.local_name().as_ref() {
b"t" => in_text = true,
b"pStyle" => style_id = attr_by_local(&e, b"val"),
_ => {}
},
Event::Empty(e) if e.local_name().as_ref() == b"pStyle" => {
style_id = attr_by_local(&e, b"val");
}
Event::Text(e) if in_text => {
let t = e
.unescape()
.map_err(|err| crate::error::PdfmuseError::Malformed(format!("invalid DOCX text: {err}")))?;
text.push_str(&t);
}
Event::End(e) => match e.local_name().as_ref() {
b"t" => in_text = false,
b"p" => break,
_ => {}
},
Event::Eof => break,
_ => {}
}
buf.clear();
}
let heading_level = style_id.as_deref().and_then(|id| styles::heading_level(styles, id));
Ok(Paragraph { bbox: BBox::default(), text, heading_level })
}
struct RawCell {
text: String,
col_span: u16,
vmerge: VMerge,
grid_col: u32,
}
enum VMerge {
None,
Restart,
Continue,
}
fn read_table<R: BufRead>(reader: &mut Reader<R>) -> Result<Table> {
let mut buf = Vec::new();
let mut raw_rows: Vec<Vec<RawCell>> = Vec::new();
loop {
match next_event(reader, &mut buf)? {
Event::Start(e) if e.local_name().as_ref() == b"tr" => {
raw_rows.push(read_row(reader)?);
}
Event::End(e) if e.local_name().as_ref() == b"tbl" => break,
Event::Eof => break,
_ => {}
}
buf.clear();
}
Ok(build_table(raw_rows))
}
fn read_row<R: BufRead>(reader: &mut Reader<R>) -> Result<Vec<RawCell>> {
let mut buf = Vec::new();
let mut cells = Vec::new();
loop {
match next_event(reader, &mut buf)? {
Event::Start(e) if e.local_name().as_ref() == b"tc" => cells.push(read_cell(reader)?),
Event::End(e) if e.local_name().as_ref() == b"tr" => break,
Event::Eof => break,
_ => {}
}
buf.clear();
}
Ok(cells)
}
fn read_cell<R: BufRead>(reader: &mut Reader<R>) -> Result<RawCell> {
let mut buf = Vec::new();
let mut text = String::new();
let mut col_span = 1u16;
let mut vmerge = VMerge::None;
let mut in_text = false;
let mut tc_depth = 1usize;
let mut paragraphs = 0usize;
loop {
match next_event(reader, &mut buf)? {
Event::Start(e) => match e.local_name().as_ref() {
b"tc" => tc_depth += 1, b"t" => in_text = true,
b"p" if tc_depth == 1 => {
if paragraphs > 0 {
text.push('\n');
}
paragraphs += 1;
}
b"gridSpan" if tc_depth == 1 => col_span = parse_span(&e),
b"vMerge" if tc_depth == 1 => vmerge = parse_vmerge(&e),
_ => {}
},
Event::Empty(e) => match e.local_name().as_ref() {
b"gridSpan" if tc_depth == 1 => col_span = parse_span(&e),
b"vMerge" if tc_depth == 1 => vmerge = parse_vmerge(&e),
_ => {}
},
Event::Text(e) if in_text => {
let t = e
.unescape()
.map_err(|err| crate::error::PdfmuseError::Malformed(format!("invalid DOCX text: {err}")))?;
text.push_str(&t);
}
Event::End(e) => match e.local_name().as_ref() {
b"t" => in_text = false,
b"tc" => {
tc_depth -= 1;
if tc_depth == 0 {
break;
}
}
_ => {}
},
Event::Eof => break,
_ => {}
}
buf.clear();
}
Ok(RawCell { text, col_span, vmerge, grid_col: 0 })
}
fn parse_span(e: &BytesStart) -> u16 {
attr_by_local(e, b"val")
.and_then(|v| v.parse::<u16>().ok())
.filter(|&n| n >= 1)
.unwrap_or(1)
}
fn parse_vmerge(e: &BytesStart) -> VMerge {
match attr_by_local(e, b"val") {
Some(v) if v.eq_ignore_ascii_case("restart") => VMerge::Restart,
_ => VMerge::Continue,
}
}
fn build_table(mut raw_rows: Vec<Vec<RawCell>>) -> Table {
for row in &mut raw_rows {
let mut col = 0u32;
for cell in row.iter_mut() {
cell.grid_col = col;
col += cell.col_span as u32;
}
}
let mut rows: Vec<Vec<Cell>> = Vec::with_capacity(raw_rows.len());
for (r, row) in raw_rows.iter().enumerate() {
let mut out_row = Vec::new();
for cell in row {
let row_span = match cell.vmerge {
VMerge::Continue => continue, VMerge::Restart => vertical_span(&raw_rows, r, cell.grid_col),
VMerge::None => 1,
};
out_row.push(Cell {
text: cell.text.clone(),
bbox: BBox::default(),
row_span,
col_span: cell.col_span,
});
}
rows.push(out_row);
}
Table { bbox: BBox::default(), rows, source: TableSource::Docx }
}
fn vertical_span(rows: &[Vec<RawCell>], start: usize, grid_col: u32) -> u16 {
let mut span = 1u16;
for row in rows.iter().skip(start + 1) {
let continues = row
.iter()
.any(|c| c.grid_col == grid_col && matches!(c.vmerge, VMerge::Continue));
if continues {
span += 1;
} else {
break;
}
}
span
}