use super::record::{Record, RecordIterator, TagId};
use crate::error::Result;
use crate::model::{
ImageRef, InlineContent, Paragraph, ParagraphStyle, Section, StyleRegistry, Table, TableCell,
TableRow, TextRun, TextStyle,
};
mod control_char {
pub const UNUSABLE: u16 = 0x0000;
pub const LINE_BREAK: u16 = 0x000A;
pub const PARA_BREAK: u16 = 0x000D;
pub const NBSP: u16 = 0x001E;
pub const FIXED_SPACE: u16 = 0x001F;
pub const FIELD_END: u16 = 0x0004;
pub const INLINE_RESERVED_1: u16 = 0x0005;
pub const INLINE_RESERVED_2: u16 = 0x0006;
pub const INLINE_RESERVED_3: u16 = 0x0007;
pub const INLINE_TITLE_MARK: u16 = 0x0008;
pub const TAB: u16 = 0x0009;
pub const HYPHEN: u16 = 0x000C;
pub const RESERVED: u16 = 0x0001;
pub const SECTION_DEF: u16 = 0x0002;
pub const FIELD_START: u16 = 0x0003;
pub const EXTENDED_CONTROL: u16 = 0x000B;
pub const EXT_RESERVED_0E: u16 = 0x000E;
pub const HIDDEN_COMMENT: u16 = 0x000F;
pub const EXT_RESERVED_10: u16 = 0x0010;
pub const FOOTNOTE: u16 = 0x0011;
pub const AUTO_NUMBERING: u16 = 0x0012;
pub const PAGE_CTRL: u16 = 0x0015;
pub const BOOKMARK: u16 = 0x0016;
pub const OLE_OVERLAY: u16 = 0x0017;
pub const TITLE_MARK: u16 = 0x0018;
}
pub fn parse_section(
data: &[u8],
section_index: usize,
styles: &StyleRegistry,
picture_counter: &mut u32,
) -> Result<Section> {
let records: Vec<Record> = RecordIterator::new(data).filter_map(|r| r.ok()).collect();
let mut section = Section::new(section_index);
let mut paragraph_context = ParagraphContext::new();
let mut skip_until_idx: usize = 0;
let mut idx = 0;
while idx < records.len() {
if idx < skip_until_idx {
idx += 1;
continue;
}
let record = &records[idx];
match record.tag() {
TagId::ParaHeader => {
let base_level = record.level();
if let Some(para) = paragraph_context.finish(styles) {
section.content.push(crate::model::Block::Paragraph(para));
}
let para_shape_id = record.read_u32(0).unwrap_or(0);
let style_id = record.read_u16(4).unwrap_or(0) as u32;
let mut style = styles
.get_para_style(para_shape_id)
.cloned()
.unwrap_or_default();
if let Some(named_style) = styles.get_para_style(style_id) {
if named_style.heading_level > 0 {
style.heading_level = named_style.heading_level;
}
}
paragraph_context.start(style);
paragraph_context.base_level = base_level;
}
TagId::ParaText => {
let text_data = record.data();
parse_para_text(text_data, &mut paragraph_context, picture_counter, styles)?;
}
TagId::ParaCharShape => {
parse_char_shape_positions(record, &mut paragraph_context, styles)?;
}
TagId::Table => {
let table_level = record.level();
if let Some(para) = paragraph_context.finish(styles) {
section.content.push(crate::model::Block::Paragraph(para));
}
let table_end = find_block_end(&records, idx, table_level);
if let Some(table) =
parse_table_records(&records[idx..table_end], styles, picture_counter)
{
section.content.push(crate::model::Block::Table(table));
}
skip_until_idx = table_end;
}
_ => {
}
}
idx += 1;
}
if let Some(para) = paragraph_context.finish(styles) {
section.content.push(crate::model::Block::Paragraph(para));
}
Ok(section)
}
fn find_block_end(records: &[Record], start_idx: usize, base_level: u16) -> usize {
for (i, record) in records.iter().enumerate().skip(start_idx + 1) {
if record.level() < base_level {
return i;
}
}
records.len()
}
fn parse_table_records(
records: &[Record],
styles: &StyleRegistry,
picture_counter: &mut u32,
) -> Option<Table> {
if records.is_empty() {
return None;
}
let table_record = &records[0];
if table_record.tag() != TagId::Table {
return None;
}
let data = table_record.data();
if data.len() < 14 {
return None;
}
let row_count = u16::from_le_bytes([data[4], data[5]]) as usize;
let col_count = u16::from_le_bytes([data[6], data[7]]) as usize;
if row_count == 0 || col_count == 0 {
return None;
}
let mut cells_data: Vec<CellData> = Vec::new();
let mut i = 1;
while i < records.len() {
let record = &records[i];
if record.tag() == TagId::ListHeader {
let cell_end = find_cell_end(records, i, record.level());
let cell_content = parse_cell_content(&records[i..cell_end], styles, picture_counter);
cells_data.push(cell_content);
i = cell_end;
} else {
i += 1;
}
}
let mut grid: Vec<Vec<Option<usize>>> = vec![vec![None; col_count]; row_count];
for (cell_idx, cell_data) in cells_data.iter().enumerate() {
let r = cell_data.row as usize;
let c = cell_data.col as usize;
if r < row_count && c < col_count {
grid[r][c] = Some(cell_idx);
for dr in 0..cell_data.rowspan as usize {
for dc in 0..cell_data.colspan as usize {
if dr == 0 && dc == 0 {
continue; }
let nr = r + dr;
let nc = c + dc;
if nr < row_count && nc < col_count {
}
}
}
}
}
let mut table = Table::new();
for (row_idx, grid_row) in grid.iter().enumerate() {
let mut row = TableRow::new();
row.is_header = row_idx == 0;
for &cell_idx_opt in grid_row {
if let Some(cell_idx) = cell_idx_opt {
let cell_data = &cells_data[cell_idx];
let cell = TableCell {
content: cell_data.paragraphs.clone(),
rowspan: cell_data.rowspan,
colspan: cell_data.colspan,
..Default::default()
};
row.cells.push(cell);
} else {
}
}
table.rows.push(row);
}
table.has_header = !table.rows.is_empty();
Some(table)
}
struct CellData {
paragraphs: Vec<Paragraph>,
rowspan: u32,
colspan: u32,
row: u16,
col: u16,
}
fn find_cell_end(records: &[Record], start_idx: usize, cell_level: u16) -> usize {
for (i, record) in records.iter().enumerate().skip(start_idx + 1) {
if record.level() < cell_level {
return i;
}
if record.level() == cell_level && record.tag() == TagId::ListHeader {
return i;
}
}
records.len()
}
fn parse_cell_content(
records: &[Record],
styles: &StyleRegistry,
picture_counter: &mut u32,
) -> CellData {
let mut paragraphs = Vec::new();
let mut rowspan = 1u32;
let mut colspan = 1u32;
let mut row = 0u16;
let mut col = 0u16;
if records.is_empty() {
return CellData {
paragraphs,
rowspan,
colspan,
row,
col,
};
}
let list_header = &records[0];
let data = list_header.data();
if data.len() >= 16 {
col = u16::from_le_bytes([data[8], data[9]]);
row = u16::from_le_bytes([data[10], data[11]]);
colspan = u16::from_le_bytes([data[12], data[13]]) as u32;
rowspan = u16::from_le_bytes([data[14], data[15]]) as u32;
if colspan == 0 {
colspan = 1;
}
if rowspan == 0 {
rowspan = 1;
}
}
let mut para_context = ParagraphContext::new();
for record in records.iter().skip(1) {
match record.tag() {
TagId::ParaHeader => {
if let Some(para) = para_context.finish(styles) {
paragraphs.push(para);
}
let para_shape_id = record.read_u32(0).unwrap_or(0);
let style = styles
.get_para_style(para_shape_id)
.cloned()
.unwrap_or_default();
para_context.start(style);
}
TagId::ParaText => {
let _ = parse_para_text(record.data(), &mut para_context, picture_counter, styles);
}
TagId::ParaCharShape => {
let _ = parse_char_shape_positions(record, &mut para_context, styles);
}
_ => {}
}
}
if let Some(para) = para_context.finish(styles) {
paragraphs.push(para);
}
CellData {
paragraphs,
rowspan,
colspan,
row,
col,
}
}
struct ParagraphContext {
style: ParagraphStyle,
content: Vec<InlineContent>,
current_text: String,
current_style: TextStyle,
char_shape_positions: Vec<(usize, u32)>,
in_paragraph: bool,
base_level: u16,
wchar_pos: usize,
current_text_wchar_positions: Vec<usize>,
content_wchar_positions: Vec<Vec<usize>>,
}
impl ParagraphContext {
fn new() -> Self {
Self {
style: ParagraphStyle::default(),
content: Vec::new(),
current_text: String::new(),
current_style: TextStyle::default(),
char_shape_positions: Vec::new(),
in_paragraph: false,
base_level: 0,
wchar_pos: 0,
current_text_wchar_positions: Vec::new(),
content_wchar_positions: Vec::new(),
}
}
fn start(&mut self, style: ParagraphStyle) {
self.style = style;
self.content.clear();
self.current_text.clear();
self.current_style = TextStyle::default();
self.char_shape_positions.clear();
self.in_paragraph = true;
self.wchar_pos = 0;
self.current_text_wchar_positions.clear();
self.content_wchar_positions.clear();
}
fn push_char(&mut self, ch: char) {
self.current_text.push(ch);
self.current_text_wchar_positions.push(self.wchar_pos);
}
fn push_line_break(&mut self) {
self.flush_text();
self.content.push(InlineContent::LineBreak);
self.content_wchar_positions.push(Vec::new());
}
fn push_image(&mut self, filename: &str) {
self.flush_text();
self.content
.push(InlineContent::Image(ImageRef::new(filename)));
self.content_wchar_positions.push(Vec::new());
}
fn flush_text(&mut self) {
if !self.current_text.is_empty() {
let text = std::mem::take(&mut self.current_text);
let positions = std::mem::take(&mut self.current_text_wchar_positions);
let style = self.current_style.clone();
self.content
.push(InlineContent::Text(TextRun::with_style(text, style)));
self.content_wchar_positions.push(positions);
}
}
fn finish(&mut self, styles: &StyleRegistry) -> Option<Paragraph> {
if !self.in_paragraph {
return None;
}
self.flush_text();
self.in_paragraph = false;
if self.content.is_empty() {
return None;
}
if !self.char_shape_positions.is_empty() {
self.apply_char_shapes(styles);
}
Some(Paragraph {
style: std::mem::take(&mut self.style),
content: std::mem::take(&mut self.content),
})
}
fn apply_char_shapes(&mut self, styles: &StyleRegistry) {
let old_content = std::mem::take(&mut self.content);
let old_positions = std::mem::take(&mut self.content_wchar_positions);
for (i, item) in old_content.into_iter().enumerate() {
match item {
InlineContent::Text(run) => {
let char_positions = old_positions.get(i).map(|v| v.as_slice()).unwrap_or(&[]);
if char_positions.is_empty() {
self.content.push(InlineContent::Text(run));
continue;
}
let split_runs = split_text_run_by_shapes(
&run.text,
char_positions,
&self.char_shape_positions,
styles,
);
for split_run in split_runs {
self.content.push(InlineContent::Text(split_run));
}
}
other => {
self.content.push(other);
}
}
}
}
}
fn split_text_run_by_shapes(
text: &str,
char_wchar_positions: &[usize],
shape_positions: &[(usize, u32)],
styles: &StyleRegistry,
) -> Vec<TextRun> {
if text.is_empty() || shape_positions.is_empty() {
return vec![TextRun::new(text)];
}
let chars: Vec<char> = text.chars().collect();
let mut runs = Vec::new();
let mut current_text = String::new();
let first_pos = char_wchar_positions.first().copied().unwrap_or(0);
let mut current_style_id = find_shape_id_at(shape_positions, first_pos);
for (i, &ch) in chars.iter().enumerate() {
let wchar_pos = char_wchar_positions.get(i).copied().unwrap_or(0);
let style_id = find_shape_id_at(shape_positions, wchar_pos);
if style_id != current_style_id && !current_text.is_empty() {
let style = styles
.get_char_style(current_style_id)
.cloned()
.unwrap_or_default();
runs.push(TextRun::with_style(
std::mem::take(&mut current_text),
style,
));
current_style_id = style_id;
}
current_text.push(ch);
}
if !current_text.is_empty() {
let style = styles
.get_char_style(current_style_id)
.cloned()
.unwrap_or_default();
runs.push(TextRun::with_style(current_text, style));
}
runs
}
fn find_shape_id_at(shape_positions: &[(usize, u32)], wchar_pos: usize) -> u32 {
let mut result = shape_positions[0].1;
for &(pos, id) in shape_positions {
if pos <= wchar_pos {
result = id;
} else {
break;
}
}
result
}
fn parse_para_text(
data: &[u8],
context: &mut ParagraphContext,
picture_counter: &mut u32,
styles: &StyleRegistry,
) -> Result<()> {
if !data.len().is_multiple_of(2) {
return Err(crate::error::Error::InvalidData(
"PARA_TEXT data must be even length".into(),
));
}
let mut i = 0;
while i + 1 < data.len() {
context.wchar_pos = i / 2;
let ch = u16::from_le_bytes([data[i], data[i + 1]]);
i += 2;
match ch {
control_char::UNUSABLE => {
break;
}
control_char::LINE_BREAK => {
context.push_line_break();
}
control_char::PARA_BREAK => {
break;
}
control_char::NBSP | control_char::FIXED_SPACE => {
context.push_char(' ');
}
control_char::EXTENDED_CONTROL => {
if i + 14 > data.len() {
break;
}
context.flush_text();
let ctrl_type = &data[i..i + 4];
let is_gso = ctrl_type == b" osg" || ctrl_type == b"gso ";
if is_gso {
*picture_counter += 1;
if let Some(filename) = styles.get_bindata_filename(*picture_counter) {
context.push_image(filename);
}
}
i += 14; }
control_char::TAB => {
if i + 14 <= data.len() {
i += 14;
}
context.push_char('\t');
}
control_char::FIELD_END | control_char::INLINE_RESERVED_1 | control_char::INLINE_RESERVED_2 | control_char::INLINE_RESERVED_3 | control_char::INLINE_TITLE_MARK | control_char::HYPHEN | 0x0013 | 0x0014 | 0x0019..=0x001D => {
if i + 14 <= data.len() {
i += 14;
}
}
control_char::RESERVED | control_char::SECTION_DEF | control_char::FIELD_START | control_char::EXT_RESERVED_0E | control_char::HIDDEN_COMMENT | control_char::EXT_RESERVED_10 | control_char::FOOTNOTE | control_char::AUTO_NUMBERING | control_char::PAGE_CTRL | control_char::BOOKMARK | control_char::OLE_OVERLAY | control_char::TITLE_MARK => {
if i + 14 <= data.len() {
i += 14;
}
}
_ => {
if let Some(c) = char::from_u32(ch as u32) {
context.push_char(c);
}
}
}
}
Ok(())
}
fn parse_char_shape_positions(
record: &Record,
context: &mut ParagraphContext,
styles: &StyleRegistry,
) -> Result<()> {
let data = record.data();
let mut offset = 0;
while offset + 8 <= data.len() {
let position = u32::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]) as usize;
let shape_id = u32::from_le_bytes([
data[offset + 4],
data[offset + 5],
data[offset + 6],
data[offset + 7],
]);
context.char_shape_positions.push((position, shape_id));
if position == 0 {
if let Some(style) = styles.get_char_style(shape_id) {
context.current_style = style.clone();
}
}
offset += 8;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::{InlineContent, StyleRegistry, TextStyle};
#[test]
fn test_split_text_run_single_style() {
let mut styles = StyleRegistry::new();
styles.register_char_style(0, TextStyle::default());
let shape_positions = vec![(0, 0u32)];
let char_positions = vec![0, 1, 2, 3, 4];
let runs = split_text_run_by_shapes("Hello", &char_positions, &shape_positions, &styles);
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].text, "Hello");
assert!(!runs[0].style.bold);
}
#[test]
fn test_split_text_run_two_styles() {
let mut styles = StyleRegistry::new();
styles.register_char_style(0, TextStyle::default());
styles.register_char_style(
1,
TextStyle {
bold: true,
..Default::default()
},
);
let shape_positions = vec![(0, 0u32), (3, 1u32)];
let char_positions = vec![0, 1, 2, 3, 4];
let runs = split_text_run_by_shapes("Hello", &char_positions, &shape_positions, &styles);
assert_eq!(runs.len(), 2);
assert_eq!(runs[0].text, "Hel");
assert!(!runs[0].style.bold);
assert_eq!(runs[1].text, "lo");
assert!(runs[1].style.bold);
}
#[test]
fn test_split_text_run_three_styles() {
let mut styles = StyleRegistry::new();
styles.register_char_style(
10,
TextStyle {
bold: true,
..Default::default()
},
);
styles.register_char_style(20, TextStyle::default());
styles.register_char_style(
30,
TextStyle {
italic: true,
..Default::default()
},
);
let shape_positions = vec![(0, 10u32), (2, 20u32), (5, 30u32)];
let char_positions: Vec<usize> = (0..8).collect();
let runs = split_text_run_by_shapes("ABCDEFGH", &char_positions, &shape_positions, &styles);
assert_eq!(runs.len(), 3);
assert_eq!(runs[0].text, "AB");
assert!(runs[0].style.bold);
assert_eq!(runs[1].text, "CDE");
assert!(!runs[1].style.bold);
assert!(!runs[1].style.italic);
assert_eq!(runs[2].text, "FGH");
assert!(runs[2].style.italic);
}
#[test]
fn test_find_shape_id_at() {
let positions = vec![(0, 10u32), (5, 20u32), (10, 30u32)];
assert_eq!(find_shape_id_at(&positions, 0), 10);
assert_eq!(find_shape_id_at(&positions, 3), 10);
assert_eq!(find_shape_id_at(&positions, 5), 20);
assert_eq!(find_shape_id_at(&positions, 7), 20);
assert_eq!(find_shape_id_at(&positions, 10), 30);
assert_eq!(find_shape_id_at(&positions, 15), 30);
}
#[test]
fn test_paragraph_context_char_shape_integration() {
let mut styles = StyleRegistry::new();
styles.register_char_style(0, TextStyle::default());
styles.register_char_style(
1,
TextStyle {
bold: true,
..Default::default()
},
);
let mut ctx = ParagraphContext::new();
ctx.start(ParagraphStyle::default());
ctx.wchar_pos = 0;
ctx.push_char('A');
ctx.wchar_pos = 1;
ctx.push_char('B');
ctx.wchar_pos = 2;
ctx.push_line_break(); ctx.wchar_pos = 3;
ctx.push_char('C');
ctx.wchar_pos = 4;
ctx.push_char('D');
ctx.char_shape_positions = vec![(0, 0), (1, 1)];
let para = ctx.finish(&styles).expect("should produce paragraph");
assert_eq!(para.content.len(), 4);
if let InlineContent::Text(ref run) = para.content[0] {
assert_eq!(run.text, "A");
assert!(!run.style.bold);
} else {
panic!("Expected text run at index 0");
}
if let InlineContent::Text(ref run) = para.content[1] {
assert_eq!(run.text, "B");
assert!(run.style.bold);
} else {
panic!("Expected text run at index 1");
}
assert!(matches!(para.content[2], InlineContent::LineBreak));
if let InlineContent::Text(ref run) = para.content[3] {
assert_eq!(run.text, "CD");
assert!(run.style.bold);
} else {
panic!("Expected text run at index 3");
}
}
#[test]
fn test_wchar_positions_after_extended_control() {
let shape_positions = vec![(0, 0u32), (8, 1u32)];
let char_positions = vec![8, 9]; let mut styles = StyleRegistry::new();
styles.register_char_style(0, TextStyle::default());
styles.register_char_style(
1,
TextStyle {
italic: true,
..Default::default()
},
);
let runs = split_text_run_by_shapes("AB", &char_positions, &shape_positions, &styles);
assert_eq!(runs.len(), 1);
assert_eq!(runs[0].text, "AB");
assert!(runs[0].style.italic); }
}