use std::fmt::Write as FmtWrite;
struct StructElement {
role: &'static str,
parent_idx: usize,
kids: Vec<StructKid>,
alt: Option<String>,
}
enum StructKid {
StructRef(usize),
MarkedContent { page_idx: usize, mcid: u32 },
}
pub struct TagBuilder {
elements: Vec<StructElement>,
parent_stack: Vec<usize>,
page_mcid_counters: Vec<u32>,
mcid_to_struct: Vec<(usize, u32, usize)>,
inside_paragraph: bool,
}
impl TagBuilder {
pub fn new(num_pages: usize) -> Self {
let root = StructElement {
role: "Document",
parent_idx: 0,
kids: Vec::new(),
alt: None,
};
TagBuilder {
elements: vec![root],
parent_stack: vec![0],
page_mcid_counters: vec![0; num_pages],
mcid_to_struct: Vec::new(),
inside_paragraph: false,
}
}
pub fn begin_element(
&mut self,
node_type: &str,
is_header_row: bool,
alt: Option<&str>,
page_idx: usize,
) -> u32 {
let role = self.map_role(node_type, is_header_row);
let was_inside_paragraph = self.inside_paragraph;
if role == "P" {
self.inside_paragraph = true;
}
let parent_idx = *self.parent_stack.last().unwrap_or(&0);
let elem_idx = self.elements.len();
let mcid = self.page_mcid_counters[page_idx];
self.page_mcid_counters[page_idx] += 1;
let elem = StructElement {
role,
parent_idx,
kids: vec![StructKid::MarkedContent { page_idx, mcid }],
alt: alt.map(|s| s.to_string()),
};
self.elements.push(elem);
self.elements[parent_idx]
.kids
.push(StructKid::StructRef(elem_idx));
self.mcid_to_struct.push((page_idx, mcid, elem_idx));
self.parent_stack.push(elem_idx);
if !was_inside_paragraph && role == "P" {
}
mcid
}
pub fn end_element(&mut self) {
if let Some(idx) = self.parent_stack.pop() {
if self.elements[idx].role == "P" {
self.inside_paragraph = false;
}
}
}
pub fn map_role_public(&self, node_type: &str, is_header_row: bool) -> &'static str {
self.map_role(node_type, is_header_row)
}
fn map_role(&self, node_type: &str, is_header_row: bool) -> &'static str {
match node_type {
"View" | "FixedHeader" | "FixedFooter" => "Div",
"Text" => {
if self.inside_paragraph {
"Span"
} else {
"P"
}
}
"TextLine" => "Span",
"Image" => "Figure",
"Svg" => "Figure",
"Table" => "Table",
"TableRow" => "TR",
"TableCell" => {
if is_header_row {
"TH"
} else {
"TD"
}
}
_ => "Div",
}
}
pub fn write_objects(
&self,
objects: &mut Vec<super::PdfObject>,
page_obj_ids: &[usize],
) -> (usize, usize) {
let num_pages = page_obj_ids.len();
let base_id = objects.len();
let elem_obj_ids: Vec<usize> = (0..self.elements.len()).map(|i| base_id + i).collect();
for i in 0..self.elements.len() {
objects.push(super::PdfObject {
id: base_id + i,
data: Vec::new(),
});
}
let parent_tree_id = objects.len();
objects.push(super::PdfObject {
id: parent_tree_id,
data: Vec::new(),
});
let role_map_id = objects.len();
objects.push(super::PdfObject {
id: role_map_id,
data: Vec::new(),
});
let root_obj_id = elem_obj_ids[0];
{
let root = &self.elements[0];
let kids_str = self.format_kids(&root.kids, &elem_obj_ids, page_obj_ids);
let data = format!(
"<< /Type /StructTreeRoot /K [{kids}] /ParentTree {pt} 0 R /RoleMap {rm} 0 R >>",
kids = kids_str,
pt = parent_tree_id,
rm = role_map_id,
);
objects[root_obj_id].data = data.into_bytes();
}
for (i, elem) in self.elements.iter().enumerate().skip(1) {
let obj_id = elem_obj_ids[i];
let parent_obj_id = elem_obj_ids[elem.parent_idx];
let kids_str = self.format_kids(&elem.kids, &elem_obj_ids, page_obj_ids);
let mut dict = format!(
"<< /Type /StructElem /S /{role} /P {parent} 0 R /K [{kids}]",
role = elem.role,
parent = parent_obj_id,
kids = kids_str,
);
if let Some(ref alt) = elem.alt {
let escaped = super::PdfWriter::escape_pdf_string(alt);
let _ = write!(dict, " /Alt ({})", escaped);
}
dict.push_str(" >>");
objects[obj_id].data = dict.into_bytes();
}
let mut nums = String::new();
for page_idx in 0..num_pages {
let mcid_count = self.page_mcid_counters[page_idx];
if mcid_count == 0 {
continue;
}
let mut refs: Vec<(u32, usize)> = self
.mcid_to_struct
.iter()
.filter(|(pi, _, _)| *pi == page_idx)
.map(|(_, mcid, elem_idx)| (*mcid, elem_obj_ids[*elem_idx]))
.collect();
refs.sort_by_key(|(mcid, _)| *mcid);
let ref_strs: Vec<String> =
refs.iter().map(|(_, oid)| format!("{} 0 R", oid)).collect();
let _ = write!(nums, " {} [{}]", page_idx, ref_strs.join(" "));
}
let parent_tree_data = format!("<< /Nums [{}] >>", nums.trim());
objects[parent_tree_id].data = parent_tree_data.into_bytes();
let role_map_data = "<< /Document /Document /Div /Div /P /P /Span /Span /Table /Table \
/TR /TR /TH /TH /TD /TD /Figure /Figure >>"
.to_string();
objects[role_map_id].data = role_map_data.into_bytes();
(root_obj_id, parent_tree_id)
}
fn format_kids(
&self,
kids: &[StructKid],
elem_obj_ids: &[usize],
page_obj_ids: &[usize],
) -> String {
let mut parts = Vec::new();
for kid in kids {
match kid {
StructKid::StructRef(idx) => {
parts.push(format!("{} 0 R", elem_obj_ids[*idx]));
}
StructKid::MarkedContent { page_idx, mcid } => {
parts.push(format!(
"<< /Type /MCR /Pg {} 0 R /MCID {} >>",
page_obj_ids[*page_idx], mcid
));
}
}
}
parts.join(" ")
}
#[cfg(test)]
pub fn page_mcid_count(&self, page_idx: usize) -> u32 {
self.page_mcid_counters.get(page_idx).copied().unwrap_or(0)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tag_builder_basic() {
let mut tb = TagBuilder::new(1);
let mcid = tb.begin_element("View", false, None, 0);
assert_eq!(mcid, 0);
let mcid2 = tb.begin_element("Text", false, None, 0);
assert_eq!(mcid2, 1);
tb.end_element();
tb.end_element();
assert_eq!(tb.elements.len(), 3); assert_eq!(tb.elements[1].role, "Div");
assert_eq!(tb.elements[2].role, "P");
}
#[test]
fn test_nested_text_maps_to_span() {
let mut tb = TagBuilder::new(1);
let _mcid = tb.begin_element("Text", false, None, 0);
assert_eq!(tb.elements.last().unwrap().role, "P");
let _mcid = tb.begin_element("Text", false, None, 0);
assert_eq!(tb.elements.last().unwrap().role, "Span");
tb.end_element();
tb.end_element();
}
#[test]
fn test_table_header_maps_to_th() {
let mut tb = TagBuilder::new(1);
tb.begin_element("Table", false, None, 0);
tb.begin_element("TableRow", true, None, 0);
tb.begin_element("TableCell", true, None, 0);
assert_eq!(tb.elements.last().unwrap().role, "TH");
tb.end_element();
tb.end_element(); tb.end_element();
tb.begin_element("TableRow", false, None, 0);
tb.begin_element("TableCell", false, None, 0);
assert_eq!(tb.elements.last().unwrap().role, "TD");
tb.end_element();
tb.end_element();
}
#[test]
fn test_figure_with_alt_text() {
let mut tb = TagBuilder::new(1);
tb.begin_element("Image", false, Some("A photo of a cat"), 0);
let elem = tb.elements.last().unwrap();
assert_eq!(elem.role, "Figure");
assert_eq!(elem.alt.as_deref(), Some("A photo of a cat"));
tb.end_element();
}
#[test]
fn test_parent_tree_consistency() {
let mut tb = TagBuilder::new(2);
tb.begin_element("Text", false, None, 0);
tb.end_element();
tb.begin_element("Text", false, None, 0);
tb.end_element();
tb.begin_element("Text", false, None, 1);
tb.end_element();
assert_eq!(tb.page_mcid_count(0), 2);
assert_eq!(tb.page_mcid_count(1), 1);
assert_eq!(tb.mcid_to_struct.len(), 3);
assert_eq!(tb.mcid_to_struct[0], (0, 0, 1)); assert_eq!(tb.mcid_to_struct[1], (0, 1, 2)); assert_eq!(tb.mcid_to_struct[2], (1, 0, 3)); }
}