use std::collections::HashMap;
use rpdfium_core::{Name, PdfSource};
use rpdfium_parser::{Object, ObjectId, ObjectStore};
use crate::error::{DocError, DocResult};
use crate::struct_element::{StructElement, parse_struct_element};
pub use crate::struct_element::{AttributeValue, StructAttribute};
const MAX_ELEMENTS: usize = 100_000;
const MAX_TREE_DEPTH: usize = 64;
pub mod structure_types {
pub const DOCUMENT: &str = "Document";
pub const PART: &str = "Part";
pub const SECT: &str = "Sect";
pub const P: &str = "P";
pub const H: &str = "H";
pub const H1: &str = "H1";
pub const H2: &str = "H2";
pub const H3: &str = "H3";
pub const H4: &str = "H4";
pub const H5: &str = "H5";
pub const H6: &str = "H6";
pub const TABLE: &str = "Table";
pub const TR: &str = "TR";
pub const TD: &str = "TD";
pub const TH: &str = "TH";
pub const FIGURE: &str = "Figure";
pub const SPAN: &str = "Span";
pub const LINK: &str = "Link";
pub const LIST: &str = "L";
pub const LIST_ITEM: &str = "LI";
pub const LABEL: &str = "Lbl";
pub const LIST_BODY: &str = "LBody";
}
#[derive(Debug, Clone)]
pub struct StructTree {
pub root_elements: Vec<StructElement>,
pub role_map: HashMap<String, String>,
}
pub fn find_elements_for_mcid(elements: &[StructElement], mcid: i32) -> Vec<usize> {
let target = mcid as i64;
let mut result = Vec::new();
let mut stack: Vec<(usize, &StructElement)> = elements.iter().enumerate().rev().collect();
while let Some((idx, elem)) = stack.pop() {
if elem.mcids.contains(&target) {
result.push(idx);
}
for child in elem.children.iter().rev() {
stack.push((idx, child));
}
}
result
}
impl StructTree {
pub fn from_catalog<S: PdfSource>(
catalog_dict: &HashMap<Name, Object>,
store: &ObjectStore<S>,
) -> DocResult<Option<Self>> {
let root_obj = match catalog_dict.get(&Name::struct_tree_root()) {
Some(obj) => store
.deep_resolve(obj)
.map_err(|e| DocError::Parser(e.to_string()))?,
None => return Ok(None),
};
let root_dict = match root_obj.as_dict() {
Some(d) => d,
None => return Ok(None),
};
let role_map = parse_role_map(root_dict, store);
let root_elements = match root_dict.get(&Name::k()) {
Some(k_obj) => parse_k_children(k_obj, store)?,
None => Vec::new(),
};
Ok(Some(StructTree {
root_elements,
role_map,
}))
}
pub fn role_map_name_for<'a>(&'a self, struct_type: &'a str) -> &'a str {
self.role_map
.get(struct_type)
.map(|s| s.as_str())
.unwrap_or(struct_type)
}
#[inline]
pub fn get_role_map_name_for<'a>(&'a self, struct_type: &'a str) -> &'a str {
self.role_map_name_for(struct_type)
}
pub fn child_count(&self) -> usize {
self.root_elements.len()
}
#[inline]
pub fn struct_tree_count_children(&self) -> usize {
self.child_count()
}
#[deprecated(
since = "0.1.0",
note = "use `struct_tree_count_children()` — matches upstream `FPDF_StructTree_CountChildren`"
)]
#[inline]
pub fn count_children(&self) -> usize {
self.child_count()
}
pub fn child_at_index(&self, index: usize) -> Option<&StructElement> {
self.root_elements.get(index)
}
#[inline]
pub fn struct_tree_get_child_at_index(&self, index: usize) -> Option<&StructElement> {
self.child_at_index(index)
}
#[deprecated(
since = "0.1.0",
note = "use `struct_tree_get_child_at_index()` — matches upstream `FPDF_StructTree_GetChildAtIndex`"
)]
#[inline]
pub fn get_child_at_index(&self, index: usize) -> Option<&StructElement> {
self.child_at_index(index)
}
pub fn elements_for_page_ref(&self, page_ref: ObjectId) -> ElementsForPage<'_> {
ElementsForPage {
stack: self.root_elements.iter().rev().collect(),
page_ref,
}
}
pub fn elements_for_mcid(&self, mcid: i32) -> Vec<usize> {
find_elements_for_mcid(&self.root_elements, mcid)
}
#[deprecated(
note = "use `elements_for_mcid()` — no public `FPDF_StructTree_GetElementsForMcid` API"
)]
#[inline]
pub fn get_elements_for_mcid(&self, mcid: i32) -> Vec<usize> {
self.elements_for_mcid(mcid)
}
pub fn elements_for_page<'a>(
&'a self,
page_index: usize,
page_ids: &[ObjectId],
) -> ElementsForPage<'a> {
match page_ids.get(page_index) {
Some(&page_ref) => ElementsForPage {
stack: self.root_elements.iter().rev().collect(),
page_ref,
},
None => ElementsForPage {
stack: Vec::new(),
page_ref: ObjectId::new(0, 0),
},
}
}
}
pub struct ElementsForPage<'a> {
stack: Vec<&'a StructElement>,
page_ref: ObjectId,
}
impl<'a> Iterator for ElementsForPage<'a> {
type Item = &'a StructElement;
fn next(&mut self) -> Option<Self::Item> {
loop {
let elem = self.stack.pop()?;
for child in elem.children.iter().rev() {
self.stack.push(child);
}
if elem.page_ref == Some(self.page_ref) {
return Some(elem);
}
}
}
}
fn parse_role_map<S: PdfSource>(
root_dict: &HashMap<Name, Object>,
store: &ObjectStore<S>,
) -> HashMap<String, String> {
let mut map = HashMap::new();
let role_map_obj = match root_dict.get(&Name::role_map()) {
Some(obj) => match store.deep_resolve(obj) {
Ok(resolved) => resolved,
Err(_) => return map,
},
None => return map,
};
if let Some(dict) = role_map_obj.as_dict() {
for (key, value) in dict {
if let Some(target) = value.as_name() {
map.insert(key.as_str().into_owned(), target.as_str().into_owned());
}
}
}
map
}
fn parse_k_children<S: PdfSource>(
k_obj: &Object,
store: &ObjectStore<S>,
) -> DocResult<Vec<StructElement>> {
let resolved = store
.deep_resolve(k_obj)
.map_err(|e| DocError::Parser(e.to_string()))?;
let top_items: Vec<&Object> = match resolved {
Object::Array(arr) => arr.iter().collect(),
_ => vec![resolved],
};
let mut flat: Vec<(usize, StructElement)> = Vec::new();
struct StackEntry<'a> {
obj: &'a Object,
depth: usize,
}
let mut stack: Vec<StackEntry<'_>> = Vec::new();
for item in top_items.iter().rev() {
stack.push(StackEntry {
obj: item,
depth: 0,
});
}
while let Some(entry) = stack.pop() {
if flat.len() >= MAX_ELEMENTS {
break;
}
if entry.depth > MAX_TREE_DEPTH {
return Err(DocError::DepthExceeded);
}
let resolved = match store.deep_resolve(entry.obj) {
Ok(r) => r,
Err(_) => continue,
};
if resolved.as_i64().is_some() {
continue;
}
let dict = match resolved.as_dict() {
Some(d) => d,
None => continue,
};
let mut elem = parse_struct_element(dict, store);
if let Some(k_val) = dict.get(&Name::k()) {
let k_resolved = match store.deep_resolve(k_val) {
Ok(r) => r,
Err(_) => {
flat.push((entry.depth, elem));
continue;
}
};
match k_resolved {
Object::Integer(n) => {
elem.mcids.push(*n);
}
Object::Dictionary(child_dict) => {
if let Some(mcid) = extract_mcid_from_dict(child_dict) {
elem.mcids.push(mcid);
} else {
stack.push(StackEntry {
obj: k_val,
depth: entry.depth + 1,
});
}
}
Object::Array(arr) => {
for child in arr.iter().rev() {
let child_resolved = match store.deep_resolve(child) {
Ok(r) => r,
Err(_) => continue,
};
match child_resolved {
Object::Integer(n) => {
elem.mcids.push(*n);
}
Object::Dictionary(child_dict) => {
if let Some(mcid) = extract_mcid_from_dict(child_dict) {
elem.mcids.push(mcid);
} else {
stack.push(StackEntry {
obj: child,
depth: entry.depth + 1,
});
}
}
_ => {}
}
}
elem.mcids.reverse();
}
_ => {}
}
}
flat.push((entry.depth, elem));
}
build_tree_from_flat(flat)
}
fn extract_mcid_from_dict(dict: &HashMap<Name, Object>) -> Option<i64> {
dict.get(&Name::mcid()).and_then(|obj| obj.as_i64())
}
fn build_tree_from_flat(flat: Vec<(usize, StructElement)>) -> DocResult<Vec<StructElement>> {
if flat.is_empty() {
return Ok(Vec::new());
}
let mut root: Vec<StructElement> = Vec::new();
let mut path: Vec<usize> = Vec::new();
for (depth, mut elem) in flat {
path.truncate(depth);
let container = get_children_at_path(&mut root, &path);
let idx = container.len();
if depth > 0 {
elem.parent_index = Some(idx);
}
container.push(elem);
if path.len() <= depth {
path.push(idx);
}
}
Ok(root)
}
fn get_children_at_path<'a>(
root: &'a mut Vec<StructElement>,
path: &[usize],
) -> &'a mut Vec<StructElement> {
let mut current = root;
for &idx in path {
current = &mut current[idx].children;
}
current
}
#[derive(Debug, Clone)]
pub struct McidMapping {
entries: HashMap<(ObjectId, i64), usize>,
elements: Vec<StructElement>,
}
impl McidMapping {
pub fn from_struct_tree(tree: &StructTree) -> Self {
let mut entries = HashMap::new();
let mut elements = Vec::new();
let mut stack: Vec<&StructElement> = tree.root_elements.iter().rev().collect();
while let Some(elem) = stack.pop() {
if !elem.mcids.is_empty() {
if let Some(page_id) = elem.page_ref {
let idx = elements.len();
elements.push(elem.clone());
for &mcid in &elem.mcids {
entries.insert((page_id, mcid), idx);
}
}
}
for child in elem.children.iter().rev() {
stack.push(child);
}
}
McidMapping { entries, elements }
}
pub fn element_for_mcid(&self, page_id: ObjectId, mcid: i64) -> Option<&StructElement> {
self.entries
.get(&(page_id, mcid))
.map(|&idx| &self.elements[idx])
}
}
#[derive(Debug, Clone)]
pub struct PageStructure {
pub elements: Vec<StructElement>,
}
impl PageStructure {
pub fn for_page(tree: &StructTree, page_id: ObjectId) -> Self {
let mut elements = Vec::new();
let mut stack: Vec<&StructElement> = tree.root_elements.iter().rev().collect();
while let Some(elem) = stack.pop() {
if elem.page_ref == Some(page_id) {
elements.push(elem.clone());
}
for child in elem.children.iter().rev() {
stack.push(child);
}
}
PageStructure { elements }
}
}
#[cfg(test)]
mod tests {
use super::*;
use rpdfium_core::PdfString;
fn build_store() -> ObjectStore<Vec<u8>> {
let pdf = build_minimal_pdf();
ObjectStore::open(pdf, rpdfium_core::ParsingMode::Lenient).unwrap()
}
fn build_minimal_pdf() -> Vec<u8> {
let mut pdf = Vec::new();
pdf.extend_from_slice(b"%PDF-1.4\n");
let obj1_offset = pdf.len();
pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
let obj2_offset = pdf.len();
pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
let xref_offset = pdf.len();
pdf.extend_from_slice(b"xref\n0 3\n");
pdf.extend_from_slice(b"0000000000 65535 f \r\n");
pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
pdf
}
fn str_obj(s: &str) -> Object {
Object::String(PdfString::from_bytes(s.as_bytes().to_vec()))
}
fn name_obj(s: &str) -> Object {
Object::Name(Name::from(s))
}
fn struct_elem_dict(tag: &str) -> HashMap<Name, Object> {
let mut d = HashMap::new();
d.insert(Name::s(), name_obj(tag));
d
}
#[test]
fn test_no_struct_tree_root_returns_none() {
let store = build_store();
let catalog = HashMap::new();
let result = StructTree::from_catalog(&catalog, &store).unwrap();
assert!(result.is_none());
}
#[test]
fn test_empty_struct_tree_root() {
let store = build_store();
let root_dict = HashMap::new();
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert!(tree.root_elements.is_empty());
assert!(tree.role_map.is_empty());
}
#[test]
fn test_basic_structure_tree_document_with_paragraphs() {
let store = build_store();
let p1 = struct_elem_dict("P");
let p2 = struct_elem_dict("P");
let mut doc = struct_elem_dict("Document");
doc.insert(
Name::k(),
Object::Array(vec![Object::Dictionary(p1), Object::Dictionary(p2)]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.root_elements.len(), 1);
assert_eq!(tree.root_elements[0].struct_type, "Document");
assert_eq!(tree.root_elements[0].children.len(), 2);
assert_eq!(tree.root_elements[0].children[0].struct_type, "P");
assert_eq!(tree.root_elements[0].children[1].struct_type, "P");
}
#[test]
fn test_mcid_from_integer_in_k() {
let store = build_store();
let mut p = struct_elem_dict("P");
p.insert(Name::k(), Object::Integer(42));
p.insert(Name::pg(), Object::Reference(ObjectId::new(5, 0)));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.root_elements.len(), 1);
assert_eq!(tree.root_elements[0].mcids, vec![42]);
assert_eq!(tree.root_elements[0].page_ref, Some(ObjectId::new(5, 0)));
}
#[test]
fn test_mcid_from_dict_in_k() {
let store = build_store();
let mut mcr = HashMap::new();
mcr.insert(Name::mcid(), Object::Integer(7));
let mut p = struct_elem_dict("Span");
p.insert(Name::k(), Object::Dictionary(mcr));
p.insert(Name::pg(), Object::Reference(ObjectId::new(3, 0)));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.root_elements[0].mcids, vec![7]);
}
#[test]
fn test_alt_text_extraction() {
let store = build_store();
let mut fig = struct_elem_dict("Figure");
fig.insert(Name::alt(), str_obj("A photo of a cat"));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(fig));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(
tree.root_elements[0].alt_text.as_deref(),
Some("A photo of a cat")
);
}
#[test]
fn test_nested_structure_elements() {
let store = build_store();
let span = struct_elem_dict("Span");
let mut p = struct_elem_dict("P");
p.insert(Name::k(), Object::Dictionary(span));
let mut doc = struct_elem_dict("Document");
doc.insert(Name::k(), Object::Dictionary(p));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.root_elements[0].struct_type, "Document");
assert_eq!(tree.root_elements[0].children[0].struct_type, "P");
assert_eq!(
tree.root_elements[0].children[0].children[0].struct_type,
"Span"
);
}
#[test]
fn test_role_mapping() {
let store = build_store();
let mut role_map_dict = HashMap::new();
role_map_dict.insert(Name::from("MyTag"), name_obj("P"));
role_map_dict.insert(Name::from("CustomH"), name_obj("H1"));
let mut root_dict = HashMap::new();
root_dict.insert(Name::role_map(), Object::Dictionary(role_map_dict));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.role_map.get("MyTag"), Some(&"P".to_string()));
assert_eq!(tree.role_map.get("CustomH"), Some(&"H1".to_string()));
}
#[test]
fn test_mixed_k_content_dicts_and_integers() {
let store = build_store();
let mut mcr = HashMap::new();
mcr.insert(Name::mcid(), Object::Integer(3));
let child = struct_elem_dict("Span");
let mut p = struct_elem_dict("P");
p.insert(Name::pg(), Object::Reference(ObjectId::new(10, 0)));
p.insert(
Name::k(),
Object::Array(vec![
Object::Integer(1),
Object::Dictionary(mcr),
Object::Dictionary(child),
]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let elem = &tree.root_elements[0];
assert_eq!(elem.struct_type, "P");
assert_eq!(elem.mcids, vec![1, 3]);
assert_eq!(elem.children.len(), 1);
assert_eq!(elem.children[0].struct_type, "Span");
}
#[test]
fn test_page_structure_filtering() {
let store = build_store();
let page1 = ObjectId::new(5, 0);
let page2 = ObjectId::new(6, 0);
let mut p1 = struct_elem_dict("P");
p1.insert(Name::pg(), Object::Reference(page1));
let mut p2 = struct_elem_dict("P");
p2.insert(Name::pg(), Object::Reference(page2));
let mut p3 = struct_elem_dict("P");
p3.insert(Name::pg(), Object::Reference(page1));
let mut doc = struct_elem_dict("Document");
doc.insert(
Name::k(),
Object::Array(vec![
Object::Dictionary(p1),
Object::Dictionary(p2),
Object::Dictionary(p3),
]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let page1_struct = PageStructure::for_page(&tree, page1);
assert_eq!(page1_struct.elements.len(), 2);
for elem in &page1_struct.elements {
assert_eq!(elem.page_ref, Some(page1));
}
let page2_struct = PageStructure::for_page(&tree, page2);
assert_eq!(page2_struct.elements.len(), 1);
}
#[test]
fn test_mcid_mapping_lookup() {
let store = build_store();
let page_id = ObjectId::new(7, 0);
let mut p = struct_elem_dict("P");
p.insert(Name::pg(), Object::Reference(page_id));
p.insert(
Name::k(),
Object::Array(vec![Object::Integer(0), Object::Integer(1)]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let mapping = McidMapping::from_struct_tree(&tree);
let elem = mapping.element_for_mcid(page_id, 0).unwrap();
assert_eq!(elem.struct_type, "P");
let elem1 = mapping.element_for_mcid(page_id, 1).unwrap();
assert_eq!(elem1.struct_type, "P");
assert!(mapping.element_for_mcid(page_id, 99).is_none());
assert!(mapping.element_for_mcid(ObjectId::new(999, 0), 0).is_none());
}
#[test]
fn test_security_limit_truncates_large_tree() {
let store = build_store();
let count = MAX_ELEMENTS + 10;
let arr: Vec<Object> = (0..count)
.map(|_| Object::Dictionary(struct_elem_dict("P")))
.collect();
let mut doc = struct_elem_dict("Document");
doc.insert(Name::k(), Object::Array(arr));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let total = count_elements(&tree.root_elements);
assert!(total <= MAX_ELEMENTS + 1); }
fn count_elements(roots: &[StructElement]) -> usize {
let mut count = 0;
let mut stack: Vec<&StructElement> = roots.iter().collect();
while let Some(elem) = stack.pop() {
count += 1;
for child in &elem.children {
stack.push(child);
}
}
count
}
#[test]
fn test_title_and_id_extraction() {
let store = build_store();
let mut elem = struct_elem_dict("Table");
elem.insert(Name::t(), str_obj("Sales Data 2026"));
elem.insert(Name::id(), str_obj("table-001"));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(elem));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(
tree.root_elements[0].title.as_deref(),
Some("Sales Data 2026")
);
assert_eq!(tree.root_elements[0].id.as_deref(), Some("table-001"));
}
#[test]
fn test_role_map_name_for_lookup() {
let store = build_store();
let mut role_map_dict = HashMap::new();
role_map_dict.insert(Name::from("MyTag"), name_obj("P"));
role_map_dict.insert(Name::from("CustomH"), name_obj("H1"));
let mut root_dict = HashMap::new();
root_dict.insert(Name::role_map(), Object::Dictionary(role_map_dict));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.role_map_name_for("MyTag"), "P");
assert_eq!(tree.role_map_name_for("CustomH"), "H1");
assert_eq!(tree.role_map_name_for("P"), "P");
assert_eq!(tree.role_map_name_for("UnknownTag"), "UnknownTag");
}
#[test]
fn test_actual_text_and_lang() {
let store = build_store();
let mut span = struct_elem_dict("Span");
span.insert(Name::actual_text(), str_obj("Hello World"));
span.insert(Name::lang(), str_obj("en-US"));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(span));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(
tree.root_elements[0].actual_text.as_deref(),
Some("Hello World")
);
assert_eq!(tree.root_elements[0].lang.as_deref(), Some("en-US"));
}
#[test]
fn test_struct_element_with_attributes() {
let store = build_store();
let mut attr_dict = HashMap::new();
attr_dict.insert(Name::o(), name_obj("Layout"));
attr_dict.insert(Name::from("WritingMode"), name_obj("LrTb"));
attr_dict.insert(Name::from("SpaceBefore"), Object::Real(12.0));
let mut td = struct_elem_dict("TD");
td.insert(Name::a(), Object::Dictionary(attr_dict));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(td));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let elem = &tree.root_elements[0];
assert_eq!(elem.struct_type, "TD");
assert_eq!(elem.attributes.len(), 1);
assert_eq!(elem.attributes[0].owner, "Layout");
assert!(elem.attributes[0].entries.len() >= 2);
let writing_mode = elem.attributes[0]
.entries
.iter()
.find(|(k, _)| k == "WritingMode");
assert!(writing_mode.is_some());
match &writing_mode.unwrap().1 {
AttributeValue::Name(n) => assert_eq!(n, "LrTb"),
_ => panic!("expected Name attribute value"),
}
}
#[test]
fn test_struct_element_with_attribute_array() {
let store = build_store();
let mut attr1 = HashMap::new();
attr1.insert(Name::o(), name_obj("Layout"));
attr1.insert(Name::from("TextAlign"), name_obj("Center"));
let mut attr2 = HashMap::new();
attr2.insert(Name::o(), name_obj("Table"));
attr2.insert(Name::from("RowSpan"), Object::Integer(2));
let mut td = struct_elem_dict("TD");
td.insert(
Name::a(),
Object::Array(vec![Object::Dictionary(attr1), Object::Dictionary(attr2)]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(td));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let elem = &tree.root_elements[0];
assert_eq!(elem.attributes.len(), 2);
assert_eq!(elem.attributes[0].owner, "Layout");
assert_eq!(elem.attributes[1].owner, "Table");
}
#[test]
fn test_struct_element_no_attributes() {
let store = build_store();
let p = struct_elem_dict("P");
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert!(tree.root_elements[0].attributes.is_empty());
}
#[test]
fn test_struct_element_obj_type_none_by_default() {
let store = build_store();
let p = struct_elem_dict("P");
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert!(tree.root_elements[0].obj_type.is_none());
}
#[test]
fn test_struct_element_obj_type_parsed() {
let store = build_store();
let mut elem = struct_elem_dict("Span");
elem.insert(Name::obj_type(), Object::Name(Name::from("Elem")));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(elem));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
assert_eq!(tree.root_elements[0].obj_type.as_deref(), Some("Elem"));
}
#[test]
fn test_elements_for_page_ref_filters_correctly() {
let store = build_store();
let page1 = ObjectId::new(5, 0);
let page2 = ObjectId::new(6, 0);
let mut p1 = struct_elem_dict("P");
p1.insert(Name::pg(), Object::Reference(page1));
let mut p2 = struct_elem_dict("H1");
p2.insert(Name::pg(), Object::Reference(page2));
let mut p3 = struct_elem_dict("Span");
p3.insert(Name::pg(), Object::Reference(page1));
let mut doc = struct_elem_dict("Document");
doc.insert(
Name::k(),
Object::Array(vec![
Object::Dictionary(p1),
Object::Dictionary(p2),
Object::Dictionary(p3),
]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let page1_elems: Vec<&StructElement> = tree.elements_for_page_ref(page1).collect();
assert_eq!(page1_elems.len(), 2);
assert!(page1_elems.iter().all(|e| e.page_ref == Some(page1)));
let types: Vec<&str> = page1_elems.iter().map(|e| e.struct_type.as_str()).collect();
assert!(types.contains(&"P"));
assert!(types.contains(&"Span"));
let page2_elems: Vec<&StructElement> = tree.elements_for_page_ref(page2).collect();
assert_eq!(page2_elems.len(), 1);
assert_eq!(page2_elems[0].struct_type, "H1");
let absent = ObjectId::new(99, 0);
let absent_elems: Vec<&StructElement> = tree.elements_for_page_ref(absent).collect();
assert!(absent_elems.is_empty());
}
#[test]
fn test_elements_for_page_with_valid_index() {
let store = build_store();
let page0 = ObjectId::new(10, 0);
let page1 = ObjectId::new(11, 0);
let mut h1 = struct_elem_dict("H1");
h1.insert(Name::pg(), Object::Reference(page0));
let mut p = struct_elem_dict("P");
p.insert(Name::pg(), Object::Reference(page1));
let mut doc = struct_elem_dict("Document");
doc.insert(
Name::k(),
Object::Array(vec![Object::Dictionary(h1), Object::Dictionary(p)]),
);
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(doc));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let page_ids = vec![page0, page1];
let idx0_elems: Vec<&StructElement> = tree.elements_for_page(0, &page_ids).collect();
assert_eq!(idx0_elems.len(), 1);
assert_eq!(idx0_elems[0].struct_type, "H1");
let idx1_elems: Vec<&StructElement> = tree.elements_for_page(1, &page_ids).collect();
assert_eq!(idx1_elems.len(), 1);
assert_eq!(idx1_elems[0].struct_type, "P");
}
#[test]
fn test_find_elements_for_mcid_empty() {
let result = find_elements_for_mcid(&[], 42);
assert!(result.is_empty());
}
#[test]
fn test_find_elements_for_mcid_found() {
let elem0 = StructElement {
struct_type: "P".to_string(),
obj_type: None,
alt_text: None,
actual_text: None,
lang: None,
title: None,
id: None,
page_ref: None,
mcids: vec![5],
children: Vec::new(),
attributes: Vec::new(),
parent_index: None,
};
let elem1 = StructElement {
struct_type: "Span".to_string(),
obj_type: None,
alt_text: None,
actual_text: None,
lang: None,
title: None,
id: None,
page_ref: None,
mcids: vec![10],
children: Vec::new(),
attributes: Vec::new(),
parent_index: None,
};
let elements = vec![elem0, elem1];
let result = find_elements_for_mcid(&elements, 5);
assert_eq!(result, vec![0]);
let result = find_elements_for_mcid(&elements, 10);
assert_eq!(result, vec![1]);
let child = StructElement {
struct_type: "Span".to_string(),
obj_type: None,
alt_text: None,
actual_text: None,
lang: None,
title: None,
id: None,
page_ref: None,
mcids: vec![99],
children: Vec::new(),
attributes: Vec::new(),
parent_index: None,
};
let parent = StructElement {
struct_type: "P".to_string(),
obj_type: None,
alt_text: None,
actual_text: None,
lang: None,
title: None,
id: None,
page_ref: None,
mcids: Vec::new(),
children: vec![child],
attributes: Vec::new(),
parent_index: None,
};
let result = find_elements_for_mcid(&[parent], 99);
assert_eq!(result, vec![0]);
}
#[test]
fn test_find_elements_for_mcid_not_found() {
let elem = StructElement {
struct_type: "P".to_string(),
obj_type: None,
alt_text: None,
actual_text: None,
lang: None,
title: None,
id: None,
page_ref: None,
mcids: vec![1, 2, 3],
children: Vec::new(),
attributes: Vec::new(),
parent_index: None,
};
let result = find_elements_for_mcid(&[elem], 999);
assert!(result.is_empty());
}
#[test]
fn test_elements_for_page_out_of_range_returns_empty() {
let store = build_store();
let page0 = ObjectId::new(20, 0);
let mut p = struct_elem_dict("P");
p.insert(Name::pg(), Object::Reference(page0));
let mut root_dict = HashMap::new();
root_dict.insert(Name::k(), Object::Dictionary(p));
let mut catalog = HashMap::new();
catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
let page_ids = vec![page0];
let elems: Vec<&StructElement> = tree.elements_for_page(5, &page_ids).collect();
assert!(elems.is_empty());
}
}