use super::types::{
ActualTextIndex, McidScope, StructChild, StructElem, StructTreeRoot, StructType,
};
use crate::error::Error;
use std::sync::Arc;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ListRole {
LI,
Lbl,
LBody,
}
#[derive(Debug, Clone)]
pub struct OrderedContent {
pub page: u32,
pub mcid: Option<u32>,
pub struct_type: String,
pub parsed_type: StructType,
pub is_heading: bool,
pub heading_level: Option<u8>,
pub list_role: Option<ListRole>,
pub is_block: bool,
pub is_word_break: bool,
pub block_id: u32,
pub in_table: bool,
pub preformatted: bool,
pub section_id: Option<u32>,
pub actual_text: Option<String>,
pub mcid_scope: Option<McidScope>,
}
#[derive(Debug, Clone, Copy, Default)]
struct InheritedContext {
heading_level: Option<u8>,
list_role: Option<ListRole>,
block_id: u32,
section_id: Option<u32>,
in_table: bool,
preformatted: bool,
}
impl InheritedContext {
fn is_paragraph_block(t: &StructType) -> bool {
matches!(
t,
StructType::P
| StructType::H
| StructType::H1
| StructType::H2
| StructType::H3
| StructType::H4
| StructType::H5
| StructType::H6
| StructType::LI
| StructType::Lbl
| StructType::LBody
| StructType::Sect
| StructType::Div
| StructType::Art
| StructType::Part
| StructType::Note
| StructType::Reference
| StructType::BibEntry
| StructType::Code
| StructType::TR
| StructType::TH
| StructType::TD
)
}
fn descend(self, child: &StructType, counter: &mut u32) -> Self {
let heading_level = match child {
StructType::H1 => Some(1),
StructType::H2 => Some(2),
StructType::H3 => Some(3),
StructType::H4 => Some(4),
StructType::H5 => Some(5),
StructType::H6 => Some(6),
StructType::H => Some(self.heading_level.unwrap_or(1)),
_ => self.heading_level,
};
let list_role = match child {
StructType::Lbl => Some(ListRole::Lbl),
StructType::LBody => Some(ListRole::LBody),
StructType::LI => Some(self.list_role.unwrap_or(ListRole::LI)),
StructType::L => self.list_role,
_ => self.list_role,
};
let block_id = if Self::is_paragraph_block(child) {
*counter += 1;
*counter
} else {
self.block_id
};
let section_id = match child {
StructType::Sect | StructType::Art | StructType::Part => Some(block_id),
_ => self.section_id,
};
let in_table = self.in_table
|| matches!(
child,
StructType::Table
| StructType::THead
| StructType::TBody
| StructType::TFoot
| StructType::TR
| StructType::TH
| StructType::TD
);
let preformatted = self.preformatted || matches!(child, StructType::Code);
Self {
heading_level,
list_role,
block_id,
section_id,
in_table,
preformatted,
}
}
}
pub fn traverse_structure_tree(
struct_tree: &StructTreeRoot,
page_num: u32,
) -> Result<Vec<OrderedContent>, Error> {
let mut result = Vec::new();
let mut block_counter = 0u32;
for root_elem in &struct_tree.root_elements {
traverse_element(
root_elem,
page_num,
InheritedContext::default(),
&mut block_counter,
&mut result,
)?;
}
Ok(result)
}
pub fn traverse_structure_tree_all_pages(
struct_tree: &StructTreeRoot,
) -> std::collections::HashMap<u32, Vec<OrderedContent>> {
let mut result: std::collections::HashMap<u32, Vec<OrderedContent>> =
std::collections::HashMap::new();
let mut block_counter = 0u32;
for root_elem in &struct_tree.root_elements {
traverse_element_all_pages(
root_elem,
InheritedContext::default(),
&mut block_counter,
&mut result,
);
}
result
}
fn traverse_element_all_pages(
elem: &StructElem,
ctx: InheritedContext,
block_counter: &mut u32,
result: &mut std::collections::HashMap<u32, Vec<OrderedContent>>,
) {
let struct_type_str = format!("{:?}", elem.struct_type);
let parsed_type = elem.struct_type.clone();
let descended = ctx.descend(&parsed_type, block_counter);
let is_heading_inherited = descended.heading_level.is_some();
let is_block = elem.struct_type.is_block();
let is_word_break = elem.struct_type.is_word_break();
for child in &elem.children {
match child {
StructChild::MarkedContentRef {
mcid,
page,
scope: mcid_scope,
} => {
result.entry(*page).or_default().push(OrderedContent {
page: *page,
mcid: Some(*mcid),
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: is_heading_inherited,
heading_level: descended.heading_level,
list_role: descended.list_role,
is_block,
is_word_break: false,
block_id: descended.block_id,
section_id: descended.section_id,
in_table: descended.in_table,
preformatted: descended.preformatted,
actual_text: None,
mcid_scope: Some(mcid_scope.clone()),
});
},
StructChild::StructElem(child_elem) => {
if is_word_break {
let child_pages = collect_pages(child_elem);
for page in child_pages {
result.entry(page).or_default().push(OrderedContent {
page,
mcid: None,
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: false,
heading_level: None,
list_role: descended.list_role,
is_block: false,
is_word_break: true,
block_id: descended.block_id,
section_id: descended.section_id,
in_table: descended.in_table,
preformatted: descended.preformatted,
actual_text: None,
mcid_scope: None,
});
}
}
traverse_element_all_pages(child_elem, descended, block_counter, result);
},
StructChild::ObjectRef(_obj_num, _gen) => {
log::debug!("Skipping unresolved ObjectRef({}, {})", _obj_num, _gen);
},
}
}
}
fn collect_pages(elem: &StructElem) -> Vec<u32> {
let mut pages = Vec::new();
collect_pages_recursive(elem, &mut pages);
pages.sort_unstable();
pages.dedup();
pages
}
fn collect_pages_recursive(elem: &StructElem, pages: &mut Vec<u32>) {
if let Some(page) = elem.page {
pages.push(page);
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, .. } => {
pages.push(*page);
},
StructChild::StructElem(child_elem) => {
collect_pages_recursive(child_elem, pages);
},
_ => {},
}
}
}
fn traverse_element(
elem: &StructElem,
target_page: u32,
ctx: InheritedContext,
block_counter: &mut u32,
result: &mut Vec<OrderedContent>,
) -> Result<(), Error> {
let struct_type_str = format!("{:?}", elem.struct_type);
let parsed_type = elem.struct_type.clone();
let descended = ctx.descend(&parsed_type, block_counter);
let is_heading_inherited = descended.heading_level.is_some();
let is_block = elem.struct_type.is_block();
let is_word_break = elem.struct_type.is_word_break();
if is_word_break {
result.push(OrderedContent {
page: target_page,
mcid: None,
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: false,
heading_level: None,
list_role: descended.list_role,
is_block: false,
is_word_break: true,
block_id: descended.block_id,
section_id: descended.section_id,
in_table: descended.in_table,
preformatted: descended.preformatted,
actual_text: None,
mcid_scope: None,
});
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef {
mcid,
page,
scope: mcid_scope,
} => {
if *page == target_page {
result.push(OrderedContent {
page: *page,
mcid: Some(*mcid),
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: is_heading_inherited,
heading_level: descended.heading_level,
list_role: descended.list_role,
is_block,
is_word_break: false,
block_id: descended.block_id,
section_id: descended.section_id,
in_table: descended.in_table,
preformatted: descended.preformatted,
actual_text: None,
mcid_scope: Some(mcid_scope.clone()),
});
}
},
StructChild::StructElem(child_elem) => {
traverse_element(child_elem, target_page, descended, block_counter, result)?;
},
StructChild::ObjectRef(_obj_num, _gen) => {
log::debug!("Skipping unresolved ObjectRef({}, {})", _obj_num, _gen);
},
}
}
Ok(())
}
#[cfg(test)]
fn has_content_on_page(elem: &StructElem, target_page: u32) -> bool {
if elem.page == Some(target_page) {
return true;
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, .. } => {
if *page == target_page {
return true;
}
},
StructChild::StructElem(child_elem) => {
if has_content_on_page(child_elem, target_page) {
return true;
}
},
_ => {},
}
}
false
}
pub fn build_actualtext_index(struct_tree: &StructTreeRoot) -> ActualTextIndex {
let mut idx = ActualTextIndex::new();
for root in &struct_tree.root_elements {
walk_actualtext(root, None, &mut idx);
}
idx
}
#[derive(Clone)]
struct ActiveScope {
text: Arc<str>,
first_page: Option<u32>,
}
fn walk_actualtext(elem: &StructElem, inherited: Option<ActiveScope>, idx: &mut ActualTextIndex) {
let own_text: Option<Arc<str>> = elem
.actual_text
.as_deref()
.filter(|s| !s.is_empty())
.map(Arc::from);
let active = if let Some(text) = own_text {
if has_any_mcr(elem) {
Some(ActiveScope {
text,
first_page: first_page_in_subtree(elem),
})
} else {
None
}
} else {
None
};
let scope = active.clone().or(inherited.clone());
for child in &elem.children {
match child {
StructChild::MarkedContentRef {
mcid,
page,
scope: mcid_scope,
} => {
if let Some(ref s) = scope {
let key = (mcid_scope.clone(), *mcid);
idx.covered_mcids.insert(key.clone());
let should_emit = match mcid_scope {
crate::structure::McidScope::Page(_) => s.first_page == Some(*page),
crate::structure::McidScope::Form(_)
| crate::structure::McidScope::Pattern(_) => true,
};
if should_emit {
idx.mcid_to_actual_text.insert(key, s.text.clone());
} else {
idx.suppress_only.insert(key);
}
}
},
StructChild::StructElem(child_elem) => {
walk_actualtext(child_elem, scope.clone(), idx);
},
StructChild::ObjectRef(_, _) => {
},
}
}
}
fn first_page_in_subtree(elem: &StructElem) -> Option<u32> {
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, scope, .. } => {
if matches!(scope, crate::structure::McidScope::Page(_)) {
return Some(*page);
}
},
StructChild::StructElem(c) => {
if let Some(p) = first_page_in_subtree(c) {
return Some(p);
}
},
StructChild::ObjectRef(_, _) => {},
}
}
None
}
fn has_any_mcr(elem: &StructElem) -> bool {
for child in &elem.children {
match child {
StructChild::MarkedContentRef { .. } => return true,
StructChild::StructElem(c) => {
if has_any_mcr(c) {
return true;
}
},
StructChild::ObjectRef(_, _) => {},
}
}
false
}
pub fn extract_reading_order(
struct_tree: &StructTreeRoot,
page_num: u32,
) -> Result<Vec<u32>, Error> {
let ordered_content = traverse_structure_tree(struct_tree, page_num)?;
Ok(ordered_content
.into_iter()
.filter_map(|c| c.mcid) .collect())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::structure::types::{StructChild, StructElem, StructType};
#[test]
fn test_simple_traversal() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0, 1]);
}
#[test]
fn test_page_filtering() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order_page_0 = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order_page_0, vec![0]);
let order_page_1 = extract_reading_order(&struct_tree, 1).unwrap();
assert_eq!(order_page_1, vec![1]);
}
#[test]
fn test_nested_structure() {
let mut root = StructElem::new(StructType::Document);
let mut sect = StructElem::new(StructType::Sect);
let mut h1 = StructElem::new(StructType::H1);
h1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
sect.add_child(StructChild::StructElem(Box::new(h1)));
sect.add_child(StructChild::StructElem(Box::new(p)));
root.add_child(StructChild::StructElem(Box::new(sect)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0, 1]);
}
#[test]
fn test_word_break_elements() {
let mut root = StructElem::new(StructType::P);
let mut span1 = StructElem::new(StructType::Span);
span1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let wb = StructElem::new(StructType::WB);
let mut span2 = StructElem::new(StructType::Span);
span2.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(span1)));
root.add_child(StructChild::StructElem(Box::new(wb)));
root.add_child(StructChild::StructElem(Box::new(span2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 3); assert_eq!(ordered[0].mcid, Some(0));
assert!(!ordered[0].is_word_break);
assert_eq!(ordered[1].mcid, None); assert!(ordered[1].is_word_break);
assert_eq!(ordered[2].mcid, Some(1));
assert!(!ordered[2].is_word_break);
let mcids = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(mcids, vec![0, 1]); }
#[test]
fn test_empty_tree() {
let struct_tree = StructTreeRoot::new();
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert!(order.is_empty());
}
#[test]
fn test_empty_page() {
let mut root = StructElem::new(StructType::Document);
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(p)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 5).unwrap();
assert!(order.is_empty());
}
#[test]
fn test_nested_heading_propagates_is_heading_to_inner_mcr() {
let mut h1 = StructElem::new(StructType::H1);
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
h1.add_child(StructChild::StructElem(Box::new(span)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(h1);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
let heading_mcrs: Vec<_> = ordered.iter().filter(|c| c.is_heading).collect();
assert_eq!(
heading_mcrs.len(),
1,
"H1 → Span → MCR must propagate is_heading=true to the inner MCR"
);
assert_eq!(heading_mcrs[0].mcid, Some(0));
let by_page = traverse_structure_tree_all_pages(&struct_tree);
let heading_mcrs_all: Vec<_> = by_page
.get(&0)
.unwrap()
.iter()
.filter(|c| c.is_heading)
.collect();
assert_eq!(heading_mcrs_all.len(), 1);
}
#[test]
fn test_nested_li_lbody_keeps_list_context() {
let mut li = StructElem::new(StructType::LI);
let mut lbody = StructElem::new(StructType::LBody);
lbody.add_child(StructChild::MarkedContentRef {
mcid: 7,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
li.add_child(StructChild::StructElem(Box::new(lbody)));
let mut l = StructElem::new(StructType::L);
l.add_child(StructChild::StructElem(Box::new(li)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(l);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
let li_mcrs: Vec<_> = ordered
.iter()
.filter(|c| matches!(c.list_role, Some(crate::structure::ListRole::LBody)))
.collect();
assert_eq!(
li_mcrs.len(),
1,
"LI → LBody → MCR must carry list_role=LBody on the inner MCR"
);
}
#[test]
fn test_nested_heading_propagates_for_h1_through_h6() {
let levels = [
(StructType::H1, 1u8),
(StructType::H2, 2),
(StructType::H3, 3),
(StructType::H4, 4),
(StructType::H5, 5),
(StructType::H6, 6),
];
for (h_type, expected_level) in levels {
let mut head = StructElem::new(h_type.clone());
let mut sect = StructElem::new(StructType::Sect);
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef {
mcid: 42,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
sect.add_child(StructChild::StructElem(Box::new(span)));
head.add_child(StructChild::StructElem(Box::new(sect)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(head);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
let item = ordered.iter().find(|c| c.mcid == Some(42)).unwrap();
assert!(
item.is_heading,
"H{} → Sect → Span → MCR must carry is_heading=true",
expected_level
);
assert_eq!(
item.heading_level,
Some(expected_level),
"H{} ancestor must propagate heading_level={}",
expected_level,
expected_level
);
}
}
#[test]
fn test_generic_h_without_level_defaults_to_h1() {
let mut h = StructElem::new(StructType::H);
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef {
mcid: 9,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
h.add_child(StructChild::StructElem(Box::new(span)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(h);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
let item = ordered.iter().find(|c| c.mcid == Some(9)).unwrap();
assert!(item.is_heading);
assert_eq!(item.heading_level, Some(1));
}
#[test]
fn test_heading_role_does_not_bleed_into_following_paragraph() {
let mut doc = StructElem::new(StructType::Document);
let mut h1 = StructElem::new(StructType::H1);
h1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
doc.add_child(StructChild::StructElem(Box::new(h1)));
doc.add_child(StructChild::StructElem(Box::new(p)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(doc);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
let h_item = ordered.iter().find(|c| c.mcid == Some(0)).unwrap();
let p_item = ordered.iter().find(|c| c.mcid == Some(1)).unwrap();
assert!(h_item.is_heading);
assert!(!p_item.is_heading, "sibling P must not inherit H1's flag");
assert_eq!(p_item.heading_level, None);
}
#[test]
fn test_list_role_variants() {
let mut l = StructElem::new(StructType::L);
let mut li_a = StructElem::new(StructType::LI);
li_a.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut li_b = StructElem::new(StructType::LI);
let mut lbl = StructElem::new(StructType::Lbl);
lbl.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut lbody = StructElem::new(StructType::LBody);
lbody.add_child(StructChild::MarkedContentRef {
mcid: 2,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
li_b.add_child(StructChild::StructElem(Box::new(lbl)));
li_b.add_child(StructChild::StructElem(Box::new(lbody)));
l.add_child(StructChild::StructElem(Box::new(li_a)));
l.add_child(StructChild::StructElem(Box::new(li_b)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(l);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
let m0 = ordered.iter().find(|c| c.mcid == Some(0)).unwrap();
let m1 = ordered.iter().find(|c| c.mcid == Some(1)).unwrap();
let m2 = ordered.iter().find(|c| c.mcid == Some(2)).unwrap();
assert!(matches!(m0.list_role, Some(ListRole::LI)));
assert!(matches!(m1.list_role, Some(ListRole::Lbl)));
assert!(matches!(m2.list_role, Some(ListRole::LBody)));
assert!(!m0.is_heading && !m1.is_heading && !m2.is_heading);
}
#[test]
fn test_block_id_groups_within_block_and_changes_across() {
let mut doc = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
let mut span_a = StructElem::new(StructType::Span);
span_a.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut span_b = StructElem::new(StructType::Span);
span_b.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
p1.add_child(StructChild::StructElem(Box::new(span_a)));
p1.add_child(StructChild::StructElem(Box::new(span_b)));
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef {
mcid: 2,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
doc.add_child(StructChild::StructElem(Box::new(p1)));
doc.add_child(StructChild::StructElem(Box::new(p2)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(doc);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
let m0 = ordered.iter().find(|c| c.mcid == Some(0)).unwrap();
let m1 = ordered.iter().find(|c| c.mcid == Some(1)).unwrap();
let m2 = ordered.iter().find(|c| c.mcid == Some(2)).unwrap();
assert_eq!(m0.block_id, m1.block_id, "two MCRs inside the same /P must share block_id");
assert_ne!(
m0.block_id, m2.block_id,
"MCRs in different /P elements must have different block_id"
);
assert!(m0.block_id > 0, "block_id should be positive once any block is entered");
}
#[test]
fn test_root_span_has_block_id_zero() {
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let ordered = traverse_structure_tree(&tree, 0).unwrap();
assert_eq!(ordered[0].block_id, 0);
}
#[test]
fn test_object_ref_skipped() {
let mut root = StructElem::new(StructType::Document);
root.add_child(StructChild::ObjectRef(42, 0));
root.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0]);
}
#[test]
fn test_traverse_all_pages() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
let mut p3 = StructElem::new(StructType::P);
p3.add_child(StructChild::MarkedContentRef {
mcid: 2,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
root.add_child(StructChild::StructElem(Box::new(p3)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert_eq!(all_pages.len(), 2); assert_eq!(all_pages[&0].len(), 2); assert_eq!(all_pages[&1].len(), 1); }
#[test]
fn test_actual_text_descendants_recorded_for_assembler_suppression() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Replacement text".to_string());
elem.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 1);
assert_eq!(ordered[0].mcid, Some(0));
assert_eq!(ordered[0].actual_text, None);
let idx = build_actualtext_index(&struct_tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 0)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 0))
.map(|s| &**s),
Some("Replacement text")
);
}
#[test]
fn test_actual_text_wrong_page() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Replacement".to_string());
elem.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert!(ordered.is_empty());
let idx = build_actualtext_index(&struct_tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(1), 0)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(1), 0))
.map(|s| &**s),
Some("Replacement")
);
}
#[test]
fn test_heading_and_block_flags() {
let mut root = StructElem::new(StructType::Document);
let mut h1 = StructElem::new(StructType::H1);
h1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
root.add_child(StructChild::StructElem(Box::new(h1)));
root.add_child(StructChild::StructElem(Box::new(span)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 2);
assert!(ordered[0].is_heading);
assert!(ordered[0].is_block);
assert!(!ordered[1].is_heading);
assert!(!ordered[1].is_block);
}
#[test]
fn test_collect_pages() {
let mut elem = StructElem::new(StructType::Document);
elem.page = Some(0);
let mut child = StructElem::new(StructType::P);
child.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
child.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 2,
scope: crate::structure::McidScope::Page(2),
});
elem.add_child(StructChild::StructElem(Box::new(child)));
let pages = collect_pages(&elem);
assert_eq!(pages, vec![0, 1, 2]);
}
#[test]
fn test_traverse_all_pages_with_actual_text_does_not_repeat_per_page() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Hello".to_string());
elem.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
elem.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert!(all_pages.contains_key(&0));
assert!(all_pages.contains_key(&1));
for items in all_pages.values() {
for item in items {
assert!(item.actual_text.is_none());
}
}
let idx = build_actualtext_index(&struct_tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 0)));
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(1), 1)));
assert!(idx
.mcid_to_actual_text
.contains_key(&(crate::structure::McidScope::Page(0), 0)));
assert!(idx
.suppress_only
.contains(&(crate::structure::McidScope::Page(1), 1)));
}
#[test]
fn test_traverse_all_pages_word_break_with_children() {
let mut root = StructElem::new(StructType::P);
let mut wb = StructElem::new(StructType::WB);
let mut child = StructElem::new(StructType::Span);
child.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
wb.add_child(StructChild::StructElem(Box::new(child)));
root.add_child(StructChild::StructElem(Box::new(wb)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
let page0 = &all_pages[&0];
assert!(page0.iter().any(|c| c.is_word_break));
assert!(page0.iter().any(|c| c.mcid == Some(0)));
}
#[test]
fn test_traverse_all_pages_object_ref() {
let mut root = StructElem::new(StructType::Document);
root.add_child(StructChild::ObjectRef(99, 0));
root.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert_eq!(all_pages[&0].len(), 1);
assert_eq!(all_pages[&0][0].mcid, Some(0));
}
#[test]
fn test_has_content_on_page_deep() {
let mut root = StructElem::new(StructType::Document);
let mut sect = StructElem::new(StructType::Sect);
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 3,
scope: crate::structure::McidScope::Page(3),
});
sect.add_child(StructChild::StructElem(Box::new(p)));
root.add_child(StructChild::StructElem(Box::new(sect)));
assert!(has_content_on_page(&root, 3));
assert!(!has_content_on_page(&root, 0));
}
#[test]
fn test_actualtext_index_simple_single_mcid() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("fi".to_string());
span.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 0)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 0))
.map(|s| &**s),
Some("fi")
);
assert!(idx.suppress_only.is_empty());
}
#[test]
fn test_actualtext_index_nested_inner_wins() {
let mut inner = StructElem::new(StructType::Span);
inner.actual_text = Some("inner".to_string());
inner.add_child(StructChild::MarkedContentRef {
mcid: 5,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut outer = StructElem::new(StructType::Span);
outer.actual_text = Some("outer".to_string());
outer.add_child(StructChild::StructElem(Box::new(inner)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(outer);
let idx = build_actualtext_index(&tree);
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 5))
.map(|s| &**s),
Some("inner")
);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 5)));
}
#[test]
fn test_actualtext_index_nested_outer_sibling_with_inner_subtree() {
let mut inner = StructElem::new(StructType::Span);
inner.actual_text = Some("I".to_string());
inner.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut outer = StructElem::new(StructType::Span);
outer.actual_text = Some("O".to_string());
outer.add_child(StructChild::StructElem(Box::new(inner)));
outer.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(outer);
let idx = build_actualtext_index(&tree);
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 0))
.map(|s| &**s),
Some("I")
);
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 1))
.map(|s| &**s),
Some("O")
);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 0)));
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 1)));
}
#[test]
fn test_actualtext_index_multi_page_first_page_emits_others_suppress() {
let mut h1 = StructElem::new(StructType::H1);
h1.actual_text = Some("Heading X".to_string());
h1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
h1.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(h1);
let idx = build_actualtext_index(&tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(1), 0)));
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 1)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(1), 0))
.map(|s| &**s),
Some("Heading X")
);
assert!(idx
.suppress_only
.contains(&(crate::structure::McidScope::Page(0), 1)));
assert!(!idx
.mcid_to_actual_text
.contains_key(&(crate::structure::McidScope::Page(0), 1)));
}
#[test]
fn test_actualtext_index_multi_mcid_subtree() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("expanded".to_string());
for m in [7, 8, 9] {
span.add_child(StructChild::MarkedContentRef {
mcid: m,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
}
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
for m in [7, 8, 9] {
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), m)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), m))
.map(|s| &**s),
Some("expanded")
);
}
}
#[test]
fn test_actualtext_index_no_actualtext_yields_empty() {
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(p);
let idx = build_actualtext_index(&tree);
assert!(idx.is_empty());
assert!(idx.mcid_to_actual_text.is_empty());
assert!(idx.covered_mcids.is_empty());
}
#[test]
fn test_actualtext_index_empty_actualtext_is_ignored() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some(String::new());
span.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
assert!(idx.is_empty());
}
#[test]
fn test_actualtext_index_no_descendant_mcid_drops_scope() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("ghost".to_string());
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
assert!(idx.is_empty());
}
#[test]
fn test_actualtext_index_figure_with_actualtext() {
let mut fig = StructElem::new(StructType::Figure);
fig.actual_text = Some("logo text".to_string());
fig.add_child(StructChild::MarkedContentRef {
mcid: 4,
page: 2,
scope: crate::structure::McidScope::Page(2),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(fig);
let idx = build_actualtext_index(&tree);
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(2), 4))
.map(|s| &**s),
Some("logo text")
);
}
#[test]
fn test_actualtext_index_cross_page_mcid_collision() {
let mut h1 = StructElem::new(StructType::H1);
h1.actual_text = Some("Heading".to_string());
h1.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
let mut doc = StructElem::new(StructType::Document);
doc.add_child(StructChild::StructElem(Box::new(h1)));
doc.add_child(StructChild::StructElem(Box::new(p)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(doc);
let idx = build_actualtext_index(&tree);
assert!(idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 0)));
assert!(!idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(1), 0)));
assert!(!idx
.suppress_only
.contains(&(crate::structure::McidScope::Page(1), 0)));
assert_eq!(
idx.mcid_to_actual_text
.get(&(crate::structure::McidScope::Page(0), 0))
.map(|s| &**s),
Some("Heading")
);
}
#[test]
fn two_forms_with_same_mcid_on_same_page_do_not_collide() {
let form_a = crate::object::ObjectRef::new(100, 0);
let form_b = crate::object::ObjectRef::new(101, 0);
let mut span_a = StructElem::new(StructType::Span);
span_a.actual_text = Some("X".to_string());
span_a.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Form(form_a),
});
let mut span_b = StructElem::new(StructType::Span);
span_b.actual_text = Some("Y".to_string());
span_b.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Form(form_b),
});
let mut doc = StructElem::new(StructType::Document);
doc.add_child(StructChild::StructElem(Box::new(span_a)));
doc.add_child(StructChild::StructElem(Box::new(span_b)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(doc);
let idx = build_actualtext_index(&tree);
let key_a = (crate::structure::McidScope::Form(form_a), 0);
let key_b = (crate::structure::McidScope::Form(form_b), 0);
assert!(idx.covered_mcids.contains(&key_a));
assert!(idx.covered_mcids.contains(&key_b));
assert_eq!(idx.mcid_to_actual_text.get(&key_a).map(|s| &**s), Some("X"));
assert_eq!(idx.mcid_to_actual_text.get(&key_b).map(|s| &**s), Some("Y"));
}
#[test]
fn actualtext_with_stm_form_resolves_to_form_scope() {
let form_ref = crate::object::ObjectRef::new(42, 0);
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("alt".to_string());
span.add_child(StructChild::MarkedContentRef {
mcid: 3,
page: 0,
scope: crate::structure::McidScope::Form(form_ref),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
let key = (crate::structure::McidScope::Form(form_ref), 3);
assert!(idx.covered_mcids.contains(&key));
assert_eq!(idx.mcid_to_actual_text.get(&key).map(|s| &**s), Some("alt"));
assert!(!idx
.covered_mcids
.contains(&(crate::structure::McidScope::Page(0), 3)));
}
#[test]
fn actualtext_with_stm_pattern_resolves_to_pattern_scope() {
let pattern_ref = crate::object::ObjectRef::new(7, 0);
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("dec".to_string());
span.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 0,
scope: crate::structure::McidScope::Pattern(pattern_ref),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
let key = (crate::structure::McidScope::Pattern(pattern_ref), 1);
assert!(idx.covered_mcids.contains(&key));
assert_eq!(idx.mcid_to_actual_text.get(&key).map(|s| &**s), Some("dec"));
}
#[test]
fn pattern_with_actualtext_keys_under_pattern_scope() {
let pat_a = crate::object::ObjectRef::new(70, 0);
let pat_b = crate::object::ObjectRef::new(71, 0);
let mut span_a = StructElem::new(StructType::Span);
span_a.actual_text = Some("alpha".to_string());
span_a.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Pattern(pat_a),
});
let mut span_b = StructElem::new(StructType::Span);
span_b.actual_text = Some("beta".to_string());
span_b.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Pattern(pat_b),
});
let mut doc = StructElem::new(StructType::Document);
doc.add_child(StructChild::StructElem(Box::new(span_a)));
doc.add_child(StructChild::StructElem(Box::new(span_b)));
let mut tree = StructTreeRoot::new();
tree.add_root_element(doc);
let idx = build_actualtext_index(&tree);
let ka = (crate::structure::McidScope::Pattern(pat_a), 0);
let kb = (crate::structure::McidScope::Pattern(pat_b), 0);
assert_eq!(idx.mcid_to_actual_text.get(&ka).map(|s| &**s), Some("alpha"));
assert_eq!(idx.mcid_to_actual_text.get(&kb).map(|s| &**s), Some("beta"));
}
#[test]
fn actualtext_without_stm_falls_back_to_page_scope() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("plain".to_string());
span.add_child(StructChild::MarkedContentRef {
mcid: 5,
page: 2,
scope: crate::structure::McidScope::Page(2),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
let key = (crate::structure::McidScope::Page(2), 5);
assert!(idx.covered_mcids.contains(&key));
assert_eq!(idx.mcid_to_actual_text.get(&key).map(|s| &**s), Some("plain"));
}
#[test]
fn malformed_mcr_dict_does_not_panic_in_builder() {
let mut span = StructElem::new(StructType::Span);
span.actual_text = Some("ghost".to_string());
let mut tree = StructTreeRoot::new();
tree.add_root_element(span);
let idx = build_actualtext_index(&tree);
assert!(idx.is_empty());
}
#[test]
fn mixed_scopes_under_one_actualtext_use_per_namespace_rules() {
let form_ref = crate::object::ObjectRef::new(50, 0);
let mut outer = StructElem::new(StructType::Span);
outer.actual_text = Some("alt".to_string());
outer.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Page(0),
});
outer.add_child(StructChild::MarkedContentRef {
mcid: 1,
page: 1,
scope: crate::structure::McidScope::Page(1),
});
outer.add_child(StructChild::MarkedContentRef {
mcid: 0,
page: 0,
scope: crate::structure::McidScope::Form(form_ref),
});
let mut tree = StructTreeRoot::new();
tree.add_root_element(outer);
let idx = build_actualtext_index(&tree);
let page0 = (crate::structure::McidScope::Page(0), 0);
let page1 = (crate::structure::McidScope::Page(1), 1);
let formk = (crate::structure::McidScope::Form(form_ref), 0);
assert_eq!(idx.mcid_to_actual_text.get(&page0).map(|s| &**s), Some("alt"));
assert!(idx.suppress_only.contains(&page1));
assert!(!idx.mcid_to_actual_text.contains_key(&page1));
assert_eq!(idx.mcid_to_actual_text.get(&formk).map(|s| &**s), Some("alt"));
}
}