use super::types::{StructChild, StructElem, StructTreeRoot, StructType};
use crate::error::Error;
#[derive(Debug, Clone)]
pub struct OrderedContent {
pub page: u32,
pub mcid: Option<u32>,
pub struct_type: String,
pub parsed_type: StructType,
pub is_heading: bool,
pub is_block: bool,
pub is_word_break: bool,
pub actual_text: Option<String>,
}
pub fn traverse_structure_tree(
struct_tree: &StructTreeRoot,
page_num: u32,
) -> Result<Vec<OrderedContent>, Error> {
let mut result = Vec::new();
for root_elem in &struct_tree.root_elements {
traverse_element(root_elem, page_num, &mut result)?;
}
Ok(result)
}
pub fn traverse_structure_tree_all_pages(
struct_tree: &StructTreeRoot,
) -> std::collections::HashMap<u32, Vec<OrderedContent>> {
let mut result: std::collections::HashMap<u32, Vec<OrderedContent>> =
std::collections::HashMap::new();
for root_elem in &struct_tree.root_elements {
traverse_element_all_pages(root_elem, &mut result);
}
result
}
fn traverse_element_all_pages(
elem: &StructElem,
result: &mut std::collections::HashMap<u32, Vec<OrderedContent>>,
) {
let struct_type_str = format!("{:?}", elem.struct_type);
let parsed_type = elem.struct_type.clone();
let is_heading = elem.struct_type.is_heading();
let is_block = elem.struct_type.is_block();
let is_word_break = elem.struct_type.is_word_break();
if let Some(ref actual_text) = elem.actual_text {
let pages = collect_pages(elem);
for page in pages {
result.entry(page).or_default().push(OrderedContent {
page,
mcid: None,
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading,
is_block,
is_word_break: false,
actual_text: Some(actual_text.clone()),
});
}
return;
}
if is_word_break {
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { mcid, page } => {
result.entry(*page).or_default().push(OrderedContent {
page: *page,
mcid: Some(*mcid),
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading,
is_block,
is_word_break: false,
actual_text: None,
});
},
StructChild::StructElem(child_elem) => {
if is_word_break {
let child_pages = collect_pages(child_elem);
for page in child_pages {
result.entry(page).or_default().push(OrderedContent {
page,
mcid: None,
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: false,
is_block: false,
is_word_break: true,
actual_text: None,
});
}
}
traverse_element_all_pages(child_elem, result);
},
StructChild::ObjectRef(_obj_num, _gen) => {
log::debug!("Skipping unresolved ObjectRef({}, {})", _obj_num, _gen);
},
}
}
}
fn collect_pages(elem: &StructElem) -> Vec<u32> {
let mut pages = Vec::new();
collect_pages_recursive(elem, &mut pages);
pages.sort_unstable();
pages.dedup();
pages
}
fn collect_pages_recursive(elem: &StructElem, pages: &mut Vec<u32>) {
if let Some(page) = elem.page {
pages.push(page);
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, .. } => {
pages.push(*page);
},
StructChild::StructElem(child_elem) => {
collect_pages_recursive(child_elem, pages);
},
_ => {},
}
}
}
fn traverse_element(
elem: &StructElem,
target_page: u32,
result: &mut Vec<OrderedContent>,
) -> Result<(), Error> {
let struct_type_str = format!("{:?}", elem.struct_type);
let parsed_type = elem.struct_type.clone();
let is_heading = elem.struct_type.is_heading();
let is_block = elem.struct_type.is_block();
let is_word_break = elem.struct_type.is_word_break();
if let Some(ref actual_text) = elem.actual_text {
if has_content_on_page(elem, target_page) {
result.push(OrderedContent {
page: target_page,
mcid: None,
struct_type: struct_type_str,
parsed_type,
is_heading,
is_block,
is_word_break: false,
actual_text: Some(actual_text.clone()),
});
return Ok(());
}
}
if is_word_break {
result.push(OrderedContent {
page: target_page,
mcid: None,
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading: false,
is_block: false,
is_word_break: true,
actual_text: None,
});
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { mcid, page } => {
if *page == target_page {
result.push(OrderedContent {
page: *page,
mcid: Some(*mcid),
struct_type: struct_type_str.clone(),
parsed_type: parsed_type.clone(),
is_heading,
is_block,
is_word_break: false,
actual_text: None,
});
}
},
StructChild::StructElem(child_elem) => {
traverse_element(child_elem, target_page, result)?;
},
StructChild::ObjectRef(_obj_num, _gen) => {
log::debug!("Skipping unresolved ObjectRef({}, {})", _obj_num, _gen);
},
}
}
Ok(())
}
fn has_content_on_page(elem: &StructElem, target_page: u32) -> bool {
if elem.page == Some(target_page) {
return true;
}
for child in &elem.children {
match child {
StructChild::MarkedContentRef { page, .. } => {
if *page == target_page {
return true;
}
},
StructChild::StructElem(child_elem) => {
if has_content_on_page(child_elem, target_page) {
return true;
}
},
_ => {},
}
}
false
}
pub fn extract_reading_order(
struct_tree: &StructTreeRoot,
page_num: u32,
) -> Result<Vec<u32>, Error> {
let ordered_content = traverse_structure_tree(struct_tree, page_num)?;
Ok(ordered_content
.into_iter()
.filter_map(|c| c.mcid) .collect())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::structure::types::{StructChild, StructElem, StructType};
#[test]
fn test_simple_traversal() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef { mcid: 1, page: 0 });
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0, 1]);
}
#[test]
fn test_page_filtering() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef { mcid: 1, page: 1 });
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order_page_0 = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order_page_0, vec![0]);
let order_page_1 = extract_reading_order(&struct_tree, 1).unwrap();
assert_eq!(order_page_1, vec![1]);
}
#[test]
fn test_nested_structure() {
let mut root = StructElem::new(StructType::Document);
let mut sect = StructElem::new(StructType::Sect);
let mut h1 = StructElem::new(StructType::H1);
h1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef { mcid: 1, page: 0 });
sect.add_child(StructChild::StructElem(Box::new(h1)));
sect.add_child(StructChild::StructElem(Box::new(p)));
root.add_child(StructChild::StructElem(Box::new(sect)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0, 1]);
}
#[test]
fn test_word_break_elements() {
let mut root = StructElem::new(StructType::P);
let mut span1 = StructElem::new(StructType::Span);
span1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let wb = StructElem::new(StructType::WB);
let mut span2 = StructElem::new(StructType::Span);
span2.add_child(StructChild::MarkedContentRef { mcid: 1, page: 0 });
root.add_child(StructChild::StructElem(Box::new(span1)));
root.add_child(StructChild::StructElem(Box::new(wb)));
root.add_child(StructChild::StructElem(Box::new(span2)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 3); assert_eq!(ordered[0].mcid, Some(0));
assert!(!ordered[0].is_word_break);
assert_eq!(ordered[1].mcid, None); assert!(ordered[1].is_word_break);
assert_eq!(ordered[2].mcid, Some(1));
assert!(!ordered[2].is_word_break);
let mcids = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(mcids, vec![0, 1]); }
#[test]
fn test_empty_tree() {
let struct_tree = StructTreeRoot::new();
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert!(order.is_empty());
}
#[test]
fn test_empty_page() {
let mut root = StructElem::new(StructType::Document);
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
root.add_child(StructChild::StructElem(Box::new(p)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 5).unwrap();
assert!(order.is_empty());
}
#[test]
fn test_object_ref_skipped() {
let mut root = StructElem::new(StructType::Document);
root.add_child(StructChild::ObjectRef(42, 0));
root.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let order = extract_reading_order(&struct_tree, 0).unwrap();
assert_eq!(order, vec![0]);
}
#[test]
fn test_traverse_all_pages() {
let mut root = StructElem::new(StructType::Document);
let mut p1 = StructElem::new(StructType::P);
p1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut p2 = StructElem::new(StructType::P);
p2.add_child(StructChild::MarkedContentRef { mcid: 1, page: 1 });
let mut p3 = StructElem::new(StructType::P);
p3.add_child(StructChild::MarkedContentRef { mcid: 2, page: 0 });
root.add_child(StructChild::StructElem(Box::new(p1)));
root.add_child(StructChild::StructElem(Box::new(p2)));
root.add_child(StructChild::StructElem(Box::new(p3)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert_eq!(all_pages.len(), 2); assert_eq!(all_pages[&0].len(), 2); assert_eq!(all_pages[&1].len(), 1); }
#[test]
fn test_actual_text_replaces_descendants() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Replacement text".to_string());
elem.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 1);
assert_eq!(ordered[0].actual_text, Some("Replacement text".to_string()));
assert_eq!(ordered[0].mcid, None); }
#[test]
fn test_actual_text_wrong_page() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Replacement".to_string());
elem.add_child(StructChild::MarkedContentRef { mcid: 0, page: 1 });
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert!(ordered.is_empty());
}
#[test]
fn test_heading_and_block_flags() {
let mut root = StructElem::new(StructType::Document);
let mut h1 = StructElem::new(StructType::H1);
h1.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut span = StructElem::new(StructType::Span);
span.add_child(StructChild::MarkedContentRef { mcid: 1, page: 0 });
root.add_child(StructChild::StructElem(Box::new(h1)));
root.add_child(StructChild::StructElem(Box::new(span)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let ordered = traverse_structure_tree(&struct_tree, 0).unwrap();
assert_eq!(ordered.len(), 2);
assert!(ordered[0].is_heading);
assert!(ordered[0].is_block);
assert!(!ordered[1].is_heading);
assert!(!ordered[1].is_block);
}
#[test]
fn test_collect_pages() {
let mut elem = StructElem::new(StructType::Document);
elem.page = Some(0);
let mut child = StructElem::new(StructType::P);
child.add_child(StructChild::MarkedContentRef { mcid: 0, page: 1 });
child.add_child(StructChild::MarkedContentRef { mcid: 1, page: 2 });
elem.add_child(StructChild::StructElem(Box::new(child)));
let pages = collect_pages(&elem);
assert_eq!(pages, vec![0, 1, 2]);
}
#[test]
fn test_traverse_all_pages_with_actual_text() {
let mut root = StructElem::new(StructType::Document);
let mut elem = StructElem::new(StructType::Span);
elem.actual_text = Some("Hello".to_string());
elem.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
elem.add_child(StructChild::MarkedContentRef { mcid: 1, page: 1 });
root.add_child(StructChild::StructElem(Box::new(elem)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert!(all_pages.contains_key(&0));
assert!(all_pages.contains_key(&1));
assert_eq!(all_pages[&0][0].actual_text, Some("Hello".to_string()));
}
#[test]
fn test_traverse_all_pages_word_break_with_children() {
let mut root = StructElem::new(StructType::P);
let mut wb = StructElem::new(StructType::WB);
let mut child = StructElem::new(StructType::Span);
child.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
wb.add_child(StructChild::StructElem(Box::new(child)));
root.add_child(StructChild::StructElem(Box::new(wb)));
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
let page0 = &all_pages[&0];
assert!(page0.iter().any(|c| c.is_word_break));
assert!(page0.iter().any(|c| c.mcid == Some(0)));
}
#[test]
fn test_traverse_all_pages_object_ref() {
let mut root = StructElem::new(StructType::Document);
root.add_child(StructChild::ObjectRef(99, 0));
root.add_child(StructChild::MarkedContentRef { mcid: 0, page: 0 });
let mut struct_tree = StructTreeRoot::new();
struct_tree.add_root_element(root);
let all_pages = traverse_structure_tree_all_pages(&struct_tree);
assert_eq!(all_pages[&0].len(), 1);
assert_eq!(all_pages[&0][0].mcid, Some(0));
}
#[test]
fn test_has_content_on_page_deep() {
let mut root = StructElem::new(StructType::Document);
let mut sect = StructElem::new(StructType::Sect);
let mut p = StructElem::new(StructType::P);
p.add_child(StructChild::MarkedContentRef { mcid: 0, page: 3 });
sect.add_child(StructChild::StructElem(Box::new(p)));
root.add_child(StructChild::StructElem(Box::new(sect)));
assert!(has_content_on_page(&root, 3));
assert!(!has_content_on_page(&root, 0));
}
}