use std::collections::HashMap;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum StandardStructureType {
Document,
Part,
Sect,
Div,
Art,
BlockQuote,
Caption,
TOC,
TOCI,
Index,
P,
H,
H1,
H2,
H3,
H4,
H5,
H6,
L,
LI,
Lbl,
LBody,
Table,
TR,
TH,
TD,
THead,
TBody,
TFoot,
Span,
Quote,
Note,
Reference,
BibEntry,
Code,
Link,
Annot,
Figure,
Formula,
Form,
Ruby,
RB,
RT,
RP,
Warichu,
WT,
WP,
NonStruct,
Private,
}
impl StandardStructureType {
pub fn as_pdf_name(&self) -> &'static str {
match self {
Self::Document => "Document",
Self::Part => "Part",
Self::Sect => "Sect",
Self::Div => "Div",
Self::Art => "Art",
Self::BlockQuote => "BlockQuote",
Self::Caption => "Caption",
Self::TOC => "TOC",
Self::TOCI => "TOCI",
Self::Index => "Index",
Self::P => "P",
Self::H => "H",
Self::H1 => "H1",
Self::H2 => "H2",
Self::H3 => "H3",
Self::H4 => "H4",
Self::H5 => "H5",
Self::H6 => "H6",
Self::L => "L",
Self::LI => "LI",
Self::Lbl => "Lbl",
Self::LBody => "LBody",
Self::Table => "Table",
Self::TR => "TR",
Self::TH => "TH",
Self::TD => "TD",
Self::THead => "THead",
Self::TBody => "TBody",
Self::TFoot => "TFoot",
Self::Span => "Span",
Self::Quote => "Quote",
Self::Note => "Note",
Self::Reference => "Reference",
Self::BibEntry => "BibEntry",
Self::Code => "Code",
Self::Link => "Link",
Self::Annot => "Annot",
Self::Figure => "Figure",
Self::Formula => "Formula",
Self::Form => "Form",
Self::Ruby => "Ruby",
Self::RB => "RB",
Self::RT => "RT",
Self::RP => "RP",
Self::Warichu => "Warichu",
Self::WT => "WT",
Self::WP => "WP",
Self::NonStruct => "NonStruct",
Self::Private => "Private",
}
}
pub fn from_pdf_name(name: &str) -> Option<Self> {
match name {
"Document" => Some(Self::Document),
"Part" => Some(Self::Part),
"Sect" => Some(Self::Sect),
"Div" => Some(Self::Div),
"Art" => Some(Self::Art),
"BlockQuote" => Some(Self::BlockQuote),
"Caption" => Some(Self::Caption),
"TOC" => Some(Self::TOC),
"TOCI" => Some(Self::TOCI),
"Index" => Some(Self::Index),
"P" => Some(Self::P),
"H" => Some(Self::H),
"H1" => Some(Self::H1),
"H2" => Some(Self::H2),
"H3" => Some(Self::H3),
"H4" => Some(Self::H4),
"H5" => Some(Self::H5),
"H6" => Some(Self::H6),
"L" => Some(Self::L),
"LI" => Some(Self::LI),
"Lbl" => Some(Self::Lbl),
"LBody" => Some(Self::LBody),
"Table" => Some(Self::Table),
"TR" => Some(Self::TR),
"TH" => Some(Self::TH),
"TD" => Some(Self::TD),
"THead" => Some(Self::THead),
"TBody" => Some(Self::TBody),
"TFoot" => Some(Self::TFoot),
"Span" => Some(Self::Span),
"Quote" => Some(Self::Quote),
"Note" => Some(Self::Note),
"Reference" => Some(Self::Reference),
"BibEntry" => Some(Self::BibEntry),
"Code" => Some(Self::Code),
"Link" => Some(Self::Link),
"Annot" => Some(Self::Annot),
"Figure" => Some(Self::Figure),
"Formula" => Some(Self::Formula),
"Form" => Some(Self::Form),
"Ruby" => Some(Self::Ruby),
"RB" => Some(Self::RB),
"RT" => Some(Self::RT),
"RP" => Some(Self::RP),
"Warichu" => Some(Self::Warichu),
"WT" => Some(Self::WT),
"WP" => Some(Self::WP),
"NonStruct" => Some(Self::NonStruct),
"Private" => Some(Self::Private),
_ => None,
}
}
}
#[derive(Debug, Clone, Default)]
pub struct StructureAttributes {
pub lang: Option<String>,
pub alt: Option<String>,
pub actual_text: Option<String>,
pub expanded: Option<String>,
pub title: Option<String>,
pub bbox: Option<[f64; 4]>,
pub custom: HashMap<String, String>,
}
impl StructureAttributes {
pub fn new() -> Self {
Self::default()
}
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
self.lang = Some(lang.into());
self
}
pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
self.alt = Some(alt.into());
self
}
pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
self.actual_text = Some(text.into());
self
}
pub fn with_title(mut self, title: impl Into<String>) -> Self {
self.title = Some(title.into());
self
}
pub fn with_bbox(mut self, bbox: [f64; 4]) -> Self {
self.bbox = Some(bbox);
self
}
}
#[derive(Debug, Clone)]
pub struct StructureElement {
pub structure_type: StructureType,
pub id: Option<String>,
pub attributes: StructureAttributes,
pub children: Vec<usize>,
pub mcids: Vec<MarkedContentReference>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum StructureType {
Standard(StandardStructureType),
Custom(String),
}
impl StructureType {
pub fn as_pdf_name(&self) -> String {
match self {
Self::Standard(std_type) => std_type.as_pdf_name().to_string(),
Self::Custom(name) => name.clone(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct MarkedContentReference {
pub page_index: usize,
pub mcid: u32,
}
impl StructureElement {
pub fn new(structure_type: StandardStructureType) -> Self {
Self {
structure_type: StructureType::Standard(structure_type),
id: None,
attributes: StructureAttributes::new(),
children: Vec::new(),
mcids: Vec::new(),
}
}
pub fn new_custom(type_name: impl Into<String>) -> Self {
Self {
structure_type: StructureType::Custom(type_name.into()),
id: None,
attributes: StructureAttributes::new(),
children: Vec::new(),
mcids: Vec::new(),
}
}
pub fn with_id(mut self, id: impl Into<String>) -> Self {
self.id = Some(id.into());
self
}
pub fn with_language(mut self, lang: impl Into<String>) -> Self {
self.attributes.lang = Some(lang.into());
self
}
pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
self.attributes.alt = Some(alt.into());
self
}
pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
self.attributes.actual_text = Some(text.into());
self
}
pub fn with_title(mut self, title: impl Into<String>) -> Self {
self.attributes.title = Some(title.into());
self
}
pub fn add_mcid(&mut self, page_index: usize, mcid: u32) {
self.mcids.push(MarkedContentReference { page_index, mcid });
}
pub fn add_child(&mut self, child_index: usize) {
self.children.push(child_index);
}
}
#[derive(Debug, Clone, Default)]
pub struct RoleMap {
mappings: HashMap<String, StandardStructureType>,
}
impl RoleMap {
pub fn new() -> Self {
Self::default()
}
pub fn add_mapping(
&mut self,
custom_type: impl Into<String>,
standard_type: StandardStructureType,
) {
self.mappings.insert(custom_type.into(), standard_type);
}
pub fn get_mapping(&self, custom_type: &str) -> Option<&StandardStructureType> {
self.mappings.get(custom_type)
}
pub fn mappings(&self) -> &HashMap<String, StandardStructureType> {
&self.mappings
}
}
#[derive(Debug, Clone)]
pub struct StructTree {
elements: Vec<StructureElement>,
root_index: Option<usize>,
pub role_map: RoleMap,
id_map: HashMap<String, usize>,
}
impl Default for StructTree {
fn default() -> Self {
Self::new()
}
}
impl StructTree {
pub fn new() -> Self {
Self {
elements: Vec::new(),
root_index: None,
role_map: RoleMap::new(),
id_map: HashMap::new(),
}
}
pub fn set_root(&mut self, element: StructureElement) -> usize {
let index = self.elements.len();
if let Some(ref id) = element.id {
self.id_map.insert(id.clone(), index);
}
self.elements.push(element);
self.root_index = Some(index);
index
}
pub fn add_child(
&mut self,
parent_index: usize,
element: StructureElement,
) -> Result<usize, String> {
if parent_index >= self.elements.len() {
return Err(format!("Parent index {} out of bounds", parent_index));
}
let child_index = self.elements.len();
if let Some(ref id) = element.id {
self.id_map.insert(id.clone(), child_index);
}
self.elements.push(element);
self.elements[parent_index].add_child(child_index);
Ok(child_index)
}
pub fn get(&self, index: usize) -> Option<&StructureElement> {
self.elements.get(index)
}
pub fn get_mut(&mut self, index: usize) -> Option<&mut StructureElement> {
self.elements.get_mut(index)
}
pub fn get_by_id(&self, id: &str) -> Option<&StructureElement> {
self.id_map.get(id).and_then(|&index| self.get(index))
}
pub fn root_index(&self) -> Option<usize> {
self.root_index
}
pub fn root(&self) -> Option<&StructureElement> {
self.root_index.and_then(|index| self.get(index))
}
pub fn len(&self) -> usize {
self.elements.len()
}
pub fn is_empty(&self) -> bool {
self.elements.is_empty()
}
pub fn iter(&self) -> impl Iterator<Item = &StructureElement> {
self.elements.iter()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_standard_structure_type_names() {
assert_eq!(StandardStructureType::Document.as_pdf_name(), "Document");
assert_eq!(StandardStructureType::H1.as_pdf_name(), "H1");
assert_eq!(StandardStructureType::P.as_pdf_name(), "P");
assert_eq!(StandardStructureType::Figure.as_pdf_name(), "Figure");
assert_eq!(StandardStructureType::Table.as_pdf_name(), "Table");
}
#[test]
fn test_standard_structure_type_parsing() {
assert_eq!(
StandardStructureType::from_pdf_name("Document"),
Some(StandardStructureType::Document)
);
assert_eq!(
StandardStructureType::from_pdf_name("H1"),
Some(StandardStructureType::H1)
);
assert_eq!(StandardStructureType::from_pdf_name("Invalid"), None);
}
#[test]
fn test_structure_element_creation() {
let elem = StructureElement::new(StandardStructureType::H1)
.with_id("heading1")
.with_language("en-US")
.with_actual_text("Chapter One");
assert_eq!(elem.id, Some("heading1".to_string()));
assert_eq!(elem.attributes.lang, Some("en-US".to_string()));
assert_eq!(elem.attributes.actual_text, Some("Chapter One".to_string()));
}
#[test]
fn test_structure_attributes_builder() {
let attrs = StructureAttributes::new()
.with_language("es-ES")
.with_alt_text("Imagen de ejemplo")
.with_bbox([0.0, 0.0, 100.0, 100.0]);
assert_eq!(attrs.lang, Some("es-ES".to_string()));
assert_eq!(attrs.alt, Some("Imagen de ejemplo".to_string()));
assert_eq!(attrs.bbox, Some([0.0, 0.0, 100.0, 100.0]));
}
#[test]
fn test_role_map() {
let mut role_map = RoleMap::new();
role_map.add_mapping("MyHeading", StandardStructureType::H1);
role_map.add_mapping("MyParagraph", StandardStructureType::P);
assert_eq!(
role_map.get_mapping("MyHeading"),
Some(&StandardStructureType::H1)
);
assert_eq!(
role_map.get_mapping("MyParagraph"),
Some(&StandardStructureType::P)
);
assert_eq!(role_map.get_mapping("Unknown"), None);
}
#[test]
fn test_struct_tree_creation() {
let mut tree = StructTree::new();
let doc = StructureElement::new(StandardStructureType::Document);
let doc_idx = tree.set_root(doc);
assert_eq!(tree.root_index(), Some(doc_idx));
assert_eq!(tree.len(), 1);
}
#[test]
fn test_struct_tree_hierarchy() {
let mut tree = StructTree::new();
let doc = StructureElement::new(StandardStructureType::Document).with_id("doc1");
let doc_idx = tree.set_root(doc);
let h1 = StructureElement::new(StandardStructureType::H1)
.with_id("h1")
.with_actual_text("Title");
let h1_idx = tree.add_child(doc_idx, h1).unwrap();
let para = StructureElement::new(StandardStructureType::P).with_id("p1");
let p_idx = tree.add_child(doc_idx, para).unwrap();
assert_eq!(tree.len(), 3);
assert_eq!(tree.get(doc_idx).unwrap().children.len(), 2);
assert_eq!(tree.get(doc_idx).unwrap().children[0], h1_idx);
assert_eq!(tree.get(doc_idx).unwrap().children[1], p_idx);
assert!(tree.get_by_id("h1").is_some());
assert!(tree.get_by_id("p1").is_some());
assert!(tree.get_by_id("unknown").is_none());
}
#[test]
fn test_marked_content_references() {
let mut elem = StructureElement::new(StandardStructureType::P);
elem.add_mcid(0, 1);
elem.add_mcid(0, 2);
assert_eq!(elem.mcids.len(), 2);
assert_eq!(elem.mcids[0].page_index, 0);
assert_eq!(elem.mcids[0].mcid, 1);
assert_eq!(elem.mcids[1].mcid, 2);
}
#[test]
fn test_custom_structure_type() {
let elem = StructureElement::new_custom("MyCustomType");
match elem.structure_type {
StructureType::Custom(ref name) => assert_eq!(name, "MyCustomType"),
_ => panic!("Expected custom structure type"),
}
}
#[test]
fn test_struct_tree_error_handling() {
let mut tree = StructTree::new();
let elem = StructureElement::new(StandardStructureType::P);
let result = tree.add_child(999, elem);
assert!(result.is_err());
assert!(result.unwrap_err().contains("out of bounds"));
}
}