1use std::collections::HashMap;
66
67#[derive(Debug, Clone, PartialEq, Eq, Hash)]
69pub enum StandardStructureType {
70 Document,
73 Part,
75 Sect,
77 Div,
79 Art,
81 BlockQuote,
83 Caption,
85 TOC,
87 TOCI,
89 Index,
91
92 P,
95 H,
97 H1,
99 H2,
101 H3,
103 H4,
105 H5,
107 H6,
109
110 L,
113 LI,
115 Lbl,
117 LBody,
119
120 Table,
123 TR,
125 TH,
127 TD,
129 THead,
131 TBody,
133 TFoot,
135
136 Span,
139 Quote,
141 Note,
143 Reference,
145 BibEntry,
147 Code,
149 Link,
151 Annot,
153
154 Figure,
157 Formula,
159 Form,
161
162 Ruby,
165 RB,
167 RT,
169 RP,
171 Warichu,
173 WT,
175 WP,
177
178 NonStruct,
181 Private,
183}
184
185impl StandardStructureType {
186 pub fn as_pdf_name(&self) -> &'static str {
188 match self {
189 Self::Document => "Document",
190 Self::Part => "Part",
191 Self::Sect => "Sect",
192 Self::Div => "Div",
193 Self::Art => "Art",
194 Self::BlockQuote => "BlockQuote",
195 Self::Caption => "Caption",
196 Self::TOC => "TOC",
197 Self::TOCI => "TOCI",
198 Self::Index => "Index",
199 Self::P => "P",
200 Self::H => "H",
201 Self::H1 => "H1",
202 Self::H2 => "H2",
203 Self::H3 => "H3",
204 Self::H4 => "H4",
205 Self::H5 => "H5",
206 Self::H6 => "H6",
207 Self::L => "L",
208 Self::LI => "LI",
209 Self::Lbl => "Lbl",
210 Self::LBody => "LBody",
211 Self::Table => "Table",
212 Self::TR => "TR",
213 Self::TH => "TH",
214 Self::TD => "TD",
215 Self::THead => "THead",
216 Self::TBody => "TBody",
217 Self::TFoot => "TFoot",
218 Self::Span => "Span",
219 Self::Quote => "Quote",
220 Self::Note => "Note",
221 Self::Reference => "Reference",
222 Self::BibEntry => "BibEntry",
223 Self::Code => "Code",
224 Self::Link => "Link",
225 Self::Annot => "Annot",
226 Self::Figure => "Figure",
227 Self::Formula => "Formula",
228 Self::Form => "Form",
229 Self::Ruby => "Ruby",
230 Self::RB => "RB",
231 Self::RT => "RT",
232 Self::RP => "RP",
233 Self::Warichu => "Warichu",
234 Self::WT => "WT",
235 Self::WP => "WP",
236 Self::NonStruct => "NonStruct",
237 Self::Private => "Private",
238 }
239 }
240
241 pub fn from_pdf_name(name: &str) -> Option<Self> {
243 match name {
244 "Document" => Some(Self::Document),
245 "Part" => Some(Self::Part),
246 "Sect" => Some(Self::Sect),
247 "Div" => Some(Self::Div),
248 "Art" => Some(Self::Art),
249 "BlockQuote" => Some(Self::BlockQuote),
250 "Caption" => Some(Self::Caption),
251 "TOC" => Some(Self::TOC),
252 "TOCI" => Some(Self::TOCI),
253 "Index" => Some(Self::Index),
254 "P" => Some(Self::P),
255 "H" => Some(Self::H),
256 "H1" => Some(Self::H1),
257 "H2" => Some(Self::H2),
258 "H3" => Some(Self::H3),
259 "H4" => Some(Self::H4),
260 "H5" => Some(Self::H5),
261 "H6" => Some(Self::H6),
262 "L" => Some(Self::L),
263 "LI" => Some(Self::LI),
264 "Lbl" => Some(Self::Lbl),
265 "LBody" => Some(Self::LBody),
266 "Table" => Some(Self::Table),
267 "TR" => Some(Self::TR),
268 "TH" => Some(Self::TH),
269 "TD" => Some(Self::TD),
270 "THead" => Some(Self::THead),
271 "TBody" => Some(Self::TBody),
272 "TFoot" => Some(Self::TFoot),
273 "Span" => Some(Self::Span),
274 "Quote" => Some(Self::Quote),
275 "Note" => Some(Self::Note),
276 "Reference" => Some(Self::Reference),
277 "BibEntry" => Some(Self::BibEntry),
278 "Code" => Some(Self::Code),
279 "Link" => Some(Self::Link),
280 "Annot" => Some(Self::Annot),
281 "Figure" => Some(Self::Figure),
282 "Formula" => Some(Self::Formula),
283 "Form" => Some(Self::Form),
284 "Ruby" => Some(Self::Ruby),
285 "RB" => Some(Self::RB),
286 "RT" => Some(Self::RT),
287 "RP" => Some(Self::RP),
288 "Warichu" => Some(Self::Warichu),
289 "WT" => Some(Self::WT),
290 "WP" => Some(Self::WP),
291 "NonStruct" => Some(Self::NonStruct),
292 "Private" => Some(Self::Private),
293 _ => None,
294 }
295 }
296}
297
298#[derive(Debug, Clone, Default)]
303pub struct StructureAttributes {
304 pub lang: Option<String>,
306
307 pub alt: Option<String>,
309
310 pub actual_text: Option<String>,
312
313 pub expanded: Option<String>,
315
316 pub title: Option<String>,
318
319 pub bbox: Option<[f64; 4]>,
321
322 pub custom: HashMap<String, String>,
324}
325
326impl StructureAttributes {
327 pub fn new() -> Self {
329 Self::default()
330 }
331
332 pub fn with_language(mut self, lang: impl Into<String>) -> Self {
334 self.lang = Some(lang.into());
335 self
336 }
337
338 pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
340 self.alt = Some(alt.into());
341 self
342 }
343
344 pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
346 self.actual_text = Some(text.into());
347 self
348 }
349
350 pub fn with_title(mut self, title: impl Into<String>) -> Self {
352 self.title = Some(title.into());
353 self
354 }
355
356 pub fn with_bbox(mut self, bbox: [f64; 4]) -> Self {
358 self.bbox = Some(bbox);
359 self
360 }
361}
362
363#[derive(Debug, Clone)]
368pub struct StructureElement {
369 pub structure_type: StructureType,
371
372 pub id: Option<String>,
374
375 pub attributes: StructureAttributes,
377
378 pub children: Vec<usize>,
380
381 pub mcids: Vec<MarkedContentReference>,
383}
384
385#[derive(Debug, Clone, PartialEq)]
387pub enum StructureType {
388 Standard(StandardStructureType),
390 Custom(String),
392}
393
394impl StructureType {
395 pub fn as_pdf_name(&self) -> String {
397 match self {
398 Self::Standard(std_type) => std_type.as_pdf_name().to_string(),
399 Self::Custom(name) => name.clone(),
400 }
401 }
402}
403
404#[derive(Debug, Clone, PartialEq)]
409pub struct MarkedContentReference {
410 pub page_index: usize,
412
413 pub mcid: u32,
415}
416
417impl StructureElement {
418 pub fn new(structure_type: StandardStructureType) -> Self {
420 Self {
421 structure_type: StructureType::Standard(structure_type),
422 id: None,
423 attributes: StructureAttributes::new(),
424 children: Vec::new(),
425 mcids: Vec::new(),
426 }
427 }
428
429 pub fn new_custom(type_name: impl Into<String>) -> Self {
431 Self {
432 structure_type: StructureType::Custom(type_name.into()),
433 id: None,
434 attributes: StructureAttributes::new(),
435 children: Vec::new(),
436 mcids: Vec::new(),
437 }
438 }
439
440 pub fn with_id(mut self, id: impl Into<String>) -> Self {
442 self.id = Some(id.into());
443 self
444 }
445
446 pub fn with_language(mut self, lang: impl Into<String>) -> Self {
448 self.attributes.lang = Some(lang.into());
449 self
450 }
451
452 pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
454 self.attributes.alt = Some(alt.into());
455 self
456 }
457
458 pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
460 self.attributes.actual_text = Some(text.into());
461 self
462 }
463
464 pub fn with_title(mut self, title: impl Into<String>) -> Self {
466 self.attributes.title = Some(title.into());
467 self
468 }
469
470 pub fn add_mcid(&mut self, page_index: usize, mcid: u32) {
472 self.mcids.push(MarkedContentReference { page_index, mcid });
473 }
474
475 pub fn add_child(&mut self, child_index: usize) {
477 self.children.push(child_index);
478 }
479}
480
481#[derive(Debug, Clone, Default)]
486pub struct RoleMap {
487 mappings: HashMap<String, StandardStructureType>,
488}
489
490impl RoleMap {
491 pub fn new() -> Self {
493 Self::default()
494 }
495
496 pub fn add_mapping(
498 &mut self,
499 custom_type: impl Into<String>,
500 standard_type: StandardStructureType,
501 ) {
502 self.mappings.insert(custom_type.into(), standard_type);
503 }
504
505 pub fn get_mapping(&self, custom_type: &str) -> Option<&StandardStructureType> {
507 self.mappings.get(custom_type)
508 }
509
510 pub fn mappings(&self) -> &HashMap<String, StandardStructureType> {
512 &self.mappings
513 }
514}
515
516#[derive(Debug, Clone)]
521pub struct StructTree {
522 elements: Vec<StructureElement>,
524
525 root_index: Option<usize>,
527
528 pub role_map: RoleMap,
530
531 id_map: HashMap<String, usize>,
533}
534
535impl Default for StructTree {
536 fn default() -> Self {
537 Self::new()
538 }
539}
540
541impl StructTree {
542 pub fn new() -> Self {
544 Self {
545 elements: Vec::new(),
546 root_index: None,
547 role_map: RoleMap::new(),
548 id_map: HashMap::new(),
549 }
550 }
551
552 pub fn set_root(&mut self, element: StructureElement) -> usize {
554 let index = self.elements.len();
555
556 if let Some(ref id) = element.id {
558 self.id_map.insert(id.clone(), index);
559 }
560
561 self.elements.push(element);
562 self.root_index = Some(index);
563 index
564 }
565
566 pub fn add_child(
568 &mut self,
569 parent_index: usize,
570 element: StructureElement,
571 ) -> Result<usize, String> {
572 if parent_index >= self.elements.len() {
573 return Err(format!("Parent index {} out of bounds", parent_index));
574 }
575
576 let child_index = self.elements.len();
577
578 if let Some(ref id) = element.id {
580 self.id_map.insert(id.clone(), child_index);
581 }
582
583 self.elements.push(element);
584 self.elements[parent_index].add_child(child_index);
585
586 Ok(child_index)
587 }
588
589 pub fn get(&self, index: usize) -> Option<&StructureElement> {
591 self.elements.get(index)
592 }
593
594 pub fn get_mut(&mut self, index: usize) -> Option<&mut StructureElement> {
596 self.elements.get_mut(index)
597 }
598
599 pub fn get_by_id(&self, id: &str) -> Option<&StructureElement> {
601 self.id_map.get(id).and_then(|&index| self.get(index))
602 }
603
604 pub fn root_index(&self) -> Option<usize> {
606 self.root_index
607 }
608
609 pub fn root(&self) -> Option<&StructureElement> {
611 self.root_index.and_then(|index| self.get(index))
612 }
613
614 pub fn len(&self) -> usize {
616 self.elements.len()
617 }
618
619 pub fn is_empty(&self) -> bool {
621 self.elements.is_empty()
622 }
623
624 pub fn iter(&self) -> impl Iterator<Item = &StructureElement> {
626 self.elements.iter()
627 }
628}
629
630#[cfg(test)]
631mod tests {
632 use super::*;
633
634 #[test]
635 fn test_standard_structure_type_names() {
636 assert_eq!(StandardStructureType::Document.as_pdf_name(), "Document");
637 assert_eq!(StandardStructureType::H1.as_pdf_name(), "H1");
638 assert_eq!(StandardStructureType::P.as_pdf_name(), "P");
639 assert_eq!(StandardStructureType::Figure.as_pdf_name(), "Figure");
640 assert_eq!(StandardStructureType::Table.as_pdf_name(), "Table");
641 }
642
643 #[test]
644 fn test_standard_structure_type_parsing() {
645 assert_eq!(
646 StandardStructureType::from_pdf_name("Document"),
647 Some(StandardStructureType::Document)
648 );
649 assert_eq!(
650 StandardStructureType::from_pdf_name("H1"),
651 Some(StandardStructureType::H1)
652 );
653 assert_eq!(StandardStructureType::from_pdf_name("Invalid"), None);
654 }
655
656 #[test]
657 fn test_structure_element_creation() {
658 let elem = StructureElement::new(StandardStructureType::H1)
659 .with_id("heading1")
660 .with_language("en-US")
661 .with_actual_text("Chapter One");
662
663 assert_eq!(elem.id, Some("heading1".to_string()));
664 assert_eq!(elem.attributes.lang, Some("en-US".to_string()));
665 assert_eq!(elem.attributes.actual_text, Some("Chapter One".to_string()));
666 }
667
668 #[test]
669 fn test_structure_attributes_builder() {
670 let attrs = StructureAttributes::new()
671 .with_language("es-ES")
672 .with_alt_text("Imagen de ejemplo")
673 .with_bbox([0.0, 0.0, 100.0, 100.0]);
674
675 assert_eq!(attrs.lang, Some("es-ES".to_string()));
676 assert_eq!(attrs.alt, Some("Imagen de ejemplo".to_string()));
677 assert_eq!(attrs.bbox, Some([0.0, 0.0, 100.0, 100.0]));
678 }
679
680 #[test]
681 fn test_role_map() {
682 let mut role_map = RoleMap::new();
683 role_map.add_mapping("MyHeading", StandardStructureType::H1);
684 role_map.add_mapping("MyParagraph", StandardStructureType::P);
685
686 assert_eq!(
687 role_map.get_mapping("MyHeading"),
688 Some(&StandardStructureType::H1)
689 );
690 assert_eq!(
691 role_map.get_mapping("MyParagraph"),
692 Some(&StandardStructureType::P)
693 );
694 assert_eq!(role_map.get_mapping("Unknown"), None);
695 }
696
697 #[test]
698 fn test_struct_tree_creation() {
699 let mut tree = StructTree::new();
700
701 let doc = StructureElement::new(StandardStructureType::Document);
703 let doc_idx = tree.set_root(doc);
704
705 assert_eq!(tree.root_index(), Some(doc_idx));
706 assert_eq!(tree.len(), 1);
707 }
708
709 #[test]
710 fn test_struct_tree_hierarchy() {
711 let mut tree = StructTree::new();
712
713 let doc = StructureElement::new(StandardStructureType::Document).with_id("doc1");
715 let doc_idx = tree.set_root(doc);
716
717 let h1 = StructureElement::new(StandardStructureType::H1)
719 .with_id("h1")
720 .with_actual_text("Title");
721 let h1_idx = tree.add_child(doc_idx, h1).unwrap();
722
723 let para = StructureElement::new(StandardStructureType::P).with_id("p1");
725 let p_idx = tree.add_child(doc_idx, para).unwrap();
726
727 assert_eq!(tree.len(), 3);
728 assert_eq!(tree.get(doc_idx).unwrap().children.len(), 2);
729 assert_eq!(tree.get(doc_idx).unwrap().children[0], h1_idx);
730 assert_eq!(tree.get(doc_idx).unwrap().children[1], p_idx);
731
732 assert!(tree.get_by_id("h1").is_some());
734 assert!(tree.get_by_id("p1").is_some());
735 assert!(tree.get_by_id("unknown").is_none());
736 }
737
738 #[test]
739 fn test_marked_content_references() {
740 let mut elem = StructureElement::new(StandardStructureType::P);
741 elem.add_mcid(0, 1);
742 elem.add_mcid(0, 2);
743
744 assert_eq!(elem.mcids.len(), 2);
745 assert_eq!(elem.mcids[0].page_index, 0);
746 assert_eq!(elem.mcids[0].mcid, 1);
747 assert_eq!(elem.mcids[1].mcid, 2);
748 }
749
750 #[test]
751 fn test_custom_structure_type() {
752 let elem = StructureElement::new_custom("MyCustomType");
753
754 match elem.structure_type {
755 StructureType::Custom(ref name) => assert_eq!(name, "MyCustomType"),
756 _ => panic!("Expected custom structure type"),
757 }
758 }
759
760 #[test]
761 fn test_struct_tree_error_handling() {
762 let mut tree = StructTree::new();
763
764 let elem = StructureElement::new(StandardStructureType::P);
766 let result = tree.add_child(999, elem);
767
768 assert!(result.is_err());
769 assert!(result.unwrap_err().contains("out of bounds"));
770 }
771}