1use std::collections::HashMap;
18
19use rpdfium_core::{Name, PdfSource};
20use rpdfium_parser::{Object, ObjectId, ObjectStore};
21
22use crate::error::{DocError, DocResult};
23use crate::struct_element::{StructElement, parse_struct_element};
24
25pub use crate::struct_element::{AttributeValue, StructAttribute};
26
27const MAX_ELEMENTS: usize = 100_000;
29
30const MAX_TREE_DEPTH: usize = 64;
32
33pub mod structure_types {
35 pub const DOCUMENT: &str = "Document";
36 pub const PART: &str = "Part";
37 pub const SECT: &str = "Sect";
38 pub const P: &str = "P";
39 pub const H: &str = "H";
40 pub const H1: &str = "H1";
41 pub const H2: &str = "H2";
42 pub const H3: &str = "H3";
43 pub const H4: &str = "H4";
44 pub const H5: &str = "H5";
45 pub const H6: &str = "H6";
46 pub const TABLE: &str = "Table";
47 pub const TR: &str = "TR";
48 pub const TD: &str = "TD";
49 pub const TH: &str = "TH";
50 pub const FIGURE: &str = "Figure";
51 pub const SPAN: &str = "Span";
52 pub const LINK: &str = "Link";
53 pub const LIST: &str = "L";
54 pub const LIST_ITEM: &str = "LI";
55 pub const LABEL: &str = "Lbl";
56 pub const LIST_BODY: &str = "LBody";
57}
58
59#[derive(Debug, Clone)]
61pub struct StructTree {
62 pub root_elements: Vec<StructElement>,
64 pub role_map: HashMap<String, String>,
66}
67
68pub fn find_elements_for_mcid(elements: &[StructElement], mcid: i32) -> Vec<usize> {
83 let target = mcid as i64;
84 let mut result = Vec::new();
85 let mut stack: Vec<(usize, &StructElement)> = elements.iter().enumerate().rev().collect();
89
90 while let Some((idx, elem)) = stack.pop() {
91 if elem.mcids.contains(&target) {
92 result.push(idx);
93 }
94 for child in elem.children.iter().rev() {
96 stack.push((idx, child));
97 }
98 }
99 result
100}
101
102impl StructTree {
103 pub fn from_catalog<S: PdfSource>(
108 catalog_dict: &HashMap<Name, Object>,
109 store: &ObjectStore<S>,
110 ) -> DocResult<Option<Self>> {
111 let root_obj = match catalog_dict.get(&Name::struct_tree_root()) {
113 Some(obj) => store
114 .deep_resolve(obj)
115 .map_err(|e| DocError::Parser(e.to_string()))?,
116 None => return Ok(None),
117 };
118
119 let root_dict = match root_obj.as_dict() {
120 Some(d) => d,
121 None => return Ok(None),
122 };
123
124 let role_map = parse_role_map(root_dict, store);
126
127 let root_elements = match root_dict.get(&Name::k()) {
129 Some(k_obj) => parse_k_children(k_obj, store)?,
130 None => Vec::new(),
131 };
132
133 Ok(Some(StructTree {
134 root_elements,
135 role_map,
136 }))
137 }
138
139 pub fn role_map_name_for<'a>(&'a self, struct_type: &'a str) -> &'a str {
146 self.role_map
147 .get(struct_type)
148 .map(|s| s.as_str())
149 .unwrap_or(struct_type)
150 }
151
152 #[inline]
156 pub fn get_role_map_name_for<'a>(&'a self, struct_type: &'a str) -> &'a str {
157 self.role_map_name_for(struct_type)
158 }
159
160 pub fn child_count(&self) -> usize {
164 self.root_elements.len()
165 }
166
167 #[inline]
171 pub fn struct_tree_count_children(&self) -> usize {
172 self.child_count()
173 }
174
175 #[deprecated(
178 since = "0.1.0",
179 note = "use `struct_tree_count_children()` — matches upstream `FPDF_StructTree_CountChildren`"
180 )]
181 #[inline]
182 pub fn count_children(&self) -> usize {
183 self.child_count()
184 }
185
186 pub fn child_at_index(&self, index: usize) -> Option<&StructElement> {
190 self.root_elements.get(index)
191 }
192
193 #[inline]
197 pub fn struct_tree_get_child_at_index(&self, index: usize) -> Option<&StructElement> {
198 self.child_at_index(index)
199 }
200
201 #[deprecated(
204 since = "0.1.0",
205 note = "use `struct_tree_get_child_at_index()` — matches upstream `FPDF_StructTree_GetChildAtIndex`"
206 )]
207 #[inline]
208 pub fn get_child_at_index(&self, index: usize) -> Option<&StructElement> {
209 self.child_at_index(index)
210 }
211
212 pub fn elements_for_page_ref(&self, page_ref: ObjectId) -> ElementsForPage<'_> {
222 ElementsForPage {
223 stack: self.root_elements.iter().rev().collect(),
224 page_ref,
225 }
226 }
227
228 pub fn elements_for_mcid(&self, mcid: i32) -> Vec<usize> {
233 find_elements_for_mcid(&self.root_elements, mcid)
234 }
235
236 #[deprecated(
238 note = "use `elements_for_mcid()` — no public `FPDF_StructTree_GetElementsForMcid` API"
239 )]
240 #[inline]
241 pub fn get_elements_for_mcid(&self, mcid: i32) -> Vec<usize> {
242 self.elements_for_mcid(mcid)
243 }
244
245 pub fn elements_for_page<'a>(
254 &'a self,
255 page_index: usize,
256 page_ids: &[ObjectId],
257 ) -> ElementsForPage<'a> {
258 match page_ids.get(page_index) {
259 Some(&page_ref) => ElementsForPage {
260 stack: self.root_elements.iter().rev().collect(),
261 page_ref,
262 },
263 None => ElementsForPage {
264 stack: Vec::new(),
265 page_ref: ObjectId::new(0, 0),
266 },
267 }
268 }
269}
270
271pub struct ElementsForPage<'a> {
277 stack: Vec<&'a StructElement>,
279 page_ref: ObjectId,
281}
282
283impl<'a> Iterator for ElementsForPage<'a> {
284 type Item = &'a StructElement;
285
286 fn next(&mut self) -> Option<Self::Item> {
287 loop {
288 let elem = self.stack.pop()?;
289 for child in elem.children.iter().rev() {
291 self.stack.push(child);
292 }
293 if elem.page_ref == Some(self.page_ref) {
294 return Some(elem);
295 }
296 }
297 }
298}
299
300fn parse_role_map<S: PdfSource>(
302 root_dict: &HashMap<Name, Object>,
303 store: &ObjectStore<S>,
304) -> HashMap<String, String> {
305 let mut map = HashMap::new();
306 let role_map_obj = match root_dict.get(&Name::role_map()) {
307 Some(obj) => match store.deep_resolve(obj) {
308 Ok(resolved) => resolved,
309 Err(_) => return map,
310 },
311 None => return map,
312 };
313
314 if let Some(dict) = role_map_obj.as_dict() {
315 for (key, value) in dict {
316 if let Some(target) = value.as_name() {
317 map.insert(key.as_str().into_owned(), target.as_str().into_owned());
318 }
319 }
320 }
321 map
322}
323
324fn parse_k_children<S: PdfSource>(
327 k_obj: &Object,
328 store: &ObjectStore<S>,
329) -> DocResult<Vec<StructElement>> {
330 let resolved = store
331 .deep_resolve(k_obj)
332 .map_err(|e| DocError::Parser(e.to_string()))?;
333
334 let top_items: Vec<&Object> = match resolved {
336 Object::Array(arr) => arr.iter().collect(),
337 _ => vec![resolved],
338 };
339
340 let mut flat: Vec<(usize, StructElement)> = Vec::new();
344
345 struct StackEntry<'a> {
347 obj: &'a Object,
348 depth: usize,
349 }
350
351 let mut stack: Vec<StackEntry<'_>> = Vec::new();
353 for item in top_items.iter().rev() {
354 stack.push(StackEntry {
355 obj: item,
356 depth: 0,
357 });
358 }
359
360 while let Some(entry) = stack.pop() {
361 if flat.len() >= MAX_ELEMENTS {
362 break;
363 }
364 if entry.depth > MAX_TREE_DEPTH {
365 return Err(DocError::DepthExceeded);
366 }
367
368 let resolved = match store.deep_resolve(entry.obj) {
369 Ok(r) => r,
370 Err(_) => continue,
371 };
372
373 if resolved.as_i64().is_some() {
375 continue;
376 }
377
378 let dict = match resolved.as_dict() {
379 Some(d) => d,
380 None => continue,
381 };
382
383 let mut elem = parse_struct_element(dict, store);
385
386 if let Some(k_val) = dict.get(&Name::k()) {
388 let k_resolved = match store.deep_resolve(k_val) {
389 Ok(r) => r,
390 Err(_) => {
391 flat.push((entry.depth, elem));
392 continue;
393 }
394 };
395
396 match k_resolved {
397 Object::Integer(n) => {
398 elem.mcids.push(*n);
399 }
400 Object::Dictionary(child_dict) => {
401 if let Some(mcid) = extract_mcid_from_dict(child_dict) {
403 elem.mcids.push(mcid);
404 } else {
405 stack.push(StackEntry {
407 obj: k_val,
408 depth: entry.depth + 1,
409 });
410 }
411 }
412 Object::Array(arr) => {
413 for child in arr.iter().rev() {
416 let child_resolved = match store.deep_resolve(child) {
417 Ok(r) => r,
418 Err(_) => continue,
419 };
420 match child_resolved {
421 Object::Integer(n) => {
422 elem.mcids.push(*n);
423 }
424 Object::Dictionary(child_dict) => {
425 if let Some(mcid) = extract_mcid_from_dict(child_dict) {
426 elem.mcids.push(mcid);
427 } else {
428 stack.push(StackEntry {
429 obj: child,
430 depth: entry.depth + 1,
431 });
432 }
433 }
434 _ => {}
435 }
436 }
437 elem.mcids.reverse();
439 }
440 _ => {}
441 }
442 }
443
444 flat.push((entry.depth, elem));
445 }
446
447 build_tree_from_flat(flat)
448}
449
450fn extract_mcid_from_dict(dict: &HashMap<Name, Object>) -> Option<i64> {
452 dict.get(&Name::mcid()).and_then(|obj| obj.as_i64())
453}
454
455fn build_tree_from_flat(flat: Vec<(usize, StructElement)>) -> DocResult<Vec<StructElement>> {
458 if flat.is_empty() {
459 return Ok(Vec::new());
460 }
461
462 let mut root: Vec<StructElement> = Vec::new();
463 let mut path: Vec<usize> = Vec::new();
464
465 for (depth, mut elem) in flat {
466 path.truncate(depth);
467 let container = get_children_at_path(&mut root, &path);
468 let idx = container.len();
469 if depth > 0 {
472 elem.parent_index = Some(idx);
473 }
474 container.push(elem);
475 if path.len() <= depth {
476 path.push(idx);
477 }
478 }
479
480 Ok(root)
481}
482
483fn get_children_at_path<'a>(
485 root: &'a mut Vec<StructElement>,
486 path: &[usize],
487) -> &'a mut Vec<StructElement> {
488 let mut current = root;
489 for &idx in path {
490 current = &mut current[idx].children;
491 }
492 current
493}
494
495#[derive(Debug, Clone)]
499pub struct McidMapping {
500 entries: HashMap<(ObjectId, i64), usize>,
502 elements: Vec<StructElement>,
504}
505
506impl McidMapping {
507 pub fn from_struct_tree(tree: &StructTree) -> Self {
509 let mut entries = HashMap::new();
510 let mut elements = Vec::new();
511
512 let mut stack: Vec<&StructElement> = tree.root_elements.iter().rev().collect();
514
515 while let Some(elem) = stack.pop() {
516 if !elem.mcids.is_empty() {
517 if let Some(page_id) = elem.page_ref {
518 let idx = elements.len();
519 elements.push(elem.clone());
520 for &mcid in &elem.mcids {
521 entries.insert((page_id, mcid), idx);
522 }
523 }
524 }
525 for child in elem.children.iter().rev() {
527 stack.push(child);
528 }
529 }
530
531 McidMapping { entries, elements }
532 }
533
534 pub fn element_for_mcid(&self, page_id: ObjectId, mcid: i64) -> Option<&StructElement> {
536 self.entries
537 .get(&(page_id, mcid))
538 .map(|&idx| &self.elements[idx])
539 }
540}
541
542#[derive(Debug, Clone)]
544pub struct PageStructure {
545 pub elements: Vec<StructElement>,
547}
548
549impl PageStructure {
550 pub fn for_page(tree: &StructTree, page_id: ObjectId) -> Self {
552 let mut elements = Vec::new();
553
554 let mut stack: Vec<&StructElement> = tree.root_elements.iter().rev().collect();
556
557 while let Some(elem) = stack.pop() {
558 if elem.page_ref == Some(page_id) {
559 elements.push(elem.clone());
560 }
561 for child in elem.children.iter().rev() {
562 stack.push(child);
563 }
564 }
565
566 PageStructure { elements }
567 }
568}
569
570#[cfg(test)]
571mod tests {
572 use super::*;
573 use rpdfium_core::PdfString;
574
575 fn build_store() -> ObjectStore<Vec<u8>> {
576 let pdf = build_minimal_pdf();
577 ObjectStore::open(pdf, rpdfium_core::ParsingMode::Lenient).unwrap()
578 }
579
580 fn build_minimal_pdf() -> Vec<u8> {
581 let mut pdf = Vec::new();
582 pdf.extend_from_slice(b"%PDF-1.4\n");
583 let obj1_offset = pdf.len();
584 pdf.extend_from_slice(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n");
585 let obj2_offset = pdf.len();
586 pdf.extend_from_slice(b"2 0 obj\n<< /Type /Pages /Kids [] /Count 0 >>\nendobj\n");
587 let xref_offset = pdf.len();
588 pdf.extend_from_slice(b"xref\n0 3\n");
589 pdf.extend_from_slice(b"0000000000 65535 f \r\n");
590 pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj1_offset).as_bytes());
591 pdf.extend_from_slice(format!("{:010} 00000 n \r\n", obj2_offset).as_bytes());
592 pdf.extend_from_slice(b"trailer\n<< /Size 3 /Root 1 0 R >>\n");
593 pdf.extend_from_slice(format!("startxref\n{}\n%%EOF", xref_offset).as_bytes());
594 pdf
595 }
596
597 fn str_obj(s: &str) -> Object {
598 Object::String(PdfString::from_bytes(s.as_bytes().to_vec()))
599 }
600
601 fn name_obj(s: &str) -> Object {
602 Object::Name(Name::from(s))
603 }
604
605 fn struct_elem_dict(tag: &str) -> HashMap<Name, Object> {
607 let mut d = HashMap::new();
608 d.insert(Name::s(), name_obj(tag));
609 d
610 }
611
612 #[test]
613 fn test_no_struct_tree_root_returns_none() {
614 let store = build_store();
615 let catalog = HashMap::new();
616 let result = StructTree::from_catalog(&catalog, &store).unwrap();
617 assert!(result.is_none());
618 }
619
620 #[test]
621 fn test_empty_struct_tree_root() {
622 let store = build_store();
623 let root_dict = HashMap::new();
624 let mut catalog = HashMap::new();
625 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
626 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
627 assert!(tree.root_elements.is_empty());
628 assert!(tree.role_map.is_empty());
629 }
630
631 #[test]
632 fn test_basic_structure_tree_document_with_paragraphs() {
633 let store = build_store();
634
635 let p1 = struct_elem_dict("P");
637 let p2 = struct_elem_dict("P");
638
639 let mut doc = struct_elem_dict("Document");
641 doc.insert(
642 Name::k(),
643 Object::Array(vec![Object::Dictionary(p1), Object::Dictionary(p2)]),
644 );
645
646 let mut root_dict = HashMap::new();
647 root_dict.insert(Name::k(), Object::Dictionary(doc));
648
649 let mut catalog = HashMap::new();
650 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
651
652 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
653 assert_eq!(tree.root_elements.len(), 1);
654 assert_eq!(tree.root_elements[0].struct_type, "Document");
655 assert_eq!(tree.root_elements[0].children.len(), 2);
656 assert_eq!(tree.root_elements[0].children[0].struct_type, "P");
657 assert_eq!(tree.root_elements[0].children[1].struct_type, "P");
658 }
659
660 #[test]
661 fn test_mcid_from_integer_in_k() {
662 let store = build_store();
663
664 let mut p = struct_elem_dict("P");
666 p.insert(Name::k(), Object::Integer(42));
667 p.insert(Name::pg(), Object::Reference(ObjectId::new(5, 0)));
668
669 let mut root_dict = HashMap::new();
670 root_dict.insert(Name::k(), Object::Dictionary(p));
671
672 let mut catalog = HashMap::new();
673 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
674
675 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
676 assert_eq!(tree.root_elements.len(), 1);
677 assert_eq!(tree.root_elements[0].mcids, vec![42]);
678 assert_eq!(tree.root_elements[0].page_ref, Some(ObjectId::new(5, 0)));
679 }
680
681 #[test]
682 fn test_mcid_from_dict_in_k() {
683 let store = build_store();
684
685 let mut mcr = HashMap::new();
687 mcr.insert(Name::mcid(), Object::Integer(7));
688
689 let mut p = struct_elem_dict("Span");
690 p.insert(Name::k(), Object::Dictionary(mcr));
691 p.insert(Name::pg(), Object::Reference(ObjectId::new(3, 0)));
692
693 let mut root_dict = HashMap::new();
694 root_dict.insert(Name::k(), Object::Dictionary(p));
695
696 let mut catalog = HashMap::new();
697 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
698
699 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
700 assert_eq!(tree.root_elements[0].mcids, vec![7]);
701 }
702
703 #[test]
704 fn test_alt_text_extraction() {
705 let store = build_store();
706
707 let mut fig = struct_elem_dict("Figure");
708 fig.insert(Name::alt(), str_obj("A photo of a cat"));
709
710 let mut root_dict = HashMap::new();
711 root_dict.insert(Name::k(), Object::Dictionary(fig));
712
713 let mut catalog = HashMap::new();
714 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
715
716 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
717 assert_eq!(
718 tree.root_elements[0].alt_text.as_deref(),
719 Some("A photo of a cat")
720 );
721 }
722
723 #[test]
724 fn test_nested_structure_elements() {
725 let store = build_store();
726
727 let span = struct_elem_dict("Span");
729 let mut p = struct_elem_dict("P");
730 p.insert(Name::k(), Object::Dictionary(span));
731
732 let mut doc = struct_elem_dict("Document");
733 doc.insert(Name::k(), Object::Dictionary(p));
734
735 let mut root_dict = HashMap::new();
736 root_dict.insert(Name::k(), Object::Dictionary(doc));
737
738 let mut catalog = HashMap::new();
739 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
740
741 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
742 assert_eq!(tree.root_elements[0].struct_type, "Document");
743 assert_eq!(tree.root_elements[0].children[0].struct_type, "P");
744 assert_eq!(
745 tree.root_elements[0].children[0].children[0].struct_type,
746 "Span"
747 );
748 }
749
750 #[test]
751 fn test_role_mapping() {
752 let store = build_store();
753
754 let mut role_map_dict = HashMap::new();
756 role_map_dict.insert(Name::from("MyTag"), name_obj("P"));
757 role_map_dict.insert(Name::from("CustomH"), name_obj("H1"));
758
759 let mut root_dict = HashMap::new();
760 root_dict.insert(Name::role_map(), Object::Dictionary(role_map_dict));
761
762 let mut catalog = HashMap::new();
763 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
764
765 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
766 assert_eq!(tree.role_map.get("MyTag"), Some(&"P".to_string()));
767 assert_eq!(tree.role_map.get("CustomH"), Some(&"H1".to_string()));
768 }
769
770 #[test]
771 fn test_mixed_k_content_dicts_and_integers() {
772 let store = build_store();
773
774 let mut mcr = HashMap::new();
776 mcr.insert(Name::mcid(), Object::Integer(3));
777
778 let child = struct_elem_dict("Span");
780
781 let mut p = struct_elem_dict("P");
783 p.insert(Name::pg(), Object::Reference(ObjectId::new(10, 0)));
784 p.insert(
785 Name::k(),
786 Object::Array(vec![
787 Object::Integer(1),
788 Object::Dictionary(mcr),
789 Object::Dictionary(child),
790 ]),
791 );
792
793 let mut root_dict = HashMap::new();
794 root_dict.insert(Name::k(), Object::Dictionary(p));
795
796 let mut catalog = HashMap::new();
797 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
798
799 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
800 let elem = &tree.root_elements[0];
801 assert_eq!(elem.struct_type, "P");
802 assert_eq!(elem.mcids, vec![1, 3]);
803 assert_eq!(elem.children.len(), 1);
804 assert_eq!(elem.children[0].struct_type, "Span");
805 }
806
807 #[test]
808 fn test_page_structure_filtering() {
809 let store = build_store();
810
811 let page1 = ObjectId::new(5, 0);
812 let page2 = ObjectId::new(6, 0);
813
814 let mut p1 = struct_elem_dict("P");
815 p1.insert(Name::pg(), Object::Reference(page1));
816 let mut p2 = struct_elem_dict("P");
817 p2.insert(Name::pg(), Object::Reference(page2));
818 let mut p3 = struct_elem_dict("P");
819 p3.insert(Name::pg(), Object::Reference(page1));
820
821 let mut doc = struct_elem_dict("Document");
822 doc.insert(
823 Name::k(),
824 Object::Array(vec![
825 Object::Dictionary(p1),
826 Object::Dictionary(p2),
827 Object::Dictionary(p3),
828 ]),
829 );
830
831 let mut root_dict = HashMap::new();
832 root_dict.insert(Name::k(), Object::Dictionary(doc));
833
834 let mut catalog = HashMap::new();
835 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
836
837 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
838 let page1_struct = PageStructure::for_page(&tree, page1);
839 assert_eq!(page1_struct.elements.len(), 2);
840 for elem in &page1_struct.elements {
841 assert_eq!(elem.page_ref, Some(page1));
842 }
843
844 let page2_struct = PageStructure::for_page(&tree, page2);
845 assert_eq!(page2_struct.elements.len(), 1);
846 }
847
848 #[test]
849 fn test_mcid_mapping_lookup() {
850 let store = build_store();
851
852 let page_id = ObjectId::new(7, 0);
853
854 let mut p = struct_elem_dict("P");
855 p.insert(Name::pg(), Object::Reference(page_id));
856 p.insert(
857 Name::k(),
858 Object::Array(vec![Object::Integer(0), Object::Integer(1)]),
859 );
860
861 let mut root_dict = HashMap::new();
862 root_dict.insert(Name::k(), Object::Dictionary(p));
863
864 let mut catalog = HashMap::new();
865 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
866
867 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
868 let mapping = McidMapping::from_struct_tree(&tree);
869
870 let elem = mapping.element_for_mcid(page_id, 0).unwrap();
871 assert_eq!(elem.struct_type, "P");
872 let elem1 = mapping.element_for_mcid(page_id, 1).unwrap();
873 assert_eq!(elem1.struct_type, "P");
874 assert!(mapping.element_for_mcid(page_id, 99).is_none());
875 assert!(mapping.element_for_mcid(ObjectId::new(999, 0), 0).is_none());
876 }
877
878 #[test]
879 fn test_security_limit_truncates_large_tree() {
880 let store = build_store();
881
882 let count = MAX_ELEMENTS + 10;
884 let arr: Vec<Object> = (0..count)
885 .map(|_| Object::Dictionary(struct_elem_dict("P")))
886 .collect();
887
888 let mut doc = struct_elem_dict("Document");
889 doc.insert(Name::k(), Object::Array(arr));
890
891 let mut root_dict = HashMap::new();
892 root_dict.insert(Name::k(), Object::Dictionary(doc));
893
894 let mut catalog = HashMap::new();
895 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
896
897 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
898 let total = count_elements(&tree.root_elements);
900 assert!(total <= MAX_ELEMENTS + 1); }
902
903 fn count_elements(roots: &[StructElement]) -> usize {
905 let mut count = 0;
906 let mut stack: Vec<&StructElement> = roots.iter().collect();
907 while let Some(elem) = stack.pop() {
908 count += 1;
909 for child in &elem.children {
910 stack.push(child);
911 }
912 }
913 count
914 }
915
916 #[test]
917 fn test_title_and_id_extraction() {
918 let store = build_store();
919
920 let mut elem = struct_elem_dict("Table");
921 elem.insert(Name::t(), str_obj("Sales Data 2026"));
922 elem.insert(Name::id(), str_obj("table-001"));
923
924 let mut root_dict = HashMap::new();
925 root_dict.insert(Name::k(), Object::Dictionary(elem));
926
927 let mut catalog = HashMap::new();
928 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
929
930 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
931 assert_eq!(
932 tree.root_elements[0].title.as_deref(),
933 Some("Sales Data 2026")
934 );
935 assert_eq!(tree.root_elements[0].id.as_deref(), Some("table-001"));
936 }
937
938 #[test]
939 fn test_role_map_name_for_lookup() {
940 let store = build_store();
941
942 let mut role_map_dict = HashMap::new();
943 role_map_dict.insert(Name::from("MyTag"), name_obj("P"));
944 role_map_dict.insert(Name::from("CustomH"), name_obj("H1"));
945
946 let mut root_dict = HashMap::new();
947 root_dict.insert(Name::role_map(), Object::Dictionary(role_map_dict));
948
949 let mut catalog = HashMap::new();
950 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
951
952 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
953 assert_eq!(tree.role_map_name_for("MyTag"), "P");
954 assert_eq!(tree.role_map_name_for("CustomH"), "H1");
955 assert_eq!(tree.role_map_name_for("P"), "P");
957 assert_eq!(tree.role_map_name_for("UnknownTag"), "UnknownTag");
958 }
959
960 #[test]
961 fn test_actual_text_and_lang() {
962 let store = build_store();
963
964 let mut span = struct_elem_dict("Span");
965 span.insert(Name::actual_text(), str_obj("Hello World"));
966 span.insert(Name::lang(), str_obj("en-US"));
967
968 let mut root_dict = HashMap::new();
969 root_dict.insert(Name::k(), Object::Dictionary(span));
970
971 let mut catalog = HashMap::new();
972 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
973
974 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
975 assert_eq!(
976 tree.root_elements[0].actual_text.as_deref(),
977 Some("Hello World")
978 );
979 assert_eq!(tree.root_elements[0].lang.as_deref(), Some("en-US"));
980 }
981
982 #[test]
983 fn test_struct_element_with_attributes() {
984 let store = build_store();
985
986 let mut attr_dict = HashMap::new();
988 attr_dict.insert(Name::o(), name_obj("Layout"));
989 attr_dict.insert(Name::from("WritingMode"), name_obj("LrTb"));
990 attr_dict.insert(Name::from("SpaceBefore"), Object::Real(12.0));
991
992 let mut td = struct_elem_dict("TD");
993 td.insert(Name::a(), Object::Dictionary(attr_dict));
994
995 let mut root_dict = HashMap::new();
996 root_dict.insert(Name::k(), Object::Dictionary(td));
997
998 let mut catalog = HashMap::new();
999 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1000
1001 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1002 let elem = &tree.root_elements[0];
1003 assert_eq!(elem.struct_type, "TD");
1004 assert_eq!(elem.attributes.len(), 1);
1005 assert_eq!(elem.attributes[0].owner, "Layout");
1006 assert!(elem.attributes[0].entries.len() >= 2);
1007
1008 let writing_mode = elem.attributes[0]
1010 .entries
1011 .iter()
1012 .find(|(k, _)| k == "WritingMode");
1013 assert!(writing_mode.is_some());
1014 match &writing_mode.unwrap().1 {
1015 AttributeValue::Name(n) => assert_eq!(n, "LrTb"),
1016 _ => panic!("expected Name attribute value"),
1017 }
1018 }
1019
1020 #[test]
1021 fn test_struct_element_with_attribute_array() {
1022 let store = build_store();
1023
1024 let mut attr1 = HashMap::new();
1025 attr1.insert(Name::o(), name_obj("Layout"));
1026 attr1.insert(Name::from("TextAlign"), name_obj("Center"));
1027
1028 let mut attr2 = HashMap::new();
1029 attr2.insert(Name::o(), name_obj("Table"));
1030 attr2.insert(Name::from("RowSpan"), Object::Integer(2));
1031
1032 let mut td = struct_elem_dict("TD");
1033 td.insert(
1034 Name::a(),
1035 Object::Array(vec![Object::Dictionary(attr1), Object::Dictionary(attr2)]),
1036 );
1037
1038 let mut root_dict = HashMap::new();
1039 root_dict.insert(Name::k(), Object::Dictionary(td));
1040
1041 let mut catalog = HashMap::new();
1042 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1043
1044 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1045 let elem = &tree.root_elements[0];
1046 assert_eq!(elem.attributes.len(), 2);
1047 assert_eq!(elem.attributes[0].owner, "Layout");
1048 assert_eq!(elem.attributes[1].owner, "Table");
1049 }
1050
1051 #[test]
1052 fn test_struct_element_no_attributes() {
1053 let store = build_store();
1054
1055 let p = struct_elem_dict("P");
1056
1057 let mut root_dict = HashMap::new();
1058 root_dict.insert(Name::k(), Object::Dictionary(p));
1059
1060 let mut catalog = HashMap::new();
1061 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1062
1063 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1064 assert!(tree.root_elements[0].attributes.is_empty());
1065 }
1066
1067 #[test]
1068 fn test_struct_element_obj_type_none_by_default() {
1069 let store = build_store();
1070
1071 let p = struct_elem_dict("P");
1072 let mut root_dict = HashMap::new();
1073 root_dict.insert(Name::k(), Object::Dictionary(p));
1074 let mut catalog = HashMap::new();
1075 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1076
1077 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1078 assert!(tree.root_elements[0].obj_type.is_none());
1079 }
1080
1081 #[test]
1082 fn test_struct_element_obj_type_parsed() {
1083 let store = build_store();
1084
1085 let mut elem = struct_elem_dict("Span");
1086 elem.insert(Name::obj_type(), Object::Name(Name::from("Elem")));
1087
1088 let mut root_dict = HashMap::new();
1089 root_dict.insert(Name::k(), Object::Dictionary(elem));
1090 let mut catalog = HashMap::new();
1091 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1092
1093 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1094 assert_eq!(tree.root_elements[0].obj_type.as_deref(), Some("Elem"));
1095 }
1096
1097 #[test]
1101 fn test_elements_for_page_ref_filters_correctly() {
1102 let store = build_store();
1103
1104 let page1 = ObjectId::new(5, 0);
1105 let page2 = ObjectId::new(6, 0);
1106
1107 let mut p1 = struct_elem_dict("P");
1109 p1.insert(Name::pg(), Object::Reference(page1));
1110 let mut p2 = struct_elem_dict("H1");
1111 p2.insert(Name::pg(), Object::Reference(page2));
1112 let mut p3 = struct_elem_dict("Span");
1113 p3.insert(Name::pg(), Object::Reference(page1));
1114
1115 let mut doc = struct_elem_dict("Document");
1116 doc.insert(
1117 Name::k(),
1118 Object::Array(vec![
1119 Object::Dictionary(p1),
1120 Object::Dictionary(p2),
1121 Object::Dictionary(p3),
1122 ]),
1123 );
1124
1125 let mut root_dict = HashMap::new();
1126 root_dict.insert(Name::k(), Object::Dictionary(doc));
1127 let mut catalog = HashMap::new();
1128 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1129
1130 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1131
1132 let page1_elems: Vec<&StructElement> = tree.elements_for_page_ref(page1).collect();
1134 assert_eq!(page1_elems.len(), 2);
1135 assert!(page1_elems.iter().all(|e| e.page_ref == Some(page1)));
1136 let types: Vec<&str> = page1_elems.iter().map(|e| e.struct_type.as_str()).collect();
1137 assert!(types.contains(&"P"));
1138 assert!(types.contains(&"Span"));
1139
1140 let page2_elems: Vec<&StructElement> = tree.elements_for_page_ref(page2).collect();
1142 assert_eq!(page2_elems.len(), 1);
1143 assert_eq!(page2_elems[0].struct_type, "H1");
1144
1145 let absent = ObjectId::new(99, 0);
1147 let absent_elems: Vec<&StructElement> = tree.elements_for_page_ref(absent).collect();
1148 assert!(absent_elems.is_empty());
1149 }
1150
1151 #[test]
1153 fn test_elements_for_page_with_valid_index() {
1154 let store = build_store();
1155
1156 let page0 = ObjectId::new(10, 0);
1157 let page1 = ObjectId::new(11, 0);
1158
1159 let mut h1 = struct_elem_dict("H1");
1160 h1.insert(Name::pg(), Object::Reference(page0));
1161 let mut p = struct_elem_dict("P");
1162 p.insert(Name::pg(), Object::Reference(page1));
1163
1164 let mut doc = struct_elem_dict("Document");
1165 doc.insert(
1166 Name::k(),
1167 Object::Array(vec![Object::Dictionary(h1), Object::Dictionary(p)]),
1168 );
1169
1170 let mut root_dict = HashMap::new();
1171 root_dict.insert(Name::k(), Object::Dictionary(doc));
1172 let mut catalog = HashMap::new();
1173 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1174
1175 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1176
1177 let page_ids = vec![page0, page1];
1179
1180 let idx0_elems: Vec<&StructElement> = tree.elements_for_page(0, &page_ids).collect();
1181 assert_eq!(idx0_elems.len(), 1);
1182 assert_eq!(idx0_elems[0].struct_type, "H1");
1183
1184 let idx1_elems: Vec<&StructElement> = tree.elements_for_page(1, &page_ids).collect();
1185 assert_eq!(idx1_elems.len(), 1);
1186 assert_eq!(idx1_elems[0].struct_type, "P");
1187 }
1188
1189 #[test]
1193 fn test_find_elements_for_mcid_empty() {
1194 let result = find_elements_for_mcid(&[], 42);
1195 assert!(result.is_empty());
1196 }
1197
1198 #[test]
1200 fn test_find_elements_for_mcid_found() {
1201 let elem0 = StructElement {
1203 struct_type: "P".to_string(),
1204 obj_type: None,
1205 alt_text: None,
1206 actual_text: None,
1207 lang: None,
1208 title: None,
1209 id: None,
1210 page_ref: None,
1211 mcids: vec![5],
1212 children: Vec::new(),
1213 attributes: Vec::new(),
1214 parent_index: None,
1215 };
1216 let elem1 = StructElement {
1217 struct_type: "Span".to_string(),
1218 obj_type: None,
1219 alt_text: None,
1220 actual_text: None,
1221 lang: None,
1222 title: None,
1223 id: None,
1224 page_ref: None,
1225 mcids: vec![10],
1226 children: Vec::new(),
1227 attributes: Vec::new(),
1228 parent_index: None,
1229 };
1230 let elements = vec![elem0, elem1];
1231
1232 let result = find_elements_for_mcid(&elements, 5);
1234 assert_eq!(result, vec![0]);
1235
1236 let result = find_elements_for_mcid(&elements, 10);
1238 assert_eq!(result, vec![1]);
1239
1240 let child = StructElement {
1242 struct_type: "Span".to_string(),
1243 obj_type: None,
1244 alt_text: None,
1245 actual_text: None,
1246 lang: None,
1247 title: None,
1248 id: None,
1249 page_ref: None,
1250 mcids: vec![99],
1251 children: Vec::new(),
1252 attributes: Vec::new(),
1253 parent_index: None,
1254 };
1255 let parent = StructElement {
1256 struct_type: "P".to_string(),
1257 obj_type: None,
1258 alt_text: None,
1259 actual_text: None,
1260 lang: None,
1261 title: None,
1262 id: None,
1263 page_ref: None,
1264 mcids: Vec::new(),
1265 children: vec![child],
1266 attributes: Vec::new(),
1267 parent_index: None,
1268 };
1269 let result = find_elements_for_mcid(&[parent], 99);
1270 assert_eq!(result, vec![0]);
1271 }
1272
1273 #[test]
1275 fn test_find_elements_for_mcid_not_found() {
1276 let elem = StructElement {
1277 struct_type: "P".to_string(),
1278 obj_type: None,
1279 alt_text: None,
1280 actual_text: None,
1281 lang: None,
1282 title: None,
1283 id: None,
1284 page_ref: None,
1285 mcids: vec![1, 2, 3],
1286 children: Vec::new(),
1287 attributes: Vec::new(),
1288 parent_index: None,
1289 };
1290 let result = find_elements_for_mcid(&[elem], 999);
1291 assert!(result.is_empty());
1292 }
1293
1294 #[test]
1296 fn test_elements_for_page_out_of_range_returns_empty() {
1297 let store = build_store();
1298
1299 let page0 = ObjectId::new(20, 0);
1300
1301 let mut p = struct_elem_dict("P");
1302 p.insert(Name::pg(), Object::Reference(page0));
1303
1304 let mut root_dict = HashMap::new();
1305 root_dict.insert(Name::k(), Object::Dictionary(p));
1306 let mut catalog = HashMap::new();
1307 catalog.insert(Name::struct_tree_root(), Object::Dictionary(root_dict));
1308
1309 let tree = StructTree::from_catalog(&catalog, &store).unwrap().unwrap();
1310
1311 let page_ids = vec![page0];
1312 let elems: Vec<&StructElement> = tree.elements_for_page(5, &page_ids).collect();
1314 assert!(elems.is_empty());
1315 }
1316}