oxidize_pdf/structure/
tagged.rs

1//! Tagged PDF Support - ISO 32000-1 Section 14.8 (Experimental)
2//!
3//! Tagged PDF is a framework for document structure and accessibility.
4//! This module provides **structure tree generation** with parent references,
5//! attributes, and role mapping.
6//!
7//! # ⚠️ Status (v1.4.0)
8//!
9//! - ✅ **Structure tree hierarchy** - Fully implemented
10//! - ✅ **Parent references** - ISO 32000-1 §14.7.2 compliant
11//! - ✅ **Attributes** - Lang, Alt, ActualText, Title, BBox
12//! - ✅ **RoleMap** - Custom to standard type mapping
13//! - ❌ **Marked content operators** - Not yet implemented (v1.5.0)
14//! - ❌ **Automatic MCID assignment** - Not yet implemented (v1.5.0)
15//! - ❌ **PDF/UA compliance** - Not yet achieved (v1.5.0)
16//!
17//! **Note**: While the structure tree is generated correctly, the content is NOT
18//! automatically marked with BMC/BDC/EMC operators. This means screen readers
19//! will see the structure but cannot navigate the actual content. For full
20//! accessibility, wait for v1.5.0 or add marked content manually.
21//!
22//! # Key Components
23//!
24//! - **Structure Tree Root** (`/StructTreeRoot` in catalog): Root of structure hierarchy
25//! - **Structure Elements** (`/StructElem`): Elements forming the document tree
26//! - **Role Map**: Maps custom structure types to standard types
27//! - **Marked Content** (v1.5.0): Associates content with structure elements via MCIDs
28//!
29//! # Standard Structure Types
30//!
31//! ISO 32000-1 Table 337 defines standard structure types:
32//! - Grouping: Document, Part, Sect, Div, Art, BlockQuote
33//! - Paragraphs: P, H, H1-H6
34//! - Lists: L, LI, Lbl, LBody
35//! - Tables: Table, TR, TH, TD
36//! - Inline: Span, Quote, Note, Reference, Code
37//! - Illustration: Figure, Formula, Form
38//!
39//! # Example
40//!
41//! ```rust,no_run
42//! use oxidize_pdf::structure::{StructTree, StructureElement, StandardStructureType};
43//!
44//! # fn example() -> Result<(), Box<dyn std::error::Error>> {
45//! let mut tree = StructTree::new();
46//!
47//! // Create document root
48//! let doc_elem = StructureElement::new(StandardStructureType::Document);
49//! let doc_idx = tree.set_root(doc_elem);
50//!
51//! // Add heading with language attribute
52//! let heading = StructureElement::new(StandardStructureType::H1)
53//!     .with_language("en-US")
54//!     .with_actual_text("Welcome to Tagged PDF");
55//! let h1_idx = tree.add_child(doc_idx, heading)?;
56//!
57//! // Add paragraph
58//! let para = StructureElement::new(StandardStructureType::P);
59//! tree.add_child(doc_idx, para)?;
60//!
61//! # Ok(())
62//! # }
63//! ```
64
65use std::collections::HashMap;
66
67/// Standard structure types defined in ISO 32000-1 Table 337
68#[derive(Debug, Clone, PartialEq, Eq, Hash)]
69pub enum StandardStructureType {
70    // Grouping Elements
71    /// Document root element
72    Document,
73    /// Part of a document
74    Part,
75    /// Section (generic division)
76    Sect,
77    /// Generic block-level division
78    Div,
79    /// Article
80    Art,
81    /// Block quotation
82    BlockQuote,
83    /// Caption (for figures, tables, etc.)
84    Caption,
85    /// Table of contents
86    TOC,
87    /// Table of contents item
88    TOCI,
89    /// Index
90    Index,
91
92    // Paragraph-like Elements
93    /// Generic paragraph
94    P,
95    /// Generic heading (when level unknown)
96    H,
97    /// Heading level 1
98    H1,
99    /// Heading level 2
100    H2,
101    /// Heading level 3
102    H3,
103    /// Heading level 4
104    H4,
105    /// Heading level 5
106    H5,
107    /// Heading level 6
108    H6,
109
110    // List Elements
111    /// List
112    L,
113    /// List item
114    LI,
115    /// Label for list item (bullet or number)
116    Lbl,
117    /// List item body
118    LBody,
119
120    // Table Elements
121    /// Table
122    Table,
123    /// Table row
124    TR,
125    /// Table header cell
126    TH,
127    /// Table data cell
128    TD,
129    /// Table header row group
130    THead,
131    /// Table body row group
132    TBody,
133    /// Table footer row group
134    TFoot,
135
136    // Inline Elements
137    /// Generic inline span
138    Span,
139    /// Quotation
140    Quote,
141    /// Note or footnote
142    Note,
143    /// Reference to external content
144    Reference,
145    /// Bibliographic entry
146    BibEntry,
147    /// Computer code
148    Code,
149    /// Hyperlink
150    Link,
151    /// Annotation reference
152    Annot,
153
154    // Illustration Elements
155    /// Figure or illustration
156    Figure,
157    /// Mathematical formula
158    Formula,
159    /// Interactive form element
160    Form,
161
162    // Ruby and Warichu (for Asian languages)
163    /// Ruby annotation (Asian text)
164    Ruby,
165    /// Ruby base text
166    RB,
167    /// Ruby text
168    RT,
169    /// Ruby punctuation
170    RP,
171    /// Warichu annotation
172    Warichu,
173    /// Warichu text
174    WT,
175    /// Warichu punctuation
176    WP,
177
178    // Special
179    /// Non-structural element (decorative content)
180    NonStruct,
181    /// Private element (application-specific)
182    Private,
183}
184
185impl StandardStructureType {
186    /// Returns the PDF name for this structure type
187    pub fn as_pdf_name(&self) -> &'static str {
188        match self {
189            Self::Document => "Document",
190            Self::Part => "Part",
191            Self::Sect => "Sect",
192            Self::Div => "Div",
193            Self::Art => "Art",
194            Self::BlockQuote => "BlockQuote",
195            Self::Caption => "Caption",
196            Self::TOC => "TOC",
197            Self::TOCI => "TOCI",
198            Self::Index => "Index",
199            Self::P => "P",
200            Self::H => "H",
201            Self::H1 => "H1",
202            Self::H2 => "H2",
203            Self::H3 => "H3",
204            Self::H4 => "H4",
205            Self::H5 => "H5",
206            Self::H6 => "H6",
207            Self::L => "L",
208            Self::LI => "LI",
209            Self::Lbl => "Lbl",
210            Self::LBody => "LBody",
211            Self::Table => "Table",
212            Self::TR => "TR",
213            Self::TH => "TH",
214            Self::TD => "TD",
215            Self::THead => "THead",
216            Self::TBody => "TBody",
217            Self::TFoot => "TFoot",
218            Self::Span => "Span",
219            Self::Quote => "Quote",
220            Self::Note => "Note",
221            Self::Reference => "Reference",
222            Self::BibEntry => "BibEntry",
223            Self::Code => "Code",
224            Self::Link => "Link",
225            Self::Annot => "Annot",
226            Self::Figure => "Figure",
227            Self::Formula => "Formula",
228            Self::Form => "Form",
229            Self::Ruby => "Ruby",
230            Self::RB => "RB",
231            Self::RT => "RT",
232            Self::RP => "RP",
233            Self::Warichu => "Warichu",
234            Self::WT => "WT",
235            Self::WP => "WP",
236            Self::NonStruct => "NonStruct",
237            Self::Private => "Private",
238        }
239    }
240
241    /// Parses a PDF name into a standard structure type
242    pub fn from_pdf_name(name: &str) -> Option<Self> {
243        match name {
244            "Document" => Some(Self::Document),
245            "Part" => Some(Self::Part),
246            "Sect" => Some(Self::Sect),
247            "Div" => Some(Self::Div),
248            "Art" => Some(Self::Art),
249            "BlockQuote" => Some(Self::BlockQuote),
250            "Caption" => Some(Self::Caption),
251            "TOC" => Some(Self::TOC),
252            "TOCI" => Some(Self::TOCI),
253            "Index" => Some(Self::Index),
254            "P" => Some(Self::P),
255            "H" => Some(Self::H),
256            "H1" => Some(Self::H1),
257            "H2" => Some(Self::H2),
258            "H3" => Some(Self::H3),
259            "H4" => Some(Self::H4),
260            "H5" => Some(Self::H5),
261            "H6" => Some(Self::H6),
262            "L" => Some(Self::L),
263            "LI" => Some(Self::LI),
264            "Lbl" => Some(Self::Lbl),
265            "LBody" => Some(Self::LBody),
266            "Table" => Some(Self::Table),
267            "TR" => Some(Self::TR),
268            "TH" => Some(Self::TH),
269            "TD" => Some(Self::TD),
270            "THead" => Some(Self::THead),
271            "TBody" => Some(Self::TBody),
272            "TFoot" => Some(Self::TFoot),
273            "Span" => Some(Self::Span),
274            "Quote" => Some(Self::Quote),
275            "Note" => Some(Self::Note),
276            "Reference" => Some(Self::Reference),
277            "BibEntry" => Some(Self::BibEntry),
278            "Code" => Some(Self::Code),
279            "Link" => Some(Self::Link),
280            "Annot" => Some(Self::Annot),
281            "Figure" => Some(Self::Figure),
282            "Formula" => Some(Self::Formula),
283            "Form" => Some(Self::Form),
284            "Ruby" => Some(Self::Ruby),
285            "RB" => Some(Self::RB),
286            "RT" => Some(Self::RT),
287            "RP" => Some(Self::RP),
288            "Warichu" => Some(Self::Warichu),
289            "WT" => Some(Self::WT),
290            "WP" => Some(Self::WP),
291            "NonStruct" => Some(Self::NonStruct),
292            "Private" => Some(Self::Private),
293            _ => None,
294        }
295    }
296}
297
298/// Attributes that can be attached to structure elements
299///
300/// These attributes provide additional semantic information for accessibility
301/// and document understanding.
302#[derive(Debug, Clone, Default)]
303pub struct StructureAttributes {
304    /// Language of the element content (e.g., "en-US", "es-ES", "zh-CN")
305    pub lang: Option<String>,
306
307    /// Alternate description (for accessibility - used when content cannot be extracted)
308    pub alt: Option<String>,
309
310    /// Actual text representation (replacement text for abbreviations, symbols, etc.)
311    pub actual_text: Option<String>,
312
313    /// Expansion of an abbreviation
314    pub expanded: Option<String>,
315
316    /// Title or label for the element
317    pub title: Option<String>,
318
319    /// Bounding box (Left, Bottom, Right, Top)
320    pub bbox: Option<[f64; 4]>,
321
322    /// Custom attributes (for application-specific metadata)
323    pub custom: HashMap<String, String>,
324}
325
326impl StructureAttributes {
327    /// Creates a new empty attributes set
328    pub fn new() -> Self {
329        Self::default()
330    }
331
332    /// Sets the language attribute
333    pub fn with_language(mut self, lang: impl Into<String>) -> Self {
334        self.lang = Some(lang.into());
335        self
336    }
337
338    /// Sets the alt text attribute (for accessibility)
339    pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
340        self.alt = Some(alt.into());
341        self
342    }
343
344    /// Sets the actual text attribute
345    pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
346        self.actual_text = Some(text.into());
347        self
348    }
349
350    /// Sets the title attribute
351    pub fn with_title(mut self, title: impl Into<String>) -> Self {
352        self.title = Some(title.into());
353        self
354    }
355
356    /// Sets the bounding box attribute
357    pub fn with_bbox(mut self, bbox: [f64; 4]) -> Self {
358        self.bbox = Some(bbox);
359        self
360    }
361}
362
363/// A structure element in the document structure tree
364///
365/// Structure elements form a hierarchical tree that describes the logical
366/// organization of the document content.
367#[derive(Debug, Clone)]
368pub struct StructureElement {
369    /// The structure type (either standard or custom)
370    pub structure_type: StructureType,
371
372    /// Element ID (optional, used for referencing)
373    pub id: Option<String>,
374
375    /// Attributes for this element
376    pub attributes: StructureAttributes,
377
378    /// Child elements (element IDs)
379    pub children: Vec<usize>,
380
381    /// Marked content references (MCIDs) associated with this element
382    pub mcids: Vec<MarkedContentReference>,
383}
384
385/// Structure type - either standard or custom (mapped via RoleMap)
386#[derive(Debug, Clone, PartialEq)]
387pub enum StructureType {
388    /// Standard structure type defined by PDF spec
389    Standard(StandardStructureType),
390    /// Custom structure type (must be mapped in RoleMap)
391    Custom(String),
392}
393
394impl StructureType {
395    /// Returns the PDF name for this structure type
396    pub fn as_pdf_name(&self) -> String {
397        match self {
398            Self::Standard(std_type) => std_type.as_pdf_name().to_string(),
399            Self::Custom(name) => name.clone(),
400        }
401    }
402}
403
404/// Reference to marked content in a content stream
405///
406/// Marked content is delimited by BMC/BDC (begin) and EMC (end) operators,
407/// and associated with structure elements via MCIDs.
408#[derive(Debug, Clone, PartialEq)]
409pub struct MarkedContentReference {
410    /// Page index where the marked content appears
411    pub page_index: usize,
412
413    /// Marked Content ID within the page's content stream
414    pub mcid: u32,
415}
416
417impl StructureElement {
418    /// Creates a new structure element with the given type
419    pub fn new(structure_type: StandardStructureType) -> Self {
420        Self {
421            structure_type: StructureType::Standard(structure_type),
422            id: None,
423            attributes: StructureAttributes::new(),
424            children: Vec::new(),
425            mcids: Vec::new(),
426        }
427    }
428
429    /// Creates a new structure element with a custom type
430    pub fn new_custom(type_name: impl Into<String>) -> Self {
431        Self {
432            structure_type: StructureType::Custom(type_name.into()),
433            id: None,
434            attributes: StructureAttributes::new(),
435            children: Vec::new(),
436            mcids: Vec::new(),
437        }
438    }
439
440    /// Sets the element ID
441    pub fn with_id(mut self, id: impl Into<String>) -> Self {
442        self.id = Some(id.into());
443        self
444    }
445
446    /// Sets the language attribute
447    pub fn with_language(mut self, lang: impl Into<String>) -> Self {
448        self.attributes.lang = Some(lang.into());
449        self
450    }
451
452    /// Sets the alt text attribute
453    pub fn with_alt_text(mut self, alt: impl Into<String>) -> Self {
454        self.attributes.alt = Some(alt.into());
455        self
456    }
457
458    /// Sets the actual text attribute
459    pub fn with_actual_text(mut self, text: impl Into<String>) -> Self {
460        self.attributes.actual_text = Some(text.into());
461        self
462    }
463
464    /// Sets the title attribute
465    pub fn with_title(mut self, title: impl Into<String>) -> Self {
466        self.attributes.title = Some(title.into());
467        self
468    }
469
470    /// Adds a marked content reference to this element
471    pub fn add_mcid(&mut self, page_index: usize, mcid: u32) {
472        self.mcids.push(MarkedContentReference { page_index, mcid });
473    }
474
475    /// Adds a child element (by index in the structure tree)
476    pub fn add_child(&mut self, child_index: usize) {
477        self.children.push(child_index);
478    }
479}
480
481/// Role map - maps custom structure types to standard types
482///
483/// Allows extending the structure type system while maintaining
484/// compatibility with standard types.
485#[derive(Debug, Clone, Default)]
486pub struct RoleMap {
487    mappings: HashMap<String, StandardStructureType>,
488}
489
490impl RoleMap {
491    /// Creates a new empty role map
492    pub fn new() -> Self {
493        Self::default()
494    }
495
496    /// Adds a mapping from a custom type to a standard type
497    pub fn add_mapping(
498        &mut self,
499        custom_type: impl Into<String>,
500        standard_type: StandardStructureType,
501    ) {
502        self.mappings.insert(custom_type.into(), standard_type);
503    }
504
505    /// Gets the standard type for a custom type (if mapped)
506    pub fn get_mapping(&self, custom_type: &str) -> Option<&StandardStructureType> {
507        self.mappings.get(custom_type)
508    }
509
510    /// Returns all mappings
511    pub fn mappings(&self) -> &HashMap<String, StandardStructureType> {
512        &self.mappings
513    }
514}
515
516/// Structure tree - hierarchical organization of document structure
517///
518/// The structure tree describes the logical organization of content in
519/// a tagged PDF document.
520#[derive(Debug, Clone)]
521pub struct StructTree {
522    /// All structure elements in the tree (indexed by position)
523    elements: Vec<StructureElement>,
524
525    /// Index of the root element (typically Document)
526    root_index: Option<usize>,
527
528    /// Role map for custom structure types
529    pub role_map: RoleMap,
530
531    /// ID tree for quick lookup by element ID
532    id_map: HashMap<String, usize>,
533}
534
535impl Default for StructTree {
536    fn default() -> Self {
537        Self::new()
538    }
539}
540
541impl StructTree {
542    /// Creates a new empty structure tree
543    pub fn new() -> Self {
544        Self {
545            elements: Vec::new(),
546            root_index: None,
547            role_map: RoleMap::new(),
548            id_map: HashMap::new(),
549        }
550    }
551
552    /// Adds a root element to the tree (typically Document)
553    pub fn set_root(&mut self, element: StructureElement) -> usize {
554        let index = self.elements.len();
555
556        // Update ID map if element has an ID
557        if let Some(ref id) = element.id {
558            self.id_map.insert(id.clone(), index);
559        }
560
561        self.elements.push(element);
562        self.root_index = Some(index);
563        index
564    }
565
566    /// Adds an element as a child of another element
567    pub fn add_child(
568        &mut self,
569        parent_index: usize,
570        element: StructureElement,
571    ) -> Result<usize, String> {
572        if parent_index >= self.elements.len() {
573            return Err(format!("Parent index {} out of bounds", parent_index));
574        }
575
576        let child_index = self.elements.len();
577
578        // Update ID map if element has an ID
579        if let Some(ref id) = element.id {
580            self.id_map.insert(id.clone(), child_index);
581        }
582
583        self.elements.push(element);
584        self.elements[parent_index].add_child(child_index);
585
586        Ok(child_index)
587    }
588
589    /// Gets an element by index
590    pub fn get(&self, index: usize) -> Option<&StructureElement> {
591        self.elements.get(index)
592    }
593
594    /// Gets a mutable reference to an element by index
595    pub fn get_mut(&mut self, index: usize) -> Option<&mut StructureElement> {
596        self.elements.get_mut(index)
597    }
598
599    /// Gets an element by ID
600    pub fn get_by_id(&self, id: &str) -> Option<&StructureElement> {
601        self.id_map.get(id).and_then(|&index| self.get(index))
602    }
603
604    /// Gets the root element index
605    pub fn root_index(&self) -> Option<usize> {
606        self.root_index
607    }
608
609    /// Gets the root element
610    pub fn root(&self) -> Option<&StructureElement> {
611        self.root_index.and_then(|index| self.get(index))
612    }
613
614    /// Returns the total number of elements in the tree
615    pub fn len(&self) -> usize {
616        self.elements.len()
617    }
618
619    /// Returns true if the tree is empty
620    pub fn is_empty(&self) -> bool {
621        self.elements.is_empty()
622    }
623
624    /// Returns an iterator over all elements
625    pub fn iter(&self) -> impl Iterator<Item = &StructureElement> {
626        self.elements.iter()
627    }
628}
629
630#[cfg(test)]
631mod tests {
632    use super::*;
633
634    #[test]
635    fn test_standard_structure_type_names() {
636        assert_eq!(StandardStructureType::Document.as_pdf_name(), "Document");
637        assert_eq!(StandardStructureType::H1.as_pdf_name(), "H1");
638        assert_eq!(StandardStructureType::P.as_pdf_name(), "P");
639        assert_eq!(StandardStructureType::Figure.as_pdf_name(), "Figure");
640        assert_eq!(StandardStructureType::Table.as_pdf_name(), "Table");
641    }
642
643    #[test]
644    fn test_standard_structure_type_parsing() {
645        assert_eq!(
646            StandardStructureType::from_pdf_name("Document"),
647            Some(StandardStructureType::Document)
648        );
649        assert_eq!(
650            StandardStructureType::from_pdf_name("H1"),
651            Some(StandardStructureType::H1)
652        );
653        assert_eq!(StandardStructureType::from_pdf_name("Invalid"), None);
654    }
655
656    #[test]
657    fn test_structure_element_creation() {
658        let elem = StructureElement::new(StandardStructureType::H1)
659            .with_id("heading1")
660            .with_language("en-US")
661            .with_actual_text("Chapter One");
662
663        assert_eq!(elem.id, Some("heading1".to_string()));
664        assert_eq!(elem.attributes.lang, Some("en-US".to_string()));
665        assert_eq!(elem.attributes.actual_text, Some("Chapter One".to_string()));
666    }
667
668    #[test]
669    fn test_structure_attributes_builder() {
670        let attrs = StructureAttributes::new()
671            .with_language("es-ES")
672            .with_alt_text("Imagen de ejemplo")
673            .with_bbox([0.0, 0.0, 100.0, 100.0]);
674
675        assert_eq!(attrs.lang, Some("es-ES".to_string()));
676        assert_eq!(attrs.alt, Some("Imagen de ejemplo".to_string()));
677        assert_eq!(attrs.bbox, Some([0.0, 0.0, 100.0, 100.0]));
678    }
679
680    #[test]
681    fn test_role_map() {
682        let mut role_map = RoleMap::new();
683        role_map.add_mapping("MyHeading", StandardStructureType::H1);
684        role_map.add_mapping("MyParagraph", StandardStructureType::P);
685
686        assert_eq!(
687            role_map.get_mapping("MyHeading"),
688            Some(&StandardStructureType::H1)
689        );
690        assert_eq!(
691            role_map.get_mapping("MyParagraph"),
692            Some(&StandardStructureType::P)
693        );
694        assert_eq!(role_map.get_mapping("Unknown"), None);
695    }
696
697    #[test]
698    fn test_struct_tree_creation() {
699        let mut tree = StructTree::new();
700
701        // Add root document element
702        let doc = StructureElement::new(StandardStructureType::Document);
703        let doc_idx = tree.set_root(doc);
704
705        assert_eq!(tree.root_index(), Some(doc_idx));
706        assert_eq!(tree.len(), 1);
707    }
708
709    #[test]
710    fn test_struct_tree_hierarchy() {
711        let mut tree = StructTree::new();
712
713        // Create document root
714        let doc = StructureElement::new(StandardStructureType::Document).with_id("doc1");
715        let doc_idx = tree.set_root(doc);
716
717        // Add heading
718        let h1 = StructureElement::new(StandardStructureType::H1)
719            .with_id("h1")
720            .with_actual_text("Title");
721        let h1_idx = tree.add_child(doc_idx, h1).unwrap();
722
723        // Add paragraph
724        let para = StructureElement::new(StandardStructureType::P).with_id("p1");
725        let p_idx = tree.add_child(doc_idx, para).unwrap();
726
727        assert_eq!(tree.len(), 3);
728        assert_eq!(tree.get(doc_idx).unwrap().children.len(), 2);
729        assert_eq!(tree.get(doc_idx).unwrap().children[0], h1_idx);
730        assert_eq!(tree.get(doc_idx).unwrap().children[1], p_idx);
731
732        // Test ID lookup
733        assert!(tree.get_by_id("h1").is_some());
734        assert!(tree.get_by_id("p1").is_some());
735        assert!(tree.get_by_id("unknown").is_none());
736    }
737
738    #[test]
739    fn test_marked_content_references() {
740        let mut elem = StructureElement::new(StandardStructureType::P);
741        elem.add_mcid(0, 1);
742        elem.add_mcid(0, 2);
743
744        assert_eq!(elem.mcids.len(), 2);
745        assert_eq!(elem.mcids[0].page_index, 0);
746        assert_eq!(elem.mcids[0].mcid, 1);
747        assert_eq!(elem.mcids[1].mcid, 2);
748    }
749
750    #[test]
751    fn test_custom_structure_type() {
752        let elem = StructureElement::new_custom("MyCustomType");
753
754        match elem.structure_type {
755            StructureType::Custom(ref name) => assert_eq!(name, "MyCustomType"),
756            _ => panic!("Expected custom structure type"),
757        }
758    }
759
760    #[test]
761    fn test_struct_tree_error_handling() {
762        let mut tree = StructTree::new();
763
764        // Try to add child to non-existent parent
765        let elem = StructureElement::new(StandardStructureType::P);
766        let result = tree.add_child(999, elem);
767
768        assert!(result.is_err());
769        assert!(result.unwrap_err().contains("out of bounds"));
770    }
771}