Skip to main content

pdfplumber_core/
struct_tree.rs

1//! PDF structure tree types for tagged PDF access.
2//!
3//! Tagged PDFs contain a logical structure tree that describes the document's
4//! semantic structure (headings, paragraphs, tables, lists). This module provides
5//! the [`StructElement`] type for representing structure tree nodes.
6
7use crate::geometry::BBox;
8
9/// A node in the PDF structure tree.
10///
11/// Represents a logical structure element from a tagged PDF's `/StructTreeRoot`.
12/// Each element has a type (e.g., "H1", "P", "Table"), optional marked content
13/// identifiers (MCIDs) linking it to page content, and optional child elements
14/// forming a tree structure.
15///
16/// # Tagged PDF Support
17///
18/// Tagged PDFs (ISO 32000-1, Section 14.8) embed semantic structure that is
19/// critical for accessibility and increasingly important for AI/LLM document
20/// understanding. The structure tree maps logical elements (headings, paragraphs,
21/// tables) to their visual representation on the page via MCID references.
22///
23/// # Example
24///
25/// ```
26/// use pdfplumber_core::StructElement;
27///
28/// let heading = StructElement {
29///     element_type: "H1".to_string(),
30///     mcids: vec![0],
31///     alt_text: None,
32///     actual_text: Some("Chapter 1".to_string()),
33///     lang: Some("en".to_string()),
34///     bbox: None,
35///     children: vec![],
36///     page_index: Some(0),
37/// };
38/// assert_eq!(heading.element_type, "H1");
39/// assert_eq!(heading.mcids, vec![0]);
40/// ```
41#[derive(Debug, Clone, PartialEq)]
42#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
43pub struct StructElement {
44    /// The structure type name (e.g., "Document", "H1", "P", "Table", "TR", "TD",
45    /// "L", "LI", "Span", "Figure").
46    pub element_type: String,
47    /// Marked content identifiers linking this element to page content.
48    /// Each MCID corresponds to a marked-content sequence in a page's content stream.
49    pub mcids: Vec<u32>,
50    /// Alternative text for accessibility (from `/Alt` entry).
51    pub alt_text: Option<String>,
52    /// Replacement text for the element's content (from `/ActualText` entry).
53    pub actual_text: Option<String>,
54    /// Language of the element's content (from `/Lang` entry, e.g., "en-US").
55    pub lang: Option<String>,
56    /// Bounding box of the element, if available.
57    pub bbox: Option<BBox>,
58    /// Child structure elements forming the tree hierarchy.
59    pub children: Vec<StructElement>,
60    /// Page index (0-based) this element belongs to, if determinable.
61    pub page_index: Option<usize>,
62}
63
64#[cfg(test)]
65mod tests {
66    use super::*;
67
68    #[test]
69    fn struct_element_basic_creation() {
70        let elem = StructElement {
71            element_type: "P".to_string(),
72            mcids: vec![1, 2],
73            alt_text: None,
74            actual_text: None,
75            lang: None,
76            bbox: None,
77            children: vec![],
78            page_index: Some(0),
79        };
80        assert_eq!(elem.element_type, "P");
81        assert_eq!(elem.mcids, vec![1, 2]);
82        assert!(elem.alt_text.is_none());
83        assert!(elem.actual_text.is_none());
84        assert!(elem.lang.is_none());
85        assert!(elem.bbox.is_none());
86        assert!(elem.children.is_empty());
87        assert_eq!(elem.page_index, Some(0));
88    }
89
90    #[test]
91    fn struct_element_with_all_fields() {
92        let elem = StructElement {
93            element_type: "Figure".to_string(),
94            mcids: vec![5],
95            alt_text: Some("A bar chart showing quarterly revenue".to_string()),
96            actual_text: Some("Revenue chart".to_string()),
97            lang: Some("en-US".to_string()),
98            bbox: Some(BBox::new(72.0, 100.0, 540.0, 400.0)),
99            children: vec![],
100            page_index: Some(2),
101        };
102        assert_eq!(elem.element_type, "Figure");
103        assert_eq!(
104            elem.alt_text.as_deref(),
105            Some("A bar chart showing quarterly revenue")
106        );
107        assert_eq!(elem.actual_text.as_deref(), Some("Revenue chart"));
108        assert_eq!(elem.lang.as_deref(), Some("en-US"));
109        assert!(elem.bbox.is_some());
110        assert_eq!(elem.page_index, Some(2));
111    }
112
113    #[test]
114    fn struct_element_with_children() {
115        let child1 = StructElement {
116            element_type: "Span".to_string(),
117            mcids: vec![1],
118            alt_text: None,
119            actual_text: None,
120            lang: None,
121            bbox: None,
122            children: vec![],
123            page_index: Some(0),
124        };
125        let child2 = StructElement {
126            element_type: "Span".to_string(),
127            mcids: vec![2],
128            alt_text: None,
129            actual_text: None,
130            lang: None,
131            bbox: None,
132            children: vec![],
133            page_index: Some(0),
134        };
135        let parent = StructElement {
136            element_type: "P".to_string(),
137            mcids: vec![],
138            alt_text: None,
139            actual_text: None,
140            lang: None,
141            bbox: None,
142            children: vec![child1, child2],
143            page_index: Some(0),
144        };
145        assert_eq!(parent.children.len(), 2);
146        assert_eq!(parent.children[0].element_type, "Span");
147        assert_eq!(parent.children[1].mcids, vec![2]);
148    }
149
150    #[test]
151    fn struct_element_nested_tree() {
152        let td1 = StructElement {
153            element_type: "TD".to_string(),
154            mcids: vec![10],
155            alt_text: None,
156            actual_text: None,
157            lang: None,
158            bbox: None,
159            children: vec![],
160            page_index: Some(0),
161        };
162        let td2 = StructElement {
163            element_type: "TD".to_string(),
164            mcids: vec![11],
165            alt_text: None,
166            actual_text: None,
167            lang: None,
168            bbox: None,
169            children: vec![],
170            page_index: Some(0),
171        };
172        let tr = StructElement {
173            element_type: "TR".to_string(),
174            mcids: vec![],
175            alt_text: None,
176            actual_text: None,
177            lang: None,
178            bbox: None,
179            children: vec![td1, td2],
180            page_index: Some(0),
181        };
182        let table = StructElement {
183            element_type: "Table".to_string(),
184            mcids: vec![],
185            alt_text: None,
186            actual_text: None,
187            lang: None,
188            bbox: None,
189            children: vec![tr],
190            page_index: Some(0),
191        };
192
193        assert_eq!(table.children.len(), 1);
194        assert_eq!(table.children[0].element_type, "TR");
195        assert_eq!(table.children[0].children.len(), 2);
196        assert_eq!(table.children[0].children[0].element_type, "TD");
197        assert_eq!(table.children[0].children[0].mcids, vec![10]);
198    }
199
200    #[test]
201    fn struct_element_clone() {
202        let elem = StructElement {
203            element_type: "H1".to_string(),
204            mcids: vec![0],
205            alt_text: Some("Title".to_string()),
206            actual_text: None,
207            lang: Some("en".to_string()),
208            bbox: Some(BBox::new(72.0, 72.0, 540.0, 100.0)),
209            children: vec![],
210            page_index: Some(0),
211        };
212        let cloned = elem.clone();
213        assert_eq!(elem, cloned);
214    }
215
216    #[test]
217    fn struct_element_no_page_index() {
218        let elem = StructElement {
219            element_type: "Document".to_string(),
220            mcids: vec![],
221            alt_text: None,
222            actual_text: None,
223            lang: None,
224            bbox: None,
225            children: vec![],
226            page_index: None,
227        };
228        assert!(elem.page_index.is_none());
229    }
230
231    #[test]
232    fn struct_element_empty_mcids() {
233        let elem = StructElement {
234            element_type: "Div".to_string(),
235            mcids: vec![],
236            alt_text: None,
237            actual_text: None,
238            lang: None,
239            bbox: None,
240            children: vec![],
241            page_index: None,
242        };
243        assert!(elem.mcids.is_empty());
244    }
245
246    #[test]
247    fn struct_element_heading_types() {
248        for level in 1..=6 {
249            let elem = StructElement {
250                element_type: format!("H{level}"),
251                mcids: vec![level as u32],
252                alt_text: None,
253                actual_text: None,
254                lang: None,
255                bbox: None,
256                children: vec![],
257                page_index: Some(0),
258            };
259            assert_eq!(elem.element_type, format!("H{level}"));
260        }
261    }
262}