oxidize_pdf/pipeline/graph.rs
1use crate::pipeline::Element;
2use std::collections::HashMap;
3
4/// Index-based graph of relationships between document elements.
5///
6/// Built from a `&[Element]` slice, the graph stores parent/child and next/prev
7/// relationships as indices into the original slice. No ownership is taken and
8/// no lifetimes are introduced — the graph is a standalone value that can be
9/// stored, cloned, or sent across threads independently of the elements.
10///
11/// # Example
12///
13/// ```rust,no_run
14/// use oxidize_pdf::parser::PdfDocument;
15/// use oxidize_pdf::pipeline::ElementGraph;
16///
17/// let doc = PdfDocument::open("document.pdf")?;
18/// let (elements, graph) = doc.partition_graph(Default::default())?;
19///
20/// for title_idx in graph.top_level_sections() {
21/// println!("Section: {}", elements[title_idx].text());
22/// for &child_idx in graph.children_of(title_idx) {
23/// println!(" {}", elements[child_idx].text());
24/// }
25/// }
26/// # Ok::<(), Box<dyn std::error::Error>>(())
27/// ```
28///
29/// # Parent / child semantics
30///
31/// An element at index `i` is a child of the nearest preceding `Title` element
32/// whose text matches `elements[i].metadata().parent_heading`. A `Title` element
33/// whose own `parent_heading` equals its own text is treated as a root (no parent)
34/// unless a *different* preceding title has the same text.
35pub struct ElementGraph {
36 parent: Vec<Option<usize>>,
37 children: Vec<Vec<usize>>,
38 next: Vec<Option<usize>>,
39 prev: Vec<Option<usize>>,
40 /// Which indices are Title elements (needed for `top_level_sections`).
41 is_title: Vec<bool>,
42}
43
44impl ElementGraph {
45 /// Build a graph from a slice of elements.
46 pub fn build(elements: &[Element]) -> Self {
47 let n = elements.len();
48
49 let mut parent: Vec<Option<usize>> = vec![None; n];
50 let mut children: Vec<Vec<usize>> = vec![Vec::new(); n];
51 let mut next: Vec<Option<usize>> = vec![None; n];
52 let mut prev: Vec<Option<usize>> = vec![None; n];
53 let mut is_title: Vec<bool> = vec![false; n];
54
55 // ── next / prev ──────────────────────────────────────────────────────
56 for i in 0..n {
57 if i + 1 < n {
58 next[i] = Some(i + 1);
59 }
60 if i > 0 {
61 prev[i] = Some(i - 1);
62 }
63 }
64
65 // ── parent / child ───────────────────────────────────────────────────
66 // Map heading text → most recent Title index with that text.
67 // We update this map as we scan forward so that "most recent" is always correct.
68 let mut latest_title_for_heading: HashMap<String, usize> = HashMap::new();
69
70 for i in 0..n {
71 if matches!(elements[i], Element::Title(_)) {
72 is_title[i] = true;
73 let title_text = elements[i].text().to_string();
74 latest_title_for_heading.insert(title_text, i);
75 }
76 }
77
78 // Second pass: assign parent / child.
79 // We need to process in order so we can handle the "most recent title"
80 // requirement by rebuilding the map incrementally.
81 let mut active_title_for_heading: HashMap<String, usize> = HashMap::new();
82
83 for i in 0..n {
84 if matches!(elements[i], Element::Title(_)) {
85 let title_text = elements[i].text().to_string();
86 active_title_for_heading.insert(title_text, i);
87 // A title is a root unless a *different* preceding title has an
88 // explicit `parent_heading` pointing to yet another title.
89 // Per the spec: "A Title element's own parent_heading equals its
90 // own text — it should NOT be its own parent."
91 // So titles are always roots in the current model.
92 // (parent stays None for titles)
93 } else {
94 // Non-title element: look up its parent_heading.
95 if let Some(heading_text) = elements[i].metadata().parent_heading.as_deref() {
96 if let Some(&title_idx) = active_title_for_heading.get(heading_text) {
97 parent[i] = Some(title_idx);
98 children[title_idx].push(i);
99 }
100 }
101 }
102 }
103
104 Self {
105 parent,
106 children,
107 next,
108 prev,
109 is_title,
110 }
111 }
112
113 /// Number of elements in the graph (equals the length of the source slice).
114 pub fn len(&self) -> usize {
115 self.parent.len()
116 }
117
118 /// Returns `true` when the graph was built from an empty slice.
119 pub fn is_empty(&self) -> bool {
120 self.len() == 0
121 }
122
123 /// Returns the index of the parent element of `idx`, or `None` if it is a root.
124 ///
125 /// # Panics
126 ///
127 /// Panics if `idx >= self.len()`.
128 pub fn parent_of(&self, idx: usize) -> Option<usize> {
129 self.parent[idx]
130 }
131
132 /// Returns the indices of the children of element `idx`.
133 ///
134 /// # Panics
135 ///
136 /// Panics if `idx >= self.len()`.
137 pub fn children_of(&self, idx: usize) -> &[usize] {
138 &self.children[idx]
139 }
140
141 /// Returns the index of the element immediately after `idx`, or `None` if `idx`
142 /// is the last element.
143 ///
144 /// # Panics
145 ///
146 /// Panics if `idx >= self.len()`.
147 pub fn next_of(&self, idx: usize) -> Option<usize> {
148 self.next[idx]
149 }
150
151 /// Returns the index of the element immediately before `idx`, or `None` if `idx`
152 /// is the first element.
153 ///
154 /// # Panics
155 ///
156 /// Panics if `idx >= self.len()`.
157 pub fn prev_of(&self, idx: usize) -> Option<usize> {
158 self.prev[idx]
159 }
160
161 /// Returns the child indices of a Title element (i.e., the elements belonging
162 /// to its section).
163 ///
164 /// This is an alias for [`children_of`](Self::children_of) with a semantically
165 /// clearer name for the common case of iterating a document section.
166 pub fn elements_in_section(&self, title_idx: usize) -> Vec<usize> {
167 self.children_of(title_idx).to_vec()
168 }
169
170 /// Returns the indices of all Title elements that have no parent (top-level sections).
171 pub fn top_level_sections(&self) -> Vec<usize> {
172 (0..self.len())
173 .filter(|&i| self.is_title[i] && self.parent[i].is_none())
174 .collect()
175 }
176}