oxidize_pdf/pipeline/graph.rs
1use crate::pipeline::Element;
2use std::collections::HashMap;
3
4/// Index-based graph of relationships between document elements.
5///
6/// Built from a `&[Element]` slice, the graph stores parent/child and next/prev
7/// relationships as indices into the original slice. No ownership is taken and
8/// no lifetimes are introduced in the type — the graph is a standalone value that
9/// can be stored, cloned, or sent across threads independently of the elements.
10///
11/// # Building
12///
13/// ```rust,ignore
14/// let elements = doc.partition()?;
15/// let graph = ElementGraph::build(&elements);
16/// ```
17///
18/// # Parent / child semantics
19///
20/// An element at index `i` is a child of the nearest preceding `Title` element
21/// whose text matches `elements[i].metadata().parent_heading`. A `Title` element
22/// whose own `parent_heading` equals its own text is treated as a root (no parent)
23/// unless a *different* preceding title has the same text.
24pub struct ElementGraph {
25 parent: Vec<Option<usize>>,
26 children: Vec<Vec<usize>>,
27 next: Vec<Option<usize>>,
28 prev: Vec<Option<usize>>,
29 /// Which indices are Title elements (needed for `top_level_sections`).
30 is_title: Vec<bool>,
31}
32
33impl ElementGraph {
34 /// Build a graph from a slice of elements.
35 pub fn build(elements: &[Element]) -> Self {
36 let n = elements.len();
37
38 let mut parent: Vec<Option<usize>> = vec![None; n];
39 let mut children: Vec<Vec<usize>> = vec![Vec::new(); n];
40 let mut next: Vec<Option<usize>> = vec![None; n];
41 let mut prev: Vec<Option<usize>> = vec![None; n];
42 let mut is_title: Vec<bool> = vec![false; n];
43
44 // ── next / prev ──────────────────────────────────────────────────────
45 for i in 0..n {
46 if i + 1 < n {
47 next[i] = Some(i + 1);
48 }
49 if i > 0 {
50 prev[i] = Some(i - 1);
51 }
52 }
53
54 // ── parent / child ───────────────────────────────────────────────────
55 // Map heading text → most recent Title index with that text.
56 // We update this map as we scan forward so that "most recent" is always correct.
57 let mut latest_title_for_heading: HashMap<String, usize> = HashMap::new();
58
59 for i in 0..n {
60 if matches!(elements[i], Element::Title(_)) {
61 is_title[i] = true;
62 let title_text = elements[i].text().to_string();
63 latest_title_for_heading.insert(title_text, i);
64 }
65 }
66
67 // Second pass: assign parent / child.
68 // We need to process in order so we can handle the "most recent title"
69 // requirement by rebuilding the map incrementally.
70 let mut active_title_for_heading: HashMap<String, usize> = HashMap::new();
71
72 for i in 0..n {
73 if matches!(elements[i], Element::Title(_)) {
74 let title_text = elements[i].text().to_string();
75 active_title_for_heading.insert(title_text, i);
76 // A title is a root unless a *different* preceding title has an
77 // explicit `parent_heading` pointing to yet another title.
78 // Per the spec: "A Title element's own parent_heading equals its
79 // own text — it should NOT be its own parent."
80 // So titles are always roots in the current model.
81 // (parent stays None for titles)
82 } else {
83 // Non-title element: look up its parent_heading.
84 if let Some(heading_text) = elements[i].metadata().parent_heading.as_deref() {
85 if let Some(&title_idx) = active_title_for_heading.get(heading_text) {
86 parent[i] = Some(title_idx);
87 children[title_idx].push(i);
88 }
89 }
90 }
91 }
92
93 Self {
94 parent,
95 children,
96 next,
97 prev,
98 is_title,
99 }
100 }
101
102 /// Number of elements in the graph (equals the length of the source slice).
103 pub fn len(&self) -> usize {
104 self.parent.len()
105 }
106
107 /// Returns `true` when the graph was built from an empty slice.
108 pub fn is_empty(&self) -> bool {
109 self.len() == 0
110 }
111
112 /// Returns the index of the parent element of `idx`, or `None` if it is a root.
113 ///
114 /// # Panics
115 ///
116 /// Panics if `idx >= self.len()`.
117 pub fn parent_of(&self, idx: usize) -> Option<usize> {
118 self.parent[idx]
119 }
120
121 /// Returns the indices of the children of element `idx`.
122 ///
123 /// # Panics
124 ///
125 /// Panics if `idx >= self.len()`.
126 pub fn children_of(&self, idx: usize) -> &[usize] {
127 &self.children[idx]
128 }
129
130 /// Returns the index of the element immediately after `idx`, or `None` if `idx`
131 /// is the last element.
132 ///
133 /// # Panics
134 ///
135 /// Panics if `idx >= self.len()`.
136 pub fn next_of(&self, idx: usize) -> Option<usize> {
137 self.next[idx]
138 }
139
140 /// Returns the index of the element immediately before `idx`, or `None` if `idx`
141 /// is the first element.
142 ///
143 /// # Panics
144 ///
145 /// Panics if `idx >= self.len()`.
146 pub fn prev_of(&self, idx: usize) -> Option<usize> {
147 self.prev[idx]
148 }
149
150 /// Returns the child indices of a Title element (i.e., the elements belonging
151 /// to its section).
152 ///
153 /// This is an alias for [`children_of`](Self::children_of) with a semantically
154 /// clearer name for the common case of iterating a document section.
155 pub fn elements_in_section(&self, title_idx: usize) -> Vec<usize> {
156 self.children_of(title_idx).to_vec()
157 }
158
159 /// Returns the indices of all Title elements that have no parent (top-level sections).
160 pub fn top_level_sections(&self) -> Vec<usize> {
161 (0..self.len())
162 .filter(|&i| self.is_title[i] && self.parent[i].is_none())
163 .collect()
164 }
165}