Skip to main content

oxidize_pdf/pipeline/
graph.rs

1use crate::pipeline::Element;
2use std::collections::HashMap;
3
4/// Index-based graph of relationships between document elements.
5///
6/// Built from a `&[Element]` slice, the graph stores parent/child and next/prev
7/// relationships as indices into the original slice.  No ownership is taken and
8/// no lifetimes are introduced in the type — the graph is a standalone value that
9/// can be stored, cloned, or sent across threads independently of the elements.
10///
11/// # Building
12///
13/// ```rust,ignore
14/// let elements = doc.partition()?;
15/// let graph = ElementGraph::build(&elements);
16/// ```
17///
18/// # Parent / child semantics
19///
20/// An element at index `i` is a child of the nearest preceding `Title` element
21/// whose text matches `elements[i].metadata().parent_heading`.  A `Title` element
22/// whose own `parent_heading` equals its own text is treated as a root (no parent)
23/// unless a *different* preceding title has the same text.
24pub struct ElementGraph {
25    parent: Vec<Option<usize>>,
26    children: Vec<Vec<usize>>,
27    next: Vec<Option<usize>>,
28    prev: Vec<Option<usize>>,
29    /// Which indices are Title elements (needed for `top_level_sections`).
30    is_title: Vec<bool>,
31}
32
33impl ElementGraph {
34    /// Build a graph from a slice of elements.
35    pub fn build(elements: &[Element]) -> Self {
36        let n = elements.len();
37
38        let mut parent: Vec<Option<usize>> = vec![None; n];
39        let mut children: Vec<Vec<usize>> = vec![Vec::new(); n];
40        let mut next: Vec<Option<usize>> = vec![None; n];
41        let mut prev: Vec<Option<usize>> = vec![None; n];
42        let mut is_title: Vec<bool> = vec![false; n];
43
44        // ── next / prev ──────────────────────────────────────────────────────
45        for i in 0..n {
46            if i + 1 < n {
47                next[i] = Some(i + 1);
48            }
49            if i > 0 {
50                prev[i] = Some(i - 1);
51            }
52        }
53
54        // ── parent / child ───────────────────────────────────────────────────
55        // Map heading text → most recent Title index with that text.
56        // We update this map as we scan forward so that "most recent" is always correct.
57        let mut latest_title_for_heading: HashMap<String, usize> = HashMap::new();
58
59        for i in 0..n {
60            if matches!(elements[i], Element::Title(_)) {
61                is_title[i] = true;
62                let title_text = elements[i].text().to_string();
63                latest_title_for_heading.insert(title_text, i);
64            }
65        }
66
67        // Second pass: assign parent / child.
68        // We need to process in order so we can handle the "most recent title"
69        // requirement by rebuilding the map incrementally.
70        let mut active_title_for_heading: HashMap<String, usize> = HashMap::new();
71
72        for i in 0..n {
73            if matches!(elements[i], Element::Title(_)) {
74                let title_text = elements[i].text().to_string();
75                active_title_for_heading.insert(title_text, i);
76                // A title is a root unless a *different* preceding title has an
77                // explicit `parent_heading` pointing to yet another title.
78                // Per the spec: "A Title element's own parent_heading equals its
79                // own text — it should NOT be its own parent."
80                // So titles are always roots in the current model.
81                // (parent stays None for titles)
82            } else {
83                // Non-title element: look up its parent_heading.
84                if let Some(heading_text) = elements[i].metadata().parent_heading.as_deref() {
85                    if let Some(&title_idx) = active_title_for_heading.get(heading_text) {
86                        parent[i] = Some(title_idx);
87                        children[title_idx].push(i);
88                    }
89                }
90            }
91        }
92
93        Self {
94            parent,
95            children,
96            next,
97            prev,
98            is_title,
99        }
100    }
101
102    /// Number of elements in the graph (equals the length of the source slice).
103    pub fn len(&self) -> usize {
104        self.parent.len()
105    }
106
107    /// Returns `true` when the graph was built from an empty slice.
108    pub fn is_empty(&self) -> bool {
109        self.len() == 0
110    }
111
112    /// Returns the index of the parent element of `idx`, or `None` if it is a root.
113    ///
114    /// # Panics
115    ///
116    /// Panics if `idx >= self.len()`.
117    pub fn parent_of(&self, idx: usize) -> Option<usize> {
118        self.parent[idx]
119    }
120
121    /// Returns the indices of the children of element `idx`.
122    ///
123    /// # Panics
124    ///
125    /// Panics if `idx >= self.len()`.
126    pub fn children_of(&self, idx: usize) -> &[usize] {
127        &self.children[idx]
128    }
129
130    /// Returns the index of the element immediately after `idx`, or `None` if `idx`
131    /// is the last element.
132    ///
133    /// # Panics
134    ///
135    /// Panics if `idx >= self.len()`.
136    pub fn next_of(&self, idx: usize) -> Option<usize> {
137        self.next[idx]
138    }
139
140    /// Returns the index of the element immediately before `idx`, or `None` if `idx`
141    /// is the first element.
142    ///
143    /// # Panics
144    ///
145    /// Panics if `idx >= self.len()`.
146    pub fn prev_of(&self, idx: usize) -> Option<usize> {
147        self.prev[idx]
148    }
149
150    /// Returns the child indices of a Title element (i.e., the elements belonging
151    /// to its section).
152    ///
153    /// This is an alias for [`children_of`](Self::children_of) with a semantically
154    /// clearer name for the common case of iterating a document section.
155    pub fn elements_in_section(&self, title_idx: usize) -> Vec<usize> {
156        self.children_of(title_idx).to_vec()
157    }
158
159    /// Returns the indices of all Title elements that have no parent (top-level sections).
160    pub fn top_level_sections(&self) -> Vec<usize> {
161        (0..self.len())
162            .filter(|&i| self.is_title[i] && self.parent[i].is_none())
163            .collect()
164    }
165}