Skip to main content

oxidize_pdf/pipeline/
graph.rs

1use crate::pipeline::Element;
2use std::collections::HashMap;
3
4/// Index-based graph of relationships between document elements.
5///
6/// Built from a `&[Element]` slice, the graph stores parent/child and next/prev
7/// relationships as indices into the original slice. No ownership is taken and
8/// no lifetimes are introduced — the graph is a standalone value that can be
9/// stored, cloned, or sent across threads independently of the elements.
10///
11/// # Example
12///
13/// ```rust,no_run
14/// use oxidize_pdf::parser::PdfDocument;
15/// use oxidize_pdf::pipeline::ElementGraph;
16///
17/// let doc = PdfDocument::open("document.pdf")?;
18/// let (elements, graph) = doc.partition_graph(Default::default())?;
19///
20/// for title_idx in graph.top_level_sections() {
21///     println!("Section: {}", elements[title_idx].text());
22///     for &child_idx in graph.children_of(title_idx) {
23///         println!("  {}", elements[child_idx].text());
24///     }
25/// }
26/// # Ok::<(), Box<dyn std::error::Error>>(())
27/// ```
28///
29/// # Parent / child semantics
30///
31/// An element at index `i` is a child of the nearest preceding `Title` element
32/// whose text matches `elements[i].metadata().parent_heading`. A `Title` element
33/// whose own `parent_heading` equals its own text is treated as a root (no parent)
34/// unless a *different* preceding title has the same text.
35pub struct ElementGraph {
36    parent: Vec<Option<usize>>,
37    children: Vec<Vec<usize>>,
38    next: Vec<Option<usize>>,
39    prev: Vec<Option<usize>>,
40    /// Which indices are Title elements (needed for `top_level_sections`).
41    is_title: Vec<bool>,
42}
43
44impl ElementGraph {
45    /// Build a graph from a slice of elements.
46    pub fn build(elements: &[Element]) -> Self {
47        let n = elements.len();
48
49        let mut parent: Vec<Option<usize>> = vec![None; n];
50        let mut children: Vec<Vec<usize>> = vec![Vec::new(); n];
51        let mut next: Vec<Option<usize>> = vec![None; n];
52        let mut prev: Vec<Option<usize>> = vec![None; n];
53        let mut is_title: Vec<bool> = vec![false; n];
54
55        // ── next / prev ──────────────────────────────────────────────────────
56        for i in 0..n {
57            if i + 1 < n {
58                next[i] = Some(i + 1);
59            }
60            if i > 0 {
61                prev[i] = Some(i - 1);
62            }
63        }
64
65        // ── parent / child ───────────────────────────────────────────────────
66        // Map heading text → most recent Title index with that text.
67        // We update this map as we scan forward so that "most recent" is always correct.
68        let mut latest_title_for_heading: HashMap<String, usize> = HashMap::new();
69
70        for i in 0..n {
71            if matches!(elements[i], Element::Title(_)) {
72                is_title[i] = true;
73                let title_text = elements[i].text().to_string();
74                latest_title_for_heading.insert(title_text, i);
75            }
76        }
77
78        // Second pass: assign parent / child.
79        // We need to process in order so we can handle the "most recent title"
80        // requirement by rebuilding the map incrementally.
81        let mut active_title_for_heading: HashMap<String, usize> = HashMap::new();
82
83        for i in 0..n {
84            if matches!(elements[i], Element::Title(_)) {
85                let title_text = elements[i].text().to_string();
86                active_title_for_heading.insert(title_text, i);
87                // A title is a root unless a *different* preceding title has an
88                // explicit `parent_heading` pointing to yet another title.
89                // Per the spec: "A Title element's own parent_heading equals its
90                // own text — it should NOT be its own parent."
91                // So titles are always roots in the current model.
92                // (parent stays None for titles)
93            } else {
94                // Non-title element: look up its parent_heading.
95                if let Some(heading_text) = elements[i].metadata().parent_heading.as_deref() {
96                    if let Some(&title_idx) = active_title_for_heading.get(heading_text) {
97                        parent[i] = Some(title_idx);
98                        children[title_idx].push(i);
99                    }
100                }
101            }
102        }
103
104        Self {
105            parent,
106            children,
107            next,
108            prev,
109            is_title,
110        }
111    }
112
113    /// Number of elements in the graph (equals the length of the source slice).
114    pub fn len(&self) -> usize {
115        self.parent.len()
116    }
117
118    /// Returns `true` when the graph was built from an empty slice.
119    pub fn is_empty(&self) -> bool {
120        self.len() == 0
121    }
122
123    /// Returns the index of the parent element of `idx`, or `None` if it is a root.
124    ///
125    /// # Panics
126    ///
127    /// Panics if `idx >= self.len()`.
128    pub fn parent_of(&self, idx: usize) -> Option<usize> {
129        self.parent[idx]
130    }
131
132    /// Returns the indices of the children of element `idx`.
133    ///
134    /// # Panics
135    ///
136    /// Panics if `idx >= self.len()`.
137    pub fn children_of(&self, idx: usize) -> &[usize] {
138        &self.children[idx]
139    }
140
141    /// Returns the index of the element immediately after `idx`, or `None` if `idx`
142    /// is the last element.
143    ///
144    /// # Panics
145    ///
146    /// Panics if `idx >= self.len()`.
147    pub fn next_of(&self, idx: usize) -> Option<usize> {
148        self.next[idx]
149    }
150
151    /// Returns the index of the element immediately before `idx`, or `None` if `idx`
152    /// is the first element.
153    ///
154    /// # Panics
155    ///
156    /// Panics if `idx >= self.len()`.
157    pub fn prev_of(&self, idx: usize) -> Option<usize> {
158        self.prev[idx]
159    }
160
161    /// Returns the child indices of a Title element (i.e., the elements belonging
162    /// to its section).
163    ///
164    /// This is an alias for [`children_of`](Self::children_of) with a semantically
165    /// clearer name for the common case of iterating a document section.
166    pub fn elements_in_section(&self, title_idx: usize) -> Vec<usize> {
167        self.children_of(title_idx).to_vec()
168    }
169
170    /// Returns the indices of all Title elements that have no parent (top-level sections).
171    pub fn top_level_sections(&self) -> Vec<usize> {
172        (0..self.len())
173            .filter(|&i| self.is_title[i] && self.parent[i].is_none())
174            .collect()
175    }
176}