Skip to main content

edgeparse_core/pdf/
bookmark_extractor.rs

1//! PDF bookmark/outline extraction.
2//!
3//! Reads the document outline (bookmarks) from the /Outlines dictionary,
4//! producing a tree of `Bookmark` nodes with titles and nesting levels.
5
6use lopdf::{Document, Object, ObjectId};
7use serde::{Deserialize, Serialize};
8use std::collections::HashSet;
9
10/// A single bookmark entry.
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct Bookmark {
13    /// Display title
14    pub title: String,
15    /// Nesting level (0 = top-level)
16    pub level: u32,
17    /// Destination page number (1-based, if resolvable)
18    pub page_number: Option<u32>,
19    /// Child bookmarks
20    pub children: Vec<Bookmark>,
21}
22
23/// Extract the document outline as a flat list of bookmarks with levels.
24pub fn extract_bookmarks(doc: &Document) -> Vec<Bookmark> {
25    let catalog = match doc.catalog() {
26        Ok(c) => c,
27        Err(_) => return Vec::new(),
28    };
29
30    let outlines_obj = match catalog.get(b"Outlines") {
31        Ok(obj) => resolve(doc, obj),
32        Err(_) => return Vec::new(),
33    };
34
35    let outlines_dict = match outlines_obj.as_dict() {
36        Ok(d) => d,
37        Err(_) => return Vec::new(),
38    };
39
40    // Get the first child
41    let first_ref = match outlines_dict.get(b"First") {
42        Ok(obj) => match obj {
43            Object::Reference(id) => *id,
44            _ => return Vec::new(),
45        },
46        Err(_) => return Vec::new(),
47    };
48
49    // Build page number lookup
50    let page_ids = doc.get_pages();
51
52    let mut visited = HashSet::new();
53    read_outline_items(doc, first_ref, 0, &page_ids, &mut visited)
54}
55
56/// Recursively read outline items following /First, /Next, /Last links.
57fn read_outline_items(
58    doc: &Document,
59    first_id: ObjectId,
60    level: u32,
61    page_ids: &std::collections::BTreeMap<u32, ObjectId>,
62    visited: &mut HashSet<ObjectId>,
63) -> Vec<Bookmark> {
64    let mut bookmarks = Vec::new();
65    let mut current_id = Some(first_id);
66
67    while let Some(obj_id) = current_id {
68        // Prevent infinite loops from malformed PDFs
69        if !visited.insert(obj_id) {
70            break;
71        }
72
73        let dict = match doc.get_object(obj_id).and_then(|o| o.as_dict()) {
74            Ok(d) => d,
75            Err(_) => break,
76        };
77
78        // Get title
79        let title = match dict.get(b"Title") {
80            Ok(Object::String(bytes, _)) => String::from_utf8_lossy(bytes).to_string(),
81            _ => String::new(),
82        };
83
84        // Resolve destination page
85        let page_number = resolve_bookmark_page(doc, dict, page_ids);
86
87        // Recurse into children
88        let children = match dict.get(b"First") {
89            Ok(Object::Reference(child_id)) => {
90                read_outline_items(doc, *child_id, level + 1, page_ids, visited)
91            }
92            _ => Vec::new(),
93        };
94
95        bookmarks.push(Bookmark {
96            title,
97            level,
98            page_number,
99            children,
100        });
101
102        // Move to next sibling
103        current_id = match dict.get(b"Next") {
104            Ok(Object::Reference(next_id)) => Some(*next_id),
105            _ => None,
106        };
107    }
108
109    bookmarks
110}
111
112/// Resolve the page number from a bookmark's /Dest or /A entry.
113fn resolve_bookmark_page(
114    doc: &Document,
115    dict: &lopdf::Dictionary,
116    page_ids: &std::collections::BTreeMap<u32, ObjectId>,
117) -> Option<u32> {
118    // Try /Dest first
119    if let Ok(dest) = dict.get(b"Dest") {
120        return page_from_dest(doc, dest, page_ids);
121    }
122
123    // Try /A (action) → /D (destination)
124    if let Ok(action_obj) = dict.get(b"A") {
125        let action = resolve(doc, action_obj);
126        if let Ok(action_dict) = action.as_dict() {
127            if let Ok(dest) = action_dict.get(b"D") {
128                return page_from_dest(doc, dest, page_ids);
129            }
130        }
131    }
132
133    None
134}
135
136/// Extract page number from a destination object.
137fn page_from_dest(
138    doc: &Document,
139    dest: &Object,
140    page_ids: &std::collections::BTreeMap<u32, ObjectId>,
141) -> Option<u32> {
142    let dest = resolve(doc, dest);
143
144    // Dest can be an array [page_ref, /type, ...] or a name ref
145    let arr = match dest.as_array() {
146        Ok(a) if !a.is_empty() => a,
147        _ => return None,
148    };
149
150    // First element should be a page reference
151    let page_ref = match &arr[0] {
152        Object::Reference(id) => *id,
153        _ => return None,
154    };
155
156    // Find matching page number
157    page_ids
158        .iter()
159        .find(|(_, id)| **id == page_ref)
160        .map(|(num, _)| *num)
161}
162
163/// Resolve an indirect reference.
164fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
165    match obj {
166        Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
167        _ => obj,
168    }
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn test_empty_document_no_bookmarks() {
177        let doc = Document::new();
178        let bookmarks = extract_bookmarks(&doc);
179        assert!(bookmarks.is_empty());
180    }
181
182    #[test]
183    fn test_bookmark_struct() {
184        let bm = Bookmark {
185            title: "Chapter 1".to_string(),
186            level: 0,
187            page_number: Some(1),
188            children: vec![Bookmark {
189                title: "Section 1.1".to_string(),
190                level: 1,
191                page_number: Some(3),
192                children: vec![],
193            }],
194        };
195        assert_eq!(bm.title, "Chapter 1");
196        assert_eq!(bm.children.len(), 1);
197        assert_eq!(bm.children[0].level, 1);
198    }
199
200    #[test]
201    fn test_visited_prevents_infinite_loop() {
202        let mut visited = HashSet::new();
203        let id = (1, 0);
204        assert!(visited.insert(id));
205        assert!(!visited.insert(id)); // second insert returns false
206    }
207}