edgeparse_core/pdf/
bookmark_extractor.rs1use lopdf::{Document, Object, ObjectId};
7use serde::{Deserialize, Serialize};
8use std::collections::HashSet;
9
10#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct Bookmark {
13 pub title: String,
15 pub level: u32,
17 pub page_number: Option<u32>,
19 pub children: Vec<Bookmark>,
21}
22
23pub fn extract_bookmarks(doc: &Document) -> Vec<Bookmark> {
25 let catalog = match doc.catalog() {
26 Ok(c) => c,
27 Err(_) => return Vec::new(),
28 };
29
30 let outlines_obj = match catalog.get(b"Outlines") {
31 Ok(obj) => resolve(doc, obj),
32 Err(_) => return Vec::new(),
33 };
34
35 let outlines_dict = match outlines_obj.as_dict() {
36 Ok(d) => d,
37 Err(_) => return Vec::new(),
38 };
39
40 let first_ref = match outlines_dict.get(b"First") {
42 Ok(obj) => match obj {
43 Object::Reference(id) => *id,
44 _ => return Vec::new(),
45 },
46 Err(_) => return Vec::new(),
47 };
48
49 let page_ids = doc.get_pages();
51
52 let mut visited = HashSet::new();
53 read_outline_items(doc, first_ref, 0, &page_ids, &mut visited)
54}
55
56fn read_outline_items(
58 doc: &Document,
59 first_id: ObjectId,
60 level: u32,
61 page_ids: &std::collections::BTreeMap<u32, ObjectId>,
62 visited: &mut HashSet<ObjectId>,
63) -> Vec<Bookmark> {
64 let mut bookmarks = Vec::new();
65 let mut current_id = Some(first_id);
66
67 while let Some(obj_id) = current_id {
68 if !visited.insert(obj_id) {
70 break;
71 }
72
73 let dict = match doc.get_object(obj_id).and_then(|o| o.as_dict()) {
74 Ok(d) => d,
75 Err(_) => break,
76 };
77
78 let title = match dict.get(b"Title") {
80 Ok(Object::String(bytes, _)) => String::from_utf8_lossy(bytes).to_string(),
81 _ => String::new(),
82 };
83
84 let page_number = resolve_bookmark_page(doc, dict, page_ids);
86
87 let children = match dict.get(b"First") {
89 Ok(Object::Reference(child_id)) => {
90 read_outline_items(doc, *child_id, level + 1, page_ids, visited)
91 }
92 _ => Vec::new(),
93 };
94
95 bookmarks.push(Bookmark {
96 title,
97 level,
98 page_number,
99 children,
100 });
101
102 current_id = match dict.get(b"Next") {
104 Ok(Object::Reference(next_id)) => Some(*next_id),
105 _ => None,
106 };
107 }
108
109 bookmarks
110}
111
112fn resolve_bookmark_page(
114 doc: &Document,
115 dict: &lopdf::Dictionary,
116 page_ids: &std::collections::BTreeMap<u32, ObjectId>,
117) -> Option<u32> {
118 if let Ok(dest) = dict.get(b"Dest") {
120 return page_from_dest(doc, dest, page_ids);
121 }
122
123 if let Ok(action_obj) = dict.get(b"A") {
125 let action = resolve(doc, action_obj);
126 if let Ok(action_dict) = action.as_dict() {
127 if let Ok(dest) = action_dict.get(b"D") {
128 return page_from_dest(doc, dest, page_ids);
129 }
130 }
131 }
132
133 None
134}
135
136fn page_from_dest(
138 doc: &Document,
139 dest: &Object,
140 page_ids: &std::collections::BTreeMap<u32, ObjectId>,
141) -> Option<u32> {
142 let dest = resolve(doc, dest);
143
144 let arr = match dest.as_array() {
146 Ok(a) if !a.is_empty() => a,
147 _ => return None,
148 };
149
150 let page_ref = match &arr[0] {
152 Object::Reference(id) => *id,
153 _ => return None,
154 };
155
156 page_ids
158 .iter()
159 .find(|(_, id)| **id == page_ref)
160 .map(|(num, _)| *num)
161}
162
163fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
165 match obj {
166 Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
167 _ => obj,
168 }
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn test_empty_document_no_bookmarks() {
177 let doc = Document::new();
178 let bookmarks = extract_bookmarks(&doc);
179 assert!(bookmarks.is_empty());
180 }
181
182 #[test]
183 fn test_bookmark_struct() {
184 let bm = Bookmark {
185 title: "Chapter 1".to_string(),
186 level: 0,
187 page_number: Some(1),
188 children: vec![Bookmark {
189 title: "Section 1.1".to_string(),
190 level: 1,
191 page_number: Some(3),
192 children: vec![],
193 }],
194 };
195 assert_eq!(bm.title, "Chapter 1");
196 assert_eq!(bm.children.len(), 1);
197 assert_eq!(bm.children[0].level, 1);
198 }
199
200 #[test]
201 fn test_visited_prevents_infinite_loop() {
202 let mut visited = HashSet::new();
203 let id = (1, 0);
204 assert!(visited.insert(id));
205 assert!(!visited.insert(id)); }
207}