edgeparse_core/pdf/
hyperlink_extractor.rs1use lopdf::{Document, Object};
7use serde::{Deserialize, Serialize};
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct PdfHyperlink {
12 pub uri: String,
14 pub page_number: u32,
16 pub rect: [f64; 4],
18}
19
20pub fn extract_hyperlinks(doc: &Document) -> Vec<PdfHyperlink> {
22 let mut links = Vec::new();
23
24 let pages = doc.get_pages();
25 let mut page_list: Vec<(u32, lopdf::ObjectId)> = pages.into_iter().collect();
26 page_list.sort_by_key(|(num, _)| *num);
27
28 for (page_num, page_id) in page_list {
29 let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
30 Ok(d) => d,
31 Err(_) => continue,
32 };
33
34 let annots_obj = match page_dict.get(b"Annots") {
35 Ok(obj) => resolve(doc, obj),
36 Err(_) => continue,
37 };
38
39 let annots_array = match annots_obj.as_array() {
40 Ok(a) => a,
41 Err(_) => continue,
42 };
43
44 for annot_ref in annots_array {
45 let annot_obj = resolve(doc, annot_ref);
46 if let Ok(dict) = annot_obj.as_dict() {
47 if let Some(link) = parse_link_annotation(doc, dict, page_num) {
48 links.push(link);
49 }
50 }
51 }
52 }
53
54 links
55}
56
57fn parse_link_annotation(
59 doc: &Document,
60 dict: &lopdf::Dictionary,
61 page_number: u32,
62) -> Option<PdfHyperlink> {
63 match dict.get(b"Subtype") {
65 Ok(Object::Name(name)) if name == b"Link" => {}
66 _ => return None,
67 }
68
69 let uri = extract_uri(doc, dict)?;
71
72 let rect = extract_rect(dict)?;
74
75 Some(PdfHyperlink {
76 uri,
77 page_number,
78 rect,
79 })
80}
81
82fn extract_uri(doc: &Document, dict: &lopdf::Dictionary) -> Option<String> {
84 let action_obj = dict.get(b"A").ok()?;
85 let action = resolve(doc, action_obj);
86 let action_dict = action.as_dict().ok()?;
87
88 match action_dict.get(b"S") {
90 Ok(Object::Name(name)) if name == b"URI" => {}
91 _ => return None,
92 }
93
94 match action_dict.get(b"URI") {
96 Ok(Object::String(bytes, _)) => Some(String::from_utf8_lossy(bytes).to_string()),
97 _ => None,
98 }
99}
100
101fn extract_rect(dict: &lopdf::Dictionary) -> Option<[f64; 4]> {
103 let rect_obj = dict.get(b"Rect").ok()?;
104 let arr = rect_obj.as_array().ok()?;
105 if arr.len() < 4 {
106 return None;
107 }
108 let mut result = [0.0f64; 4];
109 for (i, obj) in arr.iter().enumerate().take(4) {
110 result[i] = match obj {
111 Object::Real(f) => *f,
112 Object::Integer(n) => *n as f64,
113 _ => return None,
114 };
115 }
116 Some(result)
117}
118
119fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
121 match obj {
122 Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
123 _ => obj,
124 }
125}
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130
131 #[test]
132 fn test_empty_document_no_links() {
133 let doc = Document::new();
134 let links = extract_hyperlinks(&doc);
135 assert!(links.is_empty());
136 }
137
138 #[test]
139 fn test_hyperlink_struct() {
140 let link = PdfHyperlink {
141 uri: "https://example.com".to_string(),
142 page_number: 1,
143 rect: [72.0, 700.0, 200.0, 714.0],
144 };
145 assert_eq!(link.uri, "https://example.com");
146 assert_eq!(link.page_number, 1);
147 assert_eq!(link.rect[0], 72.0);
148 }
149
150 #[test]
151 fn test_extract_rect_from_dict() {
152 let mut dict = lopdf::Dictionary::new();
153 dict.set(
154 "Rect",
155 Object::Array(vec![
156 Object::Real(10.0),
157 Object::Integer(20),
158 Object::Real(100.0),
159 Object::Real(50.0),
160 ]),
161 );
162 let rect = extract_rect(&dict).unwrap();
163 assert_eq!(rect[0], 10.0);
164 assert_eq!(rect[1], 20.0);
165 assert_eq!(rect[2], 100.0);
166 assert_eq!(rect[3], 50.0);
167 }
168
169 #[test]
170 fn test_extract_rect_missing() {
171 let dict = lopdf::Dictionary::new();
172 assert!(extract_rect(&dict).is_none());
173 }
174}