Skip to main content

edgeparse_core/pdf/
hyperlink_extractor.rs

1//! Hyperlink extraction from PDF link annotations.
2//!
3//! Identifies /Link annotations with /URI actions and maps them
4//! to bounding box regions on each page.
5
6use lopdf::{Document, Object};
7use serde::{Deserialize, Serialize};
8
9/// A hyperlink extracted from a PDF page.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct PdfHyperlink {
12    /// The target URI
13    pub uri: String,
14    /// Page number (1-based)
15    pub page_number: u32,
16    /// Bounding rectangle [left, bottom, right, top]
17    pub rect: [f64; 4],
18}
19
20/// Extract all hyperlinks from the PDF document.
21pub fn extract_hyperlinks(doc: &Document) -> Vec<PdfHyperlink> {
22    let mut links = Vec::new();
23
24    let pages = doc.get_pages();
25    let mut page_list: Vec<(u32, lopdf::ObjectId)> = pages.into_iter().collect();
26    page_list.sort_by_key(|(num, _)| *num);
27
28    for (page_num, page_id) in page_list {
29        let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
30            Ok(d) => d,
31            Err(_) => continue,
32        };
33
34        let annots_obj = match page_dict.get(b"Annots") {
35            Ok(obj) => resolve(doc, obj),
36            Err(_) => continue,
37        };
38
39        let annots_array = match annots_obj.as_array() {
40            Ok(a) => a,
41            Err(_) => continue,
42        };
43
44        for annot_ref in annots_array {
45            let annot_obj = resolve(doc, annot_ref);
46            if let Ok(dict) = annot_obj.as_dict() {
47                if let Some(link) = parse_link_annotation(doc, dict, page_num) {
48                    links.push(link);
49                }
50            }
51        }
52    }
53
54    links
55}
56
57/// Parse a link annotation to extract URI and rect.
58fn parse_link_annotation(
59    doc: &Document,
60    dict: &lopdf::Dictionary,
61    page_number: u32,
62) -> Option<PdfHyperlink> {
63    // Must be a /Link annotation
64    match dict.get(b"Subtype") {
65        Ok(Object::Name(name)) if name == b"Link" => {}
66        _ => return None,
67    }
68
69    // Get URI from /A (action) dictionary
70    let uri = extract_uri(doc, dict)?;
71
72    // Get bounding rect
73    let rect = extract_rect(dict)?;
74
75    Some(PdfHyperlink {
76        uri,
77        page_number,
78        rect,
79    })
80}
81
82/// Extract URI from the /A action dictionary.
83fn extract_uri(doc: &Document, dict: &lopdf::Dictionary) -> Option<String> {
84    let action_obj = dict.get(b"A").ok()?;
85    let action = resolve(doc, action_obj);
86    let action_dict = action.as_dict().ok()?;
87
88    // Check action type is /URI
89    match action_dict.get(b"S") {
90        Ok(Object::Name(name)) if name == b"URI" => {}
91        _ => return None,
92    }
93
94    // Get the URI string
95    match action_dict.get(b"URI") {
96        Ok(Object::String(bytes, _)) => Some(String::from_utf8_lossy(bytes).to_string()),
97        _ => None,
98    }
99}
100
101/// Extract bounding rectangle from /Rect.
102fn extract_rect(dict: &lopdf::Dictionary) -> Option<[f64; 4]> {
103    let rect_obj = dict.get(b"Rect").ok()?;
104    let arr = rect_obj.as_array().ok()?;
105    if arr.len() < 4 {
106        return None;
107    }
108    let mut result = [0.0f64; 4];
109    for (i, obj) in arr.iter().enumerate().take(4) {
110        result[i] = match obj {
111            Object::Real(f) => *f,
112            Object::Integer(n) => *n as f64,
113            _ => return None,
114        };
115    }
116    Some(result)
117}
118
119/// Resolve an indirect reference.
120fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
121    match obj {
122        Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
123        _ => obj,
124    }
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    #[test]
132    fn test_empty_document_no_links() {
133        let doc = Document::new();
134        let links = extract_hyperlinks(&doc);
135        assert!(links.is_empty());
136    }
137
138    #[test]
139    fn test_hyperlink_struct() {
140        let link = PdfHyperlink {
141            uri: "https://example.com".to_string(),
142            page_number: 1,
143            rect: [72.0, 700.0, 200.0, 714.0],
144        };
145        assert_eq!(link.uri, "https://example.com");
146        assert_eq!(link.page_number, 1);
147        assert_eq!(link.rect[0], 72.0);
148    }
149
150    #[test]
151    fn test_extract_rect_from_dict() {
152        let mut dict = lopdf::Dictionary::new();
153        dict.set(
154            "Rect",
155            Object::Array(vec![
156                Object::Real(10.0),
157                Object::Integer(20),
158                Object::Real(100.0),
159                Object::Real(50.0),
160            ]),
161        );
162        let rect = extract_rect(&dict).unwrap();
163        assert_eq!(rect[0], 10.0);
164        assert_eq!(rect[1], 20.0);
165        assert_eq!(rect[2], 100.0);
166        assert_eq!(rect[3], 50.0);
167    }
168
169    #[test]
170    fn test_extract_rect_missing() {
171        let dict = lopdf::Dictionary::new();
172        assert!(extract_rect(&dict).is_none());
173    }
174}