Skip to main content

edgeparse_core/pdf/
annotation_extractor.rs

1//! PDF annotation extraction.
2//!
3//! Reads annotations (highlights, comments, stamps, etc.)
4//! from each page's /Annots array.
5
6use lopdf::{Document, Object};
7use serde::{Deserialize, Serialize};
8
9/// Annotation type.
10#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
11pub enum AnnotationType {
12    /// Text (sticky note)
13    Text,
14    /// Highlight
15    Highlight,
16    /// Underline
17    Underline,
18    /// Strikeout
19    StrikeOut,
20    /// Free text annotation
21    FreeText,
22    /// Link (see hyperlink extractor)
23    Link,
24    /// Stamp
25    Stamp,
26    /// Ink (freehand drawing)
27    Ink,
28    /// File attachment
29    FileAttachment,
30    /// Pop-up (associated with another annotation)
31    Popup,
32    /// Other/unknown
33    Other(String),
34}
35
36/// A single annotation extracted from the PDF.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct PdfAnnotation {
39    /// Annotation type
40    pub annotation_type: AnnotationType,
41    /// Content/text of the annotation
42    pub contents: Option<String>,
43    /// Author/title
44    pub author: Option<String>,
45    /// Page number (1-based)
46    pub page_number: u32,
47    /// Bounding rectangle [x1, y1, x2, y2]
48    pub rect: Option<[f64; 4]>,
49    /// Subject line
50    pub subject: Option<String>,
51    /// Creation date string
52    pub creation_date: Option<String>,
53    /// Modification date string
54    pub modification_date: Option<String>,
55}
56
57/// Extract annotations from all pages of a PDF document.
58pub fn extract_annotations(doc: &Document) -> Vec<PdfAnnotation> {
59    let mut annotations = Vec::new();
60
61    let pages = doc.get_pages();
62    let mut page_ids: Vec<(u32, lopdf::ObjectId)> = pages.into_iter().collect();
63    page_ids.sort_by_key(|(num, _)| *num);
64
65    for (page_num, page_id) in page_ids {
66        let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
67            Ok(d) => d,
68            Err(_) => continue,
69        };
70
71        let annots_obj = match page_dict.get(b"Annots") {
72            Ok(obj) => resolve(doc, obj),
73            Err(_) => continue,
74        };
75
76        let annots_array = match annots_obj.as_array() {
77            Ok(a) => a,
78            Err(_) => continue,
79        };
80
81        for annot_ref in annots_array {
82            let annot_obj = resolve(doc, annot_ref);
83            if let Ok(dict) = annot_obj.as_dict() {
84                if let Some(annot) = parse_annotation(dict, page_num) {
85                    annotations.push(annot);
86                }
87            }
88        }
89    }
90
91    annotations
92}
93
94/// Parse a single annotation dictionary.
95fn parse_annotation(dict: &lopdf::Dictionary, page_number: u32) -> Option<PdfAnnotation> {
96    let annotation_type = match dict.get(b"Subtype") {
97        Ok(Object::Name(name)) => match name.as_slice() {
98            b"Text" => AnnotationType::Text,
99            b"Highlight" => AnnotationType::Highlight,
100            b"Underline" => AnnotationType::Underline,
101            b"StrikeOut" => AnnotationType::StrikeOut,
102            b"FreeText" => AnnotationType::FreeText,
103            b"Link" => AnnotationType::Link,
104            b"Stamp" => AnnotationType::Stamp,
105            b"Ink" => AnnotationType::Ink,
106            b"FileAttachment" => AnnotationType::FileAttachment,
107            b"Popup" => AnnotationType::Popup,
108            other => AnnotationType::Other(String::from_utf8_lossy(other).to_string()),
109        },
110        _ => return None,
111    };
112
113    let contents = get_string(dict, b"Contents");
114    let author = get_string(dict, b"T");
115    let subject = get_string(dict, b"Subj");
116    let creation_date = get_string(dict, b"CreationDate");
117    let modification_date = get_string(dict, b"M");
118    let rect = get_rect(dict);
119
120    Some(PdfAnnotation {
121        annotation_type,
122        contents,
123        author,
124        page_number,
125        rect,
126        subject,
127        creation_date,
128        modification_date,
129    })
130}
131
132/// Get a string value from a dictionary.
133fn get_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
134    dict.get(key).ok().and_then(|obj| match obj {
135        Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()),
136        _ => None,
137    })
138}
139
140/// Get a rectangle [x1, y1, x2, y2] from the /Rect entry.
141fn get_rect(dict: &lopdf::Dictionary) -> Option<[f64; 4]> {
142    let rect_obj = dict.get(b"Rect").ok()?;
143    let arr = rect_obj.as_array().ok()?;
144    if arr.len() < 4 {
145        return None;
146    }
147    let mut result = [0.0f64; 4];
148    for (i, obj) in arr.iter().enumerate().take(4) {
149        result[i] = match obj {
150            Object::Real(f) => *f,
151            Object::Integer(i) => *i as f64,
152            _ => return None,
153        };
154    }
155    Some(result)
156}
157
158/// Resolve an indirect object reference.
159fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
160    match obj {
161        Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
162        _ => obj,
163    }
164}
165
166#[cfg(test)]
167mod tests {
168    use super::*;
169
170    #[test]
171    fn test_parse_annotation_text() {
172        let mut dict = lopdf::Dictionary::new();
173        dict.set("Subtype", Object::Name(b"Text".to_vec()));
174        dict.set(
175            "Contents",
176            Object::String(b"A comment".to_vec(), lopdf::StringFormat::Literal),
177        );
178        let annot = parse_annotation(&dict, 1).unwrap();
179        assert_eq!(annot.annotation_type, AnnotationType::Text);
180        assert_eq!(annot.contents, Some("A comment".to_string()));
181        assert_eq!(annot.page_number, 1);
182    }
183
184    #[test]
185    fn test_parse_annotation_highlight() {
186        let mut dict = lopdf::Dictionary::new();
187        dict.set("Subtype", Object::Name(b"Highlight".to_vec()));
188        let annot = parse_annotation(&dict, 3).unwrap();
189        assert_eq!(annot.annotation_type, AnnotationType::Highlight);
190        assert_eq!(annot.page_number, 3);
191    }
192
193    #[test]
194    fn test_parse_annotation_no_subtype() {
195        let dict = lopdf::Dictionary::new();
196        assert!(parse_annotation(&dict, 1).is_none());
197    }
198
199    #[test]
200    fn test_get_rect() {
201        let mut dict = lopdf::Dictionary::new();
202        dict.set(
203            "Rect",
204            Object::Array(vec![
205                Object::Real(10.0),
206                Object::Real(20.0),
207                Object::Real(100.0),
208                Object::Real(50.0),
209            ]),
210        );
211        let rect = get_rect(&dict).unwrap();
212        assert_eq!(rect, [10.0, 20.0, 100.0, 50.0]);
213    }
214}