use lopdf::{Document, Object};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum AnnotationType {
Text,
Highlight,
Underline,
StrikeOut,
FreeText,
Link,
Stamp,
Ink,
FileAttachment,
Popup,
Other(String),
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PdfAnnotation {
pub annotation_type: AnnotationType,
pub contents: Option<String>,
pub author: Option<String>,
pub page_number: u32,
pub rect: Option<[f64; 4]>,
pub subject: Option<String>,
pub creation_date: Option<String>,
pub modification_date: Option<String>,
}
pub fn extract_annotations(doc: &Document) -> Vec<PdfAnnotation> {
let mut annotations = Vec::new();
let pages = doc.get_pages();
let mut page_ids: Vec<(u32, lopdf::ObjectId)> = pages.into_iter().collect();
page_ids.sort_by_key(|(num, _)| *num);
for (page_num, page_id) in page_ids {
let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
Ok(d) => d,
Err(_) => continue,
};
let annots_obj = match page_dict.get(b"Annots") {
Ok(obj) => resolve(doc, obj),
Err(_) => continue,
};
let annots_array = match annots_obj.as_array() {
Ok(a) => a,
Err(_) => continue,
};
for annot_ref in annots_array {
let annot_obj = resolve(doc, annot_ref);
if let Ok(dict) = annot_obj.as_dict() {
if let Some(annot) = parse_annotation(dict, page_num) {
annotations.push(annot);
}
}
}
}
annotations
}
fn parse_annotation(dict: &lopdf::Dictionary, page_number: u32) -> Option<PdfAnnotation> {
let annotation_type = match dict.get(b"Subtype") {
Ok(Object::Name(name)) => match name.as_slice() {
b"Text" => AnnotationType::Text,
b"Highlight" => AnnotationType::Highlight,
b"Underline" => AnnotationType::Underline,
b"StrikeOut" => AnnotationType::StrikeOut,
b"FreeText" => AnnotationType::FreeText,
b"Link" => AnnotationType::Link,
b"Stamp" => AnnotationType::Stamp,
b"Ink" => AnnotationType::Ink,
b"FileAttachment" => AnnotationType::FileAttachment,
b"Popup" => AnnotationType::Popup,
other => AnnotationType::Other(String::from_utf8_lossy(other).to_string()),
},
_ => return None,
};
let contents = get_string(dict, b"Contents");
let author = get_string(dict, b"T");
let subject = get_string(dict, b"Subj");
let creation_date = get_string(dict, b"CreationDate");
let modification_date = get_string(dict, b"M");
let rect = get_rect(dict);
Some(PdfAnnotation {
annotation_type,
contents,
author,
page_number,
rect,
subject,
creation_date,
modification_date,
})
}
fn get_string(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
dict.get(key).ok().and_then(|obj| match obj {
Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).to_string()),
_ => None,
})
}
fn get_rect(dict: &lopdf::Dictionary) -> Option<[f64; 4]> {
let rect_obj = dict.get(b"Rect").ok()?;
let arr = rect_obj.as_array().ok()?;
if arr.len() < 4 {
return None;
}
let mut result = [0.0f64; 4];
for (i, obj) in arr.iter().enumerate().take(4) {
result[i] = match obj {
Object::Real(f) => *f,
Object::Integer(i) => *i as f64,
_ => return None,
};
}
Some(result)
}
fn resolve<'a>(doc: &'a Document, obj: &'a Object) -> &'a Object {
match obj {
Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
_ => obj,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_annotation_text() {
let mut dict = lopdf::Dictionary::new();
dict.set("Subtype", Object::Name(b"Text".to_vec()));
dict.set(
"Contents",
Object::String(b"A comment".to_vec(), lopdf::StringFormat::Literal),
);
let annot = parse_annotation(&dict, 1).unwrap();
assert_eq!(annot.annotation_type, AnnotationType::Text);
assert_eq!(annot.contents, Some("A comment".to_string()));
assert_eq!(annot.page_number, 1);
}
#[test]
fn test_parse_annotation_highlight() {
let mut dict = lopdf::Dictionary::new();
dict.set("Subtype", Object::Name(b"Highlight".to_vec()));
let annot = parse_annotation(&dict, 3).unwrap();
assert_eq!(annot.annotation_type, AnnotationType::Highlight);
assert_eq!(annot.page_number, 3);
}
#[test]
fn test_parse_annotation_no_subtype() {
let dict = lopdf::Dictionary::new();
assert!(parse_annotation(&dict, 1).is_none());
}
#[test]
fn test_get_rect() {
let mut dict = lopdf::Dictionary::new();
dict.set(
"Rect",
Object::Array(vec![
Object::Real(10.0),
Object::Real(20.0),
Object::Real(100.0),
Object::Real(50.0),
]),
);
let rect = get_rect(&dict).unwrap();
assert_eq!(rect, [10.0, 20.0, 100.0, 50.0]);
}
}