use crate::types::uri::Uri;
use lopdf::{Document, Object, ObjectId};
fn decode_pdf_string(bytes: &[u8]) -> String {
if bytes.starts_with(&[0xFE, 0xFF]) && bytes.len() >= 2 {
let u16s: Vec<u16> = bytes[2..]
.chunks_exact(2)
.map(|chunk| u16::from_be_bytes([chunk[0], chunk[1]]))
.collect();
String::from_utf16(&u16s).unwrap_or_else(|_| String::from_utf8_lossy(bytes).into_owned())
} else {
String::from_utf8(bytes.to_vec()).unwrap_or_else(|_| String::from_utf8_lossy(bytes).into_owned())
}
}
pub fn extract_bookmarks(document: &Document) -> Vec<Uri> {
let mut uris = Vec::new();
let catalog_id = match document.catalog() {
Ok(id) => id,
Err(_) => return uris,
};
let outlines_ref = match catalog_id.get(b"Outlines") {
Ok(obj) => obj,
Err(_) => return uris,
};
let outlines_id = match outlines_ref.as_reference() {
Ok(id) => id,
Err(_) => return uris,
};
let outlines_dict = match document.get_object(outlines_id) {
Ok(Object::Dictionary(dict)) => dict,
_ => return uris,
};
let first_ref = match outlines_dict.get(b"First") {
Ok(obj) => match obj.as_reference() {
Ok(id) => id,
Err(_) => return uris,
},
Err(_) => return uris,
};
walk_outline_items(document, first_ref, &mut uris);
uris
}
fn walk_outline_items(document: &Document, item_id: ObjectId, uris: &mut Vec<Uri>) {
walk_outline_items_inner(document, item_id, uris, 0, 500);
}
fn walk_outline_items_inner(
document: &Document,
item_id: ObjectId,
uris: &mut Vec<Uri>,
depth: usize,
max_items: usize,
) {
if uris.len() >= max_items || depth > 50 {
return;
}
let dict = match document.get_object(item_id) {
Ok(Object::Dictionary(dict)) => dict,
_ => return,
};
let title = dict.get(b"Title").ok().and_then(|obj| match obj {
Object::String(bytes, _) => Some(decode_pdf_string(bytes)),
_ => None,
});
let uri = extract_destination(document, dict, title.as_deref())
.or_else(|| extract_action(document, dict, title.as_deref()));
if let Some(u) = uri {
uris.push(u);
}
if let Ok(first_obj) = dict.get(b"First")
&& let Ok(first_id) = first_obj.as_reference()
{
walk_outline_items_inner(document, first_id, uris, depth + 1, max_items);
}
if let Ok(next_obj) = dict.get(b"Next")
&& let Ok(next_id) = next_obj.as_reference()
{
walk_outline_items_inner(document, next_id, uris, depth, max_items);
}
}
fn extract_destination(document: &Document, dict: &lopdf::Dictionary, title: Option<&str>) -> Option<Uri> {
let dest = dict.get(b"Dest").ok()?;
let label = title.map(|s| s.to_string());
match dest {
Object::String(name, _) => {
let name_str = String::from_utf8_lossy(name);
Some(Uri::anchor(format!("#{}", name_str), label))
}
Object::Name(name) => {
let name_str = String::from_utf8_lossy(name);
Some(Uri::anchor(format!("#{}", name_str), label))
}
Object::Array(arr) => {
let page_num = resolve_page_number(document, arr);
let url = format!("#page={}", page_num.unwrap_or(1));
let mut uri = Uri::anchor(url, label);
if let Some(p) = page_num {
uri.page = Some(p);
}
Some(uri)
}
Object::Reference(id) => match document.get_object(*id) {
Ok(Object::Array(arr)) => {
let page_num = resolve_page_number(document, arr);
let url = format!("#page={}", page_num.unwrap_or(1));
let mut uri = Uri::anchor(url, label);
if let Some(p) = page_num {
uri.page = Some(p);
}
Some(uri)
}
_ => None,
},
_ => None,
}
}
fn extract_action(document: &Document, dict: &lopdf::Dictionary, title: Option<&str>) -> Option<Uri> {
let action_obj = dict.get(b"A").ok()?;
let action_dict = match action_obj {
Object::Dictionary(d) => d,
Object::Reference(id) => match document.get_object(*id) {
Ok(Object::Dictionary(d)) => d,
_ => return None,
},
_ => return None,
};
let action_type = action_dict
.get(b"S")
.ok()
.and_then(|o| o.as_name().ok())
.map(|n| String::from_utf8_lossy(n).into_owned());
let label = title.map(|s| s.to_string());
match action_type.as_deref() {
Some("URI") => {
let url = action_dict.get(b"URI").ok().and_then(|o| match o {
Object::String(bytes, _) => Some(String::from_utf8_lossy(bytes).into_owned()),
_ => None,
})?;
Some(Uri::hyperlink(url, label))
}
Some("GoTo") => {
let dest = action_dict.get(b"D").ok()?;
match dest {
Object::Array(arr) => {
let page_num = resolve_page_number(document, arr);
let url = format!("#page={}", page_num.unwrap_or(1));
let mut uri = Uri::anchor(url, label);
if let Some(p) = page_num {
uri.page = Some(p);
}
Some(uri)
}
Object::String(name, _) => {
let name_str = String::from_utf8_lossy(name);
Some(Uri::anchor(format!("#{}", name_str), label))
}
_ => None,
}
}
_ => None,
}
}
fn resolve_page_number(document: &Document, arr: &[Object]) -> Option<u32> {
let page_ref = arr.first()?;
let page_id = match page_ref {
Object::Reference(id) => *id,
_ => return None,
};
let pages = document.get_pages();
for (num, id) in &pages {
if *id == page_id {
return Some(*num);
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_bookmarks_no_outlines() {
let result = Document::load_mem(b"not a pdf");
assert!(result.is_err());
}
#[test]
fn test_resolve_page_number_empty_array() {
let arr: Vec<Object> = vec![];
assert_eq!(resolve_page_number(&Document::with_version("1.5"), &arr), None);
}
}