use lopdf::Document;
use super::*;
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct PDFContent {
pub path: PathBuf,
pub metadata: PDFMetadata,
pub pages: Vec<PageContent>,
}
#[derive(Default)]
pub struct PDFContentBuilder {
path: Option<PathBuf>,
}
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct PDFMetadata {
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub keywords: Option<String>,
}
#[derive(Debug, Serialize, Deserialize, Default)]
pub struct PageContent {
pub page_number: u32,
pub text: String,
}
impl PDFContentBuilder {
pub fn new() -> Self { Default::default() }
pub fn path<P: Into<PathBuf>>(mut self, path: P) -> Self {
self.path = Some(path.into());
self
}
pub fn analyze(self) -> Result<PDFContent> {
let path = self.path.ok_or_else(|| {
LearnerError::Path(std::io::Error::new(std::io::ErrorKind::NotFound, "No PDF path specified"))
})?;
debug!("Loading document from: {path:?}");
let doc = Document::load(&path)?;
let metadata = extract_metadata(&doc)?;
let pages = extract_pages(&doc)?;
Ok(PDFContent { path, metadata, pages })
}
}
fn extract_metadata(doc: &Document) -> Result<PDFMetadata> {
let trailer = &doc.trailer;
let info_ref = trailer.get(b"Info").ok().and_then(|o| o.as_reference().ok());
let info = match info_ref {
Some(reference) => doc.get_object(reference).and_then(|obj| obj.as_dict())?,
None =>
return Ok(PDFMetadata { title: None, author: None, subject: None, keywords: None }),
};
Ok(PDFMetadata {
title: get_text_from_dict(info, "Title"),
author: get_text_from_dict(info, "Author"),
subject: get_text_from_dict(info, "Subject"),
keywords: get_text_from_dict(info, "Keywords"),
})
}
fn get_text_from_dict(dict: &lopdf::Dictionary, key: &str) -> Option<String> {
dict.get(key.as_bytes()).ok().and_then(|obj| obj.as_str().ok()).map(|bytes| {
if bytes.starts_with(&[0xFE, 0xFF]) {
String::from_utf16be_lossy(&bytes[2..])
} else {
String::from_utf8_lossy(bytes).to_string()
}
})
}
fn extract_pages(doc: &Document) -> Result<Vec<PageContent>> {
let mut pages = Vec::new();
lazy_static! {
static ref PDF_TEXT_REGEX: Regex = Regex::new(r"\(([^)]+)\)").unwrap();
};
for (page_num, page_id) in doc.page_iter().enumerate() {
debug!("Processing page {}, id: {:?}", page_num + 1, page_id);
let page = doc.get_object(page_id)?;
let page_dict = page.as_dict()?;
match page_dict.get(b"Contents") {
Ok(contents) => {
let mut text = String::new();
let text_ref = contents.as_reference()?;
let plain_content = doc.get_object(text_ref)?.as_stream()?.get_plain_content()?;
for cap in PDF_TEXT_REGEX.captures_iter(&String::from_utf8_lossy(&plain_content)) {
text.push_str(&cap[1]);
text.push(' '); }
trace!("text for page {}: {}", page_num, text);
pages.push(PageContent { page_number: page_num as u32 + 1, text });
},
Err(e) => println!("Failed to get Contents: {:?}", e),
}
}
Ok(pages)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_metadata_extraction() {
let content =
PDFContentBuilder::new().path(PathBuf::from("tests/.data/test_paper.pdf")).analyze().unwrap();
let metadata = content.metadata;
assert_eq!(metadata.title.unwrap(), "Analysis of PDF Extraction Methods");
assert_eq!(metadata.author.unwrap(), "Alice Researcher and Bob Scholar");
assert_eq!(metadata.subject.unwrap(), "PDF Content Analysis");
assert_eq!(
metadata.keywords.unwrap(),
"PDF analysis, text extraction, metadata, academic papers"
);
}
#[test]
fn test_pdf_page_extraction() {
let content =
PDFContentBuilder::new().path(PathBuf::from("tests/.data/test_paper.pdf")).analyze().unwrap();
assert!(!content.pages.is_empty(), "Should have at least one page");
let first_page = &content.pages[0];
assert!(
first_page.text.contains("Analysis of PDF Extraction Methods"),
"First page should contain title"
);
assert!(
first_page.text.contains("Abstract \\227This is a sam ple paper"),
"First page should contain abstract"
);
}
}