use crate::document_loaders::{DocumentLoader, LoadOptions};
use crate::retriever_engine::Document;
use crate::types::Layer3Error;
use crate::types::Layer3Result;
use async_trait::async_trait;
use lopdf::Document as PdfDoc;
use std::collections::HashMap;
use std::path::PathBuf;
use tracing::{debug, warn};
pub struct PdfLoader {
#[allow(dead_code)]
options: LoadOptions,
}
impl PdfLoader {
pub fn new() -> Self {
Self {
options: LoadOptions::default(),
}
}
pub fn with_options(options: LoadOptions) -> Self {
Self { options }
}
fn extract_metadata(&self, pdf: &PdfDoc) -> HashMap<String, serde_json::Value> {
let mut metadata = HashMap::new();
fn get_string_from_dict(dict: &lopdf::Dictionary, key: &[u8]) -> Option<String> {
let obj = dict.get(key).ok()?;
if let lopdf::Object::String(bytes, _) = obj {
PdfLoader::decode_pdf_string(bytes).ok()
} else {
None
}
}
if let Ok(trailer) = pdf.trailer.get(b"Info") {
if let Ok(info_ref) = trailer.as_reference() {
if let Ok(lopdf::Object::Dictionary(dict)) = pdf.get_object(info_ref) {
if let Some(title) = get_string_from_dict(dict, b"Title") {
metadata.insert("title".to_string(), serde_json::json!(title));
}
if let Some(author) = get_string_from_dict(dict, b"Author") {
metadata.insert("author".to_string(), serde_json::json!(author));
}
if let Some(subject) = get_string_from_dict(dict, b"Subject") {
metadata.insert("subject".to_string(), serde_json::json!(subject));
}
if let Some(creator) = get_string_from_dict(dict, b"Creator") {
metadata.insert("creator".to_string(), serde_json::json!(creator));
}
if let Some(producer) = get_string_from_dict(dict, b"Producer") {
metadata.insert("producer".to_string(), serde_json::json!(producer));
}
if let Some(creation_date) = get_string_from_dict(dict, b"CreationDate") {
metadata.insert(
"creation_date".to_string(),
serde_json::json!(creation_date),
);
}
if let Some(mod_date) = get_string_from_dict(dict, b"ModDate") {
metadata
.insert("modification_date".to_string(), serde_json::json!(mod_date));
}
}
}
}
let page_count = pdf.get_pages().len();
metadata.insert("page_count".to_string(), serde_json::json!(page_count));
metadata
}
fn decode_pdf_string(bytes: &[u8]) -> Layer3Result<String> {
if let Ok(s) = std::str::from_utf8(bytes) {
return Ok(s.to_string());
}
let decoded: String = bytes.iter().map(|&b| b as char).collect();
Ok(decoded)
}
fn extract_page_text(pdf: &PdfDoc, page_id: (u32, u16)) -> Layer3Result<String> {
let mut text = String::new();
if let Ok(lopdf::Object::Dictionary(dict)) = pdf.get_object(page_id) {
if let Ok(contents) = dict.get(b"Contents") {
match contents {
lopdf::Object::Reference(ref_id) => {
if let Ok(lopdf::Object::Stream(stream_obj)) = pdf.get_object(*ref_id) {
if let Ok(content) = stream_obj.decompressed_content() {
text.push_str(&Self::parse_content_stream(&content));
}
}
}
lopdf::Object::Array(arr) => {
for obj in arr {
if let lopdf::Object::Reference(ref_id) = obj {
if let Ok(lopdf::Object::Stream(stream_obj)) =
pdf.get_object(*ref_id)
{
if let Ok(content) = stream_obj.decompressed_content() {
text.push_str(&Self::parse_content_stream(&content));
}
}
}
}
}
_ => {}
}
}
}
Ok(text)
}
fn parse_content_stream(content: &[u8]) -> String {
let mut text = String::new();
let content_str = String::from_utf8_lossy(content);
let mut current_text = String::new();
let mut in_string = false;
let mut escape_next = false;
for ch in content_str.chars() {
if escape_next {
current_text.push(ch);
escape_next = false;
continue;
}
match ch {
'\\' if in_string => {
escape_next = true;
}
'(' => {
if !in_string {
in_string = true;
current_text.clear();
} else {
current_text.push(ch);
}
}
')' => {
if in_string {
in_string = false;
if !current_text.is_empty() {
let cleaned: String = current_text
.chars()
.filter(|c| {
c.is_alphabetic()
|| c.is_numeric()
|| c.is_whitespace()
|| *c == '-'
|| *c == '.'
|| *c == ','
})
.collect();
if !cleaned.trim().is_empty() {
text.push_str(&cleaned);
text.push(' ');
}
}
} else {
current_text.push(ch);
}
}
_ => {
if in_string {
current_text.push(ch);
}
}
}
}
let cleaned: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
cleaned
}
fn extract_all_text(&self, pdf: &PdfDoc) -> Layer3Result<Vec<(usize, String)>> {
let pages = pdf.get_pages();
let mut result = Vec::new();
for (page_num, page_id) in pages.iter() {
match Self::extract_page_text(pdf, *page_id) {
Ok(page_text) => {
if !page_text.trim().is_empty() {
result.push((*page_num as usize, page_text));
}
}
Err(e) => {
warn!("Failed to extract text from page {}: {}", page_num, e);
}
}
}
Ok(result)
}
}
impl Default for PdfLoader {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl DocumentLoader for PdfLoader {
async fn load(&self, path: PathBuf) -> Layer3Result<Document> {
debug!("Loading PDF file: {:?}", path);
let pdf = PdfDoc::load(&path).map_err(|e| {
Layer3Error::PersistenceError(format!(
"Failed to load PDF file '{}': {}",
path.display(),
e
))
})?;
let metadata = self.extract_metadata(&pdf);
let pages = self.extract_all_text(&pdf)?;
let full_text: String = pages
.iter()
.map(|(_, text)| text.as_str())
.collect::<Vec<_>>()
.join("\n\n");
let mut doc = Document::new(full_text).with_source(path.to_string_lossy().to_string());
doc.metadata = metadata;
Ok(doc)
}
async fn load_and_split(&self, path: PathBuf) -> Layer3Result<Vec<Document>> {
debug!("Loading and splitting PDF file: {:?}", path);
let pdf = PdfDoc::load(&path).map_err(|e| {
Layer3Error::PersistenceError(format!(
"Failed to load PDF file '{}': {}",
path.display(),
e
))
})?;
let base_metadata = self.extract_metadata(&pdf);
let pages = self.extract_all_text(&pdf)?;
if pages.is_empty() {
return Ok(Vec::new());
}
let source = path.to_string_lossy().to_string();
let documents: Vec<Document> = pages
.into_iter()
.map(|(page_num, text)| {
let mut metadata = base_metadata.clone();
metadata.insert("page".to_string(), serde_json::json!(page_num));
metadata.insert(
"total_pages".to_string(),
serde_json::json!(pdf.get_pages().len()),
);
Document {
id: None,
content: text,
metadata,
source: Some(source.clone()),
}
})
.collect();
Ok(documents)
}
fn supports(&self, path: &std::path::Path) -> bool {
path.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase() == "pdf")
.unwrap_or(false)
}
fn extensions(&self) -> &[&str] {
&["pdf"]
}
}
#[cfg(test)]
mod tests {
use super::*;
use lopdf::Dictionary;
use lopdf::Object as PdfObject;
use lopdf::Stream;
use tempfile::NamedTempFile;
fn create_minimal_pdf() -> NamedTempFile {
let mut pdf = lopdf::Document::new();
pdf.add_object(PdfObject::Dictionary(Dictionary::from_iter([
("Type", PdfObject::Name("Catalog".as_bytes().to_vec())),
("Pages", PdfObject::Reference((2, 0))),
])));
pdf.add_object(PdfObject::Dictionary(Dictionary::from_iter([
("Type", PdfObject::Name("Pages".as_bytes().to_vec())),
("Kids", PdfObject::Array(vec![PdfObject::Reference((3, 0))])),
("Count", PdfObject::Integer(1)),
])));
pdf.add_object(PdfObject::Dictionary(Dictionary::from_iter([
("Type", PdfObject::Name("Page".as_bytes().to_vec())),
("Parent", PdfObject::Reference((2, 0))),
(
"MediaBox",
PdfObject::Array(vec![
PdfObject::Integer(0),
PdfObject::Integer(0),
PdfObject::Integer(612),
PdfObject::Integer(792),
]),
),
("Contents", PdfObject::Reference((4, 0))),
])));
let content = b"BT /F1 12 Tf 100 700 Td (Hello World) Tj ET";
pdf.add_object(PdfObject::Stream(Stream::new(
Dictionary::from_iter([("Length", PdfObject::Integer(content.len() as i64))]),
content.to_vec(),
)));
let file = NamedTempFile::with_suffix(".pdf").unwrap();
pdf.save(file.path()).expect("Failed to save PDF");
file
}
#[test]
fn test_pdf_loader_extensions() {
let loader = PdfLoader::new();
assert!(loader.extensions().contains(&"pdf"));
}
#[test]
fn test_pdf_loader_supports() {
let loader = PdfLoader::new();
assert!(loader.supports(std::path::Path::new("test.pdf")));
assert!(loader.supports(std::path::Path::new("test.PDF")));
assert!(!loader.supports(std::path::Path::new("test.txt")));
}
#[tokio::test]
async fn test_pdf_loader_load() {
let loader = PdfLoader::new();
let pdf_file = create_minimal_pdf();
let result = loader.load(pdf_file.path().to_path_buf()).await;
if let Err(ref err) = result {
eprintln!("Error loading PDF: {:?}", err);
}
assert!(result.is_ok(), "PDF should load successfully");
let doc = result.unwrap();
assert!(doc.source.is_some());
assert!(doc.metadata.contains_key("page_count"));
}
#[tokio::test]
async fn test_pdf_loader_load_and_split() {
let loader = PdfLoader::new();
let pdf_file = create_minimal_pdf();
let result = loader.load_and_split(pdf_file.path().to_path_buf()).await;
if let Err(ref err) = result {
eprintln!("Error loading PDF: {:?}", err);
}
assert!(result.is_ok(), "PDF should load successfully");
let docs = result.unwrap();
assert!(docs.len() <= 1);
}
#[test]
fn test_decode_pdf_string_utf8() {
let bytes = b"Hello World";
let result = PdfLoader::decode_pdf_string(bytes);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "Hello World");
}
#[test]
fn test_decode_pdf_string_latin1() {
let bytes = vec![b'C', b'a', b'f', 0xE9]; let result = PdfLoader::decode_pdf_string(&bytes);
assert!(result.is_ok());
}
}