use super::{Document, DocumentLoader, LoaderError};
use async_trait::async_trait;
use std::path::PathBuf;
pub struct PDFLoader {
pub path: PathBuf,
}
impl PDFLoader {
pub fn new(path: impl Into<PathBuf>) -> Self {
Self { path: path.into() }
}
}
#[async_trait]
impl DocumentLoader for PDFLoader {
async fn load(&self) -> Result<Vec<Document>, LoaderError> {
if !self.path.exists() {
return Err(LoaderError::Other(format!(
"PDF 文件不存在: {}",
self.path.display()
)));
}
let text = pdf_extract::extract_text(&self.path)
.map_err(|e| LoaderError::PdfError(format!("PDF 解析失败: {}", e)))?;
let mut document = Document::new(text);
document = document.with_metadata("source".to_string(), self.path.display().to_string());
document = document.with_metadata("format".to_string(), "pdf".to_string());
Ok(vec![document])
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Write;
#[tokio::test]
async fn test_pdf_loader_nonexistent() {
let loader = PDFLoader::new("./nonexistent.pdf");
let result = loader.load().await;
assert!(result.is_err());
match result.unwrap_err() {
LoaderError::Other(msg) => assert!(msg.contains("不存在")),
_ => panic!("Expected Other error"),
}
}
#[tokio::test]
#[ignore = "requires a sample PDF file"]
async fn test_pdf_loader() {
let loader = PDFLoader::new("./sample.pdf");
let result = loader.load().await;
if result.is_ok() {
let docs = result.unwrap();
assert!(!docs.is_empty());
assert!(docs[0].content.contains("PDF"));
}
}
}