use std::collections::HashMap;
use std::path::PathBuf;
use async_trait::async_trait;
use cognis_core::document_loaders::BaseLoader;
use cognis_core::document_loaders::DocumentStream;
use cognis_core::documents::Document;
use cognis_core::error::{CognisError, Result};
use futures::stream;
use serde_json::Value;
pub struct PdfLoader {
path: PathBuf,
}
impl PdfLoader {
pub fn new(path: impl Into<PathBuf>) -> Self {
Self { path: path.into() }
}
}
#[async_trait]
impl BaseLoader for PdfLoader {
async fn lazy_load(&self) -> Result<DocumentStream> {
let path = self.path.clone();
let content = tokio::task::spawn_blocking(move || -> Result<String> {
let bytes = std::fs::read(&path).map_err(|e| CognisError::Other(e.to_string()))?;
pdf_extract::extract_text_from_mem(&bytes)
.map_err(|e| CognisError::Other(format!("PDF extraction failed: {e}")))
})
.await
.map_err(|e| CognisError::Other(format!("Task join error: {e}")))??;
let mut metadata = HashMap::new();
metadata.insert(
"source".to_string(),
Value::String(self.path.display().to_string()),
);
metadata.insert(
"content_type".to_string(),
Value::String("application/pdf".to_string()),
);
let doc = Document::new(content).with_metadata(metadata);
Ok(Box::pin(stream::iter(vec![Ok(doc)])))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_loader_creation() {
let loader = PdfLoader::new("/tmp/test.pdf");
assert_eq!(loader.path, PathBuf::from("/tmp/test.pdf"));
}
#[tokio::test]
async fn test_pdf_loader_nonexistent_file() {
let loader = PdfLoader::new("/tmp/nonexistent_cognis_test_file.pdf");
let result = loader.load().await;
assert!(result.is_err());
}
}