synaptic_pdf/loader.rs
1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::{Document, Loader, SynapticError};
7
8/// Loads documents from a PDF file.
9///
10/// Uses `pdf_extract` to extract text content from PDF files. Supports two
11/// modes of operation:
12///
13/// - **Single document** (default): All pages are combined into one `Document`.
14/// - **Split pages**: Each page becomes a separate `Document`, split on form
15/// feed characters (`\x0c`) that `pdf_extract` inserts between pages.
16///
17/// # Examples
18///
19/// ```no_run
20/// use synaptic_pdf::{PdfLoader, Loader};
21///
22/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
23/// // Load entire PDF as one document
24/// let loader = PdfLoader::new("document.pdf");
25/// let docs = loader.load().await?;
26/// assert_eq!(docs.len(), 1);
27///
28/// // Load with one document per page
29/// let loader = PdfLoader::with_split_pages("document.pdf");
30/// let docs = loader.load().await?;
31/// // docs.len() == number of pages
32/// # Ok(())
33/// # }
34/// ```
35pub struct PdfLoader {
36 path: PathBuf,
37 split_pages: bool,
38}
39
40impl PdfLoader {
41 /// Create a new `PdfLoader` that extracts all text as a single document.
42 pub fn new(path: impl Into<PathBuf>) -> Self {
43 Self {
44 path: path.into(),
45 split_pages: false,
46 }
47 }
48
49 /// Create a new `PdfLoader` that splits text into one document per page.
50 ///
51 /// Page boundaries are detected by form feed characters (`\x0c`) inserted
52 /// by the PDF extraction library.
53 pub fn with_split_pages(path: impl Into<PathBuf>) -> Self {
54 Self {
55 path: path.into(),
56 split_pages: true,
57 }
58 }
59}
60
61#[async_trait]
62impl Loader for PdfLoader {
63 async fn load(&self) -> Result<Vec<Document>, SynapticError> {
64 let path = self.path.clone();
65 let split_pages = self.split_pages;
66
67 // pdf_extract::extract_text is synchronous, so run it on a blocking thread
68 let text = tokio::task::spawn_blocking(move || {
69 pdf_extract::extract_text(&path)
70 })
71 .await
72 .map_err(|e| SynapticError::Loader(format!("task join error: {e}")))?
73 .map_err(|e| {
74 SynapticError::Loader(format!(
75 "failed to extract text from {}: {e}",
76 self.path.display()
77 ))
78 })?;
79
80 let path_str = self.path.to_string_lossy().to_string();
81
82 if split_pages {
83 // Split on form feed characters that pdf_extract inserts between pages
84 let pages: Vec<&str> = text.split('\x0c').collect();
85 let total_pages = pages.len();
86
87 let docs = pages
88 .into_iter()
89 .enumerate()
90 .filter(|(_, content)| !content.trim().is_empty())
91 .map(|(i, content)| {
92 let page_num = i + 1;
93 let id = format!("{path_str}:page_{page_num}");
94
95 let mut metadata = HashMap::new();
96 metadata.insert("source".to_string(), Value::String(path_str.clone()));
97 metadata.insert(
98 "page".to_string(),
99 Value::Number(serde_json::Number::from(page_num)),
100 );
101 metadata.insert(
102 "total_pages".to_string(),
103 Value::Number(serde_json::Number::from(total_pages)),
104 );
105
106 Document::with_metadata(id, content.trim(), metadata)
107 })
108 .collect();
109
110 Ok(docs)
111 } else {
112 // Count pages from form feed characters
113 let total_pages = text.split('\x0c').count();
114
115 let mut metadata = HashMap::new();
116 metadata.insert("source".to_string(), Value::String(path_str.clone()));
117 metadata.insert(
118 "total_pages".to_string(),
119 Value::Number(serde_json::Number::from(total_pages)),
120 );
121
122 Ok(vec![Document::with_metadata(path_str, text.trim(), metadata)])
123 }
124 }
125}