synaptic_pdf/loader.rs
1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::{Document, Loader, SynapticError};
7
8/// Loads documents from a PDF file.
9///
10/// Uses `pdf_extract` to extract text content from PDF files. Supports two
11/// modes of operation:
12///
13/// - **Single document** (default): All pages are combined into one `Document`.
14/// - **Split pages**: Each page becomes a separate `Document`, split on form
15/// feed characters (`\x0c`) that `pdf_extract` inserts between pages.
16///
17/// # Examples
18///
19/// ```no_run
20/// use synaptic_pdf::{PdfLoader, Loader};
21///
22/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
23/// // Load entire PDF as one document
24/// let loader = PdfLoader::new("document.pdf");
25/// let docs = loader.load().await?;
26/// assert_eq!(docs.len(), 1);
27///
28/// // Load with one document per page
29/// let loader = PdfLoader::with_split_pages("document.pdf");
30/// let docs = loader.load().await?;
31/// // docs.len() == number of pages
32/// # Ok(())
33/// # }
34/// ```
35pub struct PdfLoader {
36 path: PathBuf,
37 split_pages: bool,
38}
39
40impl PdfLoader {
41 /// Create a new `PdfLoader` that extracts all text as a single document.
42 pub fn new(path: impl Into<PathBuf>) -> Self {
43 Self {
44 path: path.into(),
45 split_pages: false,
46 }
47 }
48
49 /// Create a new `PdfLoader` that splits text into one document per page.
50 ///
51 /// Page boundaries are detected by form feed characters (`\x0c`) inserted
52 /// by the PDF extraction library.
53 pub fn with_split_pages(path: impl Into<PathBuf>) -> Self {
54 Self {
55 path: path.into(),
56 split_pages: true,
57 }
58 }
59}
60
61#[async_trait]
62impl Loader for PdfLoader {
63 async fn load(&self) -> Result<Vec<Document>, SynapticError> {
64 let path = self.path.clone();
65 let split_pages = self.split_pages;
66
67 // pdf_extract::extract_text is synchronous, so run it on a blocking thread
68 let text = tokio::task::spawn_blocking(move || pdf_extract::extract_text(&path))
69 .await
70 .map_err(|e| SynapticError::Loader(format!("task join error: {e}")))?
71 .map_err(|e| {
72 SynapticError::Loader(format!(
73 "failed to extract text from {}: {e}",
74 self.path.display()
75 ))
76 })?;
77
78 let path_str = self.path.to_string_lossy().to_string();
79
80 if split_pages {
81 // Split on form feed characters that pdf_extract inserts between pages
82 let pages: Vec<&str> = text.split('\x0c').collect();
83 let total_pages = pages.len();
84
85 let docs = pages
86 .into_iter()
87 .enumerate()
88 .filter(|(_, content)| !content.trim().is_empty())
89 .map(|(i, content)| {
90 let page_num = i + 1;
91 let id = format!("{path_str}:page_{page_num}");
92
93 let mut metadata = HashMap::new();
94 metadata.insert("source".to_string(), Value::String(path_str.clone()));
95 metadata.insert(
96 "page".to_string(),
97 Value::Number(serde_json::Number::from(page_num)),
98 );
99 metadata.insert(
100 "total_pages".to_string(),
101 Value::Number(serde_json::Number::from(total_pages)),
102 );
103
104 Document::with_metadata(id, content.trim(), metadata)
105 })
106 .collect();
107
108 Ok(docs)
109 } else {
110 // Count pages from form feed characters
111 let total_pages = text.split('\x0c').count();
112
113 let mut metadata = HashMap::new();
114 metadata.insert("source".to_string(), Value::String(path_str.clone()));
115 metadata.insert(
116 "total_pages".to_string(),
117 Value::Number(serde_json::Number::from(total_pages)),
118 );
119
120 Ok(vec![Document::with_metadata(
121 path_str,
122 text.trim(),
123 metadata,
124 )])
125 }
126 }
127}