Skip to main content

synaptic_pdf/
loader.rs

1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::{Document, Loader, SynapticError};
7
8/// Loads documents from a PDF file.
9///
10/// Uses `pdf_extract` to extract text content from PDF files. Supports two
11/// modes of operation:
12///
13/// - **Single document** (default): All pages are combined into one `Document`.
14/// - **Split pages**: Each page becomes a separate `Document`, split on form
15///   feed characters (`\x0c`) that `pdf_extract` inserts between pages.
16///
17/// # Examples
18///
19/// ```no_run
20/// use synaptic_pdf::{PdfLoader, Loader};
21///
22/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
23/// // Load entire PDF as one document
24/// let loader = PdfLoader::new("document.pdf");
25/// let docs = loader.load().await?;
26/// assert_eq!(docs.len(), 1);
27///
28/// // Load with one document per page
29/// let loader = PdfLoader::with_split_pages("document.pdf");
30/// let docs = loader.load().await?;
31/// // docs.len() == number of pages
32/// # Ok(())
33/// # }
34/// ```
35pub struct PdfLoader {
36    path: PathBuf,
37    split_pages: bool,
38}
39
40impl PdfLoader {
41    /// Create a new `PdfLoader` that extracts all text as a single document.
42    pub fn new(path: impl Into<PathBuf>) -> Self {
43        Self {
44            path: path.into(),
45            split_pages: false,
46        }
47    }
48
49    /// Create a new `PdfLoader` that splits text into one document per page.
50    ///
51    /// Page boundaries are detected by form feed characters (`\x0c`) inserted
52    /// by the PDF extraction library.
53    pub fn with_split_pages(path: impl Into<PathBuf>) -> Self {
54        Self {
55            path: path.into(),
56            split_pages: true,
57        }
58    }
59}
60
61#[async_trait]
62impl Loader for PdfLoader {
63    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
64        let path = self.path.clone();
65        let split_pages = self.split_pages;
66
67        // pdf_extract::extract_text is synchronous, so run it on a blocking thread
68        let text = tokio::task::spawn_blocking(move || {
69            pdf_extract::extract_text(&path)
70        })
71        .await
72        .map_err(|e| SynapticError::Loader(format!("task join error: {e}")))?
73        .map_err(|e| {
74            SynapticError::Loader(format!(
75                "failed to extract text from {}: {e}",
76                self.path.display()
77            ))
78        })?;
79
80        let path_str = self.path.to_string_lossy().to_string();
81
82        if split_pages {
83            // Split on form feed characters that pdf_extract inserts between pages
84            let pages: Vec<&str> = text.split('\x0c').collect();
85            let total_pages = pages.len();
86
87            let docs = pages
88                .into_iter()
89                .enumerate()
90                .filter(|(_, content)| !content.trim().is_empty())
91                .map(|(i, content)| {
92                    let page_num = i + 1;
93                    let id = format!("{path_str}:page_{page_num}");
94
95                    let mut metadata = HashMap::new();
96                    metadata.insert("source".to_string(), Value::String(path_str.clone()));
97                    metadata.insert(
98                        "page".to_string(),
99                        Value::Number(serde_json::Number::from(page_num)),
100                    );
101                    metadata.insert(
102                        "total_pages".to_string(),
103                        Value::Number(serde_json::Number::from(total_pages)),
104                    );
105
106                    Document::with_metadata(id, content.trim(), metadata)
107                })
108                .collect();
109
110            Ok(docs)
111        } else {
112            // Count pages from form feed characters
113            let total_pages = text.split('\x0c').count();
114
115            let mut metadata = HashMap::new();
116            metadata.insert("source".to_string(), Value::String(path_str.clone()));
117            metadata.insert(
118                "total_pages".to_string(),
119                Value::Number(serde_json::Number::from(total_pages)),
120            );
121
122            Ok(vec![Document::with_metadata(path_str, text.trim(), metadata)])
123        }
124    }
125}