Skip to main content

synaptic_pdf/
loader.rs

1use std::collections::HashMap;
2use std::path::PathBuf;
3
4use async_trait::async_trait;
5use serde_json::Value;
6use synaptic_core::{Document, Loader, SynapticError};
7
8/// Loads documents from a PDF file.
9///
10/// Uses `pdf_extract` to extract text content from PDF files. Supports two
11/// modes of operation:
12///
13/// - **Single document** (default): All pages are combined into one `Document`.
14/// - **Split pages**: Each page becomes a separate `Document`, split on form
15///   feed characters (`\x0c`) that `pdf_extract` inserts between pages.
16///
17/// # Examples
18///
19/// ```no_run
20/// use synaptic_pdf::{PdfLoader, Loader};
21///
22/// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
23/// // Load entire PDF as one document
24/// let loader = PdfLoader::new("document.pdf");
25/// let docs = loader.load().await?;
26/// assert_eq!(docs.len(), 1);
27///
28/// // Load with one document per page
29/// let loader = PdfLoader::with_split_pages("document.pdf");
30/// let docs = loader.load().await?;
31/// // docs.len() == number of pages
32/// # Ok(())
33/// # }
34/// ```
35pub struct PdfLoader {
36    path: PathBuf,
37    split_pages: bool,
38}
39
40impl PdfLoader {
41    /// Create a new `PdfLoader` that extracts all text as a single document.
42    pub fn new(path: impl Into<PathBuf>) -> Self {
43        Self {
44            path: path.into(),
45            split_pages: false,
46        }
47    }
48
49    /// Create a new `PdfLoader` that splits text into one document per page.
50    ///
51    /// Page boundaries are detected by form feed characters (`\x0c`) inserted
52    /// by the PDF extraction library.
53    pub fn with_split_pages(path: impl Into<PathBuf>) -> Self {
54        Self {
55            path: path.into(),
56            split_pages: true,
57        }
58    }
59}
60
61#[async_trait]
62impl Loader for PdfLoader {
63    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
64        let path = self.path.clone();
65        let split_pages = self.split_pages;
66
67        // pdf_extract::extract_text is synchronous, so run it on a blocking thread
68        let text = tokio::task::spawn_blocking(move || pdf_extract::extract_text(&path))
69            .await
70            .map_err(|e| SynapticError::Loader(format!("task join error: {e}")))?
71            .map_err(|e| {
72                SynapticError::Loader(format!(
73                    "failed to extract text from {}: {e}",
74                    self.path.display()
75                ))
76            })?;
77
78        let path_str = self.path.to_string_lossy().to_string();
79
80        if split_pages {
81            // Split on form feed characters that pdf_extract inserts between pages
82            let pages: Vec<&str> = text.split('\x0c').collect();
83            let total_pages = pages.len();
84
85            let docs = pages
86                .into_iter()
87                .enumerate()
88                .filter(|(_, content)| !content.trim().is_empty())
89                .map(|(i, content)| {
90                    let page_num = i + 1;
91                    let id = format!("{path_str}:page_{page_num}");
92
93                    let mut metadata = HashMap::new();
94                    metadata.insert("source".to_string(), Value::String(path_str.clone()));
95                    metadata.insert(
96                        "page".to_string(),
97                        Value::Number(serde_json::Number::from(page_num)),
98                    );
99                    metadata.insert(
100                        "total_pages".to_string(),
101                        Value::Number(serde_json::Number::from(total_pages)),
102                    );
103
104                    Document::with_metadata(id, content.trim(), metadata)
105                })
106                .collect();
107
108            Ok(docs)
109        } else {
110            // Count pages from form feed characters
111            let total_pages = text.split('\x0c').count();
112
113            let mut metadata = HashMap::new();
114            metadata.insert("source".to_string(), Value::String(path_str.clone()));
115            metadata.insert(
116                "total_pages".to_string(),
117                Value::Number(serde_json::Number::from(total_pages)),
118            );
119
120            Ok(vec![Document::with_metadata(
121                path_str,
122                text.trim(),
123                metadata,
124            )])
125        }
126    }
127}