rig/loaders/pdf.rs
1use std::{fs, path::PathBuf};
2
3use glob::glob;
4use lopdf::{Document, Error as LopdfError};
5use thiserror::Error;
6
7use super::file::FileLoaderError;
8
9#[derive(Error, Debug)]
10pub enum PdfLoaderError {
11 #[error("{0}")]
12 FileLoaderError(#[from] FileLoaderError),
13
14 #[error("UTF-8 conversion error: {0}")]
15 FromUtf8Error(#[from] std::string::FromUtf8Error),
16
17 #[error("IO error: {0}")]
18 PdfError(#[from] LopdfError),
19}
20
21// ================================================================
22// Implementing Loadable trait for loading pdfs
23// ================================================================
24
25pub(crate) trait Loadable {
26 fn load(self) -> Result<Document, PdfLoaderError>;
27 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError>;
28}
29
30impl Loadable for PathBuf {
31 fn load(self) -> Result<Document, PdfLoaderError> {
32 Document::load(self).map_err(PdfLoaderError::PdfError)
33 }
34 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
35 let contents = Document::load(&self);
36 Ok((self, contents?))
37 }
38}
39
40impl<T> Loadable for Result<T, PdfLoaderError>
41where
42 T: Loadable,
43{
44 fn load(self) -> Result<Document, PdfLoaderError> {
45 self.map(|t| t.load())?
46 }
47 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
48 self.map(|t| t.load_with_path())?
49 }
50}
51
52impl Loadable for Vec<u8> {
53 fn load(self) -> Result<Document, PdfLoaderError> {
54 Document::load_mem(&self).map_err(PdfLoaderError::PdfError)
55 }
56
57 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
58 let doc = Document::load_mem(&self).map_err(PdfLoaderError::PdfError)?;
59 Ok((PathBuf::from("<memory>"), doc))
60 }
61}
62
63// ================================================================
64// PdfFileLoader definitions and implementations
65// ================================================================
66
67/// [PdfFileLoader] is a utility for loading pdf files from the filesystem using glob patterns or
68/// directory paths. It provides methods to read file contents and handle errors gracefully.
69///
70/// # Errors
71///
72/// This module defines a custom error type [PdfLoaderError] which can represent various errors
73/// that might occur during file loading operations, such as any [FileLoaderError] alongside
74/// specific PDF-related errors.
75///
76/// # Example Usage
77///
78/// ```rust
79/// use rig:loaders::PdfileLoader;
80///
81/// fn main() -> Result<(), Box<dyn std::error::Error>> {
82/// // Create a FileLoader using a glob pattern
83/// let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
84///
85/// // Load pdf file contents by page, ignoring any errors
86/// let contents: Vec<String> = loader
87/// .load_with_path()
88/// .ignore_errors()
89/// .by_page()
90///
91/// for content in contents {
92/// println!("{}", content);
93/// }
94///
95/// Ok(())
96/// }
97/// ```
98///
99/// [PdfFileLoader] uses strict typing between the iterator methods to ensure that transitions
100/// between different implementations of the loaders and it's methods are handled properly by
101/// the compiler.
102pub struct PdfFileLoader<'a, T> {
103 iterator: Box<dyn Iterator<Item = T> + 'a>,
104}
105
106impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
107 /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
108 /// or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances that can be
109 /// further processed (by page, etc).
110 ///
111 /// # Example
112 /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
113 ///
114 /// ```rust
115 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().into_iter();
116 /// for result in content {
117 /// match result {
118 /// Ok((path, doc)) => println!("{:?} {}", path, doc),
119 /// Err(e) => eprintln!("Error reading pdf: {}", e),
120 /// }
121 /// }
122 /// ```
123 pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
124 PdfFileLoader {
125 iterator: Box::new(self.iterator.map(|res| res.load())),
126 }
127 }
128
129 /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
130 /// or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances with their path
131 /// that can be further processed.
132 ///
133 /// # Example
134 /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
135 ///
136 /// ```rust
137 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load_with_path().into_iter();
138 /// for result in content {
139 /// match result {
140 /// Ok((path, doc)) => println!("{:?} {}", path, doc),
141 /// Err(e) => eprintln!("Error reading pdf: {}", e),
142 /// }
143 /// }
144 /// ```
145 pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
146 PdfFileLoader {
147 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
148 }
149 }
150}
151
152impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
153 /// Directly reads the contents of the pdfs within the iterator returned by
154 /// [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir].
155 ///
156 /// # Example
157 /// Read pdfs in directory "tests/data/*.pdf" and return the contents of the documents.
158 ///
159 /// ```rust
160 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
161 /// for result in content {
162 /// match result {
163 /// Ok((path, content)) => println!("{}", content),
164 /// Err(e) => eprintln!("Error reading pdf: {}", e),
165 /// }
166 /// }
167 /// ```
168 pub fn read(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
169 PdfFileLoader {
170 iterator: Box::new(self.iterator.map(|res| {
171 let doc = res.load()?;
172 Ok(doc
173 .page_iter()
174 .enumerate()
175 .map(|(page_no, _)| {
176 doc.extract_text(&[page_no as u32 + 1])
177 .map_err(PdfLoaderError::PdfError)
178 })
179 .collect::<Result<Vec<String>, PdfLoaderError>>()?
180 .into_iter()
181 .collect::<String>())
182 })),
183 }
184 }
185
186 /// Directly reads the contents of the pdfs within the iterator returned by
187 /// [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir] and returns the path along with
188 /// the content.
189 ///
190 /// # Example
191 /// Read pdfs in directory "tests/data/*.pdf" and return the content and paths of the documents.
192 ///
193 /// ```rust
194 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
195 /// for result in content {
196 /// match result {
197 /// Ok((path, content)) => println!("{:?} {}", path, content),
198 /// Err(e) => eprintln!("Error reading pdf: {}", e),
199 /// }
200 /// }
201 /// ```
202 pub fn read_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, String), PdfLoaderError>> {
203 PdfFileLoader {
204 iterator: Box::new(self.iterator.map(|res| {
205 let (path, doc) = res.load_with_path()?;
206 println!(
207 "Loaded {:?} PDF: {:?}",
208 path,
209 doc.page_iter().collect::<Vec<_>>()
210 );
211 let content = doc
212 .page_iter()
213 .enumerate()
214 .map(|(page_no, _)| {
215 doc.extract_text(&[page_no as u32 + 1])
216 .map_err(PdfLoaderError::PdfError)
217 })
218 .collect::<Result<Vec<String>, PdfLoaderError>>()?
219 .into_iter()
220 .collect::<String>();
221
222 Ok((path, content))
223 })),
224 }
225 }
226}
227
228impl<'a> PdfFileLoader<'a, Document> {
229 /// Chunks the pages of a loaded document by page, flattened as a single vector.
230 ///
231 /// # Example
232 /// Load pdfs in directory "tests/data/*.pdf" and chunk all document into it's pages.
233 ///
234 /// ```rust
235 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().by_page().into_iter();
236 /// for result in content {
237 /// match result {
238 /// Ok(page) => println!("{}", page),
239 /// Err(e) => eprintln!("Error reading pdf: {}", e),
240 /// }
241 /// }
242 /// ```
243 pub fn by_page(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
244 PdfFileLoader {
245 iterator: Box::new(self.iterator.flat_map(|doc| {
246 doc.page_iter()
247 .enumerate()
248 .map(|(page_no, _)| {
249 doc.extract_text(&[page_no as u32 + 1])
250 .map_err(PdfLoaderError::PdfError)
251 })
252 .collect::<Vec<_>>()
253 })),
254 }
255 }
256}
257
258type ByPage = (PathBuf, Vec<(usize, Result<String, PdfLoaderError>)>);
259impl<'a> PdfFileLoader<'a, (PathBuf, Document)> {
260 /// Chunks the pages of a loaded document by page, processed as a vector of documents by path
261 /// which each document container an inner vector of pages by page number.
262 ///
263 /// # Example
264 /// Read pdfs in directory "tests/data/*.pdf" and chunk all documents by path by it's pages.
265 ///
266 /// ```rust
267 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
268 /// .load_with_path()
269 /// .ignore_errors()
270 /// .by_page()
271 /// .into_iter();
272 ///
273 /// for result in content {
274 /// match result {
275 /// Ok(documents) => {
276 /// for doc in documents {
277 /// match doc {
278 /// Ok((pageno, content)) => println!("Page {}: {}", pageno, content),
279 /// Err(e) => eprintln!("Error reading page: {}", e),
280 /// }
281 /// }
282 /// },
283 /// Err(e) => eprintln!("Error reading pdf: {}", e),
284 /// }
285 /// }
286 /// ```
287 pub fn by_page(self) -> PdfFileLoader<'a, ByPage> {
288 PdfFileLoader {
289 iterator: Box::new(self.iterator.map(|(path, doc)| {
290 (
291 path,
292 doc.page_iter()
293 .enumerate()
294 .map(|(page_no, _)| {
295 (
296 page_no,
297 doc.extract_text(&[page_no as u32 + 1])
298 .map_err(PdfLoaderError::PdfError),
299 )
300 })
301 .collect::<Vec<_>>(),
302 )
303 })),
304 }
305 }
306}
307
308impl<'a> PdfFileLoader<'a, ByPage> {
309 /// Ignores errors in the iterator, returning only successful results. This can be used on any
310 /// [PdfFileLoader] state of iterator whose items are results.
311 ///
312 /// # Example
313 /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
314 ///
315 /// ```rust
316 /// let content = FileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors().into_iter();
317 /// for result in content {
318 /// println!("{}", content)
319 /// }
320 /// ```
321 pub fn ignore_errors(self) -> PdfFileLoader<'a, (PathBuf, Vec<(usize, String)>)> {
322 PdfFileLoader {
323 iterator: Box::new(self.iterator.map(|(path, pages)| {
324 let pages = pages
325 .into_iter()
326 .filter_map(|(page_no, res)| res.ok().map(|content| (page_no, content)))
327 .collect::<Vec<_>>();
328 (path, pages)
329 })),
330 }
331 }
332}
333
334impl<'a, T> PdfFileLoader<'a, Result<T, PdfLoaderError>>
335where
336 T: 'a,
337{
338 /// Ignores errors in the iterator, returning only successful results. This can be used on any
339 /// [PdfFileLoader] state of iterator whose items are results.
340 ///
341 /// # Example
342 /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
343 ///
344 /// ```rust
345 /// let content = FileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors().into_iter();
346 /// for result in content {
347 /// println!("{}", content)
348 /// }
349 /// ```
350 pub fn ignore_errors(self) -> PdfFileLoader<'a, T> {
351 PdfFileLoader {
352 iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
353 }
354 }
355}
356
357impl PdfFileLoader<'_, Result<PathBuf, FileLoaderError>> {
358 /// Creates a new [PdfFileLoader] using a glob pattern to match files.
359 ///
360 /// # Example
361 /// Create a [PdfFileLoader] for all `.pdf` files that match the glob "tests/data/*.pdf".
362 ///
363 /// ```rust
364 /// let loader = FileLoader::with_glob("tests/data/*.txt")?;
365 /// ```
366 pub fn with_glob(
367 pattern: &str,
368 ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
369 let paths = glob(pattern).map_err(FileLoaderError::PatternError)?;
370 Ok(PdfFileLoader {
371 iterator: Box::new(paths.into_iter().map(|path| {
372 path.map_err(FileLoaderError::GlobError)
373 .map_err(PdfLoaderError::FileLoaderError)
374 })),
375 })
376 }
377
378 /// Creates a new [PdfFileLoader] on all files within a directory.
379 ///
380 /// # Example
381 /// Create a [PdfFileLoader] for all files that are in the directory "files".
382 ///
383 /// ```rust
384 /// let loader = PdfFileLoader::with_dir("files")?;
385 /// ```
386 pub fn with_dir(
387 directory: &str,
388 ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
389 Ok(PdfFileLoader {
390 iterator: Box::new(
391 fs::read_dir(directory)
392 .map_err(FileLoaderError::IoError)?
393 .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
394 ),
395 })
396 }
397}
398
399impl<'a> PdfFileLoader<'a, Vec<u8>> {
400 /// Ingest a PDF as a byte array.
401 pub fn from_bytes(bytes: Vec<u8>) -> PdfFileLoader<'a, Vec<u8>> {
402 PdfFileLoader {
403 iterator: Box::new(vec![bytes].into_iter()),
404 }
405 }
406
407 /// Ingest multiple byte arrays.
408 pub fn from_bytes_multi(bytes_vec: Vec<Vec<u8>>) -> PdfFileLoader<'a, Vec<u8>> {
409 PdfFileLoader {
410 iterator: Box::new(bytes_vec.into_iter()),
411 }
412 }
413
414 /// Use this once you've created the loader to load the document in.
415 pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
416 PdfFileLoader {
417 iterator: Box::new(self.iterator.map(|res| res.load())),
418 }
419 }
420
421 /// Use this once you've created the loader to load the document in (and get the path).
422 pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
423 PdfFileLoader {
424 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
425 }
426 }
427}
428
429// ================================================================
430// PDFFileLoader iterator implementations
431// ================================================================
432
433pub struct IntoIter<'a, T> {
434 iterator: Box<dyn Iterator<Item = T> + 'a>,
435}
436
437impl<'a, T> IntoIterator for PdfFileLoader<'a, T> {
438 type Item = T;
439 type IntoIter = IntoIter<'a, T>;
440
441 fn into_iter(self) -> Self::IntoIter {
442 IntoIter {
443 iterator: self.iterator,
444 }
445 }
446}
447
448impl<T> Iterator for IntoIter<'_, T> {
449 type Item = T;
450
451 fn next(&mut self) -> Option<Self::Item> {
452 self.iterator.next()
453 }
454}
455
456#[cfg(test)]
457mod tests {
458 use std::path::PathBuf;
459
460 use super::PdfFileLoader;
461
462 #[test]
463 fn test_pdf_loader() {
464 let loader = PdfFileLoader::with_glob("tests/data/*.pdf").unwrap();
465 let actual = loader
466 .load_with_path()
467 .ignore_errors()
468 .by_page()
469 .ignore_errors()
470 .into_iter()
471 .collect::<Vec<_>>();
472
473 let mut actual = actual
474 .into_iter()
475 .map(|result| {
476 let (path, pages) = result;
477 pages.iter().for_each(|(page_no, content)| {
478 println!("{path:?} Page {page_no}: {content:?}");
479 });
480 (path, pages)
481 })
482 .collect::<Vec<_>>();
483
484 let mut expected = vec![
485 (
486 PathBuf::from("tests/data/dummy.pdf"),
487 vec![(0, "Test\nPDF\nDocument\n".to_string())],
488 ),
489 (
490 PathBuf::from("tests/data/pages.pdf"),
491 vec![
492 (0, "Page\n1\n".to_string()),
493 (1, "Page\n2\n".to_string()),
494 (2, "Page\n3\n".to_string()),
495 ],
496 ),
497 ];
498
499 actual.sort();
500 expected.sort();
501
502 assert!(!actual.is_empty());
503 assert!(expected == actual)
504 }
505
506 #[test]
507 fn test_pdf_loader_bytes() {
508 // this should never fail!
509 let bytes = std::fs::read("tests/data/dummy.pdf").unwrap();
510
511 let loader = PdfFileLoader::from_bytes(bytes);
512
513 let actual = loader
514 .load()
515 .ignore_errors()
516 .by_page()
517 .ignore_errors()
518 .into_iter()
519 .collect::<Vec<_>>();
520
521 assert_eq!(actual.len(), 1);
522 assert_eq!(actual, vec!["Test\nPDF\nDocument\n".to_string()]);
523
524 // this should never fail!
525 let bytes = std::fs::read("tests/data/pages.pdf").unwrap();
526
527 let loader = PdfFileLoader::from_bytes(bytes);
528
529 let actual = loader
530 .load()
531 .ignore_errors()
532 .by_page()
533 .ignore_errors()
534 .into_iter()
535 .collect::<Vec<_>>();
536
537 assert_eq!(actual.len(), 3);
538 assert_eq!(
539 actual,
540 vec![
541 "Page\n1\n".to_string(),
542 "Page\n2\n".to_string(),
543 "Page\n3\n".to_string(),
544 ]
545 );
546 }
547}