rig/loaders/
pdf.rs

1use std::{fs, path::PathBuf};
2
3use glob::glob;
4use lopdf::{Document, Error as LopdfError};
5use thiserror::Error;
6
7use super::file::FileLoaderError;
8
9#[derive(Error, Debug)]
10pub enum PdfLoaderError {
11    #[error("{0}")]
12    FileLoaderError(#[from] FileLoaderError),
13
14    #[error("UTF-8 conversion error: {0}")]
15    FromUtf8Error(#[from] std::string::FromUtf8Error),
16
17    #[error("IO error: {0}")]
18    PdfError(#[from] LopdfError),
19}
20
21// ================================================================
22// Implementing Loadable trait for loading pdfs
23// ================================================================
24
25pub(crate) trait Loadable {
26    fn load(self) -> Result<Document, PdfLoaderError>;
27    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError>;
28}
29
30impl Loadable for PathBuf {
31    fn load(self) -> Result<Document, PdfLoaderError> {
32        Document::load(self).map_err(PdfLoaderError::PdfError)
33    }
34    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
35        let contents = Document::load(&self);
36        Ok((self, contents?))
37    }
38}
39
40impl<T> Loadable for Result<T, PdfLoaderError>
41where
42    T: Loadable,
43{
44    fn load(self) -> Result<Document, PdfLoaderError> {
45        self.map(|t| t.load())?
46    }
47    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
48        self.map(|t| t.load_with_path())?
49    }
50}
51
52impl Loadable for Vec<u8> {
53    fn load(self) -> Result<Document, PdfLoaderError> {
54        Document::load_mem(&self).map_err(PdfLoaderError::PdfError)
55    }
56
57    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
58        let doc = Document::load_mem(&self).map_err(PdfLoaderError::PdfError)?;
59        Ok((PathBuf::from("<memory>"), doc))
60    }
61}
62
63// ================================================================
64// PdfFileLoader definitions and implementations
65// ================================================================
66
67/// [PdfFileLoader] is a utility for loading pdf files from the filesystem using glob patterns or
68///  directory paths. It provides methods to read file contents and handle errors gracefully.
69///
70/// # Errors
71///
72/// This module defines a custom error type [PdfLoaderError] which can represent various errors
73///  that might occur during file loading operations, such as any [FileLoaderError] alongside
74///  specific PDF-related errors.
75///
76/// # Example Usage
77///
78/// ```rust
79/// use rig:loaders::PdfileLoader;
80///
81/// fn main() -> Result<(), Box<dyn std::error::Error>> {
82///     // Create a FileLoader using a glob pattern
83///     let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
84///
85///     // Load pdf file contents by page, ignoring any errors
86///     let contents: Vec<String> = loader
87///         .load_with_path()
88///         .ignore_errors()
89///         .by_page()
90///
91///     for content in contents {
92///         println!("{}", content);
93///     }
94///
95///     Ok(())
96/// }
97/// ```
98///
99/// [PdfFileLoader] uses strict typing between the iterator methods to ensure that transitions
100///  between different implementations of the loaders and it's methods are handled properly by
101///  the compiler.
102pub struct PdfFileLoader<'a, T> {
103    iterator: Box<dyn Iterator<Item = T> + 'a>,
104}
105
106impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
107    /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
108    ///  or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances that can be
109    ///  further processed (by page, etc).
110    ///
111    /// # Example
112    /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
113    ///
114    /// ```rust
115    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().into_iter();
116    /// for result in content {
117    ///     match result {
118    ///         Ok((path, doc)) => println!("{:?} {}", path, doc),
119    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
120    ///     }
121    /// }
122    /// ```
123    pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
124        PdfFileLoader {
125            iterator: Box::new(self.iterator.map(|res| res.load())),
126        }
127    }
128
129    /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
130    ///  or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances with their path
131    ///  that can be further processed.
132    ///
133    /// # Example
134    /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
135    ///
136    /// ```rust
137    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load_with_path().into_iter();
138    /// for result in content {
139    ///     match result {
140    ///         Ok((path, doc)) => println!("{:?} {}", path, doc),
141    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
142    ///     }
143    /// }
144    /// ```
145    pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
146        PdfFileLoader {
147            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
148        }
149    }
150}
151
152impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
153    /// Directly reads the contents of the pdfs within the iterator returned by
154    ///  [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir].
155    ///
156    /// # Example
157    /// Read pdfs in directory "tests/data/*.pdf" and return the contents of the documents.
158    ///
159    /// ```rust
160    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
161    /// for result in content {
162    ///     match result {
163    ///         Ok((path, content)) => println!("{}", content),
164    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
165    ///     }
166    /// }
167    /// ```
168    pub fn read(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
169        PdfFileLoader {
170            iterator: Box::new(self.iterator.map(|res| {
171                let doc = res.load()?;
172                Ok(doc
173                    .page_iter()
174                    .enumerate()
175                    .map(|(page_no, _)| {
176                        doc.extract_text(&[page_no as u32 + 1])
177                            .map_err(PdfLoaderError::PdfError)
178                    })
179                    .collect::<Result<Vec<String>, PdfLoaderError>>()?
180                    .into_iter()
181                    .collect::<String>())
182            })),
183        }
184    }
185
186    /// Directly reads the contents of the pdfs within the iterator returned by
187    ///  [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir] and returns the path along with
188    ///  the content.
189    ///
190    /// # Example
191    /// Read pdfs in directory "tests/data/*.pdf" and return the content and paths of the documents.
192    ///
193    /// ```rust
194    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
195    /// for result in content {
196    ///     match result {
197    ///         Ok((path, content)) => println!("{:?} {}", path, content),
198    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
199    ///     }
200    /// }
201    /// ```
202    pub fn read_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, String), PdfLoaderError>> {
203        PdfFileLoader {
204            iterator: Box::new(self.iterator.map(|res| {
205                let (path, doc) = res.load_with_path()?;
206                println!(
207                    "Loaded {:?} PDF: {:?}",
208                    path,
209                    doc.page_iter().collect::<Vec<_>>()
210                );
211                let content = doc
212                    .page_iter()
213                    .enumerate()
214                    .map(|(page_no, _)| {
215                        doc.extract_text(&[page_no as u32 + 1])
216                            .map_err(PdfLoaderError::PdfError)
217                    })
218                    .collect::<Result<Vec<String>, PdfLoaderError>>()?
219                    .into_iter()
220                    .collect::<String>();
221
222                Ok((path, content))
223            })),
224        }
225    }
226}
227
228impl<'a> PdfFileLoader<'a, Document> {
229    /// Chunks the pages of a loaded document by page, flattened as a single vector.
230    ///
231    /// # Example
232    /// Load pdfs in directory "tests/data/*.pdf" and chunk all document into it's pages.
233    ///
234    /// ```rust
235    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().by_page().into_iter();
236    /// for result in content {
237    ///     match result {
238    ///         Ok(page) => println!("{}", page),
239    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
240    ///     }
241    /// }
242    /// ```
243    pub fn by_page(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
244        PdfFileLoader {
245            iterator: Box::new(self.iterator.flat_map(|doc| {
246                doc.page_iter()
247                    .enumerate()
248                    .map(|(page_no, _)| {
249                        doc.extract_text(&[page_no as u32 + 1])
250                            .map_err(PdfLoaderError::PdfError)
251                    })
252                    .collect::<Vec<_>>()
253            })),
254        }
255    }
256}
257
258type ByPage = (PathBuf, Vec<(usize, Result<String, PdfLoaderError>)>);
259impl<'a> PdfFileLoader<'a, (PathBuf, Document)> {
260    /// Chunks the pages of a loaded document by page, processed as a vector of documents by path
261    ///  which each document container an inner vector of pages by page number.
262    ///
263    /// # Example
264    /// Read pdfs in directory "tests/data/*.pdf" and chunk all documents by path by it's pages.
265    ///
266    /// ```rust
267    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
268    ///     .load_with_path()
269    ///     .ignore_errors()
270    ///     .by_page()
271    ///     .into_iter();
272    ///
273    /// for result in content {
274    ///     match result {
275    ///         Ok(documents) => {
276    ///             for doc in documents {
277    ///                 match doc {
278    ///                     Ok((pageno, content)) => println!("Page {}: {}", pageno, content),
279    ///                     Err(e) => eprintln!("Error reading page: {}", e),
280    ///                }
281    ///             }
282    ///         },
283    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
284    ///     }
285    /// }
286    /// ```
287    pub fn by_page(self) -> PdfFileLoader<'a, ByPage> {
288        PdfFileLoader {
289            iterator: Box::new(self.iterator.map(|(path, doc)| {
290                (
291                    path,
292                    doc.page_iter()
293                        .enumerate()
294                        .map(|(page_no, _)| {
295                            (
296                                page_no,
297                                doc.extract_text(&[page_no as u32 + 1])
298                                    .map_err(PdfLoaderError::PdfError),
299                            )
300                        })
301                        .collect::<Vec<_>>(),
302                )
303            })),
304        }
305    }
306}
307
308impl<'a> PdfFileLoader<'a, ByPage> {
309    /// Ignores errors in the iterator, returning only successful results. This can be used on any
310    ///  [PdfFileLoader] state of iterator whose items are results.
311    ///
312    /// # Example
313    /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
314    ///
315    /// ```rust
316    /// let content = FileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors().into_iter();
317    /// for result in content {
318    ///     println!("{}", content)
319    /// }
320    /// ```
321    pub fn ignore_errors(self) -> PdfFileLoader<'a, (PathBuf, Vec<(usize, String)>)> {
322        PdfFileLoader {
323            iterator: Box::new(self.iterator.map(|(path, pages)| {
324                let pages = pages
325                    .into_iter()
326                    .filter_map(|(page_no, res)| res.ok().map(|content| (page_no, content)))
327                    .collect::<Vec<_>>();
328                (path, pages)
329            })),
330        }
331    }
332}
333
334impl<'a, T> PdfFileLoader<'a, Result<T, PdfLoaderError>>
335where
336    T: 'a,
337{
338    /// Ignores errors in the iterator, returning only successful results. This can be used on any
339    ///  [PdfFileLoader] state of iterator whose items are results.
340    ///
341    /// # Example
342    /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
343    ///
344    /// ```rust
345    /// let content = FileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors().into_iter();
346    /// for result in content {
347    ///     println!("{}", content)
348    /// }
349    /// ```
350    pub fn ignore_errors(self) -> PdfFileLoader<'a, T> {
351        PdfFileLoader {
352            iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
353        }
354    }
355}
356
357impl PdfFileLoader<'_, Result<PathBuf, FileLoaderError>> {
358    /// Creates a new [PdfFileLoader] using a glob pattern to match files.
359    ///
360    /// # Example
361    /// Create a [PdfFileLoader] for all `.pdf` files that match the glob "tests/data/*.pdf".
362    ///
363    /// ```rust
364    /// let loader = FileLoader::with_glob("tests/data/*.txt")?;
365    /// ```
366    pub fn with_glob(
367        pattern: &str,
368    ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
369        let paths = glob(pattern).map_err(FileLoaderError::PatternError)?;
370        Ok(PdfFileLoader {
371            iterator: Box::new(paths.into_iter().map(|path| {
372                path.map_err(FileLoaderError::GlobError)
373                    .map_err(PdfLoaderError::FileLoaderError)
374            })),
375        })
376    }
377
378    /// Creates a new [PdfFileLoader] on all files within a directory.
379    ///
380    /// # Example
381    /// Create a [PdfFileLoader] for all files that are in the directory "files".
382    ///
383    /// ```rust
384    /// let loader = PdfFileLoader::with_dir("files")?;
385    /// ```
386    pub fn with_dir(
387        directory: &str,
388    ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
389        Ok(PdfFileLoader {
390            iterator: Box::new(
391                fs::read_dir(directory)
392                    .map_err(FileLoaderError::IoError)?
393                    .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
394            ),
395        })
396    }
397}
398
399impl<'a> PdfFileLoader<'a, Vec<u8>> {
400    /// Ingest a PDF as a byte array.
401    pub fn from_bytes(bytes: Vec<u8>) -> PdfFileLoader<'a, Vec<u8>> {
402        PdfFileLoader {
403            iterator: Box::new(vec![bytes].into_iter()),
404        }
405    }
406
407    /// Ingest multiple byte arrays.
408    pub fn from_bytes_multi(bytes_vec: Vec<Vec<u8>>) -> PdfFileLoader<'a, Vec<u8>> {
409        PdfFileLoader {
410            iterator: Box::new(bytes_vec.into_iter()),
411        }
412    }
413
414    /// Use this once you've created the loader to load the document in.
415    pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
416        PdfFileLoader {
417            iterator: Box::new(self.iterator.map(|res| res.load())),
418        }
419    }
420
421    /// Use this once you've created the loader to load the document in (and get the path).
422    pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
423        PdfFileLoader {
424            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
425        }
426    }
427}
428
429// ================================================================
430// PDFFileLoader iterator implementations
431// ================================================================
432
433pub struct IntoIter<'a, T> {
434    iterator: Box<dyn Iterator<Item = T> + 'a>,
435}
436
437impl<'a, T> IntoIterator for PdfFileLoader<'a, T> {
438    type Item = T;
439    type IntoIter = IntoIter<'a, T>;
440
441    fn into_iter(self) -> Self::IntoIter {
442        IntoIter {
443            iterator: self.iterator,
444        }
445    }
446}
447
448impl<T> Iterator for IntoIter<'_, T> {
449    type Item = T;
450
451    fn next(&mut self) -> Option<Self::Item> {
452        self.iterator.next()
453    }
454}
455
456#[cfg(test)]
457mod tests {
458    use std::path::PathBuf;
459
460    use super::PdfFileLoader;
461
462    #[test]
463    fn test_pdf_loader() {
464        let loader = PdfFileLoader::with_glob("tests/data/*.pdf").unwrap();
465        let actual = loader
466            .load_with_path()
467            .ignore_errors()
468            .by_page()
469            .ignore_errors()
470            .into_iter()
471            .collect::<Vec<_>>();
472
473        let mut actual = actual
474            .into_iter()
475            .map(|result| {
476                let (path, pages) = result;
477                pages.iter().for_each(|(page_no, content)| {
478                    println!("{path:?} Page {page_no}: {content:?}");
479                });
480                (path, pages)
481            })
482            .collect::<Vec<_>>();
483
484        let mut expected = vec![
485            (
486                PathBuf::from("tests/data/dummy.pdf"),
487                vec![(0, "Test\nPDF\nDocument\n".to_string())],
488            ),
489            (
490                PathBuf::from("tests/data/pages.pdf"),
491                vec![
492                    (0, "Page\n1\n".to_string()),
493                    (1, "Page\n2\n".to_string()),
494                    (2, "Page\n3\n".to_string()),
495                ],
496            ),
497        ];
498
499        actual.sort();
500        expected.sort();
501
502        assert!(!actual.is_empty());
503        assert!(expected == actual)
504    }
505
506    #[test]
507    fn test_pdf_loader_bytes() {
508        // this should never fail!
509        let bytes = std::fs::read("tests/data/dummy.pdf").unwrap();
510
511        let loader = PdfFileLoader::from_bytes(bytes);
512
513        let actual = loader
514            .load()
515            .ignore_errors()
516            .by_page()
517            .ignore_errors()
518            .into_iter()
519            .collect::<Vec<_>>();
520
521        assert_eq!(actual.len(), 1);
522        assert_eq!(actual, vec!["Test\nPDF\nDocument\n".to_string()]);
523
524        // this should never fail!
525        let bytes = std::fs::read("tests/data/pages.pdf").unwrap();
526
527        let loader = PdfFileLoader::from_bytes(bytes);
528
529        let actual = loader
530            .load()
531            .ignore_errors()
532            .by_page()
533            .ignore_errors()
534            .into_iter()
535            .collect::<Vec<_>>();
536
537        assert_eq!(actual.len(), 3);
538        assert_eq!(
539            actual,
540            vec![
541                "Page\n1\n".to_string(),
542                "Page\n2\n".to_string(),
543                "Page\n3\n".to_string(),
544            ]
545        );
546    }
547}