Skip to main content

rig_core/loaders/
pdf.rs

1use std::{fs, path::PathBuf};
2
3use glob::glob;
4use lopdf::{Document, Error as LopdfError};
5use thiserror::Error;
6
7use super::file::FileLoaderError;
8
9#[derive(Error, Debug)]
10pub enum PdfLoaderError {
11    #[error("{0}")]
12    FileLoaderError(#[from] FileLoaderError),
13
14    #[error("UTF-8 conversion error: {0}")]
15    FromUtf8Error(#[from] std::string::FromUtf8Error),
16
17    #[error("IO error: {0}")]
18    PdfError(#[from] LopdfError),
19}
20
21// ================================================================
22// Implementing Loadable trait for loading pdfs
23// ================================================================
24
25pub(crate) trait Loadable {
26    fn load(self) -> Result<Document, PdfLoaderError>;
27    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError>;
28}
29
30impl Loadable for PathBuf {
31    fn load(self) -> Result<Document, PdfLoaderError> {
32        Document::load(self).map_err(PdfLoaderError::PdfError)
33    }
34    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
35        let contents = Document::load(&self);
36        Ok((self, contents?))
37    }
38}
39
40impl<T> Loadable for Result<T, PdfLoaderError>
41where
42    T: Loadable,
43{
44    fn load(self) -> Result<Document, PdfLoaderError> {
45        self.map(|t| t.load())?
46    }
47    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
48        self.map(|t| t.load_with_path())?
49    }
50}
51
52impl Loadable for Vec<u8> {
53    fn load(self) -> Result<Document, PdfLoaderError> {
54        Document::load_mem(&self).map_err(PdfLoaderError::PdfError)
55    }
56
57    fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
58        let doc = Document::load_mem(&self).map_err(PdfLoaderError::PdfError)?;
59        Ok((PathBuf::from("<memory>"), doc))
60    }
61}
62
63// ================================================================
64// PdfFileLoader definitions and implementations
65// ================================================================
66
67/// [PdfFileLoader] is a utility for loading pdf files from the filesystem using glob patterns or
68///  directory paths. It provides methods to read file contents and handle errors gracefully.
69///
70/// # Errors
71///
72/// This module defines a custom error type [PdfLoaderError] which can represent various errors
73///  that might occur during file loading operations, such as any [FileLoaderError] alongside
74///  specific PDF-related errors.
75///
76/// # Example Usage
77///
78/// ```no_run
79/// use rig_core::loaders::PdfFileLoader;
80///
81/// fn main() -> Result<(), Box<dyn std::error::Error>> {
82///     // Create a FileLoader using a glob pattern
83///     let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
84///
85///     // Load pdf file contents by page, ignoring any errors
86///     let contents: Vec<String> = loader
87///         .load()
88///         .ignore_errors()
89///         .by_page()
90///         .ignore_errors()
91///         .into_iter()
92///         .collect();
93///
94///     for content in contents {
95///         println!("{}", content);
96///     }
97///
98///     Ok(())
99/// }
100/// ```
101///
102/// [PdfFileLoader] uses strict typing between the iterator methods to ensure that transitions
103///  between different implementations of the loaders and it's methods are handled properly by
104///  the compiler.
105pub struct PdfFileLoader<'a, T> {
106    iterator: Box<dyn Iterator<Item = T> + 'a>,
107}
108
109impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
110    /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
111    ///  or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances that can be
112    ///  further processed (by page, etc).
113    ///
114    /// # Example
115    /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
116    ///
117    /// ```no_run
118    /// # use rig_core::loaders::PdfFileLoader;
119    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
120    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().into_iter();
121    /// for result in content {
122    ///     match result {
123    ///         Ok(doc) => println!("{:?}", doc),
124    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
125    ///     }
126    /// }
127    /// # Ok(())
128    /// # }
129    /// ```
130    pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
131        PdfFileLoader {
132            iterator: Box::new(self.iterator.map(|res| res.load())),
133        }
134    }
135
136    /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
137    ///  or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances with their path
138    ///  that can be further processed.
139    ///
140    /// # Example
141    /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
142    ///
143    /// ```no_run
144    /// # use rig_core::loaders::PdfFileLoader;
145    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
146    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load_with_path().into_iter();
147    /// for result in content {
148    ///     match result {
149    ///         Ok((path, doc)) => println!("{:?} {:?}", path, doc),
150    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
151    ///     }
152    /// }
153    /// # Ok(())
154    /// # }
155    /// ```
156    pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
157        PdfFileLoader {
158            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
159        }
160    }
161}
162
163impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
164    /// Directly reads the contents of the pdfs within the iterator returned by
165    ///  [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir].
166    ///
167    /// # Example
168    /// Read pdfs in directory "tests/data/*.pdf" and return the contents of the documents.
169    ///
170    /// ```no_run
171    /// # use rig_core::loaders::PdfFileLoader;
172    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
173    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read().into_iter();
174    /// for result in content {
175    ///     match result {
176    ///         Ok(content) => println!("{}", content),
177    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
178    ///     }
179    /// }
180    /// # Ok(())
181    /// # }
182    /// ```
183    pub fn read(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
184        PdfFileLoader {
185            iterator: Box::new(self.iterator.map(|res| {
186                let doc = res.load()?;
187                Ok(doc
188                    .page_iter()
189                    .enumerate()
190                    .map(|(page_no, _)| {
191                        doc.extract_text(&[page_no as u32 + 1])
192                            .map_err(PdfLoaderError::PdfError)
193                    })
194                    .collect::<Result<Vec<String>, PdfLoaderError>>()?
195                    .into_iter()
196                    .collect::<String>())
197            })),
198        }
199    }
200
201    /// Directly reads the contents of the pdfs within the iterator returned by
202    ///  [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir] and returns the path along with
203    ///  the content.
204    ///
205    /// # Example
206    /// Read pdfs in directory "tests/data/*.pdf" and return the content and paths of the documents.
207    ///
208    /// ```no_run
209    /// # use rig_core::loaders::PdfFileLoader;
210    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
211    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
212    /// for result in content {
213    ///     match result {
214    ///         Ok((path, content)) => println!("{:?} {}", path, content),
215    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
216    ///     }
217    /// }
218    /// # Ok(())
219    /// # }
220    /// ```
221    pub fn read_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, String), PdfLoaderError>> {
222        PdfFileLoader {
223            iterator: Box::new(self.iterator.map(|res| {
224                let (path, doc) = res.load_with_path()?;
225                println!(
226                    "Loaded {:?} PDF: {:?}",
227                    path,
228                    doc.page_iter().collect::<Vec<_>>()
229                );
230                let content = doc
231                    .page_iter()
232                    .enumerate()
233                    .map(|(page_no, _)| {
234                        doc.extract_text(&[page_no as u32 + 1])
235                            .map_err(PdfLoaderError::PdfError)
236                    })
237                    .collect::<Result<Vec<String>, PdfLoaderError>>()?
238                    .into_iter()
239                    .collect::<String>();
240
241                Ok((path, content))
242            })),
243        }
244    }
245}
246
247impl<'a> PdfFileLoader<'a, Document> {
248    /// Chunks the pages of a loaded document by page, flattened as a single vector.
249    ///
250    /// # Example
251    /// Load pdfs in directory "tests/data/*.pdf" and chunk all document into it's pages.
252    ///
253    /// ```no_run
254    /// # use rig_core::loaders::PdfFileLoader;
255    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
256    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
257    ///     .load()
258    ///     .ignore_errors()
259    ///     .by_page()
260    ///     .into_iter();
261    /// for result in content {
262    ///     match result {
263    ///         Ok(page) => println!("{}", page),
264    ///         Err(e) => eprintln!("Error reading pdf: {}", e),
265    ///     }
266    /// }
267    /// # Ok(())
268    /// # }
269    /// ```
270    pub fn by_page(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
271        PdfFileLoader {
272            iterator: Box::new(self.iterator.flat_map(|doc| {
273                doc.page_iter()
274                    .enumerate()
275                    .map(|(page_no, _)| {
276                        doc.extract_text(&[page_no as u32 + 1])
277                            .map_err(PdfLoaderError::PdfError)
278                    })
279                    .collect::<Vec<_>>()
280            })),
281        }
282    }
283}
284
285type ByPage = (PathBuf, Vec<(usize, Result<String, PdfLoaderError>)>);
286impl<'a> PdfFileLoader<'a, (PathBuf, Document)> {
287    /// Chunks the pages of a loaded document by page, processed as a vector of documents by path
288    ///  which each document container an inner vector of pages by page number.
289    ///
290    /// # Example
291    /// Read pdfs in directory "tests/data/*.pdf" and chunk all documents by path by it's pages.
292    ///
293    /// ```no_run
294    /// # use rig_core::loaders::PdfFileLoader;
295    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
296    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
297    ///     .load_with_path()
298    ///     .ignore_errors()
299    ///     .by_page()
300    ///     .into_iter();
301    ///
302    /// for (path, pages) in content {
303    ///     println!("{}", path.display());
304    ///     for (pageno, result) in pages {
305    ///         match result {
306    ///             Ok(content) => println!("Page {}: {}", pageno, content),
307    ///             Err(e) => eprintln!("Error reading page: {}", e),
308    ///         }
309    ///     }
310    /// }
311    /// # Ok(())
312    /// # }
313    /// ```
314    pub fn by_page(self) -> PdfFileLoader<'a, ByPage> {
315        PdfFileLoader {
316            iterator: Box::new(self.iterator.map(|(path, doc)| {
317                (
318                    path,
319                    doc.page_iter()
320                        .enumerate()
321                        .map(|(page_no, _)| {
322                            (
323                                page_no,
324                                doc.extract_text(&[page_no as u32 + 1])
325                                    .map_err(PdfLoaderError::PdfError),
326                            )
327                        })
328                        .collect::<Vec<_>>(),
329                )
330            })),
331        }
332    }
333}
334
335impl<'a> PdfFileLoader<'a, ByPage> {
336    /// Ignores errors in the iterator, returning only successful results. This can be used on any
337    ///  [PdfFileLoader] state of iterator whose items are results.
338    ///
339    /// # Example
340    /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
341    ///
342    /// ```no_run
343    /// # use rig_core::loaders::PdfFileLoader;
344    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
345    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
346    ///     .load_with_path()
347    ///     .ignore_errors()
348    ///     .by_page()
349    ///     .ignore_errors();
350    /// for (_path, pages) in content {
351    ///     println!("{}", pages.len())
352    /// }
353    /// # Ok(())
354    /// # }
355    /// ```
356    pub fn ignore_errors(self) -> PdfFileLoader<'a, (PathBuf, Vec<(usize, String)>)> {
357        PdfFileLoader {
358            iterator: Box::new(self.iterator.map(|(path, pages)| {
359                let pages = pages
360                    .into_iter()
361                    .filter_map(|(page_no, res)| res.ok().map(|content| (page_no, content)))
362                    .collect::<Vec<_>>();
363                (path, pages)
364            })),
365        }
366    }
367}
368
369impl<'a, T> PdfFileLoader<'a, Result<T, PdfLoaderError>>
370where
371    T: 'a,
372{
373    /// Ignores errors in the iterator, returning only successful results. This can be used on any
374    ///  [PdfFileLoader] state of iterator whose items are results.
375    ///
376    /// # Example
377    /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
378    ///
379    /// ```no_run
380    /// # use rig_core::loaders::PdfFileLoader;
381    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
382    /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors();
383    /// for content in content {
384    ///     println!("{}", content)
385    /// }
386    /// # Ok(())
387    /// # }
388    /// ```
389    pub fn ignore_errors(self) -> PdfFileLoader<'a, T> {
390        PdfFileLoader {
391            iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
392        }
393    }
394}
395
396impl PdfFileLoader<'_, Result<PathBuf, FileLoaderError>> {
397    /// Creates a new [PdfFileLoader] using a glob pattern to match files.
398    ///
399    /// # Example
400    /// Create a [PdfFileLoader] for all `.pdf` files that match the glob "tests/data/*.pdf".
401    ///
402    /// ```no_run
403    /// # use rig_core::loaders::PdfFileLoader;
404    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
405    /// let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
406    /// # Ok(())
407    /// # }
408    /// ```
409    pub fn with_glob(
410        pattern: &str,
411    ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
412        let paths = glob(pattern).map_err(FileLoaderError::PatternError)?;
413        Ok(PdfFileLoader {
414            iterator: Box::new(paths.into_iter().map(|path| {
415                path.map_err(FileLoaderError::GlobError)
416                    .map_err(PdfLoaderError::FileLoaderError)
417            })),
418        })
419    }
420
421    /// Creates a new [PdfFileLoader] on all files within a directory.
422    ///
423    /// # Example
424    /// Create a [PdfFileLoader] for all files that are in the directory "files".
425    ///
426    /// ```no_run
427    /// # use rig_core::loaders::PdfFileLoader;
428    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
429    /// let loader = PdfFileLoader::with_dir("files")?;
430    /// # Ok(())
431    /// # }
432    /// ```
433    pub fn with_dir(
434        directory: &str,
435    ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
436        Ok(PdfFileLoader {
437            iterator: Box::new(
438                fs::read_dir(directory)
439                    .map_err(FileLoaderError::IoError)?
440                    .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
441            ),
442        })
443    }
444}
445
446impl<'a> PdfFileLoader<'a, Vec<u8>> {
447    /// Ingest a PDF as a byte array.
448    pub fn from_bytes(bytes: Vec<u8>) -> PdfFileLoader<'a, Vec<u8>> {
449        PdfFileLoader {
450            iterator: Box::new(vec![bytes].into_iter()),
451        }
452    }
453
454    /// Ingest multiple byte arrays.
455    pub fn from_bytes_multi(bytes_vec: Vec<Vec<u8>>) -> PdfFileLoader<'a, Vec<u8>> {
456        PdfFileLoader {
457            iterator: Box::new(bytes_vec.into_iter()),
458        }
459    }
460
461    /// Use this once you've created the loader to load the document in.
462    pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
463        PdfFileLoader {
464            iterator: Box::new(self.iterator.map(|res| res.load())),
465        }
466    }
467
468    /// Use this once you've created the loader to load the document in (and get the path).
469    pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
470        PdfFileLoader {
471            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
472        }
473    }
474}
475
476// ================================================================
477// PDFFileLoader iterator implementations
478// ================================================================
479
480pub struct IntoIter<'a, T> {
481    iterator: Box<dyn Iterator<Item = T> + 'a>,
482}
483
484impl<'a, T> IntoIterator for PdfFileLoader<'a, T> {
485    type Item = T;
486    type IntoIter = IntoIter<'a, T>;
487
488    fn into_iter(self) -> Self::IntoIter {
489        IntoIter {
490            iterator: self.iterator,
491        }
492    }
493}
494
495impl<T> Iterator for IntoIter<'_, T> {
496    type Item = T;
497
498    fn next(&mut self) -> Option<Self::Item> {
499        self.iterator.next()
500    }
501}
502
503#[cfg(test)]
504mod tests {
505    use std::path::PathBuf;
506
507    use super::PdfFileLoader;
508
509    fn fixture_path(filename: &str) -> PathBuf {
510        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
511            .join("../../tests/data")
512            .join(filename)
513    }
514
515    fn fixture_glob() -> String {
516        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
517            .join("../../tests/data/*.pdf")
518            .to_string_lossy()
519            .into_owned()
520    }
521
522    #[test]
523    fn test_pdf_loader() {
524        let glob = fixture_glob();
525        let loader = PdfFileLoader::with_glob(&glob).unwrap();
526        let actual = loader
527            .load_with_path()
528            .ignore_errors()
529            .by_page()
530            .ignore_errors()
531            .into_iter()
532            .collect::<Vec<_>>();
533
534        let mut actual = actual
535            .into_iter()
536            .map(|result| {
537                let (path, pages) = result;
538                pages.iter().for_each(|(page_no, content)| {
539                    println!("{path:?} Page {page_no}: {content:?}");
540                });
541                (path, pages)
542            })
543            .collect::<Vec<_>>();
544
545        let mut expected = vec![
546            (
547                fixture_path("dummy.pdf"),
548                vec![(0, "Test\nPDF\nDocument\n".to_string())],
549            ),
550            (
551                fixture_path("file-id-verifiers.pdf"),
552                vec![
553                    (0, "rig-file-id-page-one-verifier-3a91\n".to_string()),
554                    (1, "rig-file-id-page-two-verifier-8c27\n".to_string()),
555                    (2, "rig-file-id-page-three-verifier-f54e\n".to_string()),
556                ],
557            ),
558            (
559                fixture_path("pages.pdf"),
560                vec![
561                    (0, "Page\n1\n".to_string()),
562                    (1, "Page\n2\n".to_string()),
563                    (2, "Page\n3\n".to_string()),
564                ],
565            ),
566        ];
567
568        actual.sort();
569        expected.sort();
570
571        assert!(!actual.is_empty());
572        assert!(expected == actual)
573    }
574
575    #[test]
576    fn test_pdf_loader_bytes() {
577        // this should never fail!
578        let bytes = std::fs::read(fixture_path("dummy.pdf")).unwrap();
579
580        let loader = PdfFileLoader::from_bytes(bytes);
581
582        let actual = loader
583            .load()
584            .ignore_errors()
585            .by_page()
586            .ignore_errors()
587            .into_iter()
588            .collect::<Vec<_>>();
589
590        assert_eq!(actual.len(), 1);
591        assert_eq!(actual, vec!["Test\nPDF\nDocument\n".to_string()]);
592
593        // this should never fail!
594        let bytes = std::fs::read(fixture_path("pages.pdf")).unwrap();
595
596        let loader = PdfFileLoader::from_bytes(bytes);
597
598        let actual = loader
599            .load()
600            .ignore_errors()
601            .by_page()
602            .ignore_errors()
603            .into_iter()
604            .collect::<Vec<_>>();
605
606        assert_eq!(actual.len(), 3);
607        assert_eq!(
608            actual,
609            vec![
610                "Page\n1\n".to_string(),
611                "Page\n2\n".to_string(),
612                "Page\n3\n".to_string(),
613            ]
614        );
615    }
616}