rig_core/loaders/pdf.rs
1use std::{fs, path::PathBuf};
2
3use glob::glob;
4use lopdf::{Document, Error as LopdfError};
5use thiserror::Error;
6
7use super::file::FileLoaderError;
8
9#[derive(Error, Debug)]
10pub enum PdfLoaderError {
11 #[error("{0}")]
12 FileLoaderError(#[from] FileLoaderError),
13
14 #[error("UTF-8 conversion error: {0}")]
15 FromUtf8Error(#[from] std::string::FromUtf8Error),
16
17 #[error("IO error: {0}")]
18 PdfError(#[from] LopdfError),
19}
20
21// ================================================================
22// Implementing Loadable trait for loading pdfs
23// ================================================================
24
25pub(crate) trait Loadable {
26 fn load(self) -> Result<Document, PdfLoaderError>;
27 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError>;
28}
29
30impl Loadable for PathBuf {
31 fn load(self) -> Result<Document, PdfLoaderError> {
32 Document::load(self).map_err(PdfLoaderError::PdfError)
33 }
34 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
35 let contents = Document::load(&self);
36 Ok((self, contents?))
37 }
38}
39
40impl<T> Loadable for Result<T, PdfLoaderError>
41where
42 T: Loadable,
43{
44 fn load(self) -> Result<Document, PdfLoaderError> {
45 self.map(|t| t.load())?
46 }
47 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
48 self.map(|t| t.load_with_path())?
49 }
50}
51
52impl Loadable for Vec<u8> {
53 fn load(self) -> Result<Document, PdfLoaderError> {
54 Document::load_mem(&self).map_err(PdfLoaderError::PdfError)
55 }
56
57 fn load_with_path(self) -> Result<(PathBuf, Document), PdfLoaderError> {
58 let doc = Document::load_mem(&self).map_err(PdfLoaderError::PdfError)?;
59 Ok((PathBuf::from("<memory>"), doc))
60 }
61}
62
63// ================================================================
64// PdfFileLoader definitions and implementations
65// ================================================================
66
67/// [PdfFileLoader] is a utility for loading pdf files from the filesystem using glob patterns or
68/// directory paths. It provides methods to read file contents and handle errors gracefully.
69///
70/// # Errors
71///
72/// This module defines a custom error type [PdfLoaderError] which can represent various errors
73/// that might occur during file loading operations, such as any [FileLoaderError] alongside
74/// specific PDF-related errors.
75///
76/// # Example Usage
77///
78/// ```no_run
79/// use rig_core::loaders::PdfFileLoader;
80///
81/// fn main() -> Result<(), Box<dyn std::error::Error>> {
82/// // Create a FileLoader using a glob pattern
83/// let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
84///
85/// // Load pdf file contents by page, ignoring any errors
86/// let contents: Vec<String> = loader
87/// .load()
88/// .ignore_errors()
89/// .by_page()
90/// .ignore_errors()
91/// .into_iter()
92/// .collect();
93///
94/// for content in contents {
95/// println!("{}", content);
96/// }
97///
98/// Ok(())
99/// }
100/// ```
101///
102/// [PdfFileLoader] uses strict typing between the iterator methods to ensure that transitions
103/// between different implementations of the loaders and it's methods are handled properly by
104/// the compiler.
105pub struct PdfFileLoader<'a, T> {
106 iterator: Box<dyn Iterator<Item = T> + 'a>,
107}
108
109impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
110 /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
111 /// or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances that can be
112 /// further processed (by page, etc).
113 ///
114 /// # Example
115 /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
116 ///
117 /// ```no_run
118 /// # use rig_core::loaders::PdfFileLoader;
119 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
120 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load().into_iter();
121 /// for result in content {
122 /// match result {
123 /// Ok(doc) => println!("{:?}", doc),
124 /// Err(e) => eprintln!("Error reading pdf: {}", e),
125 /// }
126 /// }
127 /// # Ok(())
128 /// # }
129 /// ```
130 pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
131 PdfFileLoader {
132 iterator: Box::new(self.iterator.map(|res| res.load())),
133 }
134 }
135
136 /// Loads the contents of the pdfs within the iterator returned by [PdfFileLoader::with_glob]
137 /// or [PdfFileLoader::with_dir]. Loaded PDF documents are raw PDF instances with their path
138 /// that can be further processed.
139 ///
140 /// # Example
141 /// Load pdfs in directory "tests/data/*.pdf" and return the loaded documents
142 ///
143 /// ```no_run
144 /// # use rig_core::loaders::PdfFileLoader;
145 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
146 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.load_with_path().into_iter();
147 /// for result in content {
148 /// match result {
149 /// Ok((path, doc)) => println!("{:?} {:?}", path, doc),
150 /// Err(e) => eprintln!("Error reading pdf: {}", e),
151 /// }
152 /// }
153 /// # Ok(())
154 /// # }
155 /// ```
156 pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
157 PdfFileLoader {
158 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
159 }
160 }
161}
162
163impl<'a> PdfFileLoader<'a, Result<PathBuf, PdfLoaderError>> {
164 /// Directly reads the contents of the pdfs within the iterator returned by
165 /// [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir].
166 ///
167 /// # Example
168 /// Read pdfs in directory "tests/data/*.pdf" and return the contents of the documents.
169 ///
170 /// ```no_run
171 /// # use rig_core::loaders::PdfFileLoader;
172 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
173 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read().into_iter();
174 /// for result in content {
175 /// match result {
176 /// Ok(content) => println!("{}", content),
177 /// Err(e) => eprintln!("Error reading pdf: {}", e),
178 /// }
179 /// }
180 /// # Ok(())
181 /// # }
182 /// ```
183 pub fn read(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
184 PdfFileLoader {
185 iterator: Box::new(self.iterator.map(|res| {
186 let doc = res.load()?;
187 Ok(doc
188 .page_iter()
189 .enumerate()
190 .map(|(page_no, _)| {
191 doc.extract_text(&[page_no as u32 + 1])
192 .map_err(PdfLoaderError::PdfError)
193 })
194 .collect::<Result<Vec<String>, PdfLoaderError>>()?
195 .into_iter()
196 .collect::<String>())
197 })),
198 }
199 }
200
201 /// Directly reads the contents of the pdfs within the iterator returned by
202 /// [PdfFileLoader::with_glob] or [PdfFileLoader::with_dir] and returns the path along with
203 /// the content.
204 ///
205 /// # Example
206 /// Read pdfs in directory "tests/data/*.pdf" and return the content and paths of the documents.
207 ///
208 /// ```no_run
209 /// # use rig_core::loaders::PdfFileLoader;
210 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
211 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read_with_path().into_iter();
212 /// for result in content {
213 /// match result {
214 /// Ok((path, content)) => println!("{:?} {}", path, content),
215 /// Err(e) => eprintln!("Error reading pdf: {}", e),
216 /// }
217 /// }
218 /// # Ok(())
219 /// # }
220 /// ```
221 pub fn read_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, String), PdfLoaderError>> {
222 PdfFileLoader {
223 iterator: Box::new(self.iterator.map(|res| {
224 let (path, doc) = res.load_with_path()?;
225 println!(
226 "Loaded {:?} PDF: {:?}",
227 path,
228 doc.page_iter().collect::<Vec<_>>()
229 );
230 let content = doc
231 .page_iter()
232 .enumerate()
233 .map(|(page_no, _)| {
234 doc.extract_text(&[page_no as u32 + 1])
235 .map_err(PdfLoaderError::PdfError)
236 })
237 .collect::<Result<Vec<String>, PdfLoaderError>>()?
238 .into_iter()
239 .collect::<String>();
240
241 Ok((path, content))
242 })),
243 }
244 }
245}
246
247impl<'a> PdfFileLoader<'a, Document> {
248 /// Chunks the pages of a loaded document by page, flattened as a single vector.
249 ///
250 /// # Example
251 /// Load pdfs in directory "tests/data/*.pdf" and chunk all document into it's pages.
252 ///
253 /// ```no_run
254 /// # use rig_core::loaders::PdfFileLoader;
255 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
256 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
257 /// .load()
258 /// .ignore_errors()
259 /// .by_page()
260 /// .into_iter();
261 /// for result in content {
262 /// match result {
263 /// Ok(page) => println!("{}", page),
264 /// Err(e) => eprintln!("Error reading pdf: {}", e),
265 /// }
266 /// }
267 /// # Ok(())
268 /// # }
269 /// ```
270 pub fn by_page(self) -> PdfFileLoader<'a, Result<String, PdfLoaderError>> {
271 PdfFileLoader {
272 iterator: Box::new(self.iterator.flat_map(|doc| {
273 doc.page_iter()
274 .enumerate()
275 .map(|(page_no, _)| {
276 doc.extract_text(&[page_no as u32 + 1])
277 .map_err(PdfLoaderError::PdfError)
278 })
279 .collect::<Vec<_>>()
280 })),
281 }
282 }
283}
284
285type ByPage = (PathBuf, Vec<(usize, Result<String, PdfLoaderError>)>);
286impl<'a> PdfFileLoader<'a, (PathBuf, Document)> {
287 /// Chunks the pages of a loaded document by page, processed as a vector of documents by path
288 /// which each document container an inner vector of pages by page number.
289 ///
290 /// # Example
291 /// Read pdfs in directory "tests/data/*.pdf" and chunk all documents by path by it's pages.
292 ///
293 /// ```no_run
294 /// # use rig_core::loaders::PdfFileLoader;
295 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
296 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
297 /// .load_with_path()
298 /// .ignore_errors()
299 /// .by_page()
300 /// .into_iter();
301 ///
302 /// for (path, pages) in content {
303 /// println!("{}", path.display());
304 /// for (pageno, result) in pages {
305 /// match result {
306 /// Ok(content) => println!("Page {}: {}", pageno, content),
307 /// Err(e) => eprintln!("Error reading page: {}", e),
308 /// }
309 /// }
310 /// }
311 /// # Ok(())
312 /// # }
313 /// ```
314 pub fn by_page(self) -> PdfFileLoader<'a, ByPage> {
315 PdfFileLoader {
316 iterator: Box::new(self.iterator.map(|(path, doc)| {
317 (
318 path,
319 doc.page_iter()
320 .enumerate()
321 .map(|(page_no, _)| {
322 (
323 page_no,
324 doc.extract_text(&[page_no as u32 + 1])
325 .map_err(PdfLoaderError::PdfError),
326 )
327 })
328 .collect::<Vec<_>>(),
329 )
330 })),
331 }
332 }
333}
334
335impl<'a> PdfFileLoader<'a, ByPage> {
336 /// Ignores errors in the iterator, returning only successful results. This can be used on any
337 /// [PdfFileLoader] state of iterator whose items are results.
338 ///
339 /// # Example
340 /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
341 ///
342 /// ```no_run
343 /// # use rig_core::loaders::PdfFileLoader;
344 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
345 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?
346 /// .load_with_path()
347 /// .ignore_errors()
348 /// .by_page()
349 /// .ignore_errors();
350 /// for (_path, pages) in content {
351 /// println!("{}", pages.len())
352 /// }
353 /// # Ok(())
354 /// # }
355 /// ```
356 pub fn ignore_errors(self) -> PdfFileLoader<'a, (PathBuf, Vec<(usize, String)>)> {
357 PdfFileLoader {
358 iterator: Box::new(self.iterator.map(|(path, pages)| {
359 let pages = pages
360 .into_iter()
361 .filter_map(|(page_no, res)| res.ok().map(|content| (page_no, content)))
362 .collect::<Vec<_>>();
363 (path, pages)
364 })),
365 }
366 }
367}
368
369impl<'a, T> PdfFileLoader<'a, Result<T, PdfLoaderError>>
370where
371 T: 'a,
372{
373 /// Ignores errors in the iterator, returning only successful results. This can be used on any
374 /// [PdfFileLoader] state of iterator whose items are results.
375 ///
376 /// # Example
377 /// Read files in directory "tests/data/*.pdf" and ignore errors from unreadable files.
378 ///
379 /// ```no_run
380 /// # use rig_core::loaders::PdfFileLoader;
381 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
382 /// let content = PdfFileLoader::with_glob("tests/data/*.pdf")?.read().ignore_errors();
383 /// for content in content {
384 /// println!("{}", content)
385 /// }
386 /// # Ok(())
387 /// # }
388 /// ```
389 pub fn ignore_errors(self) -> PdfFileLoader<'a, T> {
390 PdfFileLoader {
391 iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
392 }
393 }
394}
395
396impl PdfFileLoader<'_, Result<PathBuf, FileLoaderError>> {
397 /// Creates a new [PdfFileLoader] using a glob pattern to match files.
398 ///
399 /// # Example
400 /// Create a [PdfFileLoader] for all `.pdf` files that match the glob "tests/data/*.pdf".
401 ///
402 /// ```no_run
403 /// # use rig_core::loaders::PdfFileLoader;
404 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
405 /// let loader = PdfFileLoader::with_glob("tests/data/*.pdf")?;
406 /// # Ok(())
407 /// # }
408 /// ```
409 pub fn with_glob(
410 pattern: &str,
411 ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
412 let paths = glob(pattern).map_err(FileLoaderError::PatternError)?;
413 Ok(PdfFileLoader {
414 iterator: Box::new(paths.into_iter().map(|path| {
415 path.map_err(FileLoaderError::GlobError)
416 .map_err(PdfLoaderError::FileLoaderError)
417 })),
418 })
419 }
420
421 /// Creates a new [PdfFileLoader] on all files within a directory.
422 ///
423 /// # Example
424 /// Create a [PdfFileLoader] for all files that are in the directory "files".
425 ///
426 /// ```no_run
427 /// # use rig_core::loaders::PdfFileLoader;
428 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
429 /// let loader = PdfFileLoader::with_dir("files")?;
430 /// # Ok(())
431 /// # }
432 /// ```
433 pub fn with_dir(
434 directory: &str,
435 ) -> Result<PdfFileLoader<'_, Result<PathBuf, PdfLoaderError>>, PdfLoaderError> {
436 Ok(PdfFileLoader {
437 iterator: Box::new(
438 fs::read_dir(directory)
439 .map_err(FileLoaderError::IoError)?
440 .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
441 ),
442 })
443 }
444}
445
446impl<'a> PdfFileLoader<'a, Vec<u8>> {
447 /// Ingest a PDF as a byte array.
448 pub fn from_bytes(bytes: Vec<u8>) -> PdfFileLoader<'a, Vec<u8>> {
449 PdfFileLoader {
450 iterator: Box::new(vec![bytes].into_iter()),
451 }
452 }
453
454 /// Ingest multiple byte arrays.
455 pub fn from_bytes_multi(bytes_vec: Vec<Vec<u8>>) -> PdfFileLoader<'a, Vec<u8>> {
456 PdfFileLoader {
457 iterator: Box::new(bytes_vec.into_iter()),
458 }
459 }
460
461 /// Use this once you've created the loader to load the document in.
462 pub fn load(self) -> PdfFileLoader<'a, Result<Document, PdfLoaderError>> {
463 PdfFileLoader {
464 iterator: Box::new(self.iterator.map(|res| res.load())),
465 }
466 }
467
468 /// Use this once you've created the loader to load the document in (and get the path).
469 pub fn load_with_path(self) -> PdfFileLoader<'a, Result<(PathBuf, Document), PdfLoaderError>> {
470 PdfFileLoader {
471 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
472 }
473 }
474}
475
476// ================================================================
477// PDFFileLoader iterator implementations
478// ================================================================
479
480pub struct IntoIter<'a, T> {
481 iterator: Box<dyn Iterator<Item = T> + 'a>,
482}
483
484impl<'a, T> IntoIterator for PdfFileLoader<'a, T> {
485 type Item = T;
486 type IntoIter = IntoIter<'a, T>;
487
488 fn into_iter(self) -> Self::IntoIter {
489 IntoIter {
490 iterator: self.iterator,
491 }
492 }
493}
494
495impl<T> Iterator for IntoIter<'_, T> {
496 type Item = T;
497
498 fn next(&mut self) -> Option<Self::Item> {
499 self.iterator.next()
500 }
501}
502
503#[cfg(test)]
504mod tests {
505 use std::path::PathBuf;
506
507 use super::PdfFileLoader;
508
509 fn fixture_path(filename: &str) -> PathBuf {
510 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
511 .join("../../tests/data")
512 .join(filename)
513 }
514
515 fn fixture_glob() -> String {
516 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
517 .join("../../tests/data/*.pdf")
518 .to_string_lossy()
519 .into_owned()
520 }
521
522 #[test]
523 fn test_pdf_loader() {
524 let glob = fixture_glob();
525 let loader = PdfFileLoader::with_glob(&glob).unwrap();
526 let actual = loader
527 .load_with_path()
528 .ignore_errors()
529 .by_page()
530 .ignore_errors()
531 .into_iter()
532 .collect::<Vec<_>>();
533
534 let mut actual = actual
535 .into_iter()
536 .map(|result| {
537 let (path, pages) = result;
538 pages.iter().for_each(|(page_no, content)| {
539 println!("{path:?} Page {page_no}: {content:?}");
540 });
541 (path, pages)
542 })
543 .collect::<Vec<_>>();
544
545 let mut expected = vec![
546 (
547 fixture_path("dummy.pdf"),
548 vec![(0, "Test\nPDF\nDocument\n".to_string())],
549 ),
550 (
551 fixture_path("file-id-verifiers.pdf"),
552 vec![
553 (0, "rig-file-id-page-one-verifier-3a91\n".to_string()),
554 (1, "rig-file-id-page-two-verifier-8c27\n".to_string()),
555 (2, "rig-file-id-page-three-verifier-f54e\n".to_string()),
556 ],
557 ),
558 (
559 fixture_path("pages.pdf"),
560 vec![
561 (0, "Page\n1\n".to_string()),
562 (1, "Page\n2\n".to_string()),
563 (2, "Page\n3\n".to_string()),
564 ],
565 ),
566 ];
567
568 actual.sort();
569 expected.sort();
570
571 assert!(!actual.is_empty());
572 assert!(expected == actual)
573 }
574
575 #[test]
576 fn test_pdf_loader_bytes() {
577 // this should never fail!
578 let bytes = std::fs::read(fixture_path("dummy.pdf")).unwrap();
579
580 let loader = PdfFileLoader::from_bytes(bytes);
581
582 let actual = loader
583 .load()
584 .ignore_errors()
585 .by_page()
586 .ignore_errors()
587 .into_iter()
588 .collect::<Vec<_>>();
589
590 assert_eq!(actual.len(), 1);
591 assert_eq!(actual, vec!["Test\nPDF\nDocument\n".to_string()]);
592
593 // this should never fail!
594 let bytes = std::fs::read(fixture_path("pages.pdf")).unwrap();
595
596 let loader = PdfFileLoader::from_bytes(bytes);
597
598 let actual = loader
599 .load()
600 .ignore_errors()
601 .by_page()
602 .ignore_errors()
603 .into_iter()
604 .collect::<Vec<_>>();
605
606 assert_eq!(actual.len(), 3);
607 assert_eq!(
608 actual,
609 vec![
610 "Page\n1\n".to_string(),
611 "Page\n2\n".to_string(),
612 "Page\n3\n".to_string(),
613 ]
614 );
615 }
616}