Skip to main content

rig_core/loaders/epub/
loader.rs

1use crate::loaders::file::FileLoaderError;
2use epub::doc::EpubDoc;
3
4use std::fs::File;
5use std::io::BufReader;
6use std::marker::PhantomData;
7use std::path::PathBuf;
8
9use super::RawTextProcessor;
10use super::errors::EpubLoaderError;
11use super::text_processors::TextProcessor;
12
13// ================================================================
14// Implementing Loadable trait for loading epubs
15// ================================================================
16
17pub(crate) trait Loadable {
18    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError>;
19    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
20}
21
22impl Loadable for PathBuf {
23    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
24        EpubDoc::new(self).map_err(EpubLoaderError::EpubError)
25    }
26
27    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
28        let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError);
29        Ok((self, contents?))
30    }
31}
32
33impl<T: Loadable> Loadable for Result<T, EpubLoaderError> {
34    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
35        self.map(|t| t.load())?
36    }
37
38    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
39        self.map(|t| t.load_with_path())?
40    }
41}
42
43// ================================================================
44// EpubFileLoader definitions and implementations
45// ================================================================
46
47/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or
48///  directory paths. It provides methods to read file contents and handle errors gracefully.
49///
50/// # Errors
51///
52/// This module defines a custom error type [EpubLoaderError] which can represent various errors
53///  that might occur during file loading operations, such as any [FileLoaderError] alongside
54///  specific EPUB-related errors.
55///
56/// # Example Usage
57///
58/// ```no_run
59/// use rig_core::loaders::{EpubFileLoader, RawTextProcessor, StripXmlProcessor};
60///
61/// fn main() -> Result<(), Box<dyn std::error::Error>> {
62///     // Create a FileLoader using a glob pattern
63///     let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
64///
65///     // Load epub file contents by chapter, ignoring any errors
66///     let contents = loader
67///         .load_with_path()
68///         .ignore_errors()
69///         .by_chapter()
70///         .ignore_errors();
71///
72///     for (path, chapters) in contents {
73///         println!("{}", path.display());
74///         for (idx, chapter) in chapters {
75///             println!("Chapter {} begins", idx);
76///             println!("{}", chapter);
77///             println!("Chapter {} ends", idx);
78///         }
79///     }
80///
81///     // Create a FileLoader using a glob pattern with stripping xml
82///     let loader = EpubFileLoader::<_, StripXmlProcessor>::with_glob("tests/data/*.epub")?;
83///
84///     // Load epub file contents by chapter, ignoring any errors
85///     let contents = loader
86///         .load_with_path()
87///         .ignore_errors()
88///         .by_chapter()
89///         .ignore_errors();
90///
91///     for (path, chapters) in contents {
92///         println!("{}", path.display());
93///         for (idx, chapter) in chapters {
94///             println!("Chapter {} begins", idx);
95///             println!("{}", chapter);
96///             println!("Chapter {} ends", idx);
97///         }
98///     }
99///
100///     Ok(())
101/// }
102/// ```
103///
104/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions
105///  between different implementations of the loaders and it's methods are handled properly by
106///  the compiler.
107pub struct EpubFileLoader<'a, T, P = RawTextProcessor> {
108    iterator: Box<dyn Iterator<Item = T> + 'a>,
109    _processor: PhantomData<P>,
110}
111
112type EpubLoaded = Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
113
114impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P> {
115    /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
116    ///  or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be
117    ///  further processed (by chapter, etc).
118    ///
119    /// # Example
120    /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
121    ///
122    /// ```no_run
123    /// use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
124    ///
125    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
126    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().into_iter();
127    /// for result in content {
128    ///     match result {
129    ///         Ok(doc) => println!("{:?}", doc),
130    ///         Err(e) => eprintln!("Error reading epub: {}", e),
131    ///     }
132    /// }
133    /// # Ok(())
134    /// # }
135    /// ```
136    pub fn load(self) -> EpubFileLoader<'a, Result<EpubDoc<BufReader<File>>, EpubLoaderError>, P> {
137        EpubFileLoader {
138            iterator: Box::new(self.iterator.map(|res| res.load())),
139            _processor: PhantomData,
140        }
141    }
142
143    /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
144    ///  or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path
145    ///  that can be further processed.
146    ///
147    /// # Example
148    /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
149    ///
150    /// ```no_run
151    /// use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
152    ///
153    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
154    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load_with_path().into_iter();
155    /// for result in content {
156    ///     match result {
157    ///         Ok((path, doc)) => println!("{:?} {:?}", path, doc),
158    ///         Err(e) => eprintln!("Error reading epub: {}", e),
159    ///     }
160    /// }
161    /// # Ok(())
162    /// # }
163    /// ```
164    pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded, P> {
165        EpubFileLoader {
166            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
167            _processor: PhantomData,
168        }
169    }
170}
171
172impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P>
173where
174    P: TextProcessor,
175{
176    /// Directly reads the contents of the epub files within the iterator returned by
177    ///  [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir].
178    ///
179    /// # Example
180    /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents.
181    ///
182    /// ```no_run
183    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
184    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
185    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().into_iter();
186    /// for result in content {
187    ///     match result {
188    ///         Ok(content) => println!("{}", content),
189    ///         Err(e) => eprintln!("Error reading epub: {}", e),
190    ///     }
191    /// }
192    /// # Ok(())
193    /// # }
194    /// ```
195    pub fn read(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
196        EpubFileLoader {
197            iterator: Box::new(self.iterator.map(|res| {
198                let doc = res.load().map(EpubChapterIterator::<P>::from)?;
199
200                Ok(doc
201                    .into_iter()
202                    .collect::<Result<Vec<String>, EpubLoaderError>>()?
203                    .into_iter()
204                    .collect::<String>())
205            })),
206            _processor: PhantomData,
207        }
208    }
209
210    /// Directly reads the contents of the epub files within the iterator returned by
211    ///  [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with
212    ///  the content.
213    ///
214    /// # Example
215    /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents.
216    ///
217    /// ```no_run
218    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
219    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
220    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read_with_path().into_iter();
221    /// for result in content {
222    ///     match result {
223    ///         Ok((path, content)) => println!("{:?} {}", path, content),
224    ///         Err(e) => eprintln!("Error reading epub: {}", e),
225    ///     }
226    /// }
227    /// # Ok(())
228    /// # }
229    /// ```
230    pub fn read_with_path(
231        self,
232    ) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>, P> {
233        EpubFileLoader {
234            iterator: Box::new(self.iterator.map(|res| {
235                let (path, doc) = res.load_with_path()?;
236
237                let content = EpubChapterIterator::<P>::from(doc)
238                    .collect::<Result<Vec<String>, EpubLoaderError>>()?
239                    .into_iter()
240                    .collect::<String>();
241                Ok((path, content))
242            })),
243            _processor: PhantomData,
244        }
245    }
246}
247
248impl<'a, P> EpubFileLoader<'a, EpubDoc<BufReader<File>>, P>
249where
250    P: TextProcessor + 'a,
251{
252    /// Chunks the chapters of a loaded document by chapter, flattened as a single vector.
253    ///
254    /// # Example
255    /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters.
256    ///
257    /// ```no_run
258    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
259    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
260    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
261    ///     .load()
262    ///     .ignore_errors()
263    ///     .by_chapter()
264    ///     .into_iter();
265    /// for result in content {
266    ///     match result {
267    ///         Ok(chapter) => println!("{}", chapter),
268    ///         Err(e) => eprintln!("Error reading chapter: {}", e),
269    ///     }
270    /// }
271    /// # Ok(())
272    /// # }
273    /// ```
274    pub fn by_chapter(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
275        EpubFileLoader {
276            iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::<P>::from)),
277            _processor: PhantomData,
278        }
279    }
280}
281
282type ByChapter = (PathBuf, Vec<(usize, Result<String, EpubLoaderError>)>);
283impl<'a, P: TextProcessor> EpubFileLoader<'a, (PathBuf, EpubDoc<BufReader<File>>), P> {
284    /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path
285    ///  which each document container an inner vector of chapters by chapter number.
286    ///
287    /// # Example
288    /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters.
289    ///
290    /// ```no_run
291    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
292    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
293    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
294    ///     .load_with_path()
295    ///     .ignore_errors()
296    ///     .by_chapter()
297    ///     .ignore_errors()
298    ///     .into_iter();
299    ///
300    /// for result in content {
301    ///     println!("{:?}", result);
302    /// }
303    /// # Ok(())
304    /// # }
305    /// ```
306    pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter, P> {
307        EpubFileLoader {
308            iterator: Box::new(self.iterator.map(|doc| {
309                let (path, doc) = doc;
310
311                (
312                    path,
313                    EpubChapterIterator::<P>::from(doc)
314                        .enumerate()
315                        .collect::<Vec<_>>(),
316                )
317            })),
318            _processor: PhantomData,
319        }
320    }
321}
322
323impl<'a, P> EpubFileLoader<'a, ByChapter, P>
324where
325    P: TextProcessor,
326{
327    /// Ignores errors in the iterator, returning only successful results. This can be used on any
328    ///  [EpubFileLoader] state of iterator whose items are results.
329    ///
330    /// # Example
331    /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
332    ///
333    /// ```no_run
334    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
335    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
336    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
337    ///     .load_with_path()
338    ///     .ignore_errors()
339    ///     .by_chapter()
340    ///     .ignore_errors();
341    /// for (_path, chapters) in content {
342    ///     println!("{}", chapters.len())
343    /// }
344    /// # Ok(())
345    /// # }
346    /// ```
347    pub fn ignore_errors(self) -> EpubFileLoader<'a, (PathBuf, Vec<(usize, String)>), P> {
348        EpubFileLoader {
349            iterator: Box::new(self.iterator.map(|(path, chapters)| {
350                let chapters = chapters
351                    .into_iter()
352                    .filter_map(|(idx, res)| res.ok().map(|content| (idx, content)))
353                    .collect::<Vec<_>>();
354                (path, chapters)
355            })),
356            _processor: PhantomData,
357        }
358    }
359}
360
361impl<'a, P, T: 'a> EpubFileLoader<'a, Result<T, EpubLoaderError>, P> {
362    /// Ignores errors in the iterator, returning only successful results. This can be used on any
363    ///  [EpubFileLoader] state of iterator whose items are results.
364    ///
365    /// # Example
366    /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
367    ///
368    /// ```no_run
369    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
370    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
371    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors();
372    /// for content in content {
373    ///     println!("{}", content)
374    /// }
375    /// # Ok(())
376    /// # }
377    /// ```
378    pub fn ignore_errors(self) -> EpubFileLoader<'a, T, P> {
379        EpubFileLoader {
380            iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
381            _processor: PhantomData,
382        }
383    }
384}
385
386impl<P> EpubFileLoader<'_, Result<PathBuf, FileLoaderError>, P> {
387    /// Creates a new [EpubFileLoader] using a glob pattern to match files.
388    ///
389    /// # Example
390    /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub".
391    ///
392    /// ```no_run
393    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
394    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
395    /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
396    /// # Ok(())
397    /// # }
398    /// ```
399    pub fn with_glob(
400        pattern: &str,
401    ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
402        let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?;
403
404        Ok(EpubFileLoader {
405            iterator: Box::new(paths.into_iter().map(|path| {
406                path.map_err(FileLoaderError::GlobError)
407                    .map_err(EpubLoaderError::FileLoaderError)
408            })),
409            _processor: PhantomData,
410        })
411    }
412
413    /// Creates a new [EpubFileLoader] on all files within a directory.
414    ///
415    /// # Example
416    /// Create a [EpubFileLoader] for all files that are in the directory "files".
417    ///
418    /// ```no_run
419    /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
420    /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
421    /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_dir("files")?;
422    /// # Ok(())
423    /// # }
424    /// ```
425    pub fn with_dir(
426        directory: &str,
427    ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
428        let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?;
429
430        Ok(EpubFileLoader {
431            iterator: Box::new(
432                paths
433                    .into_iter()
434                    .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
435            ),
436            _processor: PhantomData,
437        })
438    }
439}
440
441// ================================================================
442// EpubFileLoader iterator implementations
443// ================================================================
444pub struct IntoIter<'a, T> {
445    iterator: Box<dyn Iterator<Item = T> + 'a>,
446}
447
448impl<'a, T, P> IntoIterator for EpubFileLoader<'a, T, P> {
449    type Item = T;
450    type IntoIter = IntoIter<'a, T>;
451
452    fn into_iter(self) -> Self::IntoIter {
453        IntoIter {
454            iterator: self.iterator,
455        }
456    }
457}
458
459impl<T> Iterator for IntoIter<'_, T> {
460    type Item = T;
461
462    fn next(&mut self) -> Option<Self::Item> {
463        self.iterator.next()
464    }
465}
466
467// ================================================================
468// EpubChapterIterator definitions and implementations
469// ================================================================
470
471struct EpubChapterIterator<P> {
472    epub: EpubDoc<BufReader<File>>,
473    finished: bool,
474    _processor: PhantomData<P>,
475}
476
477impl<P> From<EpubDoc<BufReader<File>>> for EpubChapterIterator<P> {
478    fn from(epub: EpubDoc<BufReader<File>>) -> Self {
479        Self::new(epub)
480    }
481}
482
483impl<P> EpubChapterIterator<P> {
484    fn new(epub: EpubDoc<BufReader<File>>) -> Self {
485        Self {
486            epub,
487            finished: false,
488            _processor: PhantomData,
489        }
490    }
491}
492
493impl<P> Iterator for EpubChapterIterator<P>
494where
495    P: TextProcessor,
496{
497    type Item = Result<String, EpubLoaderError>;
498
499    fn next(&mut self) -> Option<Self::Item> {
500        if self.finished {
501            return None;
502        }
503
504        // ignore empty chapters if they exist
505        while !self.finished {
506            let chapter = self.epub.get_current_str();
507
508            if !self.epub.go_next() {
509                self.finished = true;
510            }
511
512            if let Some((text, _)) = chapter {
513                return Some(
514                    P::process(&text)
515                        .map_err(|err| EpubLoaderError::TextProcessorError(Box::new(err))),
516                );
517            }
518        }
519
520        None
521    }
522}
523
524#[cfg(test)]
525mod tests {
526    use std::path::PathBuf;
527
528    use crate::loaders::epub::RawTextProcessor;
529
530    use super::EpubFileLoader;
531
532    #[test]
533    fn test_epub_loader_with_errors() {
534        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
535        let actual = loader
536            .load_with_path()
537            .ignore_errors()
538            .by_chapter()
539            .into_iter()
540            .collect::<Vec<_>>();
541
542        assert_eq!(actual.len(), 1);
543
544        let (_, chapters) = &actual[0];
545        assert_eq!(chapters.len(), 3);
546
547        for chapter in chapters {
548            assert!(chapter.1.is_ok());
549        }
550    }
551
552    #[test]
553    fn test_epub_loader_with_ignoring_errors() {
554        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
555        let actual = loader
556            .load_with_path()
557            .ignore_errors()
558            .by_chapter()
559            .ignore_errors()
560            .into_iter()
561            .collect::<Vec<_>>();
562
563        assert_eq!(actual.len(), 1);
564
565        let (_, chapters) = &actual[0];
566        assert_eq!(chapters.len(), 3);
567    }
568
569    #[test]
570    fn test_single_file() {
571        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
572
573        let actual = loader
574            .read()
575            .ignore_errors()
576            .into_iter()
577            .collect::<Vec<_>>();
578
579        assert_eq!(actual.len(), 1);
580    }
581
582    #[test]
583    fn test_single_file_with_path() {
584        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
585
586        let actual = loader
587            .read_with_path()
588            .ignore_errors()
589            .into_iter()
590            .collect::<Vec<_>>();
591
592        assert_eq!(actual.len(), 1);
593
594        let (path, _) = &actual[0];
595        assert_eq!(path, &PathBuf::from("tests/data/dummy.epub"));
596    }
597}