rig/loaders/epub/
loader.rs

1use crate::loaders::file::FileLoaderError;
2use epub::doc::EpubDoc;
3
4use std::fs::File;
5use std::io::BufReader;
6use std::marker::PhantomData;
7use std::path::PathBuf;
8
9use super::RawTextProcessor;
10use super::errors::EpubLoaderError;
11use super::text_processors::TextProcessor;
12
13// ================================================================
14// Implementing Loadable trait for loading epubs
15// ================================================================
16
17pub(crate) trait Loadable {
18    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError>;
19    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
20}
21
22impl Loadable for PathBuf {
23    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
24        EpubDoc::new(self).map_err(EpubLoaderError::EpubError)
25    }
26
27    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
28        let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError);
29        Ok((self, contents?))
30    }
31}
32
33impl<T: Loadable> Loadable for Result<T, EpubLoaderError> {
34    fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
35        self.map(|t| t.load())?
36    }
37
38    fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
39        self.map(|t| t.load_with_path())?
40    }
41}
42
43// ================================================================
44// EpubFileLoader definitions and implementations
45// ================================================================
46
47/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or
48///  directory paths. It provides methods to read file contents and handle errors gracefully.
49///
50/// # Errors
51///
52/// This module defines a custom error type [EpubLoaderError] which can represent various errors
53///  that might occur during file loading operations, such as any [FileLoaderError] alongside
54///  specific EPUB-related errors.
55///
56/// # Example Usage
57///
58/// ```rust
59/// use rig::loaders::{EpubFileLoader, RawTextProcessor, StripXmlProcessor};
60///
61/// fn main() -> Result<(), Box<dyn std::error::Error>> {
62///     // Create a FileLoader using a glob pattern
63///     let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
64///
65///     // Load epub file contents by chapter, ignoring any errors
66///     let contents = loader
67///         .load_with_path()
68///         .ignore_errors()
69///         .by_chapter()
70///         .ignore_errors();
71///
72///     for (path, chapters) in contents {
73///         println!("{}", path.display());
74///         for (idx, chapter) in chapters {
75///             println!("Chapter {} begins", idx);
76///             println!("{}", chapter);
77///             println!("Chapter {} ends", idx);
78///         }
79///     }
80///
81///     // Create a FileLoader using a glob pattern with stripping xml
82///     let loader = EpubFileLoader::<_, StripXmlProcessor>::with_glob("tests/data/*.epub")?;
83///
84///     // Load epub file contents by chapter, ignoring any errors
85///     let contents = loader
86///         .load_with_path()
87///         .ignore_errors()
88///         .by_chapter()
89///         .ignore_errors();
90///
91///     for (path, chapters) in contents {
92///         println!("{}", path.display());
93///         for (idx, chapter) in chapters {
94///             println!("Chapter {} begins", idx);
95///             println!("{}", chapter);
96///             println!("Chapter {} ends", idx);
97///         }
98///     }
99///
100///     Ok(())
101/// }
102/// ```
103///
104/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions
105///  between different implementations of the loaders and it's methods are handled properly by
106///  the compiler.
107pub struct EpubFileLoader<'a, T, P = RawTextProcessor> {
108    iterator: Box<dyn Iterator<Item = T> + 'a>,
109    _processor: PhantomData<P>,
110}
111
112type EpubLoaded = Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
113
114impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P> {
115    /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
116    ///  or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be
117    ///  further processed (by chapter, etc).
118    ///
119    /// # Example
120    /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
121    ///
122    /// ```rust
123    /// use rig::loaders::EpubFileLoader;
124    ///
125    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().into_iter();
126    /// for result in content {
127    ///     match result {
128    ///         Ok(doc) => println!("{:?}", doc),
129    ///         Err(e) => eprintln!("Error reading epub: {}", e),
130    ///     }
131    /// }
132    /// ```
133    pub fn load(self) -> EpubFileLoader<'a, Result<EpubDoc<BufReader<File>>, EpubLoaderError>, P> {
134        EpubFileLoader {
135            iterator: Box::new(self.iterator.map(|res| res.load())),
136            _processor: PhantomData,
137        }
138    }
139
140    /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
141    ///  or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path
142    ///  that can be further processed.
143    ///
144    /// # Example
145    /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
146    ///
147    /// ```rust
148    /// use rig::loaders::EpubFileLoader;
149    ///
150    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap().load_with_path().into_iter();
151    /// for result in content {
152    ///     match result {
153    ///         Ok((path, doc)) => println!("{:?} {:?}", path, doc),
154    ///         Err(e) => eprintln!("Error reading epub: {}", e),
155    ///     }
156    /// }
157    /// ```
158    pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded, P> {
159        EpubFileLoader {
160            iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
161            _processor: PhantomData,
162        }
163    }
164}
165
166impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P>
167where
168    P: TextProcessor,
169{
170    /// Directly reads the contents of the epub files within the iterator returned by
171    ///  [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir].
172    ///
173    /// # Example
174    /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents.
175    ///
176    /// ```rust
177    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().into_iter();
178    /// for result in content {
179    ///     match result {
180    ///         Ok(content) => println!("{}", content),
181    ///         Err(e) => eprintln!("Error reading epub: {}", e),
182    ///     }
183    /// }
184    /// ```
185    pub fn read(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
186        EpubFileLoader {
187            iterator: Box::new(self.iterator.map(|res| {
188                let doc = res.load().map(EpubChapterIterator::<P>::from)?;
189
190                Ok(doc
191                    .into_iter()
192                    .collect::<Result<Vec<String>, EpubLoaderError>>()?
193                    .into_iter()
194                    .collect::<String>())
195            })),
196            _processor: PhantomData,
197        }
198    }
199
200    /// Directly reads the contents of the epub files within the iterator returned by
201    ///  [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with
202    ///  the content.
203    ///
204    /// # Example
205    /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents.
206    ///
207    /// ```rust
208    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read_with_path().into_iter();
209    /// for result in content {
210    ///     match result {
211    ///         Ok((path, content)) => println!("{:?} {}", path, content),
212    ///         Err(e) => eprintln!("Error reading epub: {}", e),
213    ///     }
214    /// }
215    /// ```
216    pub fn read_with_path(
217        self,
218    ) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>, P> {
219        EpubFileLoader {
220            iterator: Box::new(self.iterator.map(|res| {
221                let (path, doc) = res.load_with_path()?;
222
223                let content = EpubChapterIterator::<P>::from(doc)
224                    .collect::<Result<Vec<String>, EpubLoaderError>>()?
225                    .into_iter()
226                    .collect::<String>();
227                Ok((path, content))
228            })),
229            _processor: PhantomData,
230        }
231    }
232}
233
234impl<'a, P> EpubFileLoader<'a, EpubDoc<BufReader<File>>, P>
235where
236    P: TextProcessor + 'a,
237{
238    /// Chunks the chapters of a loaded document by chapter, flattened as a single vector.
239    ///
240    /// # Example
241    /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters.
242    ///
243    /// ```rust
244    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().by_chapter().into_iter();
245    /// for result in content {
246    ///     println!("{}", result);
247    /// }
248    /// ```
249    pub fn by_chapter(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
250        EpubFileLoader {
251            iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::<P>::from)),
252            _processor: PhantomData,
253        }
254    }
255}
256
257type ByChapter = (PathBuf, Vec<(usize, Result<String, EpubLoaderError>)>);
258impl<'a, P: TextProcessor> EpubFileLoader<'a, (PathBuf, EpubDoc<BufReader<File>>), P> {
259    /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path
260    ///  which each document container an inner vector of chapters by chapter number.
261    ///
262    /// # Example
263    /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters.
264    ///
265    /// ```rust
266    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
267    ///     .load_with_path()
268    ///     .ignore_errors()
269    ///     .by_chapter()
270    ///     .ignore_errors()
271    ///     .into_iter();
272    ///
273    /// for result in content {
274    ///     println!("{:?}", result);
275    /// }
276    /// ```
277    pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter, P> {
278        EpubFileLoader {
279            iterator: Box::new(self.iterator.map(|doc| {
280                let (path, doc) = doc;
281
282                (
283                    path,
284                    EpubChapterIterator::<P>::from(doc)
285                        .enumerate()
286                        .collect::<Vec<_>>(),
287                )
288            })),
289            _processor: PhantomData,
290        }
291    }
292}
293
294impl<'a, P> EpubFileLoader<'a, ByChapter, P>
295where
296    P: TextProcessor,
297{
298    /// Ignores errors in the iterator, returning only successful results. This can be used on any
299    ///  [EpubFileLoader] state of iterator whose items are results.
300    ///
301    /// # Example
302    /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
303    ///
304    /// ```rust
305    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter();
306    /// for result in content {
307    ///     println!("{}", content)
308    /// }
309    /// ```
310    pub fn ignore_errors(self) -> EpubFileLoader<'a, (PathBuf, Vec<(usize, String)>), P> {
311        EpubFileLoader {
312            iterator: Box::new(self.iterator.map(|(path, chapters)| {
313                let chapters = chapters
314                    .into_iter()
315                    .filter_map(|(idx, res)| res.ok().map(|content| (idx, content)))
316                    .collect::<Vec<_>>();
317                (path, chapters)
318            })),
319            _processor: PhantomData,
320        }
321    }
322}
323
324impl<'a, P, T: 'a> EpubFileLoader<'a, Result<T, EpubLoaderError>, P> {
325    /// Ignores errors in the iterator, returning only successful results. This can be used on any
326    ///  [EpubFileLoader] state of iterator whose items are results.
327    ///
328    /// # Example
329    /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
330    ///
331    /// ```rust
332    /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter();
333    /// for result in content {
334    ///     println!("{}", content)
335    /// }
336    /// ```
337    pub fn ignore_errors(self) -> EpubFileLoader<'a, T, P> {
338        EpubFileLoader {
339            iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
340            _processor: PhantomData,
341        }
342    }
343}
344
345impl<P> EpubFileLoader<'_, Result<PathBuf, FileLoaderError>, P> {
346    /// Creates a new [EpubFileLoader] using a glob pattern to match files.
347    ///
348    /// # Example
349    /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub".
350    ///
351    /// ```rust
352    /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
353    /// ```
354    pub fn with_glob(
355        pattern: &str,
356    ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
357        let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?;
358
359        Ok(EpubFileLoader {
360            iterator: Box::new(paths.into_iter().map(|path| {
361                path.map_err(FileLoaderError::GlobError)
362                    .map_err(EpubLoaderError::FileLoaderError)
363            })),
364            _processor: PhantomData,
365        })
366    }
367
368    /// Creates a new [EpubFileLoader] on all files within a directory.
369    ///
370    /// # Example
371    /// Create a [EpubFileLoader] for all files that are in the directory "files".
372    ///
373    /// ```rust
374    /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_dir("files")?;
375    /// ```
376    pub fn with_dir(
377        directory: &str,
378    ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
379        let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?;
380
381        Ok(EpubFileLoader {
382            iterator: Box::new(
383                paths
384                    .into_iter()
385                    .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
386            ),
387            _processor: PhantomData,
388        })
389    }
390}
391
392// ================================================================
393// EpubFileLoader iterator implementations
394// ================================================================
395pub struct IntoIter<'a, T> {
396    iterator: Box<dyn Iterator<Item = T> + 'a>,
397}
398
399impl<'a, T, P> IntoIterator for EpubFileLoader<'a, T, P> {
400    type Item = T;
401    type IntoIter = IntoIter<'a, T>;
402
403    fn into_iter(self) -> Self::IntoIter {
404        IntoIter {
405            iterator: self.iterator,
406        }
407    }
408}
409
410impl<T> Iterator for IntoIter<'_, T> {
411    type Item = T;
412
413    fn next(&mut self) -> Option<Self::Item> {
414        self.iterator.next()
415    }
416}
417
418// ================================================================
419// EpubChapterIterator definitions and implementations
420// ================================================================
421
422struct EpubChapterIterator<P> {
423    epub: EpubDoc<BufReader<File>>,
424    finished: bool,
425    _processor: PhantomData<P>,
426}
427
428impl<P> From<EpubDoc<BufReader<File>>> for EpubChapterIterator<P> {
429    fn from(epub: EpubDoc<BufReader<File>>) -> Self {
430        Self::new(epub)
431    }
432}
433
434impl<P> EpubChapterIterator<P> {
435    fn new(epub: EpubDoc<BufReader<File>>) -> Self {
436        Self {
437            epub,
438            finished: false,
439            _processor: PhantomData,
440        }
441    }
442}
443
444impl<P> Iterator for EpubChapterIterator<P>
445where
446    P: TextProcessor,
447{
448    type Item = Result<String, EpubLoaderError>;
449
450    fn next(&mut self) -> Option<Self::Item> {
451        if self.finished {
452            return None;
453        }
454
455        // ignore empty chapters if they exist
456        while !self.finished {
457            let chapter = self.epub.get_current_str();
458
459            if !self.epub.go_next() {
460                self.finished = true;
461            }
462
463            if let Some((text, _)) = chapter {
464                return Some(
465                    P::process(&text)
466                        .map_err(|err| EpubLoaderError::TextProcessorError(Box::new(err))),
467                );
468            }
469        }
470
471        None
472    }
473}
474
475#[cfg(test)]
476mod tests {
477    use std::path::PathBuf;
478
479    use crate::loaders::epub::RawTextProcessor;
480
481    use super::EpubFileLoader;
482
483    #[test]
484    fn test_epub_loader_with_errors() {
485        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
486        let actual = loader
487            .load_with_path()
488            .ignore_errors()
489            .by_chapter()
490            .into_iter()
491            .collect::<Vec<_>>();
492
493        assert_eq!(actual.len(), 1);
494
495        let (_, chapters) = &actual[0];
496        assert_eq!(chapters.len(), 3);
497
498        for chapter in chapters {
499            assert!(chapter.1.is_ok());
500        }
501    }
502
503    #[test]
504    fn test_epub_loader_with_ignoring_errors() {
505        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
506        let actual = loader
507            .load_with_path()
508            .ignore_errors()
509            .by_chapter()
510            .ignore_errors()
511            .into_iter()
512            .collect::<Vec<_>>();
513
514        assert_eq!(actual.len(), 1);
515
516        let (_, chapters) = &actual[0];
517        assert_eq!(chapters.len(), 3);
518    }
519
520    #[test]
521    fn test_single_file() {
522        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
523
524        let actual = loader
525            .read()
526            .ignore_errors()
527            .into_iter()
528            .collect::<Vec<_>>();
529
530        assert_eq!(actual.len(), 1);
531    }
532
533    #[test]
534    fn test_single_file_with_path() {
535        let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
536
537        let actual = loader
538            .read_with_path()
539            .ignore_errors()
540            .into_iter()
541            .collect::<Vec<_>>();
542
543        assert_eq!(actual.len(), 1);
544
545        let (path, _) = &actual[0];
546        assert_eq!(path, &PathBuf::from("tests/data/dummy.epub"));
547    }
548}