rig/loaders/epub/loader.rs
1use crate::loaders::file::FileLoaderError;
2use epub::doc::EpubDoc;
3
4use std::fs::File;
5use std::io::BufReader;
6use std::marker::PhantomData;
7use std::path::PathBuf;
8
9use super::RawTextProcessor;
10use super::errors::EpubLoaderError;
11use super::text_processors::TextProcessor;
12
13// ================================================================
14// Implementing Loadable trait for loading epubs
15// ================================================================
16
17pub(crate) trait Loadable {
18 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError>;
19 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
20}
21
22impl Loadable for PathBuf {
23 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
24 EpubDoc::new(self).map_err(EpubLoaderError::EpubError)
25 }
26
27 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
28 let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError);
29 Ok((self, contents?))
30 }
31}
32
33impl<T: Loadable> Loadable for Result<T, EpubLoaderError> {
34 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
35 self.map(|t| t.load())?
36 }
37
38 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
39 self.map(|t| t.load_with_path())?
40 }
41}
42
43// ================================================================
44// EpubFileLoader definitions and implementations
45// ================================================================
46
47/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or
48/// directory paths. It provides methods to read file contents and handle errors gracefully.
49///
50/// # Errors
51///
52/// This module defines a custom error type [EpubLoaderError] which can represent various errors
53/// that might occur during file loading operations, such as any [FileLoaderError] alongside
54/// specific EPUB-related errors.
55///
56/// # Example Usage
57///
58/// ```rust
59/// use rig::loaders::{EpubFileLoader, RawTextProcessor, StripXmlProcessor};
60///
61/// fn main() -> Result<(), Box<dyn std::error::Error>> {
62/// // Create a FileLoader using a glob pattern
63/// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
64///
65/// // Load epub file contents by chapter, ignoring any errors
66/// let contents = loader
67/// .load_with_path()
68/// .ignore_errors()
69/// .by_chapter()
70/// .ignore_errors();
71///
72/// for (path, chapters) in contents {
73/// println!("{}", path.display());
74/// for (idx, chapter) in chapters {
75/// println!("Chapter {} begins", idx);
76/// println!("{}", chapter);
77/// println!("Chapter {} ends", idx);
78/// }
79/// }
80///
81/// // Create a FileLoader using a glob pattern with stripping xml
82/// let loader = EpubFileLoader::<_, StripXmlProcessor>::with_glob("tests/data/*.epub")?;
83///
84/// // Load epub file contents by chapter, ignoring any errors
85/// let contents = loader
86/// .load_with_path()
87/// .ignore_errors()
88/// .by_chapter()
89/// .ignore_errors();
90///
91/// for (path, chapters) in contents {
92/// println!("{}", path.display());
93/// for (idx, chapter) in chapters {
94/// println!("Chapter {} begins", idx);
95/// println!("{}", chapter);
96/// println!("Chapter {} ends", idx);
97/// }
98/// }
99///
100/// Ok(())
101/// }
102/// ```
103///
104/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions
105/// between different implementations of the loaders and it's methods are handled properly by
106/// the compiler.
107pub struct EpubFileLoader<'a, T, P = RawTextProcessor> {
108 iterator: Box<dyn Iterator<Item = T> + 'a>,
109 _processor: PhantomData<P>,
110}
111
112type EpubLoaded = Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
113
114impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P> {
115 /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
116 /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be
117 /// further processed (by chapter, etc).
118 ///
119 /// # Example
120 /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
121 ///
122 /// ```rust
123 /// use rig::loaders::EpubFileLoader;
124 ///
125 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().into_iter();
126 /// for result in content {
127 /// match result {
128 /// Ok(doc) => println!("{:?}", doc),
129 /// Err(e) => eprintln!("Error reading epub: {}", e),
130 /// }
131 /// }
132 /// ```
133 pub fn load(self) -> EpubFileLoader<'a, Result<EpubDoc<BufReader<File>>, EpubLoaderError>, P> {
134 EpubFileLoader {
135 iterator: Box::new(self.iterator.map(|res| res.load())),
136 _processor: PhantomData,
137 }
138 }
139
140 /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
141 /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path
142 /// that can be further processed.
143 ///
144 /// # Example
145 /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
146 ///
147 /// ```rust
148 /// use rig::loaders::EpubFileLoader;
149 ///
150 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap().load_with_path().into_iter();
151 /// for result in content {
152 /// match result {
153 /// Ok((path, doc)) => println!("{:?} {:?}", path, doc),
154 /// Err(e) => eprintln!("Error reading epub: {}", e),
155 /// }
156 /// }
157 /// ```
158 pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded, P> {
159 EpubFileLoader {
160 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
161 _processor: PhantomData,
162 }
163 }
164}
165
166impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P>
167where
168 P: TextProcessor,
169{
170 /// Directly reads the contents of the epub files within the iterator returned by
171 /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir].
172 ///
173 /// # Example
174 /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents.
175 ///
176 /// ```rust
177 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().into_iter();
178 /// for result in content {
179 /// match result {
180 /// Ok(content) => println!("{}", content),
181 /// Err(e) => eprintln!("Error reading epub: {}", e),
182 /// }
183 /// }
184 /// ```
185 pub fn read(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
186 EpubFileLoader {
187 iterator: Box::new(self.iterator.map(|res| {
188 let doc = res.load().map(EpubChapterIterator::<P>::from)?;
189
190 Ok(doc
191 .into_iter()
192 .collect::<Result<Vec<String>, EpubLoaderError>>()?
193 .into_iter()
194 .collect::<String>())
195 })),
196 _processor: PhantomData,
197 }
198 }
199
200 /// Directly reads the contents of the epub files within the iterator returned by
201 /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with
202 /// the content.
203 ///
204 /// # Example
205 /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents.
206 ///
207 /// ```rust
208 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read_with_path().into_iter();
209 /// for result in content {
210 /// match result {
211 /// Ok((path, content)) => println!("{:?} {}", path, content),
212 /// Err(e) => eprintln!("Error reading epub: {}", e),
213 /// }
214 /// }
215 /// ```
216 pub fn read_with_path(
217 self,
218 ) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>, P> {
219 EpubFileLoader {
220 iterator: Box::new(self.iterator.map(|res| {
221 let (path, doc) = res.load_with_path()?;
222
223 let content = EpubChapterIterator::<P>::from(doc)
224 .collect::<Result<Vec<String>, EpubLoaderError>>()?
225 .into_iter()
226 .collect::<String>();
227 Ok((path, content))
228 })),
229 _processor: PhantomData,
230 }
231 }
232}
233
234impl<'a, P> EpubFileLoader<'a, EpubDoc<BufReader<File>>, P>
235where
236 P: TextProcessor + 'a,
237{
238 /// Chunks the chapters of a loaded document by chapter, flattened as a single vector.
239 ///
240 /// # Example
241 /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters.
242 ///
243 /// ```rust
244 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().by_chapter().into_iter();
245 /// for result in content {
246 /// println!("{}", result);
247 /// }
248 /// ```
249 pub fn by_chapter(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
250 EpubFileLoader {
251 iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::<P>::from)),
252 _processor: PhantomData,
253 }
254 }
255}
256
257type ByChapter = (PathBuf, Vec<(usize, Result<String, EpubLoaderError>)>);
258impl<'a, P: TextProcessor> EpubFileLoader<'a, (PathBuf, EpubDoc<BufReader<File>>), P> {
259 /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path
260 /// which each document container an inner vector of chapters by chapter number.
261 ///
262 /// # Example
263 /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters.
264 ///
265 /// ```rust
266 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
267 /// .load_with_path()
268 /// .ignore_errors()
269 /// .by_chapter()
270 /// .ignore_errors()
271 /// .into_iter();
272 ///
273 /// for result in content {
274 /// println!("{:?}", result);
275 /// }
276 /// ```
277 pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter, P> {
278 EpubFileLoader {
279 iterator: Box::new(self.iterator.map(|doc| {
280 let (path, doc) = doc;
281
282 (
283 path,
284 EpubChapterIterator::<P>::from(doc)
285 .enumerate()
286 .collect::<Vec<_>>(),
287 )
288 })),
289 _processor: PhantomData,
290 }
291 }
292}
293
294impl<'a, P> EpubFileLoader<'a, ByChapter, P>
295where
296 P: TextProcessor,
297{
298 /// Ignores errors in the iterator, returning only successful results. This can be used on any
299 /// [EpubFileLoader] state of iterator whose items are results.
300 ///
301 /// # Example
302 /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
303 ///
304 /// ```rust
305 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter();
306 /// for result in content {
307 /// println!("{}", content)
308 /// }
309 /// ```
310 pub fn ignore_errors(self) -> EpubFileLoader<'a, (PathBuf, Vec<(usize, String)>), P> {
311 EpubFileLoader {
312 iterator: Box::new(self.iterator.map(|(path, chapters)| {
313 let chapters = chapters
314 .into_iter()
315 .filter_map(|(idx, res)| res.ok().map(|content| (idx, content)))
316 .collect::<Vec<_>>();
317 (path, chapters)
318 })),
319 _processor: PhantomData,
320 }
321 }
322}
323
324impl<'a, P, T: 'a> EpubFileLoader<'a, Result<T, EpubLoaderError>, P> {
325 /// Ignores errors in the iterator, returning only successful results. This can be used on any
326 /// [EpubFileLoader] state of iterator whose items are results.
327 ///
328 /// # Example
329 /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
330 ///
331 /// ```rust
332 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors().into_iter();
333 /// for result in content {
334 /// println!("{}", content)
335 /// }
336 /// ```
337 pub fn ignore_errors(self) -> EpubFileLoader<'a, T, P> {
338 EpubFileLoader {
339 iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
340 _processor: PhantomData,
341 }
342 }
343}
344
345impl<P> EpubFileLoader<'_, Result<PathBuf, FileLoaderError>, P> {
346 /// Creates a new [EpubFileLoader] using a glob pattern to match files.
347 ///
348 /// # Example
349 /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub".
350 ///
351 /// ```rust
352 /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
353 /// ```
354 pub fn with_glob(
355 pattern: &str,
356 ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
357 let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?;
358
359 Ok(EpubFileLoader {
360 iterator: Box::new(paths.into_iter().map(|path| {
361 path.map_err(FileLoaderError::GlobError)
362 .map_err(EpubLoaderError::FileLoaderError)
363 })),
364 _processor: PhantomData,
365 })
366 }
367
368 /// Creates a new [EpubFileLoader] on all files within a directory.
369 ///
370 /// # Example
371 /// Create a [EpubFileLoader] for all files that are in the directory "files".
372 ///
373 /// ```rust
374 /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_dir("files")?;
375 /// ```
376 pub fn with_dir(
377 directory: &str,
378 ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
379 let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?;
380
381 Ok(EpubFileLoader {
382 iterator: Box::new(
383 paths
384 .into_iter()
385 .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
386 ),
387 _processor: PhantomData,
388 })
389 }
390}
391
392// ================================================================
393// EpubFileLoader iterator implementations
394// ================================================================
395pub struct IntoIter<'a, T> {
396 iterator: Box<dyn Iterator<Item = T> + 'a>,
397}
398
399impl<'a, T, P> IntoIterator for EpubFileLoader<'a, T, P> {
400 type Item = T;
401 type IntoIter = IntoIter<'a, T>;
402
403 fn into_iter(self) -> Self::IntoIter {
404 IntoIter {
405 iterator: self.iterator,
406 }
407 }
408}
409
410impl<T> Iterator for IntoIter<'_, T> {
411 type Item = T;
412
413 fn next(&mut self) -> Option<Self::Item> {
414 self.iterator.next()
415 }
416}
417
418// ================================================================
419// EpubChapterIterator definitions and implementations
420// ================================================================
421
422struct EpubChapterIterator<P> {
423 epub: EpubDoc<BufReader<File>>,
424 finished: bool,
425 _processor: PhantomData<P>,
426}
427
428impl<P> From<EpubDoc<BufReader<File>>> for EpubChapterIterator<P> {
429 fn from(epub: EpubDoc<BufReader<File>>) -> Self {
430 Self::new(epub)
431 }
432}
433
434impl<P> EpubChapterIterator<P> {
435 fn new(epub: EpubDoc<BufReader<File>>) -> Self {
436 Self {
437 epub,
438 finished: false,
439 _processor: PhantomData,
440 }
441 }
442}
443
444impl<P> Iterator for EpubChapterIterator<P>
445where
446 P: TextProcessor,
447{
448 type Item = Result<String, EpubLoaderError>;
449
450 fn next(&mut self) -> Option<Self::Item> {
451 if self.finished {
452 return None;
453 }
454
455 // ignore empty chapters if they exist
456 while !self.finished {
457 let chapter = self.epub.get_current_str();
458
459 if !self.epub.go_next() {
460 self.finished = true;
461 }
462
463 if let Some((text, _)) = chapter {
464 return Some(
465 P::process(&text)
466 .map_err(|err| EpubLoaderError::TextProcessorError(Box::new(err))),
467 );
468 }
469 }
470
471 None
472 }
473}
474
475#[cfg(test)]
476mod tests {
477 use std::path::PathBuf;
478
479 use crate::loaders::epub::RawTextProcessor;
480
481 use super::EpubFileLoader;
482
483 #[test]
484 fn test_epub_loader_with_errors() {
485 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
486 let actual = loader
487 .load_with_path()
488 .ignore_errors()
489 .by_chapter()
490 .into_iter()
491 .collect::<Vec<_>>();
492
493 assert_eq!(actual.len(), 1);
494
495 let (_, chapters) = &actual[0];
496 assert_eq!(chapters.len(), 3);
497
498 for chapter in chapters {
499 assert!(chapter.1.is_ok());
500 }
501 }
502
503 #[test]
504 fn test_epub_loader_with_ignoring_errors() {
505 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
506 let actual = loader
507 .load_with_path()
508 .ignore_errors()
509 .by_chapter()
510 .ignore_errors()
511 .into_iter()
512 .collect::<Vec<_>>();
513
514 assert_eq!(actual.len(), 1);
515
516 let (_, chapters) = &actual[0];
517 assert_eq!(chapters.len(), 3);
518 }
519
520 #[test]
521 fn test_single_file() {
522 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
523
524 let actual = loader
525 .read()
526 .ignore_errors()
527 .into_iter()
528 .collect::<Vec<_>>();
529
530 assert_eq!(actual.len(), 1);
531 }
532
533 #[test]
534 fn test_single_file_with_path() {
535 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
536
537 let actual = loader
538 .read_with_path()
539 .ignore_errors()
540 .into_iter()
541 .collect::<Vec<_>>();
542
543 assert_eq!(actual.len(), 1);
544
545 let (path, _) = &actual[0];
546 assert_eq!(path, &PathBuf::from("tests/data/dummy.epub"));
547 }
548}