rig_core/loaders/epub/loader.rs
1use crate::loaders::file::FileLoaderError;
2use epub::doc::EpubDoc;
3
4use std::fs::File;
5use std::io::BufReader;
6use std::marker::PhantomData;
7use std::path::PathBuf;
8
9use super::RawTextProcessor;
10use super::errors::EpubLoaderError;
11use super::text_processors::TextProcessor;
12
13// ================================================================
14// Implementing Loadable trait for loading epubs
15// ================================================================
16
17pub(crate) trait Loadable {
18 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError>;
19 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
20}
21
22impl Loadable for PathBuf {
23 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
24 EpubDoc::new(self).map_err(EpubLoaderError::EpubError)
25 }
26
27 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
28 let contents = EpubDoc::new(&self).map_err(EpubLoaderError::EpubError);
29 Ok((self, contents?))
30 }
31}
32
33impl<T: Loadable> Loadable for Result<T, EpubLoaderError> {
34 fn load(self) -> Result<EpubDoc<BufReader<File>>, EpubLoaderError> {
35 self.map(|t| t.load())?
36 }
37
38 fn load_with_path(self) -> Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError> {
39 self.map(|t| t.load_with_path())?
40 }
41}
42
43// ================================================================
44// EpubFileLoader definitions and implementations
45// ================================================================
46
47/// [EpubFileLoader] is a utility for loading epub files from the filesystem using glob patterns or
48/// directory paths. It provides methods to read file contents and handle errors gracefully.
49///
50/// # Errors
51///
52/// This module defines a custom error type [EpubLoaderError] which can represent various errors
53/// that might occur during file loading operations, such as any [FileLoaderError] alongside
54/// specific EPUB-related errors.
55///
56/// # Example Usage
57///
58/// ```no_run
59/// use rig_core::loaders::{EpubFileLoader, RawTextProcessor, StripXmlProcessor};
60///
61/// fn main() -> Result<(), Box<dyn std::error::Error>> {
62/// // Create a FileLoader using a glob pattern
63/// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
64///
65/// // Load epub file contents by chapter, ignoring any errors
66/// let contents = loader
67/// .load_with_path()
68/// .ignore_errors()
69/// .by_chapter()
70/// .ignore_errors();
71///
72/// for (path, chapters) in contents {
73/// println!("{}", path.display());
74/// for (idx, chapter) in chapters {
75/// println!("Chapter {} begins", idx);
76/// println!("{}", chapter);
77/// println!("Chapter {} ends", idx);
78/// }
79/// }
80///
81/// // Create a FileLoader using a glob pattern with stripping xml
82/// let loader = EpubFileLoader::<_, StripXmlProcessor>::with_glob("tests/data/*.epub")?;
83///
84/// // Load epub file contents by chapter, ignoring any errors
85/// let contents = loader
86/// .load_with_path()
87/// .ignore_errors()
88/// .by_chapter()
89/// .ignore_errors();
90///
91/// for (path, chapters) in contents {
92/// println!("{}", path.display());
93/// for (idx, chapter) in chapters {
94/// println!("Chapter {} begins", idx);
95/// println!("{}", chapter);
96/// println!("Chapter {} ends", idx);
97/// }
98/// }
99///
100/// Ok(())
101/// }
102/// ```
103///
104/// [EpubFileLoader] uses strict typing between the iterator methods to ensure that transitions
105/// between different implementations of the loaders and it's methods are handled properly by
106/// the compiler.
107pub struct EpubFileLoader<'a, T, P = RawTextProcessor> {
108 iterator: Box<dyn Iterator<Item = T> + 'a>,
109 _processor: PhantomData<P>,
110}
111
112type EpubLoaded = Result<(PathBuf, EpubDoc<BufReader<File>>), EpubLoaderError>;
113
114impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P> {
115 /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
116 /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances that can be
117 /// further processed (by chapter, etc).
118 ///
119 /// # Example
120 /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
121 ///
122 /// ```no_run
123 /// use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
124 ///
125 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
126 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load().into_iter();
127 /// for result in content {
128 /// match result {
129 /// Ok(doc) => println!("{:?}", doc),
130 /// Err(e) => eprintln!("Error reading epub: {}", e),
131 /// }
132 /// }
133 /// # Ok(())
134 /// # }
135 /// ```
136 pub fn load(self) -> EpubFileLoader<'a, Result<EpubDoc<BufReader<File>>, EpubLoaderError>, P> {
137 EpubFileLoader {
138 iterator: Box::new(self.iterator.map(|res| res.load())),
139 _processor: PhantomData,
140 }
141 }
142
143 /// Loads the contents of the epub files within the iterator returned by [EpubFileLoader::with_glob]
144 /// or [EpubFileLoader::with_dir]. Loaded EPUB documents are raw EPUB instances with their path
145 /// that can be further processed.
146 ///
147 /// # Example
148 /// Load epub files in directory "tests/data/*.epub" and return the loaded documents
149 ///
150 /// ```no_run
151 /// use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
152 ///
153 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
154 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.load_with_path().into_iter();
155 /// for result in content {
156 /// match result {
157 /// Ok((path, doc)) => println!("{:?} {:?}", path, doc),
158 /// Err(e) => eprintln!("Error reading epub: {}", e),
159 /// }
160 /// }
161 /// # Ok(())
162 /// # }
163 /// ```
164 pub fn load_with_path(self) -> EpubFileLoader<'a, EpubLoaded, P> {
165 EpubFileLoader {
166 iterator: Box::new(self.iterator.map(|res| res.load_with_path())),
167 _processor: PhantomData,
168 }
169 }
170}
171
172impl<'a, P> EpubFileLoader<'a, Result<PathBuf, EpubLoaderError>, P>
173where
174 P: TextProcessor,
175{
176 /// Directly reads the contents of the epub files within the iterator returned by
177 /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir].
178 ///
179 /// # Example
180 /// Read epub files in directory "tests/data/*.epub" and return the contents of the documents.
181 ///
182 /// ```no_run
183 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
184 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
185 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().into_iter();
186 /// for result in content {
187 /// match result {
188 /// Ok(content) => println!("{}", content),
189 /// Err(e) => eprintln!("Error reading epub: {}", e),
190 /// }
191 /// }
192 /// # Ok(())
193 /// # }
194 /// ```
195 pub fn read(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
196 EpubFileLoader {
197 iterator: Box::new(self.iterator.map(|res| {
198 let doc = res.load().map(EpubChapterIterator::<P>::from)?;
199
200 Ok(doc
201 .into_iter()
202 .collect::<Result<Vec<String>, EpubLoaderError>>()?
203 .into_iter()
204 .collect::<String>())
205 })),
206 _processor: PhantomData,
207 }
208 }
209
210 /// Directly reads the contents of the epub files within the iterator returned by
211 /// [EpubFileLoader::with_glob] or [EpubFileLoader::with_dir] and returns the path along with
212 /// the content.
213 ///
214 /// # Example
215 /// Read epub files in directory "tests/data/*.epub" and return the content and paths of the documents.
216 ///
217 /// ```no_run
218 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
219 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
220 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read_with_path().into_iter();
221 /// for result in content {
222 /// match result {
223 /// Ok((path, content)) => println!("{:?} {}", path, content),
224 /// Err(e) => eprintln!("Error reading epub: {}", e),
225 /// }
226 /// }
227 /// # Ok(())
228 /// # }
229 /// ```
230 pub fn read_with_path(
231 self,
232 ) -> EpubFileLoader<'a, Result<(PathBuf, String), EpubLoaderError>, P> {
233 EpubFileLoader {
234 iterator: Box::new(self.iterator.map(|res| {
235 let (path, doc) = res.load_with_path()?;
236
237 let content = EpubChapterIterator::<P>::from(doc)
238 .collect::<Result<Vec<String>, EpubLoaderError>>()?
239 .into_iter()
240 .collect::<String>();
241 Ok((path, content))
242 })),
243 _processor: PhantomData,
244 }
245 }
246}
247
248impl<'a, P> EpubFileLoader<'a, EpubDoc<BufReader<File>>, P>
249where
250 P: TextProcessor + 'a,
251{
252 /// Chunks the chapters of a loaded document by chapter, flattened as a single vector.
253 ///
254 /// # Example
255 /// Load epub files in directory "tests/data/*.epub" and chunk all document into it's chapters.
256 ///
257 /// ```no_run
258 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
259 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
260 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
261 /// .load()
262 /// .ignore_errors()
263 /// .by_chapter()
264 /// .into_iter();
265 /// for result in content {
266 /// match result {
267 /// Ok(chapter) => println!("{}", chapter),
268 /// Err(e) => eprintln!("Error reading chapter: {}", e),
269 /// }
270 /// }
271 /// # Ok(())
272 /// # }
273 /// ```
274 pub fn by_chapter(self) -> EpubFileLoader<'a, Result<String, EpubLoaderError>, P> {
275 EpubFileLoader {
276 iterator: Box::new(self.iterator.flat_map(EpubChapterIterator::<P>::from)),
277 _processor: PhantomData,
278 }
279 }
280}
281
282type ByChapter = (PathBuf, Vec<(usize, Result<String, EpubLoaderError>)>);
283impl<'a, P: TextProcessor> EpubFileLoader<'a, (PathBuf, EpubDoc<BufReader<File>>), P> {
284 /// Chunks the chapters of a loaded document by chapter, processed as a vector of documents by path
285 /// which each document container an inner vector of chapters by chapter number.
286 ///
287 /// # Example
288 /// Read epub files in directory "tests/data/*.epub" and chunk all documents by path by it's chapters.
289 ///
290 /// ```no_run
291 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
292 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
293 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
294 /// .load_with_path()
295 /// .ignore_errors()
296 /// .by_chapter()
297 /// .ignore_errors()
298 /// .into_iter();
299 ///
300 /// for result in content {
301 /// println!("{:?}", result);
302 /// }
303 /// # Ok(())
304 /// # }
305 /// ```
306 pub fn by_chapter(self) -> EpubFileLoader<'a, ByChapter, P> {
307 EpubFileLoader {
308 iterator: Box::new(self.iterator.map(|doc| {
309 let (path, doc) = doc;
310
311 (
312 path,
313 EpubChapterIterator::<P>::from(doc)
314 .enumerate()
315 .collect::<Vec<_>>(),
316 )
317 })),
318 _processor: PhantomData,
319 }
320 }
321}
322
323impl<'a, P> EpubFileLoader<'a, ByChapter, P>
324where
325 P: TextProcessor,
326{
327 /// Ignores errors in the iterator, returning only successful results. This can be used on any
328 /// [EpubFileLoader] state of iterator whose items are results.
329 ///
330 /// # Example
331 /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
332 ///
333 /// ```no_run
334 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
335 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
336 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?
337 /// .load_with_path()
338 /// .ignore_errors()
339 /// .by_chapter()
340 /// .ignore_errors();
341 /// for (_path, chapters) in content {
342 /// println!("{}", chapters.len())
343 /// }
344 /// # Ok(())
345 /// # }
346 /// ```
347 pub fn ignore_errors(self) -> EpubFileLoader<'a, (PathBuf, Vec<(usize, String)>), P> {
348 EpubFileLoader {
349 iterator: Box::new(self.iterator.map(|(path, chapters)| {
350 let chapters = chapters
351 .into_iter()
352 .filter_map(|(idx, res)| res.ok().map(|content| (idx, content)))
353 .collect::<Vec<_>>();
354 (path, chapters)
355 })),
356 _processor: PhantomData,
357 }
358 }
359}
360
361impl<'a, P, T: 'a> EpubFileLoader<'a, Result<T, EpubLoaderError>, P> {
362 /// Ignores errors in the iterator, returning only successful results. This can be used on any
363 /// [EpubFileLoader] state of iterator whose items are results.
364 ///
365 /// # Example
366 /// Read files in directory "tests/data/*.epub" and ignore errors from unreadable files.
367 ///
368 /// ```no_run
369 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
370 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
371 /// let content = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?.read().ignore_errors();
372 /// for content in content {
373 /// println!("{}", content)
374 /// }
375 /// # Ok(())
376 /// # }
377 /// ```
378 pub fn ignore_errors(self) -> EpubFileLoader<'a, T, P> {
379 EpubFileLoader {
380 iterator: Box::new(self.iterator.filter_map(|res| res.ok())),
381 _processor: PhantomData,
382 }
383 }
384}
385
386impl<P> EpubFileLoader<'_, Result<PathBuf, FileLoaderError>, P> {
387 /// Creates a new [EpubFileLoader] using a glob pattern to match files.
388 ///
389 /// # Example
390 /// Create a [EpubFileLoader] for all `.epub` files that match the glob "tests/data/*.epub".
391 ///
392 /// ```no_run
393 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
394 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
395 /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub")?;
396 /// # Ok(())
397 /// # }
398 /// ```
399 pub fn with_glob(
400 pattern: &str,
401 ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
402 let paths = glob::glob(pattern).map_err(FileLoaderError::PatternError)?;
403
404 Ok(EpubFileLoader {
405 iterator: Box::new(paths.into_iter().map(|path| {
406 path.map_err(FileLoaderError::GlobError)
407 .map_err(EpubLoaderError::FileLoaderError)
408 })),
409 _processor: PhantomData,
410 })
411 }
412
413 /// Creates a new [EpubFileLoader] on all files within a directory.
414 ///
415 /// # Example
416 /// Create a [EpubFileLoader] for all files that are in the directory "files".
417 ///
418 /// ```no_run
419 /// # use rig_core::loaders::{EpubFileLoader, RawTextProcessor};
420 /// # fn run() -> Result<(), Box<dyn std::error::Error>> {
421 /// let loader = EpubFileLoader::<_, RawTextProcessor>::with_dir("files")?;
422 /// # Ok(())
423 /// # }
424 /// ```
425 pub fn with_dir(
426 directory: &str,
427 ) -> Result<EpubFileLoader<'_, Result<PathBuf, EpubLoaderError>, P>, EpubLoaderError> {
428 let paths = std::fs::read_dir(directory).map_err(FileLoaderError::IoError)?;
429
430 Ok(EpubFileLoader {
431 iterator: Box::new(
432 paths
433 .into_iter()
434 .map(|entry| Ok(entry.map_err(FileLoaderError::IoError)?.path())),
435 ),
436 _processor: PhantomData,
437 })
438 }
439}
440
441// ================================================================
442// EpubFileLoader iterator implementations
443// ================================================================
444pub struct IntoIter<'a, T> {
445 iterator: Box<dyn Iterator<Item = T> + 'a>,
446}
447
448impl<'a, T, P> IntoIterator for EpubFileLoader<'a, T, P> {
449 type Item = T;
450 type IntoIter = IntoIter<'a, T>;
451
452 fn into_iter(self) -> Self::IntoIter {
453 IntoIter {
454 iterator: self.iterator,
455 }
456 }
457}
458
459impl<T> Iterator for IntoIter<'_, T> {
460 type Item = T;
461
462 fn next(&mut self) -> Option<Self::Item> {
463 self.iterator.next()
464 }
465}
466
467// ================================================================
468// EpubChapterIterator definitions and implementations
469// ================================================================
470
471struct EpubChapterIterator<P> {
472 epub: EpubDoc<BufReader<File>>,
473 finished: bool,
474 _processor: PhantomData<P>,
475}
476
477impl<P> From<EpubDoc<BufReader<File>>> for EpubChapterIterator<P> {
478 fn from(epub: EpubDoc<BufReader<File>>) -> Self {
479 Self::new(epub)
480 }
481}
482
483impl<P> EpubChapterIterator<P> {
484 fn new(epub: EpubDoc<BufReader<File>>) -> Self {
485 Self {
486 epub,
487 finished: false,
488 _processor: PhantomData,
489 }
490 }
491}
492
493impl<P> Iterator for EpubChapterIterator<P>
494where
495 P: TextProcessor,
496{
497 type Item = Result<String, EpubLoaderError>;
498
499 fn next(&mut self) -> Option<Self::Item> {
500 if self.finished {
501 return None;
502 }
503
504 // ignore empty chapters if they exist
505 while !self.finished {
506 let chapter = self.epub.get_current_str();
507
508 if !self.epub.go_next() {
509 self.finished = true;
510 }
511
512 if let Some((text, _)) = chapter {
513 return Some(
514 P::process(&text)
515 .map_err(|err| EpubLoaderError::TextProcessorError(Box::new(err))),
516 );
517 }
518 }
519
520 None
521 }
522}
523
524#[cfg(test)]
525mod tests {
526 use std::path::PathBuf;
527
528 use crate::loaders::epub::RawTextProcessor;
529
530 use super::EpubFileLoader;
531
532 #[test]
533 fn test_epub_loader_with_errors() {
534 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
535 let actual = loader
536 .load_with_path()
537 .ignore_errors()
538 .by_chapter()
539 .into_iter()
540 .collect::<Vec<_>>();
541
542 assert_eq!(actual.len(), 1);
543
544 let (_, chapters) = &actual[0];
545 assert_eq!(chapters.len(), 3);
546
547 for chapter in chapters {
548 assert!(chapter.1.is_ok());
549 }
550 }
551
552 #[test]
553 fn test_epub_loader_with_ignoring_errors() {
554 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
555 let actual = loader
556 .load_with_path()
557 .ignore_errors()
558 .by_chapter()
559 .ignore_errors()
560 .into_iter()
561 .collect::<Vec<_>>();
562
563 assert_eq!(actual.len(), 1);
564
565 let (_, chapters) = &actual[0];
566 assert_eq!(chapters.len(), 3);
567 }
568
569 #[test]
570 fn test_single_file() {
571 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
572
573 let actual = loader
574 .read()
575 .ignore_errors()
576 .into_iter()
577 .collect::<Vec<_>>();
578
579 assert_eq!(actual.len(), 1);
580 }
581
582 #[test]
583 fn test_single_file_with_path() {
584 let loader = EpubFileLoader::<_, RawTextProcessor>::with_glob("tests/data/*.epub").unwrap();
585
586 let actual = loader
587 .read_with_path()
588 .ignore_errors()
589 .into_iter()
590 .collect::<Vec<_>>();
591
592 assert_eq!(actual.len(), 1);
593
594 let (path, _) = &actual[0];
595 assert_eq!(path, &PathBuf::from("tests/data/dummy.epub"));
596 }
597}