rig/loaders/epub/
text_processors.rs

1use std::{convert::Infallible, error::Error};
2
3use quick_xml::Reader;
4use quick_xml::events::Event;
5
6// ================================================================
7// Implementing TextProcessor trait for post-processing epubs
8// ================================================================
9
10pub trait TextProcessor {
11    type Error: Error + 'static;
12
13    fn process(text: &str) -> Result<String, Self::Error>;
14}
15
16pub struct RawTextProcessor;
17
18impl TextProcessor for RawTextProcessor {
19    type Error = Infallible;
20
21    fn process(text: &str) -> Result<String, Self::Error> {
22        Ok(text.to_string())
23    }
24}
25
26#[derive(thiserror::Error, Debug)]
27pub enum XmlProcessingError {
28    #[error("XML parsing error: {0}")]
29    Xml(#[from] quick_xml::Error),
30
31    #[error("Failed to unescape XML entity: {0}")]
32    Encoding(#[from] quick_xml::encoding::EncodingError),
33
34    #[error("Invalid UTF-8 sequence: {0}")]
35    Utf8(#[from] std::string::FromUtf8Error),
36}
37
38pub struct StripXmlProcessor;
39
40impl TextProcessor for StripXmlProcessor {
41    type Error = XmlProcessingError;
42
43    fn process(xml: &str) -> Result<String, Self::Error> {
44        let mut reader = Reader::from_str(xml.trim());
45
46        let mut result = String::with_capacity(xml.len() / 2); // Rough estimate
47        let mut last_was_text = false;
48
49        loop {
50            match reader.read_event()? {
51                Event::Text(e) => {
52                    let text = e.decode()?;
53                    if !text.trim().is_empty() {
54                        if last_was_text {
55                            result.push(' ');
56                        }
57                        result.push_str(&text);
58                        last_was_text = true;
59                    }
60                }
61                Event::CData(e) => {
62                    let text = String::from_utf8(e.into_inner().into_owned())?;
63                    if !text.trim().is_empty() {
64                        if last_was_text {
65                            result.push(' ');
66                        }
67                        result.push_str(&text);
68                        last_was_text = true;
69                    }
70                }
71                Event::Eof => break,
72                _ => {
73                    last_was_text = false;
74                }
75            }
76        }
77
78        Ok(result)
79    }
80}