Skip to main content

epub_parser/utils/
xml.rs

1//! XML parsing utilities for the EPUB parser.
2//!
3//! This module provides helper functions for common XML parsing tasks,
4//! particularly for extracting text content from XML elements.
5
6use quick_xml::events::Event;
7
8/// Pre-process XML content to handle common HTML entities.
9///
10/// EPUB files often contain HTML entities like ` ` that are not
11/// recognized by strict XML parsers. This function replaces them with
12/// numeric character references that XML parsers can understand.
13pub fn preprocess_html_entities(content: &str) -> String {
14    content
15        .replace(" ", " ")
16        .replace("©", "©")
17        .replace("—", "—")
18        .replace("–", "–")
19        .replace("“", "“")
20        .replace("”", "”")
21        .replace("‘", "‘")
22        .replace("’", "’")
23        .replace("…", "…")
24}
25
26/// A utility for parsing XML content.
27///
28/// This struct provides helper methods for common XML parsing operations
29/// used when extracting data from EPUB files (OPF, NCX, and HTML content).
30pub struct XmlParser;
31
32impl XmlParser {
33    /// Extracts text content from an XML reader.
34    ///
35    /// Reads events from the XML reader until a text event is found or
36    /// the element ends. This is useful for extracting the text content
37    /// of XML elements like `<title>`, `<creator>`, etc.
38    ///
39    /// # Arguments
40    ///
41    /// * `reader` - The XML reader to read events from.
42    /// * `buf` - A buffer for reading events (will be cleared automatically).
43    ///
44    /// # Returns
45    ///
46    /// Returns `Ok(Some(String))` if text was found, `Ok(None)` if no
47    /// text was found before the element ended, or an error if parsing fails.
48    ///
49    /// # Errors
50    ///
51    /// Returns an error if there is an XML parsing error.
52    ///
53    /// # Example
54    ///
55    /// ```
56    /// use quick_xml::Reader;
57    /// use epub_parser::utils::XmlParser;
58    ///
59    /// let xml = r#"<title>My Book</title>"#;
60    /// let mut reader = Reader::from_str(xml);
61    /// let mut buf = Vec::new();
62    ///
63    /// // Skip the Start event
64    /// reader.read_event_into(&mut buf).unwrap();
65    ///
66    /// let text = XmlParser::extract_text(&mut reader, &mut buf).unwrap();
67    /// assert_eq!(text, Some("My Book".to_string()));
68    /// ```
69    pub fn extract_text<R: std::io::BufRead>(
70        reader: &mut quick_xml::Reader<R>,
71        buf: &mut Vec<u8>,
72    ) -> Result<Option<String>, Box<dyn std::error::Error>> {
73        let mut text = String::new();
74
75        loop {
76            match reader.read_event_into(buf) {
77                Ok(Event::Text(e)) => {
78                    text = e.unescape()?.into_owned();
79                    text = text.trim().to_string();
80                    if !text.is_empty() {
81                        break;
82                    }
83                }
84                Ok(Event::End(_)) => break,
85                Ok(Event::Eof) => break,
86                Err(e) => return Err(e.into()),
87                _ => {}
88            }
89            buf.clear();
90        }
91
92        Ok(if text.is_empty() { None } else { Some(text) })
93    }
94}