epub_parser/utils/xml.rs
1//! XML parsing utilities for the EPUB parser.
2//!
3//! This module provides helper functions for common XML parsing tasks,
4//! particularly for extracting text content from XML elements.
5
6use quick_xml::events::Event;
7
8/// Pre-process XML content to handle common HTML entities.
9///
10/// EPUB files often contain HTML entities like ` ` that are not
11/// recognized by strict XML parsers. This function replaces them with
12/// numeric character references that XML parsers can understand.
13pub fn preprocess_html_entities(content: &str) -> String {
14 content
15 .replace(" ", " ")
16 .replace("©", "©")
17 .replace("—", "—")
18 .replace("–", "–")
19 .replace("“", "“")
20 .replace("”", "”")
21 .replace("‘", "‘")
22 .replace("’", "’")
23 .replace("…", "…")
24}
25
26/// A utility for parsing XML content.
27///
28/// This struct provides helper methods for common XML parsing operations
29/// used when extracting data from EPUB files (OPF, NCX, and HTML content).
30pub struct XmlParser;
31
32impl XmlParser {
33 /// Extracts text content from an XML reader.
34 ///
35 /// Reads events from the XML reader until a text event is found or
36 /// the element ends. This is useful for extracting the text content
37 /// of XML elements like `<title>`, `<creator>`, etc.
38 ///
39 /// # Arguments
40 ///
41 /// * `reader` - The XML reader to read events from.
42 /// * `buf` - A buffer for reading events (will be cleared automatically).
43 ///
44 /// # Returns
45 ///
46 /// Returns `Ok(Some(String))` if text was found, `Ok(None)` if no
47 /// text was found before the element ended, or an error if parsing fails.
48 ///
49 /// # Errors
50 ///
51 /// Returns an error if there is an XML parsing error.
52 ///
53 /// # Example
54 ///
55 /// ```
56 /// use quick_xml::Reader;
57 /// use epub_parser::utils::XmlParser;
58 ///
59 /// let xml = r#"<title>My Book</title>"#;
60 /// let mut reader = Reader::from_str(xml);
61 /// let mut buf = Vec::new();
62 ///
63 /// // Skip the Start event
64 /// reader.read_event_into(&mut buf).unwrap();
65 ///
66 /// let text = XmlParser::extract_text(&mut reader, &mut buf).unwrap();
67 /// assert_eq!(text, Some("My Book".to_string()));
68 /// ```
69 pub fn extract_text<R: std::io::BufRead>(
70 reader: &mut quick_xml::Reader<R>,
71 buf: &mut Vec<u8>,
72 ) -> Result<Option<String>, Box<dyn std::error::Error>> {
73 let mut text = String::new();
74
75 loop {
76 match reader.read_event_into(buf) {
77 Ok(Event::Text(e)) => {
78 text = e.unescape()?.into_owned();
79 text = text.trim().to_string();
80 if !text.is_empty() {
81 break;
82 }
83 }
84 Ok(Event::End(_)) => break,
85 Ok(Event::Eof) => break,
86 Err(e) => return Err(e.into()),
87 _ => {}
88 }
89 buf.clear();
90 }
91
92 Ok(if text.is_empty() { None } else { Some(text) })
93 }
94}