parse_blogger_backup_xml/
xml_tools.rs

1/// Tools for exploring an xml document on top of quick-xml.  For example contains
2///
3/// - Functions to convert different references and byte slices into strings.
4/// - An XPath struct with a nice string representation that you can push and pop to and from.
5/// - Functions to selectively print out different aspects of an xml file.
6use quick_xml::events::{BytesEnd, BytesStart, BytesText};
7use quick_xml::{events::Event, Reader};
8use std::borrow::Cow;
9use std::collections::HashSet;
10use std::fmt;
11
12pub fn string_from_bytes_text(bytes_text: BytesText) -> Result<String, Box<dyn std::error::Error>> {
13    let bytes = bytes_text.unescaped()?.into_owned();
14    Ok(String::from_utf8(bytes)?)
15}
16
17pub fn start_tag_string(bytes_start: &BytesStart) -> Result<String, Box<dyn std::error::Error>> {
18    let tag = bytes_start.name();
19    let tag = tag.to_owned();
20    let tag = String::from_utf8(tag)?;
21    Ok(tag)
22}
23pub fn end_tag_string(bytes_end: &BytesEnd) -> Result<String, Box<dyn std::error::Error>> {
24    let tag = bytes_end.name();
25    let tag = tag.to_owned();
26    let tag = String::from_utf8(tag)?;
27    Ok(tag)
28}
29
30pub fn string_from_cow(cow: Cow<[u8]>) -> Result<String, Box<dyn std::error::Error>> {
31    let string = match cow {
32        Cow::Owned(internal) => String::from_utf8(internal)?,
33        Cow::Borrowed(internal) => String::from_utf8(internal.to_owned())?,
34    };
35    Ok(string)
36}
37
38struct Stringable(String);
39
40impl fmt::Display for Stringable {
41    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42        write!(f, "{}", self.0)
43    }
44}
45
46impl From<&BytesStart<'_>> for Stringable {
47    fn from(start: &BytesStart) -> Self {
48        let tag = start.name();
49        let tag = tag.to_owned();
50        let tag = String::from_utf8(tag).expect("Tag not in utf8");
51        Self(tag)
52    }
53}
54
55pub struct XPath(Vec<String>);
56
57impl XPath {
58    pub fn new() -> Self {
59        Self(vec![])
60    }
61    pub fn push(&mut self, tag: String) {
62        self.0.push(tag);
63    }
64    pub fn pop(&mut self) -> Option<String> {
65        self.0.pop()
66    }
67    pub fn pop_checked(&mut self, tag: String) {
68        assert_eq!(self.pop().expect("can't end without starting."), tag);
69    }
70    pub fn as_string(&self) -> String {
71        self.0.join("=>")
72    }
73}
74
75impl Default for XPath {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl fmt::Debug for XPath {
82    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
83        write!(f, "{}", self.as_string())
84    }
85}
86
87impl fmt::Display for XPath {
88    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
89        write!(f, "{}", self.as_string())
90    }
91}
92
93/// Get a list of all tags in an xml file.
94pub fn tag_names(path: &str) -> Result<HashSet<String>, Box<dyn std::error::Error>> {
95    let mut reader = Reader::from_file(path)?;
96    let mut buf = Vec::new();
97    let mut tag_names: HashSet<String> = HashSet::new();
98    loop {
99        match reader.read_event(&mut buf) {
100            Ok(Event::Start(ref e)) => {
101                let tag = start_tag_string(e)?;
102                tag_names.insert(tag);
103            }
104            Ok(Event::Eof) => break,
105            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
106            _ => (),
107        }
108    }
109    Ok(tag_names)
110}
111
112/// Get a list of all text children of all elements in your xml document.
113pub fn all_text(path: &str) -> Result<Vec<String>, Box<dyn std::error::Error>> {
114    let mut reader = Reader::from_file(path)?;
115    let mut buf = Vec::new();
116    let mut txt = Vec::new();
117    loop {
118        match reader.read_event(&mut buf) {
119            Ok(Event::Text(e)) => txt.push(e.unescape_and_decode(&reader).unwrap()),
120            Ok(Event::Eof) => break,
121            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
122            _ => (),
123        }
124    }
125    Ok(txt)
126}
127
128/// List all xml paths (as a set, where siblings have the same path if they have the same element type).
129pub fn paths(path: &str) -> Result<(), Box<dyn std::error::Error>> {
130    let mut reader = Reader::from_file(path)?;
131    let mut xpath: XPath = XPath::new();
132    let mut buf = Vec::new();
133    let mut xpath_strings = HashSet::new();
134    loop {
135        match reader.read_event(&mut buf) {
136            Ok(Event::Start(ref e)) => {
137                xpath.push(start_tag_string(e)?);
138            }
139            Ok(Event::End(ref e)) => {
140                xpath.pop_checked(end_tag_string(e)?);
141            }
142            Ok(Event::Eof) => break,
143            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
144            _ => (),
145        }
146        xpath_strings.insert(xpath.as_string());
147    }
148    let mut xpath_strings: Vec<String> = xpath_strings.into_iter().collect();
149    xpath_strings.sort();
150    for xpath_string in xpath_strings {
151        println!("{}", xpath_string);
152    }
153    Ok(())
154}
155
156/// All attributes in all the elements
157pub fn all_attributes(file_path: &str) -> Result<Vec<String>, Box<dyn std::error::Error>> {
158    let mut reader = Reader::from_file(file_path)?;
159    let mut xpath = XPath::new();
160    let mut buf = Vec::new();
161    let mut attributes = HashSet::new();
162    loop {
163        match reader.read_event(&mut buf) {
164            Ok(Event::Start(ref e)) => {
165                xpath.push(start_tag_string(e)?);
166                for attr in e.attributes() {
167                    let attr_string = format!("{:?}", attr.unwrap());
168                    attributes.insert(attr_string);
169                }
170            }
171            Ok(Event::End(ref e)) => xpath.pop_checked(end_tag_string(e)?),
172            Ok(Event::Eof) => break,
173            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
174            Ok(_event) => {}
175        }
176    }
177    let mut attributes: Vec<String> = attributes.into_iter().collect();
178    attributes.sort();
179    for attr in &attributes {
180        println!("{}", attr);
181    }
182    Ok(attributes)
183}
184
185/// Print out all events found under a specific xpath leaf type.
186pub fn path_contents(
187    file_path: &str,
188    x_path: &str,
189    first: u32,
190    last: u32,
191) -> Result<(), Box<dyn std::error::Error>> {
192    let mut reader = Reader::from_file(file_path)?;
193    let mut xpath = XPath::new();
194    let mut xpath_string = "".to_owned();
195    let mut buf = Vec::new();
196    let mut index = 0;
197    loop {
198        match reader.read_event(&mut buf) {
199            Ok(Event::Start(ref e)) => {
200                xpath.push(start_tag_string(e)?);
201                xpath_string = xpath.as_string();
202            }
203            Ok(Event::End(ref e)) => {
204                if x_path == xpath_string {
205                    index += 1;
206                    if first <= index && index <= last {
207                        println!()
208                    };
209                }
210                xpath.pop_checked(end_tag_string(e)?);
211                xpath_string = xpath.as_string();
212            }
213            Ok(Event::Eof) => break,
214            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
215            Ok(event) => {
216                if x_path == xpath_string && first <= index && index <= last {
217                    println!("{:?}", event);
218                }
219            }
220        }
221    }
222    Ok(())
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn run_path_contents() -> Result<(), Box<dyn std::error::Error>> {
231        // feed
232        // feed=>author
233        // feed=>author=>email
234        // feed=>author=>name
235        // feed=>entry
236        // feed=>entry=>app:control
237        // feed=>entry=>app:control=>app:draft
238        // feed=>entry=>author
239        // feed=>entry=>author=>email
240        // feed=>entry=>author=>name
241        // feed=>entry=>author=>uri
242        // feed=>entry=>content
243        // feed=>entry=>id
244        // feed=>entry=>published
245        // feed=>entry=>thr:total
246        // feed=>entry=>title
247        // feed=>entry=>updated
248        // feed=>generator
249        // feed=>id
250        // feed=>title
251        // feed=>updated
252        let entry_number = 102;
253        path_contents(
254            "data/harris_backup.xml",
255            "feed=>entry=>app:control=>app:draft",
256            0,
257            entry_number,
258        )?;
259        Ok(())
260    }
261    #[test]
262    fn run_all_attributes() -> Result<(), Box<dyn std::error::Error>> {
263        all_attributes("data/harris_backup.xml")?;
264        Ok(())
265    }
266    #[test]
267    fn run_paths() -> Result<(), Box<dyn std::error::Error>> {
268        paths("data/harris_backup.xml")?;
269        Ok(())
270    }
271    #[test]
272    fn print_tag_names() -> Result<(), Box<dyn std::error::Error>> {
273        let tags = tag_names("data/harris_backup.xml")?;
274        dbg!(tags);
275        Ok(())
276    }
277    #[test]
278    fn print_all_text() -> Result<(), Box<dyn std::error::Error>> {
279        let tags = all_text("data/harris_backup.xml")?;
280        dbg!(tags);
281        Ok(())
282    }
283}