parse_blogger_backup_xml/
parse_backup.rs

1/// A blogger backup has a schema with the following path types:
2/// - feed
3/// - feed=>author
4/// - feed=>author=>email
5/// - feed=>author=>name
6/// - feed=>entry
7/// - feed=>entry=>app:control
8/// - feed=>entry=>app:control=>app:draft
9/// - feed=>entry=>author
10/// - feed=>entry=>author=>email
11/// - feed=>entry=>author=>name
12/// - feed=>entry=>author=>uri
13/// - feed=>entry=>content
14/// - feed=>entry=>id
15/// - feed=>entry=>published
16/// - feed=>entry=>thr:total
17/// - feed=>entry=>title
18/// - feed=>entry=>updated
19/// - feed=>generator
20/// - feed=>id
21/// - feed=>title
22/// - feed=>updated
23///
24/// In other words, there are a few main entity types:
25/// feed, author, and entry.
26/// Of those, only entry corresponds to actual blog posts.
27/// However, both comments and posts are entries.
28/// get_posts figures all that out.
29use std::collections::HashMap;
30use std::str::FromStr;
31
32use quick_xml::events::Event;
33use quick_xml::Reader;
34
35use crate::models::Entry;
36use crate::models::EntryKind;
37use crate::models::Post;
38use crate::xml_tools::end_tag_string;
39use crate::xml_tools::start_tag_string;
40use crate::xml_tools::string_from_bytes_text;
41use crate::xml_tools::string_from_cow;
42use crate::xml_tools::XPath;
43
44// const COMMENT_KIND: &[u8] = b"http://schemas.google.com/blogger/2008/kind#comment";
45const POST_ID_PREFIX: &[u8] = b"tag:blogger.com,1999:blog";
46const POST_KIND: &[u8] = b"http://schemas.google.com/blogger/2008/kind#post";
47const SETTINGS_KIND: &[u8] = b"http://schemas.google.com/blogger/2008/kind#settings";
48const TEMPLATE_KIND: &[u8] = b"http://schemas.google.com/blogger/2008/kind#template";
49
50/// Logic in this function:
51/// - finds entries,
52/// - determines whether they are posts or comments, and
53/// - assigns comments to their posts
54pub fn get_posts(file_path: &str) -> Result<Vec<Post>, Box<dyn std::error::Error>> {
55    let mut buf = Vec::new();
56    let mut comments = Vec::new();
57    let mut entry = Entry::new();
58    let mut posts = HashMap::new();
59    let mut reader = Reader::from_file(file_path)?;
60    let mut xpath = XPath::new();
61    loop {
62        match reader.read_event(&mut buf) {
63            Ok(Event::Start(ref bytes_start)) => {
64                xpath.push(start_tag_string(bytes_start)?);
65            }
66            Ok(Event::End(ref bytes_end)) => {
67                if xpath.as_string() == "feed=>entry" {
68                    match entry.kind {
69                        Some(EntryKind::Comment) => comments.push(entry.to_comment().unwrap()),
70                        Some(EntryKind::Post) => {
71                            let post = entry.to_post().unwrap();
72                            posts.insert(post.id.to_owned(), post);
73                        }
74                        _ => (),
75                    }
76                    entry.clear();
77                }
78                xpath.pop_checked(end_tag_string(bytes_end)?);
79            }
80            Ok(Event::Empty(byte_start)) => {
81                for attribute in byte_start.attributes().flatten() {
82                    match attribute.value {
83                        value if value == POST_KIND => entry.kind = Some(EntryKind::Post),
84                        value if value == SETTINGS_KIND => entry.kind = Some(EntryKind::Settings),
85                        value if value == TEMPLATE_KIND => entry.kind = Some(EntryKind::Template),
86                        value if value.starts_with(POST_ID_PREFIX) => {
87                            entry.kind = Some(EntryKind::Comment);
88                            entry.post_id = Some(string_from_cow(value)?);
89                        }
90                        _value => {
91                            // let value = string_from_cow(value)?;
92                            // dbg!(value);
93                        }
94                    }
95                }
96            }
97            Ok(Event::Text(bytes_text)) => {
98                let text = Some(string_from_bytes_text(bytes_text)?);
99                match xpath.as_string().as_str() {
100                    "feed=>entry=>author=>name" => entry.author_name = text,
101                    "feed=>entry=>published" => {
102                        let text = text.unwrap();
103                        let published = parse_published(&text)?;
104                        entry.published = Some(published);
105                    }
106                    "feed=>entry=>id" => entry.id = text,
107                    "feed=>entry=>title" => entry.title = text,
108                    "feed=>entry=>content" => entry.content = text,
109                    "feed=>entry=>app:control=>app:draft" => {
110                        if text.unwrap() == "yes" {
111                            entry.draft = true;
112                            println!("This post is a draft")
113                        }
114                    }
115                    "feed=>entry" => println!("{}", text.unwrap()),
116                    _ => (),
117                }
118            }
119            Ok(Event::Eof) => break,
120            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
121            Ok(_event) => {}
122        }
123    }
124    for comment in comments {
125        if let Some(post) = posts.get_mut(&comment.post_id) {
126            post.comments.push(comment);
127        } else {
128            println!("missing post for comment {:?}", comment);
129        }
130    }
131    let mut posts: Vec<Post> = posts.into_iter().map(|(_, post)| post).collect();
132    posts.sort_by(|a, b| a.published.cmp(&b.published));
133    Ok(posts)
134}
135
136pub fn parse_published(
137    published: &str,
138) -> Result<chrono::DateTime<chrono::FixedOffset>, Box<dyn std::error::Error>> {
139    let dt = chrono::DateTime::from_str(published)?;
140    Ok(dt)
141}
142
143#[cfg(test)]
144mod tests {
145    use super::get_posts;
146
147    #[test]
148    fn test_get_posts() {
149        let posts = get_posts("data/backup.xml").unwrap();
150        dbg!(posts);
151    }
152}