feed_parser/parsers/atom/
mod.rs

1use crate::parsers::{
2    errors::{ParseError, ParseResult},
3    Feed,
4};
5use core::str;
6use quick_xml::de::from_str;
7use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
8use quick_xml::Reader;
9use quick_xml::Writer;
10use regex::Regex;
11use std::io::Cursor;
12
13#[cfg(test)]
14mod tests;
15
16fn preprocess(text: &str) -> String {
17    let tags = vec!["title", "description", "summary", "content"];
18    let mut text = text.to_string();
19    for tag in tags {
20        let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
21        let m = re.replace_all(&text, |caps: &regex::Captures| {
22            let content = &caps["content"];
23            let content = html_escape::encode_text(content);
24            format!("<{}>{}</{}>", tag, content, tag)
25        });
26        text = m.to_string();
27    }
28    return text.to_string();
29}
30
31pub fn parse(text: &str) -> ParseResult<Vec<Feed>> {
32    let text = preprocess(text);
33
34    let mut reader = Reader::from_str(&text);
35    reader.config_mut().trim_text(true);
36
37    let mut feeds = Vec::new();
38    let mut writer = Writer::new(Cursor::new(Vec::new()));
39    let mut parsing = false;
40    loop {
41        match reader.read_event() {
42            Ok(Event::Start(e)) => {
43                if parsing {
44                    if e.name().as_ref() == b"dc:creator" {
45                        assert!(writer
46                            .write_event(Event::Start(BytesStart::new("creator")))
47                            .is_ok());
48                    } else if e.name().as_ref() == b"dc:date" {
49                        assert!(writer
50                            .write_event(Event::Start(BytesStart::new("date")))
51                            .is_ok());
52                    } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
53                        assert!(writer
54                            .write_event(Event::Start(BytesStart::new("publish_date")))
55                            .is_ok());
56                    } else if e.name().as_ref() == b"description" {
57                        assert!(writer
58                            .write_event(Event::Start(BytesStart::new("description")))
59                            .is_ok());
60                    } else if e.name().as_ref() == b"link" {
61                        continue;
62                    } else {
63                        assert!(writer.write_event(Event::Start(e.clone())).is_ok());
64                    }
65                }
66                if e.name().as_ref() == b"entry" {
67                    assert!(writer
68                        .write_event(Event::Start(BytesStart::new("entry")))
69                        .is_ok());
70                    parsing = true;
71                }
72            }
73            Ok(Event::Empty(e)) => {
74                if parsing {
75                    if e.name().as_ref() == b"link" {
76                        let mut is_link = true;
77                        for attr in e.attributes() {
78                            let attr = attr.unwrap();
79                            if attr.key.0 == b"type" {
80                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
81                                if attr_text != "text/html" {
82                                    is_link = false;
83                                }
84                            } else if attr.key.0 == b"rel" {
85                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
86                                if attr_text != "alternate" {
87                                    is_link = false;
88                                }
89                            }
90                        }
91                        if is_link == false {
92                            continue;
93                        }
94                        for attr in e.attributes() {
95                            let attr = attr.unwrap();
96                            if attr.key.0 == b"href" {
97                                assert!(writer
98                                    .write_event(Event::Start(BytesStart::new("link")))
99                                    .is_ok());
100                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
101                                assert!(writer
102                                    .write_event(Event::Text(BytesText::new(attr_text)))
103                                    .is_ok());
104                                assert!(writer
105                                    .write_event(Event::End(BytesEnd::new("link")))
106                                    .is_ok());
107                            }
108                        }
109                    } else {
110                        assert!(writer.write_event(Event::Empty(e)).is_ok());
111                    }
112                }
113            }
114            Ok(Event::End(e)) => {
115                if e.name().as_ref() == b"entry" {
116                    assert!(writer
117                        .write_event(Event::End(BytesEnd::new("entry")))
118                        .is_ok());
119                    let feed_text = writer.into_inner().into_inner();
120                    let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
121                    feeds.push(feed);
122
123                    writer = Writer::new(Cursor::new(Vec::new()));
124                    parsing = false;
125                }
126                if parsing {
127                    if e.name().as_ref() == b"dc:creator" {
128                        assert!(writer
129                            .write_event(Event::End(BytesEnd::new("creator")))
130                            .is_ok());
131                    } else if e.name().as_ref() == b"dc:date" {
132                        assert!(writer
133                            .write_event(Event::End(BytesEnd::new("date")))
134                            .is_ok());
135                    } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
136                        assert!(writer
137                            .write_event(Event::End(BytesEnd::new("publish_date")))
138                            .is_ok());
139                    } else if e.name().as_ref() == b"link" {
140                        continue;
141                    } else {
142                        assert!(writer.write_event(Event::End(e)).is_ok());
143                    }
144                }
145            }
146            Ok(Event::Text(e)) => {
147                if parsing {
148                    let text = str::from_utf8(&e as &[u8]).unwrap();
149                    let text = html_escape::decode_html_entities(text);
150                    let e = BytesText::new(&text);
151                    assert!(writer.write_event(Event::Text(e)).is_ok());
152                }
153            }
154            Ok(Event::CData(e)) => {
155                if parsing {
156                    assert!(writer.write_event(Event::CData(e)).is_ok());
157                }
158            }
159            Ok(Event::Eof) => break,
160            Ok(_e) => {}
161            Err(e) => {
162                return Err(ParseError::XmlParseError(e));
163            }
164        }
165    }
166    return Ok(feeds);
167}