feed_parser/parsers/atom/
mod.rs

1use crate::parsers::Feed;
2use core::str;
3use quick_xml::de::from_str;
4use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
5use quick_xml::Reader;
6use quick_xml::Writer;
7use regex::Regex;
8use std::io::Cursor;
9
10#[cfg(test)]
11mod tests;
12
13fn preprocess(text: &str) -> String {
14    let tags = vec!["title", "description", "summary", "content"];
15    let mut text = text.to_string();
16    for tag in tags {
17        let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
18        let m = re.replace_all(&text, |caps: &regex::Captures| {
19            let content = &caps["content"];
20            let content = html_escape::encode_text(content);
21            format!("<{}>{}</{}>", tag, content, tag)
22        });
23        text = m.to_string();
24    }
25    return text.to_string();
26}
27
28pub fn parse(text: &str) -> Result<Vec<Feed>, String> {
29    let text = preprocess(text);
30
31    let mut reader = Reader::from_str(&text);
32    reader.config_mut().trim_text(true);
33
34    let mut feeds = Vec::new();
35    let mut writer = Writer::new(Cursor::new(Vec::new()));
36    let mut parsing = false;
37    loop {
38        match reader.read_event() {
39            Ok(Event::Start(e)) => {
40                if parsing {
41                    if e.name().as_ref() == b"dc:creator" {
42                        assert!(writer
43                            .write_event(Event::Start(BytesStart::new("creator")))
44                            .is_ok());
45                    } else if e.name().as_ref() == b"dc:date" {
46                        assert!(writer
47                            .write_event(Event::Start(BytesStart::new("date")))
48                            .is_ok());
49                    } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
50                        assert!(writer
51                            .write_event(Event::Start(BytesStart::new("publish_date")))
52                            .is_ok());
53                    } else if e.name().as_ref() == b"description" {
54                        assert!(writer
55                            .write_event(Event::Start(BytesStart::new("description")))
56                            .is_ok());
57                    } else if e.name().as_ref() == b"link" {
58                        continue;
59                    } else {
60                        assert!(writer.write_event(Event::Start(e.clone())).is_ok());
61                    }
62                }
63                if e.name().as_ref() == b"entry" {
64                    assert!(writer
65                        .write_event(Event::Start(BytesStart::new("entry")))
66                        .is_ok());
67                    parsing = true;
68                }
69            }
70            Ok(Event::Empty(e)) => {
71                if parsing {
72                    if e.name().as_ref() == b"link" {
73                        let mut is_link = true;
74                        for attr in e.attributes() {
75                            let attr = attr.unwrap();
76                            if attr.key.0 == b"type" {
77                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
78                                if attr_text != "text/html" {
79                                    is_link = false;
80                                }
81                            } else if attr.key.0 == b"rel" {
82                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
83                                if attr_text != "alternate" {
84                                    is_link = false;
85                                }
86                            }
87                        }
88                        if is_link == false {
89                            continue;
90                        }
91                        for attr in e.attributes() {
92                            let attr = attr.unwrap();
93                            if attr.key.0 == b"href" {
94                                assert!(writer
95                                    .write_event(Event::Start(BytesStart::new("link")))
96                                    .is_ok());
97                                let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
98                                assert!(writer
99                                    .write_event(Event::Text(BytesText::new(attr_text)))
100                                    .is_ok());
101                                assert!(writer
102                                    .write_event(Event::End(BytesEnd::new("link")))
103                                    .is_ok());
104                            }
105                        }
106                    } else {
107                        assert!(writer.write_event(Event::Empty(e)).is_ok());
108                    }
109                }
110            }
111            Ok(Event::End(e)) => {
112                if e.name().as_ref() == b"entry" {
113                    assert!(writer
114                        .write_event(Event::End(BytesEnd::new("entry")))
115                        .is_ok());
116                    let feed_text = writer.into_inner().into_inner();
117                    let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
118                    feeds.push(feed);
119
120                    writer = Writer::new(Cursor::new(Vec::new()));
121                    parsing = false;
122                }
123                if parsing {
124                    if e.name().as_ref() == b"dc:creator" {
125                        assert!(writer
126                            .write_event(Event::End(BytesEnd::new("creator")))
127                            .is_ok());
128                    } else if e.name().as_ref() == b"dc:date" {
129                        assert!(writer
130                            .write_event(Event::End(BytesEnd::new("date")))
131                            .is_ok());
132                    } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
133                        assert!(writer
134                            .write_event(Event::End(BytesEnd::new("publish_date")))
135                            .is_ok());
136                    } else if e.name().as_ref() == b"link" {
137                        continue;
138                    } else {
139                        assert!(writer.write_event(Event::End(e)).is_ok());
140                    }
141                }
142            }
143            Ok(Event::Text(e)) => {
144                if parsing {
145                    let text = str::from_utf8(&e as &[u8]).unwrap();
146                    let text = html_escape::decode_html_entities(text);
147                    let e = BytesText::new(&text);
148                    assert!(writer.write_event(Event::Text(e)).is_ok());
149                }
150            }
151            Ok(Event::CData(e)) => {
152                if parsing {
153                    assert!(writer.write_event(Event::CData(e)).is_ok());
154                }
155            }
156            Ok(Event::Eof) => break,
157            Ok(_e) => {}
158            Err(e) => {
159                println!("Error at position {}: {:?}", reader.error_position(), e);
160            }
161        }
162    }
163    return Ok(feeds);
164}