feed_parser/parsers/rss2/
mod.rs

1use crate::parsers::Feed;
2use core::str;
3use quick_xml::de::from_str;
4use quick_xml::events::{BytesEnd, BytesStart, Event};
5use quick_xml::Reader;
6use quick_xml::Writer;
7use regex::Regex;
8use std::io::Cursor;
9
10#[cfg(test)]
11mod tests;
12
13fn preprocess(text: &str) -> String {
14    let tags = vec!["title", "description", "summary", "content", "other"];
15    let mut text = text.to_string();
16    for tag in tags {
17        let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
18        let m = re.replace_all(&text, |caps: &regex::Captures| {
19            let content = &caps["content"];
20            let content = html_escape::encode_text(content);
21            format!("<{}>{}</{}>", tag, content, tag)
22        });
23        text = m.to_string();
24    }
25    return text.to_string();
26}
27
28pub fn parse(text: &str) -> Result<Vec<Feed>, String> {
29    let text = preprocess(text);
30
31    let mut reader = Reader::from_str(&text);
32    reader.config_mut().trim_text(true);
33
34    let mut feeds = Vec::new();
35    let mut writer = Writer::new(Cursor::new(Vec::new()));
36    let mut parsing = false;
37    loop {
38        match reader.read_event() {
39            Ok(Event::Start(e)) => {
40                if parsing {
41                    if e.name().as_ref() == b"dc:creator" {
42                        assert!(writer
43                            .write_event(Event::Start(BytesStart::new("creator")))
44                            .is_ok());
45                    } else if e.name().as_ref() == b"dc:date" {
46                        assert!(writer
47                            .write_event(Event::Start(BytesStart::new("date")))
48                            .is_ok());
49                    } else if e.name().as_ref() == b"pubDate" {
50                        assert!(writer
51                            .write_event(Event::Start(BytesStart::new("publish_date")))
52                            .is_ok());
53                    } else {
54                        assert!(writer.write_event(Event::Start(e.clone())).is_ok());
55                    }
56                }
57                if e.name().as_ref() == b"item" {
58                    assert!(writer
59                        .write_event(Event::Start(BytesStart::new("item")))
60                        .is_ok());
61                    parsing = true;
62                }
63            }
64            Ok(Event::End(e)) => {
65                if e.name().as_ref() == b"item" {
66                    assert!(writer
67                        .write_event(Event::End(BytesEnd::new("item")))
68                        .is_ok());
69                    let feed_text = writer.into_inner().into_inner();
70                    let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
71                    feeds.push(feed);
72
73                    writer = Writer::new(Cursor::new(Vec::new()));
74                    parsing = false;
75                }
76                if parsing {
77                    if e.name().as_ref() == b"dc:creator" {
78                        assert!(writer
79                            .write_event(Event::End(BytesEnd::new("creator")))
80                            .is_ok());
81                    } else if e.name().as_ref() == b"dc:date" {
82                        assert!(writer
83                            .write_event(Event::End(BytesEnd::new("date")))
84                            .is_ok());
85                    } else if e.name().as_ref() == b"pubDate" {
86                        assert!(writer
87                            .write_event(Event::End(BytesEnd::new("publish_date")))
88                            .is_ok());
89                    } else {
90                        assert!(writer.write_event(Event::End(e)).is_ok());
91                    }
92                }
93            }
94            Ok(Event::Text(e)) => {
95                if parsing {
96                    assert!(writer.write_event(Event::Text(e)).is_ok());
97                }
98            }
99            Ok(Event::CData(e)) => {
100                if parsing {
101                    assert!(writer.write_event(Event::CData(e)).is_ok());
102                }
103            }
104            Ok(Event::Eof) => break,
105            Ok(_e) => {}
106            Err(e) => {
107                return Err(format!(
108                    "Error at position {}: {:?}",
109                    reader.error_position(),
110                    e
111                ))
112            }
113        }
114    }
115    return Ok(feeds);
116}