feed_parser/parsers/rss2/
mod.rs

1use crate::parsers::{
2    errors::{ParseError, ParseResult},
3    Feed,
4};
5use core::str;
6use quick_xml::de::from_str;
7use quick_xml::events::{BytesEnd, BytesStart, Event};
8use quick_xml::Reader;
9use quick_xml::Writer;
10use regex::Regex;
11use std::io::Cursor;
12
13#[cfg(test)]
14mod tests;
15
16fn preprocess(text: &str) -> String {
17    let tags = vec!["title", "description", "summary", "content", "other"];
18    let mut text = text.to_string();
19    for tag in tags {
20        let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
21        let m = re.replace_all(&text, |caps: &regex::Captures| {
22            let content = &caps["content"];
23            let content = html_escape::encode_text(content);
24            format!("<{}>{}</{}>", tag, content, tag)
25        });
26        text = m.to_string();
27    }
28    return text.to_string();
29}
30
31pub fn parse(text: &str) -> ParseResult<Vec<Feed>> {
32    let text = preprocess(text);
33
34    let mut reader = Reader::from_str(&text);
35    reader.config_mut().trim_text(true);
36
37    let mut feeds = Vec::new();
38    let mut writer = Writer::new(Cursor::new(Vec::new()));
39    let mut parsing = false;
40    loop {
41        match reader.read_event() {
42            Ok(Event::Start(e)) => {
43                if parsing {
44                    if e.name().as_ref() == b"dc:creator" {
45                        assert!(writer
46                            .write_event(Event::Start(BytesStart::new("creator")))
47                            .is_ok());
48                    } else if e.name().as_ref() == b"dc:date" {
49                        assert!(writer
50                            .write_event(Event::Start(BytesStart::new("date")))
51                            .is_ok());
52                    } else if e.name().as_ref() == b"pubDate" {
53                        assert!(writer
54                            .write_event(Event::Start(BytesStart::new("publish_date")))
55                            .is_ok());
56                    } else {
57                        assert!(writer.write_event(Event::Start(e.clone())).is_ok());
58                    }
59                }
60                if e.name().as_ref() == b"item" {
61                    assert!(writer
62                        .write_event(Event::Start(BytesStart::new("item")))
63                        .is_ok());
64                    parsing = true;
65                }
66            }
67            Ok(Event::End(e)) => {
68                if e.name().as_ref() == b"item" {
69                    assert!(writer
70                        .write_event(Event::End(BytesEnd::new("item")))
71                        .is_ok());
72                    let feed_text = writer.into_inner().into_inner();
73                    let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
74                    feeds.push(feed);
75
76                    writer = Writer::new(Cursor::new(Vec::new()));
77                    parsing = false;
78                }
79                if parsing {
80                    if e.name().as_ref() == b"dc:creator" {
81                        assert!(writer
82                            .write_event(Event::End(BytesEnd::new("creator")))
83                            .is_ok());
84                    } else if e.name().as_ref() == b"dc:date" {
85                        assert!(writer
86                            .write_event(Event::End(BytesEnd::new("date")))
87                            .is_ok());
88                    } else if e.name().as_ref() == b"pubDate" {
89                        assert!(writer
90                            .write_event(Event::End(BytesEnd::new("publish_date")))
91                            .is_ok());
92                    } else {
93                        assert!(writer.write_event(Event::End(e)).is_ok());
94                    }
95                }
96            }
97            Ok(Event::Text(e)) => {
98                if parsing {
99                    assert!(writer.write_event(Event::Text(e)).is_ok());
100                }
101            }
102            Ok(Event::CData(e)) => {
103                if parsing {
104                    assert!(writer.write_event(Event::CData(e)).is_ok());
105                }
106            }
107            Ok(Event::Eof) => break,
108            Ok(_e) => {}
109            Err(e) => {
110                return Err(ParseError::XmlParseError(e));
111            }
112        }
113    }
114    return Ok(feeds);
115}