feed_parser/parsers/rss1/
mod.rs1use crate::parsers::{
2 errors::{ParseError, ParseResult},
3 Feed,
4};
5use core::str;
6use quick_xml::de::from_str;
7use quick_xml::events::{BytesEnd, BytesStart, Event};
8use quick_xml::Reader;
9use quick_xml::Writer;
10use regex::Regex;
11use std::io::Cursor;
12
13#[cfg(test)]
14mod tests;
15
16fn preprocess(text: &str) -> String {
17 let tags = vec!["title", "description", "summary", "content"];
18 let mut text = text.to_string();
19 for tag in tags {
20 let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
21 let m = re.replace_all(&text, |caps: ®ex::Captures| {
22 let content = &caps["content"];
23 let content = html_escape::encode_text(content);
24 format!("<{}>{}</{}>", tag, content, tag)
25 });
26 text = m.to_string();
27 }
28 return text.to_string();
29}
30
31pub fn parse(text: &str) -> ParseResult<Vec<Feed>> {
32 let text = preprocess(text);
33
34 let mut reader = Reader::from_str(&text);
35 reader.config_mut().trim_text(true);
36
37 let mut feeds = Vec::new();
38 let mut writer = Writer::new(Cursor::new(Vec::new()));
39 let mut parsing = false;
40 loop {
41 match reader.read_event() {
42 Ok(Event::Start(e)) => {
43 if parsing {
44 if e.name().as_ref() == b"dc:creator" {
45 assert!(writer
46 .write_event(Event::Start(BytesStart::new("creator")))
47 .is_ok());
48 } else if e.name().as_ref() == b"dc:date" {
49 assert!(writer
50 .write_event(Event::Start(BytesStart::new("date")))
51 .is_ok());
52 } else if e.name().as_ref() == b"pubDate" {
53 assert!(writer
54 .write_event(Event::Start(BytesStart::new("publish_date")))
55 .is_ok());
56 } else {
57 assert!(writer.write_event(Event::Start(e.clone())).is_ok());
58 }
59 }
60 if e.name().as_ref() == b"item" {
61 assert!(writer
62 .write_event(Event::Start(BytesStart::new("item")))
63 .is_ok());
64 parsing = true;
65 }
66 }
67 Ok(Event::End(e)) => {
68 if e.name().as_ref() == b"item" {
69 assert!(writer
70 .write_event(Event::End(BytesEnd::new("item")))
71 .is_ok());
72 let feed_text = writer.into_inner().into_inner();
73 let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
74 feeds.push(feed);
75
76 writer = Writer::new(Cursor::new(Vec::new()));
77 parsing = false;
78 }
79 if parsing {
80 if e.name().as_ref() == b"dc:creator" {
81 assert!(writer
82 .write_event(Event::End(BytesEnd::new("creator")))
83 .is_ok());
84 } else if e.name().as_ref() == b"dc:date" {
85 assert!(writer
86 .write_event(Event::End(BytesEnd::new("date")))
87 .is_ok());
88 } else if e.name().as_ref() == b"pubDate" {
89 assert!(writer
90 .write_event(Event::End(BytesEnd::new("publish_date")))
91 .is_ok());
92 } else {
93 assert!(writer.write_event(Event::End(e)).is_ok());
94 }
95 }
96 }
97 Ok(Event::Text(e)) => {
98 if parsing {
99 assert!(writer.write_event(Event::Text(e)).is_ok());
100 }
101 }
102 Ok(Event::CData(e)) => {
103 if parsing {
104 assert!(writer.write_event(Event::CData(e)).is_ok());
105 }
106 }
107 Ok(Event::Eof) => break,
108 Ok(_e) => {}
109 Err(e) => {
110 return Err(ParseError::XmlParseError(e));
111 }
112 }
113 }
114 return Ok(feeds);
115}