feed_parser/parsers/rss2/
mod.rs1use crate::parsers::Feed;
2use core::str;
3use quick_xml::de::from_str;
4use quick_xml::events::{BytesEnd, BytesStart, Event};
5use quick_xml::Reader;
6use quick_xml::Writer;
7use regex::Regex;
8use std::io::Cursor;
9
10#[cfg(test)]
11mod tests;
12
13fn preprocess(text: &str) -> String {
14 let tags = vec!["title", "description", "summary", "content", "other"];
15 let mut text = text.to_string();
16 for tag in tags {
17 let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
18 let m = re.replace_all(&text, |caps: ®ex::Captures| {
19 let content = &caps["content"];
20 let content = html_escape::encode_text(content);
21 format!("<{}>{}</{}>", tag, content, tag)
22 });
23 text = m.to_string();
24 }
25 return text.to_string();
26}
27
28pub fn parse(text: &str) -> Result<Vec<Feed>, String> {
29 let text = preprocess(text);
30
31 let mut reader = Reader::from_str(&text);
32 reader.config_mut().trim_text(true);
33
34 let mut feeds = Vec::new();
35 let mut writer = Writer::new(Cursor::new(Vec::new()));
36 let mut parsing = false;
37 loop {
38 match reader.read_event() {
39 Ok(Event::Start(e)) => {
40 if parsing {
41 if e.name().as_ref() == b"dc:creator" {
42 assert!(writer
43 .write_event(Event::Start(BytesStart::new("creator")))
44 .is_ok());
45 } else if e.name().as_ref() == b"dc:date" {
46 assert!(writer
47 .write_event(Event::Start(BytesStart::new("date")))
48 .is_ok());
49 } else if e.name().as_ref() == b"pubDate" {
50 assert!(writer
51 .write_event(Event::Start(BytesStart::new("publish_date")))
52 .is_ok());
53 } else {
54 assert!(writer.write_event(Event::Start(e.clone())).is_ok());
55 }
56 }
57 if e.name().as_ref() == b"item" {
58 assert!(writer
59 .write_event(Event::Start(BytesStart::new("item")))
60 .is_ok());
61 parsing = true;
62 }
63 }
64 Ok(Event::End(e)) => {
65 if e.name().as_ref() == b"item" {
66 assert!(writer
67 .write_event(Event::End(BytesEnd::new("item")))
68 .is_ok());
69 let feed_text = writer.into_inner().into_inner();
70 let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
71 feeds.push(feed);
72
73 writer = Writer::new(Cursor::new(Vec::new()));
74 parsing = false;
75 }
76 if parsing {
77 if e.name().as_ref() == b"dc:creator" {
78 assert!(writer
79 .write_event(Event::End(BytesEnd::new("creator")))
80 .is_ok());
81 } else if e.name().as_ref() == b"dc:date" {
82 assert!(writer
83 .write_event(Event::End(BytesEnd::new("date")))
84 .is_ok());
85 } else if e.name().as_ref() == b"pubDate" {
86 assert!(writer
87 .write_event(Event::End(BytesEnd::new("publish_date")))
88 .is_ok());
89 } else {
90 assert!(writer.write_event(Event::End(e)).is_ok());
91 }
92 }
93 }
94 Ok(Event::Text(e)) => {
95 if parsing {
96 assert!(writer.write_event(Event::Text(e)).is_ok());
97 }
98 }
99 Ok(Event::CData(e)) => {
100 if parsing {
101 assert!(writer.write_event(Event::CData(e)).is_ok());
102 }
103 }
104 Ok(Event::Eof) => break,
105 Ok(_e) => {}
106 Err(e) => {
107 return Err(format!(
108 "Error at position {}: {:?}",
109 reader.error_position(),
110 e
111 ))
112 }
113 }
114 }
115 return Ok(feeds);
116}