feed_parser/parsers/atom/
mod.rs1use crate::parsers::{
2 errors::{ParseError, ParseResult},
3 Feed,
4};
5use core::str;
6use quick_xml::de::from_str;
7use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
8use quick_xml::Reader;
9use quick_xml::Writer;
10use regex::Regex;
11use std::io::Cursor;
12
13#[cfg(test)]
14mod tests;
15
16fn preprocess(text: &str) -> String {
17 let tags = vec!["title", "description", "summary", "content"];
18 let mut text = text.to_string();
19 for tag in tags {
20 let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
21 let m = re.replace_all(&text, |caps: ®ex::Captures| {
22 let content = &caps["content"];
23 let content = html_escape::encode_text(content);
24 format!("<{}>{}</{}>", tag, content, tag)
25 });
26 text = m.to_string();
27 }
28 return text.to_string();
29}
30
31pub fn parse(text: &str) -> ParseResult<Vec<Feed>> {
32 let text = preprocess(text);
33
34 let mut reader = Reader::from_str(&text);
35 reader.config_mut().trim_text(true);
36
37 let mut feeds = Vec::new();
38 let mut writer = Writer::new(Cursor::new(Vec::new()));
39 let mut parsing = false;
40 loop {
41 match reader.read_event() {
42 Ok(Event::Start(e)) => {
43 if parsing {
44 if e.name().as_ref() == b"dc:creator" {
45 assert!(writer
46 .write_event(Event::Start(BytesStart::new("creator")))
47 .is_ok());
48 } else if e.name().as_ref() == b"dc:date" {
49 assert!(writer
50 .write_event(Event::Start(BytesStart::new("date")))
51 .is_ok());
52 } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
53 assert!(writer
54 .write_event(Event::Start(BytesStart::new("publish_date")))
55 .is_ok());
56 } else if e.name().as_ref() == b"description" {
57 assert!(writer
58 .write_event(Event::Start(BytesStart::new("description")))
59 .is_ok());
60 } else if e.name().as_ref() == b"link" {
61 continue;
62 } else {
63 assert!(writer.write_event(Event::Start(e.clone())).is_ok());
64 }
65 }
66 if e.name().as_ref() == b"entry" {
67 assert!(writer
68 .write_event(Event::Start(BytesStart::new("entry")))
69 .is_ok());
70 parsing = true;
71 }
72 }
73 Ok(Event::Empty(e)) => {
74 if parsing {
75 if e.name().as_ref() == b"link" {
76 let mut is_link = true;
77 for attr in e.attributes() {
78 let attr = attr.unwrap();
79 if attr.key.0 == b"type" {
80 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
81 if attr_text != "text/html" {
82 is_link = false;
83 }
84 } else if attr.key.0 == b"rel" {
85 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
86 if attr_text != "alternate" {
87 is_link = false;
88 }
89 }
90 }
91 if is_link == false {
92 continue;
93 }
94 for attr in e.attributes() {
95 let attr = attr.unwrap();
96 if attr.key.0 == b"href" {
97 assert!(writer
98 .write_event(Event::Start(BytesStart::new("link")))
99 .is_ok());
100 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
101 assert!(writer
102 .write_event(Event::Text(BytesText::new(attr_text)))
103 .is_ok());
104 assert!(writer
105 .write_event(Event::End(BytesEnd::new("link")))
106 .is_ok());
107 }
108 }
109 } else {
110 assert!(writer.write_event(Event::Empty(e)).is_ok());
111 }
112 }
113 }
114 Ok(Event::End(e)) => {
115 if e.name().as_ref() == b"entry" {
116 assert!(writer
117 .write_event(Event::End(BytesEnd::new("entry")))
118 .is_ok());
119 let feed_text = writer.into_inner().into_inner();
120 let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
121 feeds.push(feed);
122
123 writer = Writer::new(Cursor::new(Vec::new()));
124 parsing = false;
125 }
126 if parsing {
127 if e.name().as_ref() == b"dc:creator" {
128 assert!(writer
129 .write_event(Event::End(BytesEnd::new("creator")))
130 .is_ok());
131 } else if e.name().as_ref() == b"dc:date" {
132 assert!(writer
133 .write_event(Event::End(BytesEnd::new("date")))
134 .is_ok());
135 } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
136 assert!(writer
137 .write_event(Event::End(BytesEnd::new("publish_date")))
138 .is_ok());
139 } else if e.name().as_ref() == b"link" {
140 continue;
141 } else {
142 assert!(writer.write_event(Event::End(e)).is_ok());
143 }
144 }
145 }
146 Ok(Event::Text(e)) => {
147 if parsing {
148 let text = str::from_utf8(&e as &[u8]).unwrap();
149 let text = html_escape::decode_html_entities(text);
150 let e = BytesText::new(&text);
151 assert!(writer.write_event(Event::Text(e)).is_ok());
152 }
153 }
154 Ok(Event::CData(e)) => {
155 if parsing {
156 assert!(writer.write_event(Event::CData(e)).is_ok());
157 }
158 }
159 Ok(Event::Eof) => break,
160 Ok(_e) => {}
161 Err(e) => {
162 return Err(ParseError::XmlParseError(e));
163 }
164 }
165 }
166 return Ok(feeds);
167}