feed_parser/parsers/atom/
mod.rs1use crate::parsers::Feed;
2use core::str;
3use quick_xml::de::from_str;
4use quick_xml::events::{BytesEnd, BytesStart, BytesText, Event};
5use quick_xml::Reader;
6use quick_xml::Writer;
7use regex::Regex;
8use std::io::Cursor;
9
10#[cfg(test)]
11mod tests;
12
13fn preprocess(text: &str) -> String {
14 let tags = vec!["title", "description", "summary", "content"];
15 let mut text = text.to_string();
16 for tag in tags {
17 let re = Regex::new(&format!(r#"<{}>(?<content>.*?)</{}>"#, tag, tag)).unwrap();
18 let m = re.replace_all(&text, |caps: ®ex::Captures| {
19 let content = &caps["content"];
20 let content = html_escape::encode_text(content);
21 format!("<{}>{}</{}>", tag, content, tag)
22 });
23 text = m.to_string();
24 }
25 return text.to_string();
26}
27
28pub fn parse(text: &str) -> Result<Vec<Feed>, String> {
29 let text = preprocess(text);
30
31 let mut reader = Reader::from_str(&text);
32 reader.config_mut().trim_text(true);
33
34 let mut feeds = Vec::new();
35 let mut writer = Writer::new(Cursor::new(Vec::new()));
36 let mut parsing = false;
37 loop {
38 match reader.read_event() {
39 Ok(Event::Start(e)) => {
40 if parsing {
41 if e.name().as_ref() == b"dc:creator" {
42 assert!(writer
43 .write_event(Event::Start(BytesStart::new("creator")))
44 .is_ok());
45 } else if e.name().as_ref() == b"dc:date" {
46 assert!(writer
47 .write_event(Event::Start(BytesStart::new("date")))
48 .is_ok());
49 } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
50 assert!(writer
51 .write_event(Event::Start(BytesStart::new("publish_date")))
52 .is_ok());
53 } else if e.name().as_ref() == b"description" {
54 assert!(writer
55 .write_event(Event::Start(BytesStart::new("description")))
56 .is_ok());
57 } else if e.name().as_ref() == b"link" {
58 continue;
59 } else {
60 assert!(writer.write_event(Event::Start(e.clone())).is_ok());
61 }
62 }
63 if e.name().as_ref() == b"entry" {
64 assert!(writer
65 .write_event(Event::Start(BytesStart::new("entry")))
66 .is_ok());
67 parsing = true;
68 }
69 }
70 Ok(Event::Empty(e)) => {
71 if parsing {
72 if e.name().as_ref() == b"link" {
73 let mut is_link = true;
74 for attr in e.attributes() {
75 let attr = attr.unwrap();
76 if attr.key.0 == b"type" {
77 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
78 if attr_text != "text/html" {
79 is_link = false;
80 }
81 } else if attr.key.0 == b"rel" {
82 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
83 if attr_text != "alternate" {
84 is_link = false;
85 }
86 }
87 }
88 if is_link == false {
89 continue;
90 }
91 for attr in e.attributes() {
92 let attr = attr.unwrap();
93 if attr.key.0 == b"href" {
94 assert!(writer
95 .write_event(Event::Start(BytesStart::new("link")))
96 .is_ok());
97 let attr_text: &str = str::from_utf8(attr.value.as_ref()).unwrap();
98 assert!(writer
99 .write_event(Event::Text(BytesText::new(attr_text)))
100 .is_ok());
101 assert!(writer
102 .write_event(Event::End(BytesEnd::new("link")))
103 .is_ok());
104 }
105 }
106 } else {
107 assert!(writer.write_event(Event::Empty(e)).is_ok());
108 }
109 }
110 }
111 Ok(Event::End(e)) => {
112 if e.name().as_ref() == b"entry" {
113 assert!(writer
114 .write_event(Event::End(BytesEnd::new("entry")))
115 .is_ok());
116 let feed_text = writer.into_inner().into_inner();
117 let feed = from_str::<Feed>(str::from_utf8(&feed_text).unwrap()).unwrap();
118 feeds.push(feed);
119
120 writer = Writer::new(Cursor::new(Vec::new()));
121 parsing = false;
122 }
123 if parsing {
124 if e.name().as_ref() == b"dc:creator" {
125 assert!(writer
126 .write_event(Event::End(BytesEnd::new("creator")))
127 .is_ok());
128 } else if e.name().as_ref() == b"dc:date" {
129 assert!(writer
130 .write_event(Event::End(BytesEnd::new("date")))
131 .is_ok());
132 } else if e.name().as_ref() == b"pubDate" || e.name().as_ref() == b"published" {
133 assert!(writer
134 .write_event(Event::End(BytesEnd::new("publish_date")))
135 .is_ok());
136 } else if e.name().as_ref() == b"link" {
137 continue;
138 } else {
139 assert!(writer.write_event(Event::End(e)).is_ok());
140 }
141 }
142 }
143 Ok(Event::Text(e)) => {
144 if parsing {
145 let text = str::from_utf8(&e as &[u8]).unwrap();
146 let text = html_escape::decode_html_entities(text);
147 let e = BytesText::new(&text);
148 assert!(writer.write_event(Event::Text(e)).is_ok());
149 }
150 }
151 Ok(Event::CData(e)) => {
152 if parsing {
153 assert!(writer.write_event(Event::CData(e)).is_ok());
154 }
155 }
156 Ok(Event::Eof) => break,
157 Ok(_e) => {}
158 Err(e) => {
159 println!("Error at position {}: {:?}", reader.error_position(), e);
160 }
161 }
162 }
163 return Ok(feeds);
164}