feed_rs/parser/
mod.rs

1use std::error::Error;
2use std::fmt;
3use std::fmt::Debug;
4use std::hash::Hasher;
5use std::io::{BufRead, BufReader, Read};
6
7use chrono::{DateTime, Utc};
8use siphasher::sip128::{Hasher128, SipHasher};
9
10use crate::model;
11use crate::parser::util::{IdGenerator, TimestampParser};
12use crate::xml;
13use crate::xml::NS;
14
15mod atom;
16mod json;
17mod rss0;
18mod rss1;
19mod rss2;
20
21pub(crate) mod itunes;
22pub(crate) mod mediarss;
23pub(crate) mod util;
24
25pub type ParseFeedResult<T> = Result<T, ParseFeedError>;
26
27/// An error returned when parsing a feed from a source fails
28#[derive(Debug)]
29pub enum ParseFeedError {
30    // TODO add line number/position
31    ParseError(ParseErrorKind),
32    // IO error
33    IoError(std::io::Error),
34    // Underlying issue with JSON (poorly formatted etc.)
35    JsonSerde(serde_json::error::Error),
36    // Unsupported version of the JSON feed
37    JsonUnsupportedVersion(String),
38    // Underlying issue with XML (poorly formatted etc.)
39    XmlReader(xml::XmlError),
40}
41
42impl From<serde_json::error::Error> for ParseFeedError {
43    fn from(err: serde_json::error::Error) -> Self {
44        ParseFeedError::JsonSerde(err)
45    }
46}
47
48impl From<std::io::Error> for ParseFeedError {
49    fn from(err: std::io::Error) -> Self {
50        ParseFeedError::IoError(err)
51    }
52}
53
54impl From<xml::XmlError> for ParseFeedError {
55    fn from(err: xml::XmlError) -> Self {
56        ParseFeedError::XmlReader(err)
57    }
58}
59
60impl fmt::Display for ParseFeedError {
61    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
62        match self {
63            ParseFeedError::ParseError(pe) => write!(f, "unable to parse feed: {}", pe),
64            ParseFeedError::IoError(ie) => write!(f, "unable to read feed: {}", ie),
65            ParseFeedError::JsonSerde(je) => write!(f, "unable to parse JSON: {}", je),
66            ParseFeedError::JsonUnsupportedVersion(version) => write!(f, "unsupported version: {}", version),
67            ParseFeedError::XmlReader(xe) => write!(f, "unable to parse XML: {}", xe),
68        }
69    }
70}
71
72impl Error for ParseFeedError {
73    fn source(&self) -> Option<&(dyn Error + 'static)> {
74        match self {
75            ParseFeedError::IoError(ie) => Some(ie),
76            ParseFeedError::JsonSerde(je) => Some(je),
77            ParseFeedError::XmlReader(xe) => Some(xe),
78            _ => None,
79        }
80    }
81}
82
83/// Underlying cause of the parse failure
84#[derive(Debug)]
85pub enum ParseErrorKind {
86    /// Could not find the expected root element (e.g. "channel" for RSS 2, a JSON node etc.)
87    NoFeedRoot,
88    /// The content type is unsupported, and we cannot parse the value into a known representation
89    UnknownMimeType(String),
90    /// Required content within the source was not found e.g. the XML child text element for a "content" element
91    MissingContent(&'static str),
92}
93
94impl fmt::Display for ParseErrorKind {
95    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
96        match self {
97            ParseErrorKind::NoFeedRoot => f.write_str("no root element"),
98            ParseErrorKind::UnknownMimeType(mime) => write!(f, "unsupported content type {}", mime),
99            ParseErrorKind::MissingContent(elem) => write!(f, "missing content element {}", elem),
100        }
101    }
102}
103
104/// Parser for various feed formats
105pub struct Parser {
106    base_uri: Option<String>,
107    id_generator: Box<IdGenerator>,
108    sanitize_content: bool,
109    timestamp_parser: Box<TimestampParser>,
110}
111
112impl Parser {
113    /// Parse the input (Atom, a flavour of RSS or JSON Feed) into our model
114    ///
115    /// # Arguments
116    ///
117    /// * `input` - A source of content such as a string, file etc.
118    ///
119    /// NOTE: feed-rs uses the encoding attribute in the XML prolog to decode content.
120    /// HTTP libraries (such as reqwest) provide a `text()` method which applies the content-encoding header and decodes the source into UTF-8.
121    /// This then causes feed-rs to fail when it attempts to interpret the UTF-8 stream as a different character set.
122    /// Instead, pass the raw, encoded source to feed-rs e.g. the `.bytes()` method if using reqwest.
123    ///
124    /// # Examples
125    ///
126    /// ```
127    /// use feed_rs::parser;
128    /// let xml = r#"
129    /// <feed>
130    ///    <title type="text">sample feed</title>
131    ///    <updated>2005-07-31T12:29:29Z</updated>
132    ///    <id>feed1</id>
133    ///    <entry>
134    ///        <title>sample entry</title>
135    ///        <id>entry1</id>
136    ///    </entry>
137    /// </feed>
138    /// "#;
139    /// let feed_from_xml = parser::parse(xml.as_bytes()).unwrap();
140    ///
141    ///
142    /// ```
143    pub fn parse<R: Read>(&self, source: R) -> ParseFeedResult<model::Feed> {
144        // Buffer the reader for performance (e.g. when streaming from a network) and so we can peek to determine the type of content
145        let mut input = BufReader::new(source);
146
147        // Determine whether this is XML or JSON and call the appropriate parser
148        input.fill_buf()?;
149        let first_char = input.buffer().iter().find(|b| **b == b'<' || **b == b'{').map(|b| *b as char);
150        let result = match first_char {
151            Some('<') => self.parse_xml(input),
152
153            Some('{') => self.parse_json(input),
154
155            _ => Err(ParseFeedError::ParseError(ParseErrorKind::NoFeedRoot)),
156        };
157
158        // Post processing as required
159        if let Ok(mut feed) = result {
160            assign_missing_ids(&self.id_generator, &mut feed, self.base_uri.as_deref());
161
162            Ok(feed)
163        } else {
164            result
165        }
166    }
167
168    // Handles JSON content
169    fn parse_json<R: BufRead>(&self, source: R) -> ParseFeedResult<model::Feed> {
170        json::parse(self, source)
171    }
172
173    // Parses timestamps with the configured parser (internal, or supplied via the builder)
174    fn parse_timestamp(&self, text: &str) -> Option<DateTime<Utc>> {
175        (self.timestamp_parser)(text)
176    }
177
178    // Handles XML content
179    fn parse_xml<R: BufRead>(&self, source: R) -> ParseFeedResult<model::Feed> {
180        // Set up the source of XML elements from the input
181        let element_source = xml::ElementSource::new(source, self.base_uri.as_deref())?;
182        if let Ok(Some(root)) = element_source.root() {
183            // Dispatch to the correct parser
184            let version = root.attr_value("version");
185            match (root.name.as_str(), version.as_deref()) {
186                ("feed", _) => {
187                    element_source.set_default_default_namespace(NS::Atom);
188                    return atom::parse_feed(self, root);
189                }
190                ("entry", _) => {
191                    element_source.set_default_default_namespace(NS::Atom);
192                    return atom::parse_entry(self, root);
193                }
194                ("rss", Some("2.0")) => {
195                    element_source.set_default_default_namespace(NS::RSS);
196                    return rss2::parse(self, root);
197                }
198                ("rss", Some("0.91")) | ("rss", Some("0.92")) => {
199                    element_source.set_default_default_namespace(NS::RSS);
200                    return rss0::parse(self, root);
201                }
202                ("RDF", _) => {
203                    element_source.set_default_default_namespace(NS::RSS);
204                    return rss1::parse(self, root);
205                }
206                _ => {}
207            };
208        }
209
210        // Couldn't find a recognised feed within the provided XML stream
211        Err(ParseFeedError::ParseError(ParseErrorKind::NoFeedRoot))
212    }
213}
214
215/// Parses the provided source with the defaults
216///
217/// Customisation of the parser (e.g. base URI, custom timestamp parsers etc. can be configured through the builder.
218pub fn parse<R: Read>(source: R) -> ParseFeedResult<model::Feed> {
219    Builder::new().build().parse(source)
220}
221
222/// Builder to create instances of `FeedParser`
223pub struct Builder {
224    base_uri: Option<String>,
225    id_generator: Box<IdGenerator>,
226    sanitize_content: bool,
227    timestamp_parser: Box<TimestampParser>,
228}
229
230impl Builder {
231    /// Create a new instance of the builder
232    pub fn new() -> Builder {
233        Builder::default()
234    }
235
236    /// Source of the content, used to resolve relative URLs in XML based feeds
237    pub fn base_uri<S: AsRef<str>>(mut self, uri: Option<S>) -> Self {
238        self.base_uri = uri.map(|s| s.as_ref().to_string());
239        self
240    }
241
242    /// Create a new instance of the parser
243    pub fn build(self) -> Parser {
244        Parser {
245            base_uri: self.base_uri,
246            id_generator: self.id_generator,
247            sanitize_content: self.sanitize_content,
248            timestamp_parser: self.timestamp_parser,
249        }
250    }
251
252    /// Registers an ID generator
253    pub fn id_generator<F>(mut self, generator: F) -> Self
254    where
255        F: Fn(&[model::Link], &Option<model::Text>, Option<&str>) -> String + 'static,
256    {
257        self.id_generator = Box::new(generator);
258        self
259    }
260
261    /// Registers an ID generator compatible with v0.2 of feed-rs
262    pub fn id_generator_v0_2(self) -> Self {
263        self.id_generator(|links, title, _uri| {
264            // If we have a link without relative components, use that
265            if let Some(link) = links.iter().find(|l| l.rel.is_none()) {
266                // Trim the trailing slash if it exists
267                let mut link = model::Link::new(link.href.clone(), None);
268                if link.href.ends_with('/') {
269                    link.href.pop();
270                }
271
272                generate_id_from_link_and_title(&link, title)
273            } else {
274                util::uuid_gen()
275            }
276        })
277    }
278
279    /// Registers the flag for sanitizing content when the "sanitize" feature
280    /// is available
281    pub fn sanitize_content(mut self, flag: bool) -> Self {
282        self.sanitize_content = flag;
283        self
284    }
285
286    /// Registers a custom timestamp parser
287    pub fn timestamp_parser<F>(mut self, ts_parser: F) -> Self
288    where
289        F: Fn(&str) -> Option<DateTime<Utc>> + 'static,
290    {
291        self.timestamp_parser = Box::new(ts_parser);
292        self
293    }
294}
295
296/// Creates a parser instance with sensible defaults
297impl Default for Builder {
298    fn default() -> Self {
299        Builder {
300            base_uri: None,
301            id_generator: Box::new(generate_id),
302            sanitize_content: true,
303            timestamp_parser: Box::new(util::parse_timestamp_lenient),
304        }
305    }
306}
307
308// Assigns IDs to missing feed + entries as required
309fn assign_missing_ids(id_generator: &IdGenerator, feed: &mut model::Feed, uri: Option<&str>) {
310    if feed.id.is_empty() {
311        feed.id = id_generator(&feed.links, &feed.title, uri);
312    }
313
314    for entry in feed.entries.iter_mut() {
315        if entry.id.is_empty() {
316            entry.id = id_generator(&entry.links, &entry.title, uri);
317        }
318    }
319}
320
321const LINK_HASH_KEY1: u64 = 0x5d78_4074_2887_2d60;
322const LINK_HASH_KEY2: u64 = 0x90ee_ca4c_90a5_e228;
323
324// Creates a unique ID by trying the following in order:
325// 1) the first link + optional title
326// 2) the uri + title provided
327// 3) a UUID
328pub fn generate_id(links: &[model::Link], title: &Option<model::Text>, uri: Option<&str>) -> String {
329    if let Some(link) = links.first() {
330        generate_id_from_link_and_title(link, title)
331    } else if let (Some(uri), Some(title)) = (uri, title) {
332        generate_id_from_uri_and_title(uri, title)
333    } else {
334        // Generate a UUID as last resort
335        util::uuid_gen()
336    }
337}
338
339// Generate an ID from the link + title
340pub fn generate_id_from_link_and_title(link: &model::Link, title: &Option<model::Text>) -> String {
341    let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
342    hasher.write(link.href.as_bytes());
343    if let Some(title) = title {
344        hasher.write(title.content.as_bytes());
345    }
346    let hash = hasher.finish128();
347    format!("{:x}{:x}", hash.h1, hash.h2)
348}
349
350// Generate an ID from the URI and title
351pub fn generate_id_from_uri_and_title(uri: &str, title: &model::Text) -> String {
352    let mut hasher = SipHasher::new_with_keys(LINK_HASH_KEY1, LINK_HASH_KEY2);
353    hasher.write(uri.as_bytes());
354    hasher.write(title.content.as_bytes());
355    let hash = hasher.finish128();
356    format!("{:x}{:x}", hash.h1, hash.h2)
357}
358
359#[cfg(test)]
360mod tests;