Skip to main content

feedparser_rs/parser/
mod.rs

1pub mod atom;
2mod common;
3mod detect;
4pub mod json;
5pub mod namespace_detection;
6pub mod rss;
7pub mod rss10;
8
9use crate::{error::Result, types::ParsedFeed};
10
11pub use common::skip_element;
12pub use detect::detect_format;
13
14/// Parse feed from raw bytes
15///
16/// This is the main entry point for parsing feeds. It automatically detects
17/// the feed format (RSS, Atom, JSON) and parses accordingly.
18///
19/// # Errors
20///
21/// Returns a `FeedError` if the feed cannot be parsed. However, in most cases,
22/// the parser will set the `bozo` flag and return partial results rather than
23/// returning an error.
24///
25/// # Examples
26///
27/// ```
28/// use feedparser_rs::parse;
29///
30/// let xml = r#"
31///     <?xml version="1.0"?>
32///     <rss version="2.0">
33///         <channel>
34///             <title>Example Feed</title>
35///         </channel>
36///     </rss>
37/// "#;
38///
39/// let feed = parse(xml.as_bytes()).unwrap();
40/// assert_eq!(feed.feed.title.as_deref(), Some("Example Feed"));
41/// ```
42pub fn parse(data: &[u8]) -> Result<ParsedFeed> {
43    parse_with_limits(data, crate::ParserLimits::default())
44}
45
46/// Parse feed with custom parser limits
47///
48/// This allows controlling resource usage when parsing untrusted feeds.
49///
50/// # Examples
51///
52/// ```
53/// use feedparser_rs::{parse_with_limits, ParserLimits};
54///
55/// let xml = b"<rss version=\"2.0\"><channel><title>Test</title></channel></rss>";
56/// let limits = ParserLimits::strict();
57/// let feed = parse_with_limits(xml, limits).unwrap();
58/// ```
59///
60/// # Errors
61///
62/// Returns an error if:
63/// - Feed size exceeds limits
64/// - Format is unknown or unsupported
65/// - Fatal parsing error occurs
66pub fn parse_with_limits(data: &[u8], limits: crate::ParserLimits) -> Result<ParsedFeed> {
67    use crate::types::FeedVersion;
68    use crate::util::encoding::detect_and_convert;
69
70    // Detect encoding and convert to UTF-8 before parsing.
71    // This handles ISO-8859-1, Windows-1252, UTF-16, and BOM-prefixed feeds.
72    let (utf8_string, detected_encoding) = detect_and_convert(data)
73        .unwrap_or_else(|_| (String::from_utf8_lossy(data).into_owned(), "UTF-8"));
74
75    let utf8_bytes = utf8_string.as_bytes();
76    let encoding_label = detected_encoding.to_lowercase();
77
78    // Detect format on UTF-8 data (required for correct UTF-16 detection)
79    let version = detect_format(utf8_bytes);
80
81    // Parse based on detected format, then update the encoding field
82    let mut feed = match version {
83        // RSS variants (all use RSS 2.0 parser; overwrite version after parsing)
84        FeedVersion::Rss20
85        | FeedVersion::Rss092
86        | FeedVersion::Rss091Netscape
87        | FeedVersion::Rss091Userland
88        | FeedVersion::Rss090 => {
89            let mut parsed = rss::parse_rss20_with_limits(utf8_bytes, limits)?;
90            parsed.version = version;
91            Ok(parsed)
92        }
93
94        // Atom variants
95        FeedVersion::Atom10 | FeedVersion::Atom03 => {
96            atom::parse_atom10_with_limits(utf8_bytes, limits)
97        }
98
99        // RSS 1.0 (RDF)
100        FeedVersion::Rss10 => rss10::parse_rss10_with_limits(utf8_bytes, limits),
101
102        // JSON Feed
103        FeedVersion::JsonFeed10 | FeedVersion::JsonFeed11 => {
104            json::parse_json_feed_with_limits(utf8_bytes, limits)
105        }
106
107        // Unknown format - return a bozo feed since the format is unrecognized.
108        // The bozo pattern requires we never panic and always return partial data,
109        // but unrecognizable input must signal the caller via bozo=true.
110        FeedVersion::Unknown => {
111            let mut feed = crate::types::ParsedFeed::new();
112            feed.version = FeedVersion::Unknown;
113            feed.bozo = true;
114            feed.bozo_exception = Some("Feed format not recognized".to_string());
115            Ok(feed)
116        }
117    }?;
118
119    feed.encoding = encoding_label;
120    Ok(feed)
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    #[test]
128    fn test_parse_returns_ok_bozo_for_garbage() {
129        let feed = parse(b"test").unwrap();
130        assert!(feed.bozo, "unrecognized input must set bozo");
131        assert_eq!(feed.version, crate::types::FeedVersion::Unknown);
132        assert!(feed.entries.is_empty());
133    }
134
135    #[test]
136    fn test_rss091n_version_string() {
137        // #283: RSS 0.91 with Netscape DOCTYPE must report "rss091n"
138        let xml = br#"<?xml version="1.0"?>
139<!DOCTYPE rss PUBLIC "-//Netscape Communications//DTD RSS 0.91//EN"
140    "http://my.netscape.com/publish/formats/rss-0.91.dtd">
141<rss version="0.91">
142<channel><title>T</title><link>http://example.com</link><description>D</description>
143<language>en</language></channel></rss>"#;
144        let feed = parse(xml).unwrap();
145        assert_eq!(feed.version.as_str(), "rss091n");
146    }
147
148    #[test]
149    fn test_rss091u_version_string() {
150        // #283: RSS 0.91 without Netscape DOCTYPE must report "rss091u"
151        let xml = br#"<?xml version="1.0"?>
152<rss version="0.91">
153<channel><title>T</title><link>http://example.com</link><description>D</description>
154<language>en</language></channel></rss>"#;
155        let feed = parse(xml).unwrap();
156        assert_eq!(feed.version.as_str(), "rss091u");
157    }
158
159    #[test]
160    fn test_rss092_version_string() {
161        // #283: RSS 0.92 feeds must report version "rss092", not "rss20"
162        let xml = br#"<?xml version="1.0"?>
163<rss version="0.92">
164<channel><title>T</title><link>http://example.com</link><description>D</description>
165</channel></rss>"#;
166        let feed = parse(xml).unwrap();
167        assert_eq!(feed.version.as_str(), "rss092");
168    }
169
170    #[test]
171    fn test_rss20_version_string_unchanged() {
172        // #283: RSS 2.0 feeds must still report version "rss20"
173        let xml = br#"<?xml version="1.0"?>
174<rss version="2.0">
175<channel><title>T</title><link>http://example.com</link><description>D</description>
176</channel></rss>"#;
177        let feed = parse(xml).unwrap();
178        assert_eq!(feed.version.as_str(), "rss20");
179    }
180}