Skip to main content

feedparser_rs/parser/
detect.rs

1//! Feed format detection from XML/JSON content
2
3use crate::types::FeedVersion;
4use quick_xml::{Reader, events::Event};
5
6/// H1: Maximum size for JSON detection to prevent memory exhaustion
7/// We only need to read the "version" field which is at the start
8const MAX_JSON_DETECTION_SIZE: usize = 1024 * 1024; // 1MB
9
10/// Auto-detect feed format from raw data
11///
12/// Examines the input data to determine the feed format by analyzing:
13/// 1. Whether it's JSON (starts with `{`) → JSON Feed
14/// 2. Root XML element name and attributes → RSS or Atom
15///
16/// # Arguments
17///
18/// * `data` - Raw feed data (XML or JSON)
19///
20/// # Returns
21///
22/// * `FeedVersion` - Detected format, or `Unknown` if unrecognized
23///
24/// # Examples
25///
26/// ```
27/// use feedparser_rs::{detect_format, FeedVersion};
28///
29/// let rss = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
30/// assert_eq!(detect_format(rss), FeedVersion::Rss20);
31///
32/// let atom = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
33/// assert_eq!(detect_format(atom), FeedVersion::Atom10);
34/// ```
35#[must_use]
36pub fn detect_format(data: &[u8]) -> FeedVersion {
37    // Check for JSON Feed (starts with '{')
38    let first_non_whitespace = data.iter().find(|&&b| !b.is_ascii_whitespace()).copied();
39
40    if first_non_whitespace == Some(b'{') {
41        return detect_json_feed_version(data);
42    }
43
44    // Parse XML to find root element
45    detect_xml_format(data)
46}
47
48/// Detect JSON Feed version from JSON data
49///
50/// H1: Uses size limit to prevent memory exhaustion from large JSON files.
51fn detect_json_feed_version(data: &[u8]) -> FeedVersion {
52    // H1: Check size limit before parsing to prevent memory exhaustion
53    if data.len() > MAX_JSON_DETECTION_SIZE {
54        // For detection, we only need to find the "version" field
55        // If file is too large, try to parse just the first chunk
56        let truncated = &data[..MAX_JSON_DETECTION_SIZE];
57        // Try to find version in truncated data using simple search
58        // This is a fallback - if we can't detect, return Unknown
59        return detect_json_version_from_partial(truncated);
60    }
61
62    // Try to parse as JSON and check version field
63    if let Ok(json) = serde_json::from_slice::<serde_json::Value>(data)
64        && let Some(version) = json.get("version").and_then(|v| v.as_str())
65    {
66        return match version {
67            "https://jsonfeed.org/version/1" => FeedVersion::JsonFeed10,
68            "https://jsonfeed.org/version/1.1" => FeedVersion::JsonFeed11,
69            _ => FeedVersion::Unknown,
70        };
71    }
72    FeedVersion::Unknown
73}
74
75/// Fallback detection for large JSON files using string search
76fn detect_json_version_from_partial(data: &[u8]) -> FeedVersion {
77    // Simple byte search for version field patterns
78    // This is a heuristic fallback for oversized JSON
79    let data_str = std::str::from_utf8(data).unwrap_or("");
80
81    if data_str.contains("https://jsonfeed.org/version/1.1") {
82        FeedVersion::JsonFeed11
83    } else if data_str.contains("https://jsonfeed.org/version/1") {
84        FeedVersion::JsonFeed10
85    } else {
86        FeedVersion::Unknown
87    }
88}
89
90/// Returns true if the data contains the Netscape RSS 0.91 DOCTYPE declaration.
91///
92/// Python feedparser uses this to distinguish `rss091n` (Netscape) from `rss091u` (Userland).
93fn has_netscape_rss091_doctype(data: &[u8]) -> bool {
94    // Only scan the first 512 bytes — DOCTYPE always appears before the root element
95    let probe = &data[..data.len().min(512)];
96    // The canonical Netscape DOCTYPE system identifier
97    probe
98        .windows(b"Netscape Communications".len())
99        .any(|w| w == b"Netscape Communications")
100}
101
102/// Detect XML-based feed format (RSS or Atom)
103fn detect_xml_format(data: &[u8]) -> FeedVersion {
104    let has_netscape_doctype = has_netscape_rss091_doctype(data);
105
106    let mut reader = Reader::from_reader(data);
107    reader.config_mut().trim_text(true);
108
109    let mut buf = Vec::new();
110
111    // Read events until we find the root element
112    loop {
113        match reader.read_event_into(&mut buf) {
114            Ok(Event::Start(e) | Event::Empty(e)) => {
115                let name = e.local_name();
116
117                match name.as_ref() {
118                    b"rss" => {
119                        // Check version attribute
120                        for attr in e.attributes().flatten() {
121                            if attr.key.as_ref() == b"version" {
122                                return match attr.value.as_ref() {
123                                    b"0.90" => FeedVersion::Rss090,
124                                    b"0.91" => {
125                                        if has_netscape_doctype {
126                                            FeedVersion::Rss091Netscape
127                                        } else {
128                                            FeedVersion::Rss091Userland
129                                        }
130                                    }
131                                    b"0.92" => FeedVersion::Rss092,
132                                    b"2.0" => FeedVersion::Rss20,
133                                    _ => FeedVersion::Unknown,
134                                };
135                            }
136                        }
137                        // No version attribute, assume 2.0
138                        return FeedVersion::Rss20;
139                    }
140                    b"rdf:RDF" | b"RDF" => {
141                        // RSS 1.0 uses RDF
142                        return FeedVersion::Rss10;
143                    }
144                    b"feed" => {
145                        // Atom - check xmlns attribute
146                        for attr in e.attributes().flatten() {
147                            if attr.key.as_ref() == b"xmlns" {
148                                let ns = attr.value.as_ref();
149                                if ns == b"http://www.w3.org/2005/Atom" {
150                                    return FeedVersion::Atom10;
151                                } else if ns == b"http://purl.org/atom/ns#" {
152                                    return FeedVersion::Atom03;
153                                }
154                            }
155                        }
156                        // No xmlns or unknown, assume Atom 1.0
157                        return FeedVersion::Atom10;
158                    }
159                    _ => {
160                        // Unknown root element
161                        return FeedVersion::Unknown;
162                    }
163                }
164            }
165            Ok(Event::Eof) => break,
166            Err(_) => {
167                // XML parsing error, can't detect
168                break;
169            }
170            _ => {}
171        }
172        buf.clear();
173    }
174
175    FeedVersion::Unknown
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn test_detect_rss20() {
184        let xml = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
185        assert_eq!(detect_format(xml), FeedVersion::Rss20);
186    }
187
188    #[test]
189    fn test_detect_rss20_no_version() {
190        let xml = br#"<?xml version="1.0"?><rss></rss>"#;
191        assert_eq!(detect_format(xml), FeedVersion::Rss20);
192    }
193
194    #[test]
195    fn test_detect_rss091_userland() {
196        let xml = br#"<rss version="0.91"></rss>"#;
197        assert_eq!(detect_format(xml), FeedVersion::Rss091Userland);
198    }
199
200    #[test]
201    fn test_detect_rss091_netscape() {
202        let xml = br#"<?xml version="1.0"?>
203<!DOCTYPE rss PUBLIC "-//Netscape Communications//DTD RSS 0.91//EN"
204    "http://my.netscape.com/publish/formats/rss-0.91.dtd">
205<rss version="0.91"></rss>"#;
206        assert_eq!(detect_format(xml), FeedVersion::Rss091Netscape);
207    }
208
209    #[test]
210    fn test_detect_rss092() {
211        let xml = br#"<rss version="0.92"></rss>"#;
212        assert_eq!(detect_format(xml), FeedVersion::Rss092);
213    }
214
215    #[test]
216    fn test_detect_rss090() {
217        let xml = br#"<rss version="0.90"></rss>"#;
218        assert_eq!(detect_format(xml), FeedVersion::Rss090);
219    }
220
221    #[test]
222    fn test_detect_rss10_rdf() {
223        let xml = br#"<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>"#;
224        assert_eq!(detect_format(xml), FeedVersion::Rss10);
225    }
226
227    #[test]
228    fn test_detect_rss10_rdf_uppercase() {
229        let xml = br#"<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></RDF>"#;
230        assert_eq!(detect_format(xml), FeedVersion::Rss10);
231    }
232
233    #[test]
234    fn test_detect_atom10() {
235        let xml = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
236        assert_eq!(detect_format(xml), FeedVersion::Atom10);
237    }
238
239    #[test]
240    fn test_detect_atom10_no_xmlns() {
241        let xml = br"<feed></feed>";
242        assert_eq!(detect_format(xml), FeedVersion::Atom10);
243    }
244
245    #[test]
246    fn test_detect_atom03() {
247        let xml = br#"<feed xmlns="http://purl.org/atom/ns#"></feed>"#;
248        assert_eq!(detect_format(xml), FeedVersion::Atom03);
249    }
250
251    #[test]
252    fn test_detect_json_feed_10() {
253        let json = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
254        assert_eq!(detect_format(json), FeedVersion::JsonFeed10);
255    }
256
257    #[test]
258    fn test_detect_json_feed_11() {
259        let json = br#"{"version": "https://jsonfeed.org/version/1.1"}"#;
260        assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
261    }
262
263    #[test]
264    fn test_detect_unknown_xml() {
265        let xml = br"<unknown></unknown>";
266        assert_eq!(detect_format(xml), FeedVersion::Unknown);
267    }
268
269    #[test]
270    fn test_detect_invalid_xml() {
271        let xml = b"not xml at all";
272        assert_eq!(detect_format(xml), FeedVersion::Unknown);
273    }
274
275    #[test]
276    fn test_detect_whitespace_before_json() {
277        let json = b"  \n  {\"version\": \"https://jsonfeed.org/version/1.1\"}";
278        assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
279    }
280
281    #[test]
282    fn test_detect_whitespace_before_xml() {
283        let xml = b"  \n  <?xml version=\"1.0\"?><rss version=\"2.0\"></rss>";
284        assert_eq!(detect_format(xml), FeedVersion::Rss20);
285    }
286
287    #[test]
288    fn test_detect_empty_data() {
289        let data = b"";
290        assert_eq!(detect_format(data), FeedVersion::Unknown);
291    }
292
293    #[test]
294    fn test_detect_json_version_from_partial() {
295        // Test the fallback detection using string search
296        use super::detect_json_version_from_partial;
297
298        let json_11 = br#"{"version": "https://jsonfeed.org/version/1.1", "title": "Test"}"#;
299        assert_eq!(
300            detect_json_version_from_partial(json_11),
301            FeedVersion::JsonFeed11
302        );
303
304        let json_10 = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
305        assert_eq!(
306            detect_json_version_from_partial(json_10),
307            FeedVersion::JsonFeed10
308        );
309
310        let unknown = br#"{"title": "No version field"}"#;
311        assert_eq!(
312            detect_json_version_from_partial(unknown),
313            FeedVersion::Unknown
314        );
315    }
316}