feedparser_rs/parser/
detect.rs

1//! Feed format detection from XML/JSON content
2
3use crate::types::FeedVersion;
4use quick_xml::{Reader, events::Event};
5
6/// H1: Maximum size for JSON detection to prevent memory exhaustion
7/// We only need to read the "version" field which is at the start
8const MAX_JSON_DETECTION_SIZE: usize = 1024 * 1024; // 1MB
9
10/// Auto-detect feed format from raw data
11///
12/// Examines the input data to determine the feed format by analyzing:
13/// 1. Whether it's JSON (starts with `{`) → JSON Feed
14/// 2. Root XML element name and attributes → RSS or Atom
15///
16/// # Arguments
17///
18/// * `data` - Raw feed data (XML or JSON)
19///
20/// # Returns
21///
22/// * `FeedVersion` - Detected format, or `Unknown` if unrecognized
23///
24/// # Examples
25///
26/// ```
27/// use feedparser_rs::{detect_format, FeedVersion};
28///
29/// let rss = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
30/// assert_eq!(detect_format(rss), FeedVersion::Rss20);
31///
32/// let atom = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
33/// assert_eq!(detect_format(atom), FeedVersion::Atom10);
34/// ```
35#[must_use]
36pub fn detect_format(data: &[u8]) -> FeedVersion {
37    // Check for JSON Feed (starts with '{')
38    let first_non_whitespace = data.iter().find(|&&b| !b.is_ascii_whitespace()).copied();
39
40    if first_non_whitespace == Some(b'{') {
41        return detect_json_feed_version(data);
42    }
43
44    // Parse XML to find root element
45    detect_xml_format(data)
46}
47
48/// Detect JSON Feed version from JSON data
49///
50/// H1: Uses size limit to prevent memory exhaustion from large JSON files.
51fn detect_json_feed_version(data: &[u8]) -> FeedVersion {
52    // H1: Check size limit before parsing to prevent memory exhaustion
53    if data.len() > MAX_JSON_DETECTION_SIZE {
54        // For detection, we only need to find the "version" field
55        // If file is too large, try to parse just the first chunk
56        let truncated = &data[..MAX_JSON_DETECTION_SIZE];
57        // Try to find version in truncated data using simple search
58        // This is a fallback - if we can't detect, return Unknown
59        return detect_json_version_from_partial(truncated);
60    }
61
62    // Try to parse as JSON and check version field
63    if let Ok(json) = serde_json::from_slice::<serde_json::Value>(data)
64        && let Some(version) = json.get("version").and_then(|v| v.as_str())
65    {
66        return match version {
67            "https://jsonfeed.org/version/1" => FeedVersion::JsonFeed10,
68            "https://jsonfeed.org/version/1.1" => FeedVersion::JsonFeed11,
69            _ => FeedVersion::Unknown,
70        };
71    }
72    FeedVersion::Unknown
73}
74
75/// Fallback detection for large JSON files using string search
76fn detect_json_version_from_partial(data: &[u8]) -> FeedVersion {
77    // Simple byte search for version field patterns
78    // This is a heuristic fallback for oversized JSON
79    let data_str = std::str::from_utf8(data).unwrap_or("");
80
81    if data_str.contains("https://jsonfeed.org/version/1.1") {
82        FeedVersion::JsonFeed11
83    } else if data_str.contains("https://jsonfeed.org/version/1") {
84        FeedVersion::JsonFeed10
85    } else {
86        FeedVersion::Unknown
87    }
88}
89
90/// Detect XML-based feed format (RSS or Atom)
91fn detect_xml_format(data: &[u8]) -> FeedVersion {
92    let mut reader = Reader::from_reader(data);
93    reader.config_mut().trim_text(true);
94
95    let mut buf = Vec::new();
96
97    // Read events until we find the root element
98    loop {
99        match reader.read_event_into(&mut buf) {
100            Ok(Event::Start(e) | Event::Empty(e)) => {
101                let name = e.local_name();
102
103                match name.as_ref() {
104                    b"rss" => {
105                        // Check version attribute
106                        for attr in e.attributes().flatten() {
107                            if attr.key.as_ref() == b"version" {
108                                return match attr.value.as_ref() {
109                                    b"0.90" => FeedVersion::Rss090,
110                                    b"0.91" => FeedVersion::Rss091,
111                                    b"0.92" => FeedVersion::Rss092,
112                                    b"2.0" => FeedVersion::Rss20,
113                                    _ => FeedVersion::Unknown,
114                                };
115                            }
116                        }
117                        // No version attribute, assume 2.0
118                        return FeedVersion::Rss20;
119                    }
120                    b"rdf:RDF" | b"RDF" => {
121                        // RSS 1.0 uses RDF
122                        return FeedVersion::Rss10;
123                    }
124                    b"feed" => {
125                        // Atom - check xmlns attribute
126                        for attr in e.attributes().flatten() {
127                            if attr.key.as_ref() == b"xmlns" {
128                                let ns = attr.value.as_ref();
129                                if ns == b"http://www.w3.org/2005/Atom" {
130                                    return FeedVersion::Atom10;
131                                } else if ns == b"http://purl.org/atom/ns#" {
132                                    return FeedVersion::Atom03;
133                                }
134                            }
135                        }
136                        // No xmlns or unknown, assume Atom 1.0
137                        return FeedVersion::Atom10;
138                    }
139                    _ => {
140                        // Unknown root element
141                        return FeedVersion::Unknown;
142                    }
143                }
144            }
145            Ok(Event::Eof) => break,
146            Err(_) => {
147                // XML parsing error, can't detect
148                break;
149            }
150            _ => {}
151        }
152        buf.clear();
153    }
154
155    FeedVersion::Unknown
156}
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn test_detect_rss20() {
164        let xml = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
165        assert_eq!(detect_format(xml), FeedVersion::Rss20);
166    }
167
168    #[test]
169    fn test_detect_rss20_no_version() {
170        let xml = br#"<?xml version="1.0"?><rss></rss>"#;
171        assert_eq!(detect_format(xml), FeedVersion::Rss20);
172    }
173
174    #[test]
175    fn test_detect_rss091() {
176        let xml = br#"<rss version="0.91"></rss>"#;
177        assert_eq!(detect_format(xml), FeedVersion::Rss091);
178    }
179
180    #[test]
181    fn test_detect_rss092() {
182        let xml = br#"<rss version="0.92"></rss>"#;
183        assert_eq!(detect_format(xml), FeedVersion::Rss092);
184    }
185
186    #[test]
187    fn test_detect_rss090() {
188        let xml = br#"<rss version="0.90"></rss>"#;
189        assert_eq!(detect_format(xml), FeedVersion::Rss090);
190    }
191
192    #[test]
193    fn test_detect_rss10_rdf() {
194        let xml = br#"<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>"#;
195        assert_eq!(detect_format(xml), FeedVersion::Rss10);
196    }
197
198    #[test]
199    fn test_detect_rss10_rdf_uppercase() {
200        let xml = br#"<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></RDF>"#;
201        assert_eq!(detect_format(xml), FeedVersion::Rss10);
202    }
203
204    #[test]
205    fn test_detect_atom10() {
206        let xml = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
207        assert_eq!(detect_format(xml), FeedVersion::Atom10);
208    }
209
210    #[test]
211    fn test_detect_atom10_no_xmlns() {
212        let xml = br"<feed></feed>";
213        assert_eq!(detect_format(xml), FeedVersion::Atom10);
214    }
215
216    #[test]
217    fn test_detect_atom03() {
218        let xml = br#"<feed xmlns="http://purl.org/atom/ns#"></feed>"#;
219        assert_eq!(detect_format(xml), FeedVersion::Atom03);
220    }
221
222    #[test]
223    fn test_detect_json_feed_10() {
224        let json = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
225        assert_eq!(detect_format(json), FeedVersion::JsonFeed10);
226    }
227
228    #[test]
229    fn test_detect_json_feed_11() {
230        let json = br#"{"version": "https://jsonfeed.org/version/1.1"}"#;
231        assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
232    }
233
234    #[test]
235    fn test_detect_unknown_xml() {
236        let xml = br"<unknown></unknown>";
237        assert_eq!(detect_format(xml), FeedVersion::Unknown);
238    }
239
240    #[test]
241    fn test_detect_invalid_xml() {
242        let xml = b"not xml at all";
243        assert_eq!(detect_format(xml), FeedVersion::Unknown);
244    }
245
246    #[test]
247    fn test_detect_whitespace_before_json() {
248        let json = b"  \n  {\"version\": \"https://jsonfeed.org/version/1.1\"}";
249        assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
250    }
251
252    #[test]
253    fn test_detect_whitespace_before_xml() {
254        let xml = b"  \n  <?xml version=\"1.0\"?><rss version=\"2.0\"></rss>";
255        assert_eq!(detect_format(xml), FeedVersion::Rss20);
256    }
257
258    #[test]
259    fn test_detect_empty_data() {
260        let data = b"";
261        assert_eq!(detect_format(data), FeedVersion::Unknown);
262    }
263
264    #[test]
265    fn test_detect_json_version_from_partial() {
266        // Test the fallback detection using string search
267        use super::detect_json_version_from_partial;
268
269        let json_11 = br#"{"version": "https://jsonfeed.org/version/1.1", "title": "Test"}"#;
270        assert_eq!(
271            detect_json_version_from_partial(json_11),
272            FeedVersion::JsonFeed11
273        );
274
275        let json_10 = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
276        assert_eq!(
277            detect_json_version_from_partial(json_10),
278            FeedVersion::JsonFeed10
279        );
280
281        let unknown = br#"{"title": "No version field"}"#;
282        assert_eq!(
283            detect_json_version_from_partial(unknown),
284            FeedVersion::Unknown
285        );
286    }
287}