feedparser_rs/parser/
detect.rs1use crate::types::FeedVersion;
4use quick_xml::{Reader, events::Event};
5
6const MAX_JSON_DETECTION_SIZE: usize = 1024 * 1024; #[must_use]
36pub fn detect_format(data: &[u8]) -> FeedVersion {
37 let first_non_whitespace = data.iter().find(|&&b| !b.is_ascii_whitespace()).copied();
39
40 if first_non_whitespace == Some(b'{') {
41 return detect_json_feed_version(data);
42 }
43
44 detect_xml_format(data)
46}
47
48fn detect_json_feed_version(data: &[u8]) -> FeedVersion {
52 if data.len() > MAX_JSON_DETECTION_SIZE {
54 let truncated = &data[..MAX_JSON_DETECTION_SIZE];
57 return detect_json_version_from_partial(truncated);
60 }
61
62 if let Ok(json) = serde_json::from_slice::<serde_json::Value>(data)
64 && let Some(version) = json.get("version").and_then(|v| v.as_str())
65 {
66 return match version {
67 "https://jsonfeed.org/version/1" => FeedVersion::JsonFeed10,
68 "https://jsonfeed.org/version/1.1" => FeedVersion::JsonFeed11,
69 _ => FeedVersion::Unknown,
70 };
71 }
72 FeedVersion::Unknown
73}
74
75fn detect_json_version_from_partial(data: &[u8]) -> FeedVersion {
77 let data_str = std::str::from_utf8(data).unwrap_or("");
80
81 if data_str.contains("https://jsonfeed.org/version/1.1") {
82 FeedVersion::JsonFeed11
83 } else if data_str.contains("https://jsonfeed.org/version/1") {
84 FeedVersion::JsonFeed10
85 } else {
86 FeedVersion::Unknown
87 }
88}
89
90fn has_netscape_rss091_doctype(data: &[u8]) -> bool {
94 let probe = &data[..data.len().min(512)];
96 probe
98 .windows(b"Netscape Communications".len())
99 .any(|w| w == b"Netscape Communications")
100}
101
102fn detect_xml_format(data: &[u8]) -> FeedVersion {
104 let has_netscape_doctype = has_netscape_rss091_doctype(data);
105
106 let mut reader = Reader::from_reader(data);
107 reader.config_mut().trim_text(true);
108
109 let mut buf = Vec::new();
110
111 loop {
113 match reader.read_event_into(&mut buf) {
114 Ok(Event::Start(e) | Event::Empty(e)) => {
115 let name = e.local_name();
116
117 match name.as_ref() {
118 b"rss" => {
119 for attr in e.attributes().flatten() {
121 if attr.key.as_ref() == b"version" {
122 return match attr.value.as_ref() {
123 b"0.90" => FeedVersion::Rss090,
124 b"0.91" => {
125 if has_netscape_doctype {
126 FeedVersion::Rss091Netscape
127 } else {
128 FeedVersion::Rss091Userland
129 }
130 }
131 b"0.92" => FeedVersion::Rss092,
132 b"2.0" => FeedVersion::Rss20,
133 _ => FeedVersion::Unknown,
134 };
135 }
136 }
137 return FeedVersion::Rss20;
139 }
140 b"rdf:RDF" | b"RDF" => {
141 return FeedVersion::Rss10;
143 }
144 b"feed" => {
145 for attr in e.attributes().flatten() {
147 if attr.key.as_ref() == b"xmlns" {
148 let ns = attr.value.as_ref();
149 if ns == b"http://www.w3.org/2005/Atom" {
150 return FeedVersion::Atom10;
151 } else if ns == b"http://purl.org/atom/ns#" {
152 return FeedVersion::Atom03;
153 }
154 }
155 }
156 return FeedVersion::Atom10;
158 }
159 _ => {
160 return FeedVersion::Unknown;
162 }
163 }
164 }
165 Ok(Event::Eof) => break,
166 Err(_) => {
167 break;
169 }
170 _ => {}
171 }
172 buf.clear();
173 }
174
175 FeedVersion::Unknown
176}
177
178#[cfg(test)]
179mod tests {
180 use super::*;
181
182 #[test]
183 fn test_detect_rss20() {
184 let xml = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
185 assert_eq!(detect_format(xml), FeedVersion::Rss20);
186 }
187
188 #[test]
189 fn test_detect_rss20_no_version() {
190 let xml = br#"<?xml version="1.0"?><rss></rss>"#;
191 assert_eq!(detect_format(xml), FeedVersion::Rss20);
192 }
193
194 #[test]
195 fn test_detect_rss091_userland() {
196 let xml = br#"<rss version="0.91"></rss>"#;
197 assert_eq!(detect_format(xml), FeedVersion::Rss091Userland);
198 }
199
200 #[test]
201 fn test_detect_rss091_netscape() {
202 let xml = br#"<?xml version="1.0"?>
203<!DOCTYPE rss PUBLIC "-//Netscape Communications//DTD RSS 0.91//EN"
204 "http://my.netscape.com/publish/formats/rss-0.91.dtd">
205<rss version="0.91"></rss>"#;
206 assert_eq!(detect_format(xml), FeedVersion::Rss091Netscape);
207 }
208
209 #[test]
210 fn test_detect_rss092() {
211 let xml = br#"<rss version="0.92"></rss>"#;
212 assert_eq!(detect_format(xml), FeedVersion::Rss092);
213 }
214
215 #[test]
216 fn test_detect_rss090() {
217 let xml = br#"<rss version="0.90"></rss>"#;
218 assert_eq!(detect_format(xml), FeedVersion::Rss090);
219 }
220
221 #[test]
222 fn test_detect_rss10_rdf() {
223 let xml = br#"<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>"#;
224 assert_eq!(detect_format(xml), FeedVersion::Rss10);
225 }
226
227 #[test]
228 fn test_detect_rss10_rdf_uppercase() {
229 let xml = br#"<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></RDF>"#;
230 assert_eq!(detect_format(xml), FeedVersion::Rss10);
231 }
232
233 #[test]
234 fn test_detect_atom10() {
235 let xml = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
236 assert_eq!(detect_format(xml), FeedVersion::Atom10);
237 }
238
239 #[test]
240 fn test_detect_atom10_no_xmlns() {
241 let xml = br"<feed></feed>";
242 assert_eq!(detect_format(xml), FeedVersion::Atom10);
243 }
244
245 #[test]
246 fn test_detect_atom03() {
247 let xml = br#"<feed xmlns="http://purl.org/atom/ns#"></feed>"#;
248 assert_eq!(detect_format(xml), FeedVersion::Atom03);
249 }
250
251 #[test]
252 fn test_detect_json_feed_10() {
253 let json = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
254 assert_eq!(detect_format(json), FeedVersion::JsonFeed10);
255 }
256
257 #[test]
258 fn test_detect_json_feed_11() {
259 let json = br#"{"version": "https://jsonfeed.org/version/1.1"}"#;
260 assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
261 }
262
263 #[test]
264 fn test_detect_unknown_xml() {
265 let xml = br"<unknown></unknown>";
266 assert_eq!(detect_format(xml), FeedVersion::Unknown);
267 }
268
269 #[test]
270 fn test_detect_invalid_xml() {
271 let xml = b"not xml at all";
272 assert_eq!(detect_format(xml), FeedVersion::Unknown);
273 }
274
275 #[test]
276 fn test_detect_whitespace_before_json() {
277 let json = b" \n {\"version\": \"https://jsonfeed.org/version/1.1\"}";
278 assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
279 }
280
281 #[test]
282 fn test_detect_whitespace_before_xml() {
283 let xml = b" \n <?xml version=\"1.0\"?><rss version=\"2.0\"></rss>";
284 assert_eq!(detect_format(xml), FeedVersion::Rss20);
285 }
286
287 #[test]
288 fn test_detect_empty_data() {
289 let data = b"";
290 assert_eq!(detect_format(data), FeedVersion::Unknown);
291 }
292
293 #[test]
294 fn test_detect_json_version_from_partial() {
295 use super::detect_json_version_from_partial;
297
298 let json_11 = br#"{"version": "https://jsonfeed.org/version/1.1", "title": "Test"}"#;
299 assert_eq!(
300 detect_json_version_from_partial(json_11),
301 FeedVersion::JsonFeed11
302 );
303
304 let json_10 = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
305 assert_eq!(
306 detect_json_version_from_partial(json_10),
307 FeedVersion::JsonFeed10
308 );
309
310 let unknown = br#"{"title": "No version field"}"#;
311 assert_eq!(
312 detect_json_version_from_partial(unknown),
313 FeedVersion::Unknown
314 );
315 }
316}