use crate::types::FeedVersion;
use quick_xml::{Reader, events::Event};
const MAX_JSON_DETECTION_SIZE: usize = 1024 * 1024;
#[must_use]
pub fn detect_format(data: &[u8]) -> FeedVersion {
let first_non_whitespace = data.iter().find(|&&b| !b.is_ascii_whitespace()).copied();
if first_non_whitespace == Some(b'{') {
return detect_json_feed_version(data);
}
detect_xml_format(data)
}
fn detect_json_feed_version(data: &[u8]) -> FeedVersion {
if data.len() > MAX_JSON_DETECTION_SIZE {
let truncated = &data[..MAX_JSON_DETECTION_SIZE];
return detect_json_version_from_partial(truncated);
}
if let Ok(json) = serde_json::from_slice::<serde_json::Value>(data)
&& let Some(version) = json.get("version").and_then(|v| v.as_str())
{
return match version {
"https://jsonfeed.org/version/1" => FeedVersion::JsonFeed10,
"https://jsonfeed.org/version/1.1" => FeedVersion::JsonFeed11,
_ => FeedVersion::Unknown,
};
}
FeedVersion::Unknown
}
fn detect_json_version_from_partial(data: &[u8]) -> FeedVersion {
let data_str = std::str::from_utf8(data).unwrap_or("");
if data_str.contains("https://jsonfeed.org/version/1.1") {
FeedVersion::JsonFeed11
} else if data_str.contains("https://jsonfeed.org/version/1") {
FeedVersion::JsonFeed10
} else {
FeedVersion::Unknown
}
}
fn has_netscape_rss091_doctype(data: &[u8]) -> bool {
let probe = &data[..data.len().min(512)];
probe
.windows(b"Netscape Communications".len())
.any(|w| w == b"Netscape Communications")
}
fn detect_xml_format(data: &[u8]) -> FeedVersion {
let has_netscape_doctype = has_netscape_rss091_doctype(data);
let mut reader = Reader::from_reader(data);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e) | Event::Empty(e)) => {
let name = e.local_name();
match name.as_ref() {
b"rss" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"version" {
return match attr.value.as_ref() {
b"0.90" => FeedVersion::Rss090,
b"0.91" => {
if has_netscape_doctype {
FeedVersion::Rss091Netscape
} else {
FeedVersion::Rss091Userland
}
}
b"0.92" => FeedVersion::Rss092,
b"2.0" => FeedVersion::Rss20,
_ => FeedVersion::Unknown,
};
}
}
return FeedVersion::Rss20;
}
b"rdf:RDF" | b"RDF" => {
return FeedVersion::Rss10;
}
b"feed" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"xmlns" {
let ns = attr.value.as_ref();
if ns == b"http://www.w3.org/2005/Atom" {
return FeedVersion::Atom10;
} else if ns == b"http://purl.org/atom/ns#" {
return FeedVersion::Atom03;
}
}
}
return FeedVersion::Atom10;
}
_ => {
return FeedVersion::Unknown;
}
}
}
Ok(Event::Eof) => break,
Err(_) => {
break;
}
_ => {}
}
buf.clear();
}
FeedVersion::Unknown
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_rss20() {
let xml = br#"<?xml version="1.0"?><rss version="2.0"></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss20);
}
#[test]
fn test_detect_rss20_no_version() {
let xml = br#"<?xml version="1.0"?><rss></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss20);
}
#[test]
fn test_detect_rss091_userland() {
let xml = br#"<rss version="0.91"></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss091Userland);
}
#[test]
fn test_detect_rss091_netscape() {
let xml = br#"<?xml version="1.0"?>
<!DOCTYPE rss PUBLIC "-//Netscape Communications//DTD RSS 0.91//EN"
"http://my.netscape.com/publish/formats/rss-0.91.dtd">
<rss version="0.91"></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss091Netscape);
}
#[test]
fn test_detect_rss092() {
let xml = br#"<rss version="0.92"></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss092);
}
#[test]
fn test_detect_rss090() {
let xml = br#"<rss version="0.90"></rss>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss090);
}
#[test]
fn test_detect_rss10_rdf() {
let xml = br#"<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></rdf:RDF>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss10);
}
#[test]
fn test_detect_rss10_rdf_uppercase() {
let xml = br#"<RDF xmlns="http://www.w3.org/1999/02/22-rdf-syntax-ns#"></RDF>"#;
assert_eq!(detect_format(xml), FeedVersion::Rss10);
}
#[test]
fn test_detect_atom10() {
let xml = br#"<feed xmlns="http://www.w3.org/2005/Atom"></feed>"#;
assert_eq!(detect_format(xml), FeedVersion::Atom10);
}
#[test]
fn test_detect_atom10_no_xmlns() {
let xml = br"<feed></feed>";
assert_eq!(detect_format(xml), FeedVersion::Atom10);
}
#[test]
fn test_detect_atom03() {
let xml = br#"<feed xmlns="http://purl.org/atom/ns#"></feed>"#;
assert_eq!(detect_format(xml), FeedVersion::Atom03);
}
#[test]
fn test_detect_json_feed_10() {
let json = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
assert_eq!(detect_format(json), FeedVersion::JsonFeed10);
}
#[test]
fn test_detect_json_feed_11() {
let json = br#"{"version": "https://jsonfeed.org/version/1.1"}"#;
assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
}
#[test]
fn test_detect_unknown_xml() {
let xml = br"<unknown></unknown>";
assert_eq!(detect_format(xml), FeedVersion::Unknown);
}
#[test]
fn test_detect_invalid_xml() {
let xml = b"not xml at all";
assert_eq!(detect_format(xml), FeedVersion::Unknown);
}
#[test]
fn test_detect_whitespace_before_json() {
let json = b" \n {\"version\": \"https://jsonfeed.org/version/1.1\"}";
assert_eq!(detect_format(json), FeedVersion::JsonFeed11);
}
#[test]
fn test_detect_whitespace_before_xml() {
let xml = b" \n <?xml version=\"1.0\"?><rss version=\"2.0\"></rss>";
assert_eq!(detect_format(xml), FeedVersion::Rss20);
}
#[test]
fn test_detect_empty_data() {
let data = b"";
assert_eq!(detect_format(data), FeedVersion::Unknown);
}
#[test]
fn test_detect_json_version_from_partial() {
use super::detect_json_version_from_partial;
let json_11 = br#"{"version": "https://jsonfeed.org/version/1.1", "title": "Test"}"#;
assert_eq!(
detect_json_version_from_partial(json_11),
FeedVersion::JsonFeed11
);
let json_10 = br#"{"version": "https://jsonfeed.org/version/1", "title": "Test"}"#;
assert_eq!(
detect_json_version_from_partial(json_10),
FeedVersion::JsonFeed10
);
let unknown = br#"{"title": "No version field"}"#;
assert_eq!(
detect_json_version_from_partial(unknown),
FeedVersion::Unknown
);
}
}