use crate::{
ParserLimits,
error::{FeedError, Result},
namespace::namespaces as ns_uris,
types::{FeedVersion, ParsedFeed},
};
use quick_xml::{
Reader,
events::{BytesRef, Event},
};
use std::{collections::HashMap, io::Write as _};
pub use crate::types::{FromAttributes, LimitedCollectionExt};
pub use crate::util::text::bytes_to_string;
pub const EVENT_BUFFER_CAPACITY: usize = 1024;
pub const TEXT_BUFFER_CAPACITY: usize = 256;
#[inline]
#[must_use]
#[allow(dead_code)] pub fn new_event_buffer() -> Vec<u8> {
Vec::with_capacity(EVENT_BUFFER_CAPACITY)
}
#[inline]
#[must_use]
#[allow(dead_code)] pub fn new_text_buffer() -> String {
String::with_capacity(TEXT_BUFFER_CAPACITY)
}
#[allow(dead_code)]
pub struct ParseContext<'a> {
pub reader: Reader<&'a [u8]>,
pub buf: Vec<u8>,
pub limits: ParserLimits,
pub depth: usize,
}
impl<'a> ParseContext<'a> {
#[allow(dead_code)]
pub fn new(data: &'a [u8], limits: ParserLimits) -> Result<Self> {
limits
.check_feed_size(data.len())
.map_err(|e| FeedError::InvalidFormat(e.to_string()))?;
let reader = Reader::from_reader(data);
Ok(Self {
reader,
buf: Vec::with_capacity(EVENT_BUFFER_CAPACITY),
limits,
depth: 1, })
}
#[inline]
#[allow(dead_code)]
pub fn check_depth(&mut self) -> Result<()> {
self.depth += 1;
if self.depth > self.limits.max_nesting_depth {
return Err(FeedError::InvalidFormat(format!(
"XML nesting depth {} exceeds maximum {}",
self.depth, self.limits.max_nesting_depth
)));
}
Ok(())
}
#[inline]
#[allow(dead_code)]
pub const fn decrement_depth(&mut self) {
self.depth = self.depth.saturating_sub(1);
}
#[inline]
#[allow(dead_code)]
pub fn clear_buf(&mut self) {
self.buf.clear();
}
}
#[inline]
pub fn init_feed(version: FeedVersion, max_entries: usize) -> ParsedFeed {
let mut feed = ParsedFeed::with_capacity(max_entries);
feed.version = version;
feed.encoding = String::from("utf-8");
feed
}
#[inline]
pub fn check_depth(depth: usize, max_depth: usize) -> Result<()> {
if depth > max_depth {
return Err(FeedError::InvalidFormat(format!(
"XML nesting depth {depth} exceeds maximum {max_depth}"
)));
}
Ok(())
}
#[inline]
pub fn extract_ns_local_name<'a>(name: &'a [u8], prefix: &[u8]) -> Option<&'a str> {
if name.starts_with(prefix) {
let tag_name = std::str::from_utf8(&name[prefix.len()..]).ok()?;
if !tag_name.is_empty()
&& tag_name
.chars()
.all(|c| c.is_alphanumeric() || c == '-' || c == '_')
{
Some(tag_name)
} else {
None
}
} else {
None
}
}
#[inline]
fn split_ns_tag(name: &[u8]) -> Option<(&[u8], &[u8])> {
let colon = name.iter().position(|&b| b == b':')?;
let local = &name[colon + 1..];
if local.is_empty() {
return None;
}
Some((&name[..=colon], local))
}
#[inline]
fn match_ns_tag_by_uri<'a>(
name: &'a [u8],
canonical_prefix: &[u8],
target_uri: &str,
namespaces: &HashMap<String, String>,
) -> Option<&'a str> {
if let Some(local) = extract_ns_local_name(name, canonical_prefix) {
return Some(local);
}
let (prefix_with_colon, _) = split_ns_tag(name)?;
let prefix_str = std::str::from_utf8(&prefix_with_colon[..prefix_with_colon.len() - 1]).ok()?;
if namespaces.get(prefix_str).map(String::as_str) == Some(target_uri) {
extract_ns_local_name(name, prefix_with_colon)
} else {
None
}
}
#[inline]
pub fn is_dc_tag<'a>(name: &'a [u8], namespaces: &HashMap<String, String>) -> Option<&'a str> {
match_ns_tag_by_uri(name, b"dc:", ns_uris::DUBLIN_CORE, namespaces)
}
#[inline]
pub fn is_content_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"content:")
}
#[inline]
pub fn is_syn_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"sy:").or_else(|| extract_ns_local_name(name, b"syn:"))
}
#[inline]
pub fn is_media_tag<'a>(name: &'a [u8], namespaces: &HashMap<String, String>) -> Option<&'a str> {
match_ns_tag_by_uri(name, b"media:", ns_uris::MEDIA, namespaces)
}
#[inline]
pub fn is_slash_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"slash:")
}
#[inline]
pub fn is_wfw_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"wfw:")
}
#[inline]
pub fn is_georss_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"georss:")
}
#[inline]
pub fn is_geo_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"geo:")
}
#[inline]
pub fn is_thr_tag(name: &[u8]) -> Option<&str> {
extract_ns_local_name(name, b"thr:")
}
#[inline]
pub fn is_itunes_tag(name: &[u8], tag: &[u8], namespaces: &HashMap<String, String>) -> bool {
if let Some(local) = match_ns_tag_by_uri(name, b"itunes:", ns_uris::ITUNES, namespaces) {
return local.as_bytes() == tag;
}
if !name.contains(&b':') {
return name == tag;
}
false
}
pub fn extract_xml_base(
element: &quick_xml::events::BytesStart,
max_attr_length: usize,
) -> Option<String> {
element
.attributes()
.flatten()
.find(|attr| {
let key = attr.key.as_ref();
key == b"xml:base" || key == b"base"
})
.filter(|attr| attr.value.len() <= max_attr_length)
.and_then(|attr| attr.unescape_value().ok())
.map(|s| s.to_string())
}
pub fn extract_xml_lang(
element: &quick_xml::events::BytesStart,
max_attr_length: usize,
) -> Option<String> {
element
.attributes()
.flatten()
.find(|attr| {
let key = attr.key.as_ref();
key == b"xml:lang" || key == b"lang"
})
.filter(|attr| attr.value.len() <= max_attr_length)
.and_then(|attr| attr.unescape_value().ok())
.map(|s| s.to_string())
}
pub fn extract_namespaces(
element: &quick_xml::events::BytesStart,
feed: &mut ParsedFeed,
limits: &ParserLimits,
) {
for result in element.attributes() {
let Ok(attr) = result else { continue };
let key = attr.key.as_ref();
let prefix = if key == b"xmlns" {
String::new()
} else if let Some(suffix) = key.strip_prefix(b"xmlns:") {
if let Ok(s) = std::str::from_utf8(suffix) {
s.to_string()
} else {
feed.bozo = true;
feed.bozo_exception = Some("Namespace prefix contains invalid UTF-8".to_string());
continue;
}
} else {
continue;
};
if attr.value.len() > limits.max_attribute_length {
feed.bozo = true;
feed.bozo_exception = Some(format!(
"Namespace URI exceeds maximum attribute length of {} bytes",
limits.max_attribute_length
));
continue;
}
let uri = if let Ok(v) = attr.unescape_value() {
v.to_string()
} else {
feed.bozo = true;
feed.bozo_exception = Some("Malformed namespace URI value".to_string());
continue;
};
if feed.namespaces.len() >= limits.max_namespaces {
feed.bozo = true;
feed.bozo_exception = Some(format!(
"Namespace limit exceeded: {}",
limits.max_namespaces
));
break;
}
feed.namespaces.insert(prefix, uri);
}
}
pub fn read_text(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
) -> Result<(String, bool)> {
let mut text = String::with_capacity(TEXT_BUFFER_CAPACITY);
let mut had_bozo = false;
loop {
match reader.read_event_into(buf) {
Ok(Event::Text(e)) => {
append_bytes(&mut text, e.as_ref(), limits.max_text_length)?;
}
Ok(Event::CData(e)) => {
append_bytes(&mut text, e.as_ref(), limits.max_text_length)?;
}
Ok(Event::GeneralRef(e)) => {
let (resolved, is_bozo) = resolve_entity(&e);
had_bozo |= is_bozo;
append_bytes(&mut text, resolved.as_bytes(), limits.max_text_length)?;
}
Ok(Event::End(_) | Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => {}
}
buf.clear();
}
let trimmed = text.trim().replace('\0', "");
Ok((trimmed, had_bozo))
}
fn resolve_entity(e: &BytesRef<'_>) -> (String, bool) {
match e.resolve_char_ref() {
Ok(Some(ch)) => return (ch.to_string(), false),
Ok(None) => {} Err(_) => {
let name = String::from_utf8_lossy(e.as_ref());
return (format!("&{name};"), true);
}
}
match e.as_ref() {
b"amp" => ("&".to_string(), false),
b"lt" => ("<".to_string(), false),
b"gt" => (">".to_string(), false),
b"quot" => ("\"".to_string(), false),
b"apos" => ("'".to_string(), false),
other => {
let name = String::from_utf8_lossy(other).into_owned();
(format!("&{name};"), true)
}
}
}
#[inline]
fn append_bytes(text: &mut String, bytes: &[u8], max_len: usize) -> Result<()> {
if text.len() + bytes.len() > max_len {
return Err(FeedError::InvalidFormat(format!(
"Text field exceeds maximum length of {max_len} bytes"
)));
}
match std::str::from_utf8(bytes) {
Ok(s) => text.push_str(s),
Err(_) => text.push_str(&String::from_utf8_lossy(bytes)),
}
Ok(())
}
pub fn skip_element(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
current_depth: usize,
) -> Result<()> {
let mut local_depth: usize = 1;
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(_)) => {
local_depth += 1;
if current_depth + local_depth > limits.max_nesting_depth {
return Err(FeedError::InvalidFormat(format!(
"XML nesting depth exceeds maximum of {}",
limits.max_nesting_depth
)));
}
}
Ok(Event::End(_)) => {
local_depth = local_depth.saturating_sub(1);
if local_depth == 0 {
break;
}
}
Ok(Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => {}
}
buf.clear();
}
Ok(())
}
pub fn read_xhtml_content(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
) -> Result<(String, bool)> {
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(_)) => break, Ok(Event::End(_) | Event::Eof) | Err(_) => return Ok((String::new(), true)),
_ => {}
}
buf.clear();
}
buf.clear();
serialize_inner_xml(reader, buf, limits).map(|s| (s, false))
}
#[inline]
pub fn read_xhtml_content_str(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
) -> Result<String> {
read_xhtml_content(reader, buf, limits).map(|(s, _)| s)
}
fn serialize_inner_xml(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
) -> Result<String> {
let mut output: Vec<u8> = Vec::with_capacity(TEXT_BUFFER_CAPACITY);
let max_len = limits.max_text_length;
{
let mut writer = quick_xml::Writer::new(&mut output);
let mut depth: usize = 0;
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(e)) => {
depth += 1;
let _ = writer.write_event(Event::Start(e));
}
Ok(Event::End(e)) => {
if depth == 0 {
break;
}
depth -= 1;
let _ = writer.write_event(Event::End(e));
}
Ok(Event::Text(e)) => {
let _ = writer.write_event(Event::Text(e));
}
Ok(Event::GeneralRef(e)) => {
let inner = writer.get_mut();
match e.as_ref() {
b"apos" => {
let _ = inner.write_all(b"'");
}
b"quot" => {
let _ = inner.write_all(b"\"");
}
_ => {
let _ = inner.write_all(b"&");
let _ = inner.write_all(e.as_ref());
let _ = inner.write_all(b";");
}
}
}
Ok(Event::CData(e)) => {
let _ = writer.write_event(Event::CData(e));
}
Ok(Event::Empty(e)) => {
let _ = writer.write_event(Event::Empty(e));
}
Ok(Event::Comment(e)) => {
let _ = writer.write_event(Event::Comment(e));
}
Ok(Event::Eof) | Err(_) => break,
_ => {}
}
buf.clear();
}
}
if output.len() > max_len {
return Err(FeedError::InvalidFormat(format!(
"XHTML content exceeds maximum length of {max_len} bytes"
)));
}
let result = String::from_utf8_lossy(&output).trim().to_string();
Ok(result)
}
#[inline]
pub fn read_text_str(
reader: &mut Reader<&[u8]>,
buf: &mut Vec<u8>,
limits: &ParserLimits,
) -> Result<String> {
read_text(reader, buf, limits).map(|(t, _)| t)
}
pub fn skip_to_end(reader: &mut Reader<&[u8]>, buf: &mut Vec<u8>, tag: &[u8]) -> Result<()> {
loop {
match reader.read_event_into(buf) {
Ok(Event::End(e)) if e.local_name().as_ref() == tag => break,
Ok(Event::Eof) => break,
Err(e) => return Err(e.into()),
_ => {}
}
buf.clear();
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bytes_to_string_valid_utf8() {
let bytes = b"Hello, World!";
assert_eq!(bytes_to_string(bytes), "Hello, World!");
}
#[test]
fn test_bytes_to_string_invalid_utf8() {
let bytes = &[0xff, 0xfe, 0x48, 0x65, 0x6c, 0x6c, 0x6f];
let result = bytes_to_string(bytes);
assert!(result.contains("Hello"));
}
#[test]
fn test_read_text_basic() {
let xml = b"<title>Test Title</title>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "Test Title");
assert!(!had_bozo);
}
#[test]
fn test_read_text_exceeds_limit() {
let xml = b"<title>This is a very long title</title>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits {
max_text_length: 10,
..ParserLimits::default()
};
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = read_text(&mut reader, &mut buf, &limits);
assert!(result.is_err());
}
#[test]
fn test_read_text_numeric_char_ref() {
let xml = b"<guid>https://example.com/?post_type=webcomic1&p=3172</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?post_type=webcomic1&p=3172");
assert!(!had_bozo);
}
#[test]
fn test_read_text_amp_entity() {
let xml = b"<guid>https://example.com/?a=1&b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2");
assert!(!had_bozo);
}
#[test]
fn test_read_text_hex_char_ref() {
let xml = b"<guid>https://example.com/?a=1&b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2");
assert!(!had_bozo);
}
#[test]
fn test_read_text_multiple_entities() {
let xml = b"<guid>https://example.com/?a=1&b=2&c=3</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&b=2&c=3");
assert!(!had_bozo);
}
#[test]
fn test_read_text_unknown_entity_preserved() {
let xml = b"<guid>https://example.com/?a=1&customEntity;b=2</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "https://example.com/?a=1&customEntity;b=2");
assert!(had_bozo);
}
#[test]
fn test_read_text_mixed_valid_and_unknown_entities() {
let xml = b"<title>AT&T&unknown;rocks</title>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let (text, had_bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "AT&T&unknown;rocks");
assert!(had_bozo);
}
fn advance_past_start(reader: &mut Reader<&[u8]>, buf: &mut Vec<u8>) {
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
}
#[test]
fn test_read_text_malformed_hex_char_ref() {
let xml = b"<guid>pre&#x;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&#x;suf");
assert!(had_bozo);
}
#[test]
fn test_read_text_malformed_decimal_char_ref() {
let xml = b"<guid>pre&#;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&#;suf");
assert!(had_bozo);
}
#[test]
fn test_read_text_empty_entity_name() {
let xml = b"<guid>pre&;suf</guid>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "pre&;suf");
assert!(had_bozo);
}
fn advance_past_start_xhtml(reader: &mut Reader<&[u8]>, buf: &mut Vec<u8>) {
loop {
match reader.read_event_into(buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
}
#[test]
fn test_read_xhtml_content_preserves_markup() {
let xml = b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"><p>Hello <b>world</b></p></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(result, "<p>Hello <b>world</b></p>");
assert!(!had_bozo);
}
#[test]
fn test_read_xhtml_content_no_outer_div() {
let xml = b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"><p>Hello <b>world</b></p></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert!(!result.contains("<div"), "outer <div> must be stripped");
assert!(!had_bozo);
}
#[test]
fn test_read_xhtml_content_empty() {
let xml =
b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(result, "");
assert!(!had_bozo);
}
#[test]
fn test_read_xhtml_content_no_div_wrapper_no_panic() {
let xml = b"<content type=\"xhtml\"></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(result, "");
assert!(had_bozo);
}
#[test]
fn test_read_xhtml_content_nested_elements() {
let xml = b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"><ul><li>A</li><li>B</li></ul></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert!(result.contains("<ul>"));
assert!(result.contains("<li>A</li>"));
assert!(result.contains("<li>B</li>"));
assert!(!result.contains("<div"));
assert!(!had_bozo);
}
#[test]
fn test_read_xhtml_content_preserves_entities() {
let xml = b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"><p>Tom & Jerry <rocks></p></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert!(!had_bozo);
assert!(
result.contains("&"),
"& must be escaped as & in output, got: {result}"
);
assert!(
result.contains("<"),
"< must be escaped as < in output, got: {result}"
);
assert!(
!result.contains("Tom & Jerry"),
"bare & must not appear in output, got: {result}"
);
}
#[test]
fn test_read_xhtml_content_apos_and_quot_decoded() {
let xml = b"<content type=\"xhtml\"><div xmlns=\"http://www.w3.org/1999/xhtml\"><p>it's a "test"</p></div></content>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start_xhtml(&mut reader, &mut buf);
let (result, had_bozo) =
read_xhtml_content(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert!(!had_bozo);
assert!(
result.contains("it's a"),
"' must decode to literal apostrophe, got: {result}"
);
assert!(
result.contains("\"test\""),
"" must decode to literal quote, got: {result}"
);
assert!(
!result.contains("'"),
"' must not remain escaped in output, got: {result}"
);
assert!(
!result.contains("""),
"" must not remain escaped in output, got: {result}"
);
}
#[test]
fn test_skip_element_basic() {
let xml = b"<parent><child>content</child></parent>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
let depth = 1;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
buf.clear();
let result = skip_element(&mut reader, &mut buf, &limits, depth);
assert!(result.is_ok());
}
#[test]
fn test_resolve_entity_valid_named() {
let xml = b"<t>&</t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "&");
assert!(!had_bozo);
}
#[test]
fn test_resolve_entity_valid_numeric() {
let xml = b"<t>&</t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "&");
assert!(!had_bozo);
}
#[test]
fn test_resolve_entity_unknown_named() {
let xml = b"<t>&foo;</t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "&foo;");
assert!(had_bozo);
}
#[test]
fn test_read_text_returns_bozo_on_unknown_entity() {
let xml = b"<t>hello &custom; world</t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "hello &custom; world");
assert!(had_bozo);
}
#[test]
fn test_read_text_no_bozo_on_standard_entities() {
let xml = b"<t>a&b<c></t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "a&b<c>");
assert!(!had_bozo);
}
#[test]
fn test_read_text_mixed_entities_bozo() {
let xml = b"<t>&&unknown;</t>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
advance_past_start(&mut reader, &mut buf);
let (text, had_bozo) = read_text(&mut reader, &mut buf, &ParserLimits::default()).unwrap();
assert_eq!(text, "&&unknown;");
assert!(had_bozo);
}
fn parse_namespaces_from_element(xml: &[u8], limits: &ParserLimits) -> ParsedFeed {
let mut reader = Reader::from_reader(xml);
let mut buf = Vec::new();
let mut feed = init_feed(crate::types::FeedVersion::Rss20, limits.max_entries);
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e) | Event::Empty(e)) => {
extract_namespaces(&e, &mut feed, limits);
break;
}
Ok(Event::Eof) => break,
_ => {}
}
buf.clear();
}
feed
}
#[test]
fn test_extract_namespaces_default_and_prefixed() {
let xml = b"<rss xmlns=\"http://default.example.com/\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\"/>";
let limits = ParserLimits::default();
let feed = parse_namespaces_from_element(xml, &limits);
assert!(!feed.bozo);
assert_eq!(
feed.namespaces.get("").map(String::as_str),
Some("http://default.example.com/")
);
assert_eq!(
feed.namespaces.get("dc").map(String::as_str),
Some("http://purl.org/dc/elements/1.1/")
);
}
#[test]
fn test_extract_namespaces_empty_default() {
let xml = b"<rss xmlns=\"\"/>";
let limits = ParserLimits::default();
let feed = parse_namespaces_from_element(xml, &limits);
assert!(!feed.bozo);
assert_eq!(feed.namespaces.get("").map(String::as_str), Some(""));
}
#[test]
fn test_extract_namespaces_no_xmlns() {
let xml = b"<rss version=\"2.0\"/>";
let limits = ParserLimits::default();
let feed = parse_namespaces_from_element(xml, &limits);
assert!(!feed.bozo);
assert!(feed.namespaces.is_empty());
}
#[test]
fn test_extract_namespaces_limit_exceeded_sets_bozo() {
let xml =
b"<rss xmlns:a=\"http://a.com/\" xmlns:b=\"http://b.com/\" xmlns:c=\"http://c.com/\"/>";
let limits = ParserLimits {
max_namespaces: 2,
..ParserLimits::default()
};
let feed = parse_namespaces_from_element(xml, &limits);
assert!(feed.bozo);
assert_eq!(feed.namespaces.len(), 2);
}
#[test]
fn test_extract_namespaces_uri_too_long_sets_bozo() {
let long_uri = "http://".to_string() + &"a".repeat(200);
let xml = format!("<rss xmlns:dc=\"{long_uri}\"/>");
let limits = ParserLimits {
max_attribute_length: 100,
..ParserLimits::default()
};
let feed = parse_namespaces_from_element(xml.as_bytes(), &limits);
assert!(feed.bozo);
assert!(feed.namespaces.is_empty());
}
#[test]
fn test_is_dc_tag_custom_prefix() {
let mut ns = HashMap::new();
ns.insert(
"dublin".to_string(),
"http://purl.org/dc/elements/1.1/".to_string(),
);
assert_eq!(is_dc_tag(b"dublin:creator", &ns), Some("creator"));
assert_eq!(is_dc_tag(b"dublin:date", &ns), Some("date"));
let empty = HashMap::new();
assert_eq!(is_dc_tag(b"dc:creator", &empty), Some("creator"));
assert!(is_dc_tag(b"foo:creator", &ns).is_none());
}
#[test]
fn test_is_media_tag_custom_prefix() {
let mut ns = HashMap::new();
ns.insert(
"mrss".to_string(),
"http://search.yahoo.com/mrss/".to_string(),
);
assert_eq!(is_media_tag(b"mrss:thumbnail", &ns), Some("thumbnail"));
assert_eq!(is_media_tag(b"mrss:content", &ns), Some("content"));
let empty = HashMap::new();
assert_eq!(is_media_tag(b"media:thumbnail", &empty), Some("thumbnail"));
assert!(is_media_tag(b"foo:thumbnail", &ns).is_none());
}
#[test]
fn test_is_itunes_tag_custom_prefix() {
let mut ns = HashMap::new();
ns.insert(
"podcast".to_string(),
"http://www.itunes.com/dtds/podcast-1.0.dtd".to_string(),
);
assert!(is_itunes_tag(b"podcast:author", b"author", &ns));
assert!(is_itunes_tag(b"podcast:explicit", b"explicit", &ns));
let empty = HashMap::new();
assert!(is_itunes_tag(b"itunes:author", b"author", &empty));
assert!(!is_itunes_tag(b"foo:author", b"author", &ns));
}
#[test]
fn test_ns_tag_with_invalid_security_chars() {
let mut ns = HashMap::new();
ns.insert(
"dublin".to_string(),
"http://purl.org/dc/elements/1.1/".to_string(),
);
assert!(is_dc_tag(b"dublin:../../etc/passwd", &ns).is_none());
assert!(is_dc_tag(b"dublin:tag<script>", &ns).is_none());
}
#[test]
fn test_is_itunes_tag_security() {
let empty = HashMap::new();
assert!(!is_itunes_tag(
b"itunes:../../etc/passwd",
b"../../etc/passwd",
&empty
));
let mut ns = HashMap::new();
ns.insert(
"it".to_string(),
"http://www.itunes.com/dtds/podcast-1.0.dtd".to_string(),
);
assert!(!is_itunes_tag(
b"it:../../etc/passwd",
b"../../etc/passwd",
&ns
));
assert!(!is_itunes_tag(b"../../etc/passwd", b"author", &empty));
}
#[test]
fn test_read_text_strips_null_bytes() {
let xml = b"<title>Hello\x00World</title>";
let mut reader = Reader::from_reader(&xml[..]);
let mut buf = Vec::new();
let limits = ParserLimits::default();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(_)) => break,
Ok(Event::Eof) => panic!("Unexpected EOF"),
_ => {}
}
buf.clear();
}
let (text, _bozo) = read_text(&mut reader, &mut buf, &limits).unwrap();
assert_eq!(text, "HelloWorld");
}
}