use jiff::Timestamp;
use quick_xml::XmlVersion;
use quick_xml::escape::resolve_predefined_entity;
use quick_xml::events::{BytesEnd, BytesRef, BytesText};
use rama_core::telemetry::tracing;
use rama_net::uri::Uri;
use super::atom::{AtomCategory, AtomLink, AtomText};
use super::error::FeedParseError;
use super::feed_ext::names::attr;
use super::rss2::Rss2Enclosure;
pub(super) type Attrs<'a> = quick_xml::events::BytesStart<'a>;
pub(in crate::protocols::rss) fn end_event_parts<'b>(
e: BytesEnd<'_>,
name_buf: &'b mut [u8; 64],
text_buf: &mut String,
) -> (&'b str, String) {
let local_bytes = e.local_name();
let n = local_bytes.as_ref().len().min(name_buf.len());
name_buf[..n].copy_from_slice(&local_bytes.as_ref()[..n]);
drop(e);
let local = std::str::from_utf8(&name_buf[..n]).unwrap_or("");
let mut text = std::mem::take(text_buf);
let trimmed = text.trim();
if trimmed.len() != text.len() {
text = trimmed.to_owned();
}
(local, text)
}
pub(super) fn attr_value(e: &Attrs<'_>, name: &str) -> Option<String> {
let needle = name.as_bytes();
e.attributes()
.filter_map(|a| a.ok())
.find(|a| a.key.as_ref() == needle)
.and_then(|a| {
a.normalized_value(XmlVersion::Implicit1_0)
.ok()
.map(|v| v.into_owned())
})
}
pub(super) fn parse_uri(s: &str) -> Option<Uri> {
Uri::parse(s.trim()).ok()
}
pub(super) fn parse_uri_reference(s: &str) -> Option<Uri> {
Uri::parse_reference(s.trim()).ok()
}
pub(super) fn attr_uri(e: &Attrs<'_>, name: &str) -> Option<Uri> {
attr_value(e, name).and_then(|v| parse_uri(&v))
}
pub(super) fn attr_uri_reference(e: &Attrs<'_>, name: &str) -> Option<Uri> {
attr_value(e, name).and_then(|v| parse_uri_reference(&v))
}
pub(super) fn parse_rss2_date(s: &str) -> Option<Timestamp> {
use jiff::fmt::rfc2822;
let s = s.trim();
rfc2822::parse(s)
.ok()
.map(|zdt| zdt.timestamp())
.or_else(|| s.parse::<Timestamp>().ok())
}
pub(super) fn parse_rfc3339_lax(s: &str) -> Option<Timestamp> {
s.trim().parse::<Timestamp>().ok()
}
pub(super) fn make_atom_text(type_attr: &str, value: String) -> AtomText {
match type_attr {
"html" | "text/html" => AtomText::html_raw(value),
"xhtml" => AtomText::xhtml(value),
_ => AtomText::text(value),
}
}
pub(super) fn enclosure_from_attrs(e: &Attrs<'_>) -> Option<Rss2Enclosure> {
Some(Rss2Enclosure {
url: attr_uri(e, attr::URL)?,
length: attr_value(e, attr::LENGTH)
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or_default(),
type_: attr_value(e, attr::TYPE).unwrap_or_default(),
})
}
pub(super) fn atom_link_from_attrs(e: &Attrs<'_>) -> Option<AtomLink> {
Some(AtomLink {
href: attr_uri_reference(e, attr::HREF)?,
rel: attr_value(e, attr::REL),
type_: attr_value(e, attr::TYPE),
hreflang: attr_value(e, attr::HREFLANG),
title: attr_value(e, attr::TITLE),
length: attr_value(e, attr::LENGTH).and_then(|v| v.parse().ok()),
})
}
pub(super) fn atom_category_from_attrs(e: &Attrs<'_>) -> AtomCategory {
AtomCategory {
term: attr_value(e, attr::TERM).unwrap_or_default(),
scheme: attr_value(e, attr::SCHEME),
label: attr_value(e, attr::LABEL),
}
}
pub(super) fn push_text(
buf: &mut String,
e: &BytesText<'_>,
strict: bool,
) -> Result<(), FeedParseError> {
match e.decode() {
Ok(t) => buf.push_str(&t),
Err(err) => {
if strict {
return Err(FeedParseError::new(format!("invalid text content: {err}")));
}
tracing::debug!("rss feed text decode error (lenient): {err}");
buf.push_str(&String::from_utf8_lossy(e.as_ref()));
}
}
Ok(())
}
pub(super) fn push_general_ref(
buf: &mut String,
e: &BytesRef<'_>,
strict: bool,
) -> Result<(), FeedParseError> {
match e.resolve_char_ref() {
Ok(Some(ch)) => {
buf.push(ch);
return Ok(());
}
Ok(None) => {} Err(err) => {
if strict {
return Err(FeedParseError::new(format!(
"invalid character reference: {err}"
)));
}
}
}
let name = e.decode().unwrap_or_default();
if let Some(replacement) = resolve_predefined_entity(&name) {
buf.push_str(replacement);
return Ok(());
}
if strict {
return Err(FeedParseError::new(format!(
"unresolvable entity reference: &{name};"
)));
}
tracing::debug!("rss feed unknown entity (lenient): &{name};");
buf.push('&');
buf.push_str(&name);
buf.push(';');
Ok(())
}
pub(super) fn detect_atom(s: &str) -> bool {
first_element_local_name(probe_prefix(s, 2048)) == Some("feed")
}
pub(super) fn detect_rss(s: &str) -> bool {
matches!(
first_element_local_name(probe_prefix(s, 1024)),
Some("rss" | "channel")
)
}
fn first_element_local_name(s: &str) -> Option<&str> {
let mut rest = s;
loop {
let lt = rest.find('<')?;
rest = &rest[lt + 1..];
if let Some(after) = rest.strip_prefix("?xml") {
let end = after.find("?>")?;
rest = &after[end + 2..];
continue;
}
if let Some(after) = rest.strip_prefix("!--") {
let end = after.find("-->")?;
rest = &after[end + 3..];
continue;
}
if let Some(after) = rest.strip_prefix("!DOCTYPE") {
let end = after.find('>')?;
rest = &after[end + 1..];
continue;
}
if rest.starts_with('!') || rest.starts_with('?') {
let end = rest.find('>')?;
rest = &rest[end + 1..];
continue;
}
let qname_end = rest
.find(|c: char| c.is_whitespace() || c == '>' || c == '/')
.unwrap_or(rest.len());
let qname = &rest[..qname_end];
return Some(qname.rsplit(':').next().unwrap_or_default());
}
}
fn probe_prefix(s: &str, max: usize) -> &str {
if s.len() <= max {
return s;
}
let mut end = max;
while !s.is_char_boundary(end) {
end -= 1;
}
&s[..end]
}