use std::error::Error;
use std::io::BufRead;
use std::ops::Add;
use std::sync::OnceLock;
use std::time::Duration;
use chrono::{DateTime, Utc};
use regex::{Captures, Regex};
use url::Url;
use uuid::Uuid;
use fixes::PatSub;
use model::{Link, Text};
use crate::model;
use crate::parser::{ParseFeedResult, Parser};
use crate::xml::Element;
mod fixes {
use super::OnceLock;
use super::Regex;
pub struct PatSub(pub Regex, pub &'static str);
pub fn rfc1123() -> &'static [PatSub] {
static RFC1123: OnceLock<Vec<PatSub>> = OnceLock::new();
RFC1123.get_or_init(|| {
vec![
PatSub(Regex::new(" Z$").unwrap(), " +0000"),
PatSub(Regex::new("^[[:alpha:]]{3}, ").unwrap(), ""),
]
})
}
pub fn rfc2822() -> &'static [PatSub] {
static RFC2822: OnceLock<Vec<PatSub>> = OnceLock::new();
RFC2822.get_or_init(|| {
vec![
PatSub(Regex::new("(UTC|-0000$)").unwrap(), "+0000"),
PatSub(Regex::new("(Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*, ").unwrap(), ""),
PatSub(Regex::new("(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*").unwrap(), "$1"),
#[allow(clippy::trivial_regex)]
PatSub(Regex::new(" 24:").unwrap(), " 00:"),
PatSub(Regex::new(" ([0-9]):").unwrap(), " 0${1}:"),
]
})
}
pub fn rfc3339() -> &'static [PatSub] {
static RFC3339: OnceLock<Vec<PatSub>> = OnceLock::new();
RFC3339.get_or_init(|| {
vec![
PatSub(Regex::new(r"(\+|-)(\d{2})(\d{2})").unwrap(), "${1}${2}:${3}"),
PatSub(Regex::new(r"-\d{2}$").unwrap(), "${0}T00:00:00+00:00"),
]
})
}
}
static RFC1123_FORMAT_STR: &str = "%d %b %Y %H:%M:%S %z";
pub(crate) type TimestampParser = dyn Fn(&str) -> Option<DateTime<Utc>> + 'static;
pub(crate) type IdGenerator = dyn Fn(&[Link], &Option<Text>, Option<&str>) -> String;
pub(crate) fn handle_encoded<R: BufRead>(element: Element<R>) -> ParseFeedResult<Option<Text>> {
Ok(element.children_as_string()?.map(Text::html))
}
pub(crate) fn handle_language_attr<R: BufRead>(element: &Element<R>) -> Option<String> {
element.attr_value("xml:lang")
}
pub(crate) fn handle_base_attr<R: BufRead>(element: &Element<R>) -> Option<String> {
element.attr_value("xml:base")
}
pub(crate) fn handle_link<R: BufRead>(element: Element<R>) -> Option<Link> {
element.child_as_text().map(|s| Link::new(s, element.xml_base.as_ref()))
}
pub(crate) fn handle_text<R: BufRead>(element: Element<R>) -> Option<Text> {
if let Ok(Some(text)) = element.children_as_string() {
Some(Text::new(text))
} else {
None
}
}
pub(crate) fn handle_timestamp<R: BufRead>(parser: &Parser, element: Element<R>) -> Option<DateTime<Utc>> {
if let Some(text) = element.child_as_text() {
parser.parse_timestamp(&text)
} else {
None
}
}
pub(crate) fn if_some_then<T, F: FnOnce(T)>(v: Option<T>, func: F) {
if let Some(v) = v {
func(v)
}
}
pub(crate) fn if_ok_then_some<T, F: FnOnce(Option<T>)>(v: Result<T, impl Error>, func: F) {
if let Ok(v) = v {
func(Some(v))
}
}
pub(crate) fn parse_timestamp_lenient(original: &str) -> Option<DateTime<Utc>> {
try_parse_timestamp_rfc3339_lenient(original)
.or_else(|| try_parse_timestamp_rfc2822_lenient(original))
.or_else(|| try_parse_timestamp_rfc1123_lenient(original))
}
pub(crate) fn parse_uri(uri: &str, base: Option<&Url>) -> Option<Url> {
match Url::parse(uri) {
Ok(uri) => Some(uri),
Err(url::ParseError::RelativeUrlWithoutBase) => {
if let Some(base) = base {
if let Ok(with_base) = base.join(uri) {
return Some(with_base);
}
}
None
}
_ => None,
}
}
fn try_parse_timestamp_rfc1123_lenient(original: &str) -> Option<DateTime<Utc>> {
let mut cleaned = original.trim().to_string();
for PatSub(regex, replacement) in fixes::rfc1123() {
cleaned = regex.replace(&cleaned, *replacement).to_string();
}
DateTime::parse_from_str(&cleaned, RFC1123_FORMAT_STR).map(|t| t.with_timezone(&Utc)).ok()
}
fn try_parse_timestamp_rfc2822_lenient(original: &str) -> Option<DateTime<Utc>> {
let mut cleaned = original.trim().to_string();
for PatSub(regex, replacement) in fixes::rfc2822() {
cleaned = regex.replace(&cleaned, *replacement).to_string();
}
DateTime::parse_from_rfc2822(&cleaned).map(|t| t.with_timezone(&Utc)).ok()
}
fn try_parse_timestamp_rfc3339_lenient(original: &str) -> Option<DateTime<Utc>> {
let mut cleaned = original.trim().to_string();
for PatSub(regex, replacement) in fixes::rfc3339() {
cleaned = regex.replace(&cleaned, *replacement).to_string();
}
DateTime::parse_from_rfc3339(cleaned.trim()).map(|t| t.with_timezone(&Utc)).ok()
}
pub(crate) fn uuid_gen() -> String {
Uuid::new_v4().to_string()
}
pub(crate) fn parse_npt(text: &str) -> Option<Duration> {
static NPT_HHMMSS: OnceLock<Regex> = OnceLock::new();
let npt_hhmmss = NPT_HHMMSS.get_or_init(|| {
Regex::new(r"(?P<h>\d+):(?P<m>\d{2}):(?P<s>\d{2})(\.(?P<f>\d+))?").unwrap()
});
if let Some(captures) = npt_hhmmss.captures(text) {
let h = captures.name("h");
let m = captures.name("m");
let s = captures.name("s");
if let (Some(h), Some(m), Some(s)) = (h, m, s) {
let mut seconds = s.as_str().parse::<u64>().unwrap();
seconds += m.as_str().parse::<u64>().unwrap() * 60;
seconds += h.as_str().parse::<u64>().unwrap() * 3600;
let mut duration = Duration::from_secs(seconds);
duration = parse_npt_add_frac_sec(duration, captures);
return Some(duration);
}
}
static NPT_SEC: OnceLock<Regex> = OnceLock::new();
let npt_sec = NPT_SEC.get_or_init(|| {
Regex::new(r"(?P<s>\d+)(\.(?P<f>\d+))?").unwrap()
});
if let Some(captures) = npt_sec.captures(text) {
if let Some(s) = captures.name("s") {
let seconds = s.as_str().parse::<u64>().unwrap();
let mut duration = Duration::from_secs(seconds);
duration = parse_npt_add_frac_sec(duration, captures);
return Some(duration);
}
}
None
}
fn parse_npt_add_frac_sec(duration: Duration, captures: Captures) -> Duration {
if let Some(frac) = captures.name("f") {
let frac = frac.as_str();
let denom = 10f32.powi(frac.len() as i32);
let num = frac.parse::<f32>().unwrap();
let millis = (1000f32 * (num / denom)) as u64;
duration.add(Duration::from_millis(millis))
} else {
duration
}
}
#[cfg(test)]
mod tests {
use chrono::{TimeZone, Utc};
use super::*;
#[test]
fn test_timestamp_rss2() {
let tests = vec![
("26 August 2019 10:00:00 +0000", Utc.with_ymd_and_hms(2019, 8, 26, 10, 0, 0).unwrap()),
("Mon, 01 Jan 0001 00:00:00 UTC", Utc.with_ymd_and_hms(1, 1, 1, 0, 0, 0).unwrap()),
("Wed, 22 Jan 2020 10:58:02 -0000", Utc.with_ymd_and_hms(2020, 1, 22, 10, 58, 2).unwrap()),
("Wed, 25 Aug 2012 03:25:42 GMT", Utc.with_ymd_and_hms(2012, 8, 25, 3, 25, 42).unwrap()),
("2 September 2019 20:00:00 +0000", Utc.with_ymd_and_hms(2019, 9, 2, 20, 0, 0).unwrap()),
("2016-10-01T00:00:00+10:00", Utc.with_ymd_and_hms(2016, 9, 30, 14, 0, 0).unwrap()),
("24 Sep 2013 1:27 PDT", Utc.with_ymd_and_hms(2013, 9, 24, 8, 27, 0).unwrap()),
("5 Jun 2017 24:05 PDT", Utc.with_ymd_and_hms(2017, 6, 5, 7, 5, 0).unwrap()),
("Tue, 15 Nov 2022 20:15:04 Z", Utc.with_ymd_and_hms(2022, 11, 15, 20, 15, 4).unwrap()),
("mer, 16 nov 2022 00:38:15 +0100", Utc.with_ymd_and_hms(2022, 11, 15, 23, 38, 15).unwrap()),
];
for (source, expected) in tests {
let parsed = parse_timestamp_lenient(source).unwrap_or_else(|| panic!("failed to parse {}", source));
assert_eq!(parsed, expected);
}
}
#[test]
fn test_timestamp_atom() {
let tests = vec![
("2014-12-29T14:53:35+02:00", Utc.with_ymd_and_hms(2014, 12, 29, 12, 53, 35).unwrap()),
("2014-12-29T14:53:35+0200", Utc.with_ymd_and_hms(2014, 12, 29, 12, 53, 35).unwrap()),
];
for (source, expected) in tests {
let parsed = parse_timestamp_lenient(source).unwrap_or_else(|| panic!("failed to parse {}", source));
assert_eq!(parsed, expected);
}
}
#[test]
fn test_parse_npt() {
assert_eq!(parse_npt("12:05:35").unwrap(), Duration::from_secs(12 * 3600 + 5 * 60 + 35));
assert_eq!(
parse_npt("12:05:35.123").unwrap(),
Duration::from_millis(12 * 3600000 + 5 * 60000 + 35 * 1000 + 123)
);
assert_eq!(parse_npt("123.45").unwrap(), Duration::from_millis(123450));
}
}