use std::io::BufRead;
use bytes::Bytes;
use countio::Counter;
use quick_xml::{events, Reader};
use time::format_description::well_known::Iso8601;
use time::OffsetDateTime;
use url::Url;
use crate::{
attribute as attr,
attribute::{Frequency, Priority},
parse::Parser,
record::{EntryRecord, IndexRecord, BYTE_LIMIT, RECORD_LIMIT},
Error,
};
pub struct XmlParser<R, D> {
record: Option<D>,
pub(crate) reader: Reader<Counter<R>>,
pub(crate) records: usize,
path: Vec<Bytes>,
}
impl<R, D> XmlParser<R, D> {
pub(crate) fn from_reader(reader: R) -> Self {
Self {
record: None,
reader: Reader::from_reader(Counter::new(reader)),
records: 0,
path: Vec::default(),
}
}
pub(crate) fn from_wrapper(wrapped: Reader<Counter<R>>, path: &str) -> Self {
let bytes = Bytes::from(path.as_bytes().to_vec());
Self {
record: None,
reader: wrapped,
records: 0,
path: Vec::from([bytes]),
}
}
pub fn get_ref(&self) -> &R {
self.reader.get_ref().get_ref()
}
pub fn get_mut(&mut self) -> &mut R {
self.reader.get_mut().get_mut()
}
pub fn into_inner(self) -> R {
self.reader.into_inner().into_inner()
}
pub(crate) fn try_if_readable(&mut self) -> Result<(), Error> {
if self.records + 1 > RECORD_LIMIT {
return Err(Error::EntryLimit { over: 1 });
}
if self.reader.get_ref().reader_bytes() > BYTE_LIMIT {
let over = self.reader.get_ref().reader_bytes() - BYTE_LIMIT;
return Err(Error::ByteLimit { over });
}
Ok(())
}
pub(crate) fn write_event(
&mut self,
event: events::Event,
tag: &[u8],
create: impl FnOnce() -> D,
handle: impl FnOnce(&mut Self, &str),
check: impl FnOnce(Option<D>) -> Option<D>,
) -> Result<Option<Option<D>>, Error> {
match event {
events::Event::Start(bytes) => {
let name = bytes.name().into_inner();
if name.eq_ignore_ascii_case(tag) {
self.records += 1;
let instance = create();
self.record.replace(instance);
}
self.path.push(name.to_vec().into());
}
events::Event::Text(bytes) => {
let text = bytes.unescape()?;
handle(self, &text);
}
events::Event::End(bytes) => {
let name = bytes.name().into_inner().to_vec();
if self.path.pop() != Some(name.clone().into()) {
}
if name.eq_ignore_ascii_case(tag) {
let rec = self.record.take();
return Ok(Some(check(rec)));
}
}
events::Event::Eof => {
let rec = self.record.take();
return Ok(Some(check(rec)));
}
_ => {} }
Ok(None)
}
}
impl<R> XmlParser<R, EntryRecord> {
fn handle_entry_text(&mut self, text: &str) {
static LOC: [&str; 3] = [attr::URL_SET, attr::URL, attr::LOCATION];
static MOD: [&str; 3] = [attr::URL_SET, attr::URL, attr::LAST_MODIFIED];
static FRQ: [&str; 3] = [attr::URL_SET, attr::URL, attr::CHANGE_FREQUENCY];
static PRI: [&str; 3] = [attr::URL_SET, attr::URL, attr::PRIORITY];
if let Some(rec) = &mut self.record {
match self.path.as_slice() {
x if x == LOC => rec.location = Url::parse(text).ok(),
x if x == MOD => rec.modified = OffsetDateTime::parse(text, &Iso8601::PARSING).ok(),
x if x == FRQ => rec.frequency = Frequency::parse(text).ok(),
x if x == PRI => rec.priority = Priority::parse(text).ok(),
_ => {}
}
}
}
fn is_entry_good(mut record: Option<EntryRecord>) -> Option<EntryRecord> {
if record.as_ref().is_some_and(|u| u.location.is_none()) {
let _ = record.take();
}
record
}
pub(crate) fn write_entry_event(
&mut self,
event: events::Event,
) -> Result<Option<Option<EntryRecord>>, Error> {
self.write_event(
event,
attr::URL.as_bytes(),
EntryRecord::clean,
Self::handle_entry_text,
Self::is_entry_good,
)
}
}
impl<R> XmlParser<R, IndexRecord> {
fn handle_index_text(&mut self, text: &str) {
static LOC: [&str; 3] = [attr::SITEMAP_INDEX, attr::SITEMAP, attr::LOCATION];
static MOD: [&str; 3] = [attr::SITEMAP_INDEX, attr::SITEMAP, attr::LAST_MODIFIED];
if let Some(rec) = &mut self.record {
match self.path.as_slice() {
x if x == LOC => rec.location = Url::parse(text).ok(),
x if x == MOD => rec.modified = OffsetDateTime::parse(text, &Iso8601::PARSING).ok(),
_ => {}
}
}
}
fn is_index_good(mut record: Option<IndexRecord>) -> Option<IndexRecord> {
if record.as_ref().is_some_and(|u| u.location.is_none()) {
let _ = record.take();
}
record
}
pub(crate) fn write_index_event(
&mut self,
event: events::Event,
) -> Result<Option<Option<IndexRecord>>, Error> {
self.write_event(
event,
attr::SITEMAP.as_bytes(),
IndexRecord::clean,
Self::handle_index_text,
Self::is_index_good,
)
}
}
impl<R: BufRead> Parser<R, EntryRecord> for XmlParser<R, EntryRecord> {
type Error = Error;
fn new(reader: R) -> Result<Self, Self::Error> {
Ok(Self::from_reader(reader))
}
fn read(&mut self) -> Result<Option<EntryRecord>, Self::Error> {
let mut buf = Vec::new();
loop {
self.try_if_readable()?;
let event = self.reader.read_event_into(&mut buf)?;
if let Some(record) = self.write_entry_event(event)? {
return Ok(record);
}
}
}
fn close(self) -> Result<R, Self::Error> {
Ok(self.into_inner())
}
}
impl<R: BufRead> Parser<R, IndexRecord> for XmlParser<R, IndexRecord> {
type Error = Error;
fn new(reader: R) -> Result<Self, Self::Error> {
Ok(Self::from_reader(reader))
}
fn read(&mut self) -> Result<Option<IndexRecord>, Self::Error> {
let mut buf = Vec::new();
loop {
self.try_if_readable()?;
let event = self.reader.read_event_into(&mut buf)?;
if let Some(record) = self.write_index_event(event)? {
return Ok(record);
}
}
}
fn close(self) -> Result<R, Self::Error> {
Ok(self.into_inner())
}
}
impl<R, D> std::fmt::Debug for XmlParser<R, D> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("XmlParser")
.field("bytes", &self.reader.get_ref().reader_bytes())
.field("records", &self.records)
.finish()
}
}
#[cfg(feature = "tokio")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))]
mod tokio {
use async_trait::async_trait;
use tokio::io::AsyncBufRead;
use crate::{
parse::{AsyncParser, XmlParser},
record::{EntryRecord, IndexRecord},
Error,
};
#[async_trait]
impl<R: AsyncBufRead + Unpin + Send> AsyncParser<R, EntryRecord> for XmlParser<R, EntryRecord> {
type Error = Error;
async fn new(reader: R) -> Result<Self, Self::Error> {
Ok(Self::from_reader(reader))
}
async fn read(&mut self) -> Result<Option<EntryRecord>, Self::Error> {
let mut buf = Vec::new();
loop {
self.try_if_readable()?;
let event = self.reader.read_event_into_async(&mut buf).await?;
if let Some(record) = self.write_entry_event(event)? {
return Ok(record);
}
}
}
async fn close(self) -> Result<R, Self::Error> {
Ok(self.into_inner())
}
}
#[async_trait]
impl<R: AsyncBufRead + Unpin + Send> AsyncParser<R, IndexRecord> for XmlParser<R, IndexRecord> {
type Error = Error;
async fn new(reader: R) -> Result<Self, Self::Error> {
Ok(Self::from_reader(reader))
}
async fn read(&mut self) -> Result<Option<IndexRecord>, Self::Error> {
let mut buf = Vec::new();
loop {
self.try_if_readable()?;
let event = self.reader.read_event_into_async(&mut buf).await?;
if let Some(record) = self.write_index_event(event)? {
return Ok(record);
}
}
}
async fn close(self) -> Result<R, Self::Error> {
Ok(self.into_inner())
}
}
}
#[cfg(test)]
mod test {
use url::Url;
use crate::Error;
use crate::{parse::XmlParser, record::EntryRecord};
const EXAMPLE: &'static str = r#"
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>https://www.example.com/file1.html</loc>
<lastmod>2022-09-08T10:43:13.000-04:00</lastmod>
<changefreq>daily</changefreq>
<priority>0.6</priority>
</url>
</urlset>"#;
#[test]
fn synk() -> Result<(), Error> {
use crate::parse::Parser;
let buf = EXAMPLE.as_bytes();
let mut parser = XmlParser::new(buf)?;
let record: EntryRecord = parser.read()?.unwrap();
parser.close()?;
let exp = Url::parse("https://www.example.com/file1.html");
assert_eq!(record.location(), &exp.unwrap());
Ok(())
}
#[cfg(feature = "tokio")]
#[tokio::test]
async fn asynk() -> Result<(), Error> {
use crate::parse::AsyncParser;
let buf = EXAMPLE.as_bytes();
let mut parser = XmlParser::new(buf).await?;
let record: EntryRecord = parser.read().await?.unwrap();
parser.close().await?;
let exp = Url::parse("https://www.example.com/file1.html");
assert_eq!(record.location(), &exp.unwrap());
Ok(())
}
}