sitemapo 0.2.0

The implementation of the Sitemap.xml (or URL inclusion) protocol with the support of txt & xml formats, and video, image, news extensions.
Documentation
use std::io::BufRead;
use std::marker::PhantomData;

use countio::Counter;
use url::Url;

use crate::{
    parse::Parser,
    record::{BYTE_LIMIT, RECORD_LIMIT},
    Error,
};

/// Sitemap parser for the simple TXT file that contains one URL per line.
///
/// For example:
///
/// ```txt
/// https://www.example.com/file1.html
/// https://www.example.com/file2.html
/// ```
///
/// Enforces [total written/read bytes](BYTE_LIMIT) and [total url](RECORD_LIMIT) limits.
/// See [Error].
///
/// ```rust
/// use sitemapo::{
///     parse::{Parser, TxtParser},
///     Error,
/// };
///
/// fn main() -> Result<(), Error> {
///     let buf = "https://example.com/file1.html".as_bytes();
///
///     let mut parser = TxtParser::new(buf).unwrap();
///     let _rec = parser.read()?;
///     let _buf = parser.close()?;
///     Ok(())
/// }
/// ```
pub struct TxtParser<R, D> {
    record_type: PhantomData<D>,
    pub(crate) reader: Counter<R>,
    pub(crate) records: usize,
}

impl<R, D> TxtParser<R, D> {
    /// Creates a new instance with a provided reader.
    pub(crate) fn from_reader(reader: R) -> Self {
        Self {
            record_type: PhantomData,
            reader: Counter::new(reader),
            records: 0,
        }
    }

    /// Returns a reference to the underlying reader.
    pub fn get_ref(&self) -> &R {
        self.reader.get_ref()
    }

    /// Returns a mutable reference to the underlying reader.
    pub fn get_mut(&mut self) -> &mut R {
        self.reader.get_mut()
    }

    /// Returns an underlying reader.
    pub fn into_inner(self) -> R {
        self.reader.into_inner()
    }

    pub(crate) fn try_if_readable(&mut self) -> Result<(), Error> {
        if self.records + 1 > RECORD_LIMIT {
            return Err(Error::EntryLimit { over: 1 });
        }

        if self.reader.reader_bytes() > BYTE_LIMIT {
            let over = self.reader.reader_bytes() - BYTE_LIMIT;
            return Err(Error::ByteLimit { over });
        }

        Ok(())
    }
}

impl<R: BufRead> Parser<R, Url> for TxtParser<R, Url> {
    type Error = Error;

    fn new(reader: R) -> Result<Self, Self::Error> {
        Ok(Self::from_reader(reader))
    }

    fn read(&mut self) -> Result<Option<Url>, Self::Error> {
        loop {
            self.try_if_readable()?;

            let mut buf = String::new();
            if self.reader.read_line(&mut buf)? == 0 {
                return Ok(None);
            }

            self.records += 1;
            match Url::parse(buf.as_str()) {
                Ok(address) => return Ok(Some(address)),
                Err(_) => continue,
            }
        }
    }

    fn close(self) -> Result<R, Self::Error> {
        Ok(self.into_inner())
    }
}

impl<R, D> std::fmt::Debug for TxtParser<R, D> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("TxtParser")
            .field("bytes", &self.reader.reader_bytes())
            .field("records", &self.records)
            .finish()
    }
}

#[cfg(feature = "tokio")]
#[cfg_attr(docsrs, doc(cfg(feature = "tokio")))]
mod tokio {
    use async_trait::async_trait;
    use tokio::io::{AsyncBufRead, AsyncBufReadExt};
    use url::Url;

    use crate::{
        parse::{AsyncParser, TxtParser},
        Error,
    };

    #[async_trait]
    impl<R: AsyncBufRead + Unpin + Send> AsyncParser<R, Url> for TxtParser<R, Url> {
        type Error = Error;

        async fn new(reader: R) -> Result<Self, Self::Error> {
            Ok(Self::from_reader(reader))
        }

        async fn read(&mut self) -> Result<Option<Url>, Self::Error> {
            loop {
                self.try_if_readable()?;

                let mut buf = String::new();
                if self.reader.read_line(&mut buf).await? == 0 {
                    return Ok(None);
                }

                self.records += 1;
                match Url::parse(buf.as_str()) {
                    Ok(address) => return Ok(Some(address)),
                    Err(_) => continue,
                }
            }
        }

        async fn close(self) -> Result<R, Self::Error> {
            Ok(self.into_inner())
        }
    }
}

#[cfg(test)]
mod test {
    use url::Url;

    use crate::{parse::TxtParser, Error};

    #[test]
    fn synk() -> Result<(), Error> {
        use crate::parse::Parser;

        let buf = r#"https://www.example.com/file1.html
        https://www.example.com/file2.html"#
            .as_bytes();

        let mut parser = TxtParser::new(buf)?;
        let url = parser.read()?;
        parser.close()?;

        let exp = Url::parse("https://www.example.com/file1.html");
        assert_eq!(url, exp.ok());

        Ok(())
    }

    #[cfg(feature = "tokio")]
    #[tokio::test]
    async fn asynk() -> Result<(), Error> {
        use crate::parse::AsyncParser;

        let buf = r#"https://www.example.com/file1.html
        https://www.example.com/file2.html"#
            .as_bytes();

        let mut parser = TxtParser::new(buf).await?;
        let url = parser.read().await?;
        parser.close().await?;

        let exp = Url::parse("https://www.example.com/file1.html");
        assert_eq!(url, exp.ok());

        Ok(())
    }
}