crawn 0.3.0

A utility for web crawling and scraping
use crate::error::Res;
use std::collections::{HashSet, VecDeque};
use std::future::Future;

pub trait UrlRepo {
    fn add(&mut self, url: String) -> impl Future<Output = Res<()>> + Send + Sync;

    fn pop(&mut self) -> impl Future<Output = Res<Option<String>>> + Send + Sync;

    fn kick(&mut self, url: String) -> impl Future<Output = Res<()>> + Send + Sync;

    fn mark(&mut self, url: String) -> impl Future<Output = Res<()>> + Send + Sync;
}

#[derive(Default)]
pub struct InMemoryRepo {
    urls: VecDeque<String>,
    visited: HashSet<String>,
}

impl UrlRepo for InMemoryRepo {
    async fn add(&mut self, url: String) -> Res<()> {
        let vis = &mut self.visited;

        if vis.contains(&url) || url.is_empty() {
            Ok(())
        } else {
            if &url != "M" {
                vis.insert(url.clone());
            }
            self.urls.push_back(url);

            Ok(())
        }
    }

    async fn pop(&mut self) -> Res<Option<String>> {
        Ok(self.urls.pop_front())
    }

    async fn kick(&mut self, url: String) -> Res<()> {
        self.urls.push_front(url);

        Ok(())
    }

    async fn mark(&mut self, url: String) -> Res<()> {
        self.visited.insert(url);

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use crate::{
        InMemoryRepo, UrlRepo,
        error::{Res, ResExt},
    };

    #[tokio::test]
    async fn test_inmemoryrepo() -> Res<()> {
        let mut repo = InMemoryRepo::default();

        for i in 0..50 {
            repo.add(format!("https://example.com/index{}.html", i))
                .await
                .context("Failed to add URL to repo")?;
        }

        while let Some(url) = repo.pop().await.context("Failed to pop URL from repo")? {
            println!("{}", url);
        }

        Ok(())
    }
}