turboscraper 0.1.1

A high-performance, concurrent web scraping framework for Rust with built-in support for retries, storage backends, and concurrent request handling
Documentation
use crate::core::spider::SpiderConfig;
use crate::http::request::HttpRequest;
use crate::{HttpResponse, ScraperError, ScraperResult, StatsTracker};
use async_trait::async_trait;
use log::{debug, info, warn};
use std::sync::Arc;
use tokio::time::sleep;

#[async_trait]
pub trait Scraper: Send + Sync {
    async fn fetch_single(
        &self,
        request: HttpRequest,
        config: &SpiderConfig,
    ) -> ScraperResult<HttpResponse>;
    fn box_clone(&self) -> Box<dyn Scraper>;
    fn stats(&self) -> &StatsTracker;
    fn set_stats(&mut self, stats: Arc<StatsTracker>);

    async fn fetch(
        &self,
        request: HttpRequest,
        config: &SpiderConfig,
    ) -> ScraperResult<HttpResponse> {
        let url = request.url.clone();

        loop {
            info!("Fetching URL: {} [{}]", url, request.method);
            let response = self.fetch_single(request.clone(), config).await?;
            debug!(
                "Received response: status={}, body_length={}",
                response.status,
                response.decoded_body.len()
            );

            if let Some((category, delay)) = config.retry_config.should_retry_request(
                &url,
                response.status,
                &response.decoded_body,
            ) {
                self.stats().record_retry(format!("{:?}", category));
                let state = config.retry_config.get_retry_state(&url);
                let attempt = state.counts.get(&category).unwrap();
                let max_retries = config
                    .retry_config
                    .categories
                    .get(&category)
                    .map(|c| c.max_retries)
                    .unwrap_or(0);

                if attempt >= &max_retries {
                    return Err((
                        ScraperError::MaxRetriesReached {
                            category: category.clone(),
                            retry_count: *attempt,
                            url: Box::new(url.clone()),
                        },
                        Box::new(request),
                    ));
                }

                warn!(
                    "Retry triggered for URL: {} (category={:?}, attempt={}/{}, delay={:?})",
                    url, category, attempt, max_retries, delay
                );

                sleep(delay).await;
                continue;
            }

            let state = config.retry_config.get_retry_state(&url);
            info!(
                "Request completed for URL: {} (total_retries={}, status={})",
                url, state.total_retries, response.status
            );
            debug!("Retry history for {}: {:?}", url, state.counts);

            return Ok(HttpResponse {
                retry_count: state.total_retries,
                retry_history: state.counts,
                ..response
            });
        }
    }
}