spider-lib 3.0.4

A Rust-based web scraping framework inspired by Scrapy (Python).
Documentation
#[path = "showcase/support.rs"]
mod showcase;

use showcase::{ShowcaseItem, ShowcaseSpider, prepare_output_dir};
use spider_lib::prelude::*;

fn build_json_pipeline() -> Result<JsonPipeline<ShowcaseItem>, PipelineError> {
    JsonPipeline::new("output/showcase.json")
}

fn build_jsonl_pipeline() -> Result<JsonlPipeline<ShowcaseItem>, PipelineError> {
    JsonlPipeline::new("output/showcase.jsonl")
}

fn build_csv_pipeline() -> Result<CsvPipeline<ShowcaseItem>, PipelineError> {
    CsvPipeline::new("output/showcase.csv")
}

fn build_sqlite_pipeline() -> Result<SqlitePipeline<ShowcaseItem>, PipelineError> {
    SqlitePipeline::new("output/showcase.sqlite", "showcase_items")
}

fn build_stream_json_pipeline() -> Result<StreamJsonPipeline<ShowcaseItem>, PipelineError> {
    StreamJsonPipeline::new("output/showcase-stream.json")
}

#[tokio::main]
async fn main() -> Result<(), SpiderError> {
    prepare_output_dir()?;

    let crawler = CrawlerBuilder::new(ShowcaseSpider)
        .limit(1)
        .log_level(log::LevelFilter::Info)
        .add_pipeline(
            TransformPipeline::new()
                .with_operation(TransformOperation::Trim {
                    field: "title".into(),
                })
                .with_operation(TransformOperation::SetDefault {
                    field: "note".into(),
                    value: serde_json::json!("generated by showcase_pipelines example"),
                }),
        )
        .add_pipeline(
            ValidationPipeline::new()
                .with_rule("title", ValidationRule::Required)
                .with_rule("title", ValidationRule::NonEmptyString)
                .with_rule("status", ValidationRule::Type(JsonType::Number))
                .with_rule("body_bytes", ValidationRule::MinNumber(1.0)),
        )
        .add_pipeline(DeduplicationPipeline::new(["url"]))
        .add_pipeline(ConsolePipeline::new())
        .add_pipeline(build_json_pipeline()?)
        .add_pipeline(build_jsonl_pipeline()?)
        .add_pipeline(build_csv_pipeline()?)
        .add_pipeline(build_sqlite_pipeline()?)
        .add_pipeline(build_stream_json_pipeline()?)
        .build()
        .await?;

    let state = crawler.state_arc();
    crawler.start_crawl().await?;

    println!("showcase pipelines summary: {}", state.summary());

    Ok(())
}