#[path = "showcase/support.rs"]
mod showcase;
use showcase::{ShowcaseItem, ShowcaseSpider, prepare_output_dir};
use spider_lib::prelude::*;
fn build_json_pipeline() -> Result<JsonPipeline<ShowcaseItem>, PipelineError> {
JsonPipeline::new("output/showcase.json")
}
fn build_jsonl_pipeline() -> Result<JsonlPipeline<ShowcaseItem>, PipelineError> {
JsonlPipeline::new("output/showcase.jsonl")
}
fn build_csv_pipeline() -> Result<CsvPipeline<ShowcaseItem>, PipelineError> {
CsvPipeline::new("output/showcase.csv")
}
fn build_sqlite_pipeline() -> Result<SqlitePipeline<ShowcaseItem>, PipelineError> {
SqlitePipeline::new("output/showcase.sqlite", "showcase_items")
}
fn build_stream_json_pipeline() -> Result<StreamJsonPipeline<ShowcaseItem>, PipelineError> {
StreamJsonPipeline::new("output/showcase-stream.json")
}
#[tokio::main]
async fn main() -> Result<(), SpiderError> {
prepare_output_dir()?;
let crawler = CrawlerBuilder::new(ShowcaseSpider)
.limit(1)
.log_level(log::LevelFilter::Info)
.add_pipeline(
TransformPipeline::new()
.with_operation(TransformOperation::Trim {
field: "title".into(),
})
.with_operation(TransformOperation::SetDefault {
field: "note".into(),
value: serde_json::json!("generated by showcase_pipelines example"),
}),
)
.add_pipeline(
ValidationPipeline::new()
.with_rule("title", ValidationRule::Required)
.with_rule("title", ValidationRule::NonEmptyString)
.with_rule("status", ValidationRule::Type(JsonType::Number))
.with_rule("body_bytes", ValidationRule::MinNumber(1.0)),
)
.add_pipeline(DeduplicationPipeline::new(["url"]))
.add_pipeline(ConsolePipeline::new())
.add_pipeline(build_json_pipeline()?)
.add_pipeline(build_jsonl_pipeline()?)
.add_pipeline(build_csv_pipeline()?)
.add_pipeline(build_sqlite_pipeline()?)
.add_pipeline(build_stream_json_pipeline()?)
.build()
.await?;
let state = crawler.state_arc();
crawler.start_crawl().await?;
println!("showcase pipelines summary: {}", state.summary());
Ok(())
}