use kumo::llm::anthropic::models;
use kumo::prelude::*;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
#[derive(Debug, Serialize, Deserialize, JsonSchema)]
struct Quote {
text: String,
author: String,
tags: Vec<String>,
}
struct QuotesSpider {
client: Arc<AnthropicClient>,
}
#[async_trait::async_trait]
impl Spider for QuotesSpider {
type Item = Quote;
fn name(&self) -> &str {
"quotes-llm"
}
fn start_urls(&self) -> Vec<String> {
vec!["https://quotes.toscrape.com".into()]
}
async fn parse(&self, res: &Response) -> Result<Output<Self::Item>, KumoError> {
let quotes: Vec<Quote> = res.extract(self.client.as_ref()).await?;
let next_url = res
.css("li.next a")
.first()
.and_then(|el| el.attr("href"))
.map(|href| res.urljoin(&href));
let mut output = Output::new().items(quotes);
if let Some(url) = next_url {
output = output.follow(url);
}
Ok(output)
}
}
#[tokio::main]
async fn main() -> Result<(), KumoError> {
tracing_subscriber::fmt()
.with_env_filter("kumo=info")
.init();
let api_key = std::env::var("ANTHROPIC_API_KEY").unwrap_or_else(|_| {
eprintln!("ANTHROPIC_API_KEY not set.");
std::process::exit(1);
});
let client = Arc::new(
AnthropicClient::new(api_key)
.model(models::CLAUDE_HAIKU_4_5)
.system_prompt("Extract all quotes from this quotes listing page. Each page contains multiple quotes in .quote elements.")
.strip_scripts_and_styles(true),
);
let stats = CrawlEngine::builder()
.concurrency(1)
.middleware(
DefaultHeaders::new().user_agent("kumo/0.2 (+https://github.com/wihlarkop/kumo)"),
)
.store(StdoutStore)
.run(QuotesSpider {
client: Arc::clone(&client),
})
.await?;
let usage = client.total_usage();
println!(
"Done — scraped {} items from {} pages ({} errors)",
stats.items_scraped, stats.pages_crawled, stats.errors
);
println!(
"Tokens — {} in / {} out / {} total ({} cached)",
usage.input_tokens, usage.output_tokens, usage.total_tokens, usage.cached_input_tokens
);
Ok(())
}