use std::time::Duration;
use clap::{Parser, Subcommand, ValueEnum};
use kreuzcrawl::{
BatchCrawlResults, BatchScrapeResults, BrowserConfig, BrowserMode, CrawlConfig, PageAction, ProxyConfig,
batch_crawl, batch_scrape, crawl, create_engine, generate_citations, interact, map_urls, scrape,
};
#[derive(Clone, Copy, Debug, Eq, PartialEq, ValueEnum)]
enum CliBrowserMode {
Auto,
Always,
Never,
}
impl From<CliBrowserMode> for BrowserMode {
fn from(value: CliBrowserMode) -> Self {
match value {
CliBrowserMode::Auto => BrowserMode::Auto,
CliBrowserMode::Always => BrowserMode::Always,
CliBrowserMode::Never => BrowserMode::Never,
}
}
}
fn parse_browser_endpoint(value: &str) -> Result<String, String> {
if value.starts_with("ws://") || value.starts_with("wss://") {
Ok(value.to_owned())
} else {
Err(format!(
"browser endpoint must be a WebSocket URL starting with ws:// or wss://, got: {value:?}"
))
}
}
fn build_browser_config(
browser_mode: CliBrowserMode,
browser_endpoint: Option<String>,
timeout: Duration,
) -> BrowserConfig {
BrowserConfig {
mode: browser_mode.into(),
endpoint: browser_endpoint,
timeout,
..Default::default()
}
}
fn merge_json_config(config: &mut CrawlConfig, config_str: &str) -> Result<(), Box<dyn std::error::Error>> {
let json_text = if let Some(path) = config_str.strip_prefix('@') {
std::fs::read_to_string(path)?
} else {
config_str.to_string()
};
let json: serde_json::Value = serde_json::from_str(&json_text)?;
let partial: CrawlConfig = serde_json::from_value(json)?;
let mut config_json = serde_json::to_value(config.clone())?;
let partial_json = serde_json::to_value(partial)?;
if let (serde_json::Value::Object(config_map), serde_json::Value::Object(partial_map)) =
(&mut config_json, partial_json)
{
for (k, v) in partial_map {
if !v.is_null() {
config_map.insert(k, v);
}
}
}
*config = serde_json::from_value(config_json)?;
Ok(())
}
fn print_batch_scrape(results: &BatchScrapeResults, format: &str) {
if format == "markdown" {
for entry in &results.results {
if let Some(ref r) = entry.result
&& let Some(ref md) = r.markdown
{
println!("---\nURL: {}\n---\n{}\n", entry.url, md.content);
}
if let Some(ref e) = entry.error {
eprintln!("Error scraping {}: {e}", entry.url);
}
}
} else {
println!(
"{}",
serde_json::to_string_pretty(results).expect("results are serializable")
);
}
}
fn print_batch_crawl(results: &BatchCrawlResults, format: &str) {
if format == "markdown" {
for entry in &results.results {
if let Some(ref r) = entry.result {
for page in &r.pages {
if let Some(ref md) = page.markdown {
println!("---\nSeed: {}\nURL: {}\n---\n{}\n", entry.url, page.url, md.content);
}
}
}
if let Some(ref e) = entry.error {
eprintln!("Error crawling {}: {e}", entry.url);
}
}
} else {
println!(
"{}",
serde_json::to_string_pretty(results).expect("results are serializable")
);
}
}
#[derive(Parser)]
#[command(name = "kreuzcrawl", about = "High-performance web crawler and scraper", version)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Scrape {
url: String,
#[arg(long, default_value = "json")]
format: String,
#[arg(long)]
proxy: Option<String>,
#[arg(long)]
user_agent: Option<String>,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long)]
respect_robots_txt: bool,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
Crawl {
#[arg(required = true)]
urls: Vec<String>,
#[arg(long, short = 'd', default_value = "2")]
depth: usize,
#[arg(long, short = 'n')]
max_pages: Option<usize>,
#[arg(long, short = 'c', default_value = "10")]
concurrent: usize,
#[arg(long, default_value = "200")]
rate_limit: u64,
#[arg(long, default_value = "json")]
format: String,
#[arg(long)]
proxy: Option<String>,
#[arg(long)]
user_agent: Option<String>,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long)]
respect_robots_txt: bool,
#[arg(long)]
stay_on_domain: bool,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
Map {
url: String,
#[arg(long)]
limit: Option<usize>,
#[arg(long)]
search: Option<String>,
#[arg(long)]
respect_robots_txt: bool,
#[arg(long, default_value = "json")]
format: String,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
Interact {
url: String,
#[arg(long, value_name = "JSON")]
actions: String,
#[arg(long, default_value = "json")]
format: String,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
BatchScrape {
#[arg(required = true)]
urls: Vec<String>,
#[arg(long, short = 'c', default_value = "10")]
concurrent: usize,
#[arg(long, default_value = "json")]
format: String,
#[arg(long)]
proxy: Option<String>,
#[arg(long)]
user_agent: Option<String>,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long)]
respect_robots_txt: bool,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
BatchCrawl {
#[arg(required = true)]
urls: Vec<String>,
#[arg(long, short = 'd', default_value = "2")]
depth: usize,
#[arg(long, short = 'n')]
max_pages: Option<usize>,
#[arg(long, short = 'c', default_value = "10")]
concurrent: usize,
#[arg(long, default_value = "200")]
rate_limit: u64,
#[arg(long, default_value = "json")]
format: String,
#[arg(long)]
proxy: Option<String>,
#[arg(long)]
user_agent: Option<String>,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long)]
respect_robots_txt: bool,
#[arg(long)]
stay_on_domain: bool,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
Download {
url: String,
#[arg(long)]
max_size: Option<usize>,
#[arg(long, default_value = "30000")]
timeout: u64,
#[arg(long, value_enum, default_value_t = CliBrowserMode::Auto)]
browser_mode: CliBrowserMode,
#[arg(long, value_parser = parse_browser_endpoint)]
browser_endpoint: Option<String>,
#[arg(long, value_name = "JSON")]
config: Option<String>,
},
Citations {
input: String,
},
Version {},
#[cfg(feature = "api")]
Serve {
#[arg(long, default_value = "0.0.0.0")]
host: String,
#[arg(long, default_value = "3000")]
port: u16,
},
#[cfg(feature = "mcp")]
Mcp {},
}
#[tokio::main]
async fn main() {
let cli = Cli::parse();
match cli.command {
Commands::Scrape {
url,
format,
proxy,
user_agent,
timeout,
respect_robots_txt,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
user_agent,
request_timeout: timeout_duration,
respect_robots_txt,
proxy: proxy.map(|url| ProxyConfig {
url,
username: None,
password: None,
}),
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match scrape(&handle, &url).await {
Ok(result) => {
if format == "markdown" {
if let Some(ref md) = result.markdown {
println!("{}", md.content);
} else {
eprintln!("No markdown content available");
}
} else {
println!(
"{}",
serde_json::to_string_pretty(&result).expect("result is serializable")
);
}
}
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::Crawl {
urls,
depth,
max_pages,
concurrent,
rate_limit,
format,
proxy,
user_agent,
timeout,
respect_robots_txt,
stay_on_domain,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
max_depth: Some(depth),
max_pages,
max_concurrent: Some(concurrent),
rate_limit_ms: Some(rate_limit),
user_agent,
request_timeout: timeout_duration,
respect_robots_txt,
stay_on_domain,
proxy: proxy.map(|url| ProxyConfig {
url,
username: None,
password: None,
}),
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
if urls.len() == 1 {
match crawl(&handle, &urls[0]).await {
Ok(result) => {
if format == "markdown" {
for page in &result.pages {
if let Some(ref md) = page.markdown {
println!("---\nURL: {}\n---\n{}\n", page.url, md.content);
}
}
} else {
println!(
"{}",
serde_json::to_string_pretty(&result).expect("result is serializable")
);
}
}
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
} else {
match batch_crawl(&handle, urls).await {
Ok(results) => print_batch_crawl(&results, &format),
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
}
Commands::Map {
url,
limit,
search,
respect_robots_txt,
format,
timeout,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
respect_robots_txt,
map_limit: limit,
map_search: search,
request_timeout: timeout_duration,
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match map_urls(&handle, &url).await {
Ok(result) => {
if format == "markdown" {
for url_entry in &result.urls {
println!("{}", url_entry.url);
}
} else {
println!(
"{}",
serde_json::to_string_pretty(&result).expect("result is serializable")
);
}
}
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::Interact {
url,
actions,
format,
timeout,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
request_timeout: timeout_duration,
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let parsed_actions: Vec<PageAction> = match serde_json::from_str(&actions) {
Ok(value) => value,
Err(e) => {
eprintln!("Error: invalid actions JSON: {e}");
std::process::exit(1);
}
};
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match interact(&handle, &url, parsed_actions).await {
Ok(result) => {
if format == "markdown" {
println!("{}", result.final_html);
} else {
let wrapped = serde_json::json!({ "interaction": result });
println!(
"{}",
serde_json::to_string_pretty(&wrapped).expect("result is serializable")
);
}
}
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::BatchScrape {
urls,
concurrent,
format,
proxy,
user_agent,
timeout,
respect_robots_txt,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
max_concurrent: Some(concurrent),
user_agent,
request_timeout: timeout_duration,
respect_robots_txt,
proxy: proxy.map(|url| ProxyConfig {
url,
username: None,
password: None,
}),
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match batch_scrape(&handle, urls).await {
Ok(results) => print_batch_scrape(&results, &format),
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::BatchCrawl {
urls,
depth,
max_pages,
concurrent,
rate_limit,
format,
proxy,
user_agent,
timeout,
respect_robots_txt,
stay_on_domain,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
max_depth: Some(depth),
max_pages,
max_concurrent: Some(concurrent),
rate_limit_ms: Some(rate_limit),
user_agent,
request_timeout: timeout_duration,
respect_robots_txt,
stay_on_domain,
proxy: proxy.map(|url| ProxyConfig {
url,
username: None,
password: None,
}),
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match batch_crawl(&handle, urls).await {
Ok(results) => print_batch_crawl(&results, &format),
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::Download {
url,
max_size,
timeout,
browser_mode,
browser_endpoint,
config: config_str,
} => {
let timeout_duration = Duration::from_millis(timeout);
let mut config = CrawlConfig {
request_timeout: timeout_duration,
download_documents: true,
document_max_size: max_size,
browser: build_browser_config(browser_mode, browser_endpoint, timeout_duration),
..Default::default()
};
if let Some(config_json) = config_str
&& let Err(e) = merge_json_config(&mut config, &config_json)
{
eprintln!("Error: invalid config: {e}");
std::process::exit(1);
}
let handle = create_engine(Some(config)).expect("failed to create crawl engine");
match scrape(&handle, &url).await {
Ok(result) => {
let output = if let Some(ref doc) = result.downloaded_document {
serde_json::json!({
"url": doc.url,
"mime_type": doc.mime_type,
"size": doc.size,
"filename": doc.filename,
"content_hash": doc.content_hash,
})
} else {
serde_json::json!({
"url": url,
"content_type": result.content_type,
"status_code": result.status_code,
"body_size": result.body_size,
"note": "URL returned HTML content, not a downloadable document",
})
};
println!(
"{}",
serde_json::to_string_pretty(&output).expect("output is serializable")
);
}
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
}
}
Commands::Citations { input } => {
let markdown = if let Some(path) = input.strip_prefix('@') {
match std::fs::read_to_string(path) {
Ok(text) => text,
Err(e) => {
eprintln!("Error: cannot read {path}: {e}");
std::process::exit(1);
}
}
} else {
input
};
let result = generate_citations(&markdown);
println!(
"{}",
serde_json::to_string_pretty(&result).expect("result is serializable")
);
}
Commands::Version {} => {
let response = serde_json::json!({ "version": env!("CARGO_PKG_VERSION") });
println!(
"{}",
serde_json::to_string_pretty(&response).expect("version is serializable")
);
}
#[cfg(feature = "api")]
Commands::Serve { host, port } => {
eprintln!("Starting REST API server on {host}:{port}");
#[cfg(feature = "mcp")]
eprintln!("MCP Streamable HTTP transport available at http://{host}:{port}/mcp");
if let Err(e) = kreuzcrawl::serve_api(&host, port, CrawlConfig::default()).await {
eprintln!("Server error: {e}");
std::process::exit(1);
}
}
#[cfg(feature = "mcp")]
Commands::Mcp {} => {
eprintln!("Starting MCP server (stdio transport)");
if let Err(e) = kreuzcrawl::start_mcp_server().await {
eprintln!("MCP server error: {e}");
std::process::exit(1);
}
}
}
}
#[cfg(test)]
mod tests {
use std::time::Duration;
use super::{CliBrowserMode, build_browser_config, parse_browser_endpoint};
use kreuzcrawl::BrowserMode;
const DEFAULT_TIMEOUT: Duration = Duration::from_millis(30_000);
#[test]
fn maps_cli_browser_mode_to_engine_mode() {
assert_eq!(
build_browser_config(CliBrowserMode::Auto, None, DEFAULT_TIMEOUT).mode,
BrowserMode::Auto
);
assert_eq!(
build_browser_config(CliBrowserMode::Always, None, DEFAULT_TIMEOUT).mode,
BrowserMode::Always
);
assert_eq!(
build_browser_config(CliBrowserMode::Never, None, DEFAULT_TIMEOUT).mode,
BrowserMode::Never
);
}
#[test]
fn preserves_browser_endpoint() {
let endpoint = Some("ws://127.0.0.1:9222/devtools/browser/test".to_string());
let config = build_browser_config(CliBrowserMode::Auto, endpoint.clone(), DEFAULT_TIMEOUT);
assert_eq!(config.endpoint, endpoint);
}
#[test]
fn timeout_is_propagated_to_browser_config() {
let timeout = Duration::from_millis(5_000);
let config = build_browser_config(CliBrowserMode::Auto, None, timeout);
assert_eq!(config.timeout, timeout);
}
#[test]
fn parse_browser_endpoint_accepts_ws_urls() {
assert!(parse_browser_endpoint("ws://127.0.0.1:9222/devtools/browser/abc").is_ok());
assert!(parse_browser_endpoint("wss://remote.host/devtools/browser/abc").is_ok());
}
#[test]
fn parse_browser_endpoint_rejects_non_ws_urls() {
assert!(parse_browser_endpoint("http://127.0.0.1:9222").is_err());
assert!(parse_browser_endpoint("https://remote.host").is_err());
assert!(parse_browser_endpoint("127.0.0.1:9222").is_err());
}
use clap::Parser;
use super::{Cli, Commands};
#[test]
fn parses_batch_scrape_subcommand() {
let cli = Cli::try_parse_from(["kreuzcrawl", "batch-scrape", "https://a.com", "https://b.com"]).unwrap();
match cli.command {
Commands::BatchScrape { urls, concurrent, .. } => {
assert_eq!(urls, vec!["https://a.com".to_string(), "https://b.com".to_string()]);
assert_eq!(concurrent, 10);
}
_ => panic!("expected BatchScrape"),
}
}
#[test]
fn parses_batch_crawl_subcommand_with_depth() {
let cli = Cli::try_parse_from([
"kreuzcrawl",
"batch-crawl",
"https://a.com",
"https://b.com",
"--depth",
"3",
])
.unwrap();
match cli.command {
Commands::BatchCrawl { urls, depth, .. } => {
assert_eq!(urls.len(), 2);
assert_eq!(depth, 3);
}
_ => panic!("expected BatchCrawl"),
}
}
#[test]
fn parses_download_subcommand() {
let cli =
Cli::try_parse_from(["kreuzcrawl", "download", "https://a.com/doc.pdf", "--max-size", "1024"]).unwrap();
match cli.command {
Commands::Download { url, max_size, .. } => {
assert_eq!(url, "https://a.com/doc.pdf");
assert_eq!(max_size, Some(1024));
}
_ => panic!("expected Download"),
}
}
#[test]
fn parses_citations_subcommand() {
let cli = Cli::try_parse_from(["kreuzcrawl", "citations", "@notes.md"]).unwrap();
match cli.command {
Commands::Citations { input } => assert_eq!(input, "@notes.md"),
_ => panic!("expected Citations"),
}
}
#[test]
fn parses_version_subcommand() {
let cli = Cli::try_parse_from(["kreuzcrawl", "version"]).unwrap();
assert!(matches!(cli.command, Commands::Version {}));
}
#[test]
fn batch_scrape_requires_at_least_one_url() {
assert!(Cli::try_parse_from(["kreuzcrawl", "batch-scrape"]).is_err());
}
}