use clap::{Parser, ValueEnum};
use crw_core::config::{RendererConfig, RendererMode, StealthConfig};
use crw_core::types::{OutputFormat, ScrapeRequest};
use crw_crawl::single::scrape_url;
use crw_renderer::FallbackRenderer;
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Parser)]
#[command(
name = "crw",
about = "Scrape a URL and output markdown, JSON, HTML, or plain text",
long_about = "Lightweight web scraper. Fetches a URL and outputs clean content to stdout.\n\nExamples:\n crw https://example.com\n crw https://example.com --format json\n crw https://example.com --raw -o page.md\n crw https://example.com --css 'article' --format html"
)]
struct Cli {
url: String,
#[arg(short, long, value_enum, default_value = "markdown")]
format: Format,
#[arg(short, long, value_name = "FILE")]
output: Option<String>,
#[arg(long)]
raw: bool,
#[arg(long)]
js: bool,
#[arg(long, value_name = "SELECTOR")]
css: Option<String>,
#[arg(long, value_name = "EXPR")]
xpath: Option<String>,
#[arg(long, value_name = "URL")]
proxy: Option<String>,
#[arg(long)]
stealth: bool,
}
#[derive(Clone, ValueEnum)]
enum Format {
Markdown,
Json,
Html,
Rawhtml,
Text,
Links,
}
#[tokio::main]
async fn main() {
let mut cli = Cli::parse();
if !cli.url.contains("://") {
cli.url = format!("https://{}", cli.url);
}
let mut renderer_config = RendererConfig::default();
let _browser_guards = if cli.js {
if let Ok(ws_url) = std::env::var("CRW_CDP_URL") {
renderer_config.lightpanda = Some(crw_core::config::CdpEndpoint { ws_url });
Vec::new()
} else {
let browsers = crw_renderer::browser::spawn_all_headless().await;
if browsers.is_empty() {
eprintln!(
"warning: --js requested but no browser found. \
Install LightPanda or Chrome for JS rendering. \
Falling back to HTTP."
);
}
let mut guards = Vec::new();
for (guard, ws_url, kind) in browsers {
match kind {
crw_renderer::browser::RendererKind::LightPanda => {
renderer_config.lightpanda = Some(crw_core::config::CdpEndpoint { ws_url });
}
crw_renderer::browser::RendererKind::Chrome => {
renderer_config.chrome = Some(crw_core::config::CdpEndpoint { ws_url });
}
}
guards.push(guard);
}
guards
}
} else {
renderer_config.mode = RendererMode::None;
Vec::new()
};
let stealth_config = StealthConfig {
enabled: cli.stealth,
inject_headers: cli.stealth,
..Default::default()
};
let renderer = match FallbackRenderer::new(
&renderer_config,
"crw/0.0.3",
cli.proxy.as_deref(),
&stealth_config,
) {
Ok(r) => Arc::new(r),
Err(e) => {
eprintln!("error: failed to build renderer: {e}");
std::process::exit(1);
}
};
let output_format = match cli.format {
Format::Markdown => OutputFormat::Markdown,
Format::Json => OutputFormat::Json,
Format::Html => OutputFormat::Html,
Format::Rawhtml => OutputFormat::RawHtml,
Format::Text => OutputFormat::PlainText,
Format::Links => OutputFormat::Links,
};
let req = ScrapeRequest {
url: cli.url,
formats: vec![output_format],
only_main_content: !cli.raw,
render_js: if cli.js { Some(true) } else { None },
wait_for: None,
include_tags: vec![],
exclude_tags: vec![],
json_schema: None,
headers: HashMap::new(),
css_selector: cli.css,
xpath: cli.xpath,
chunk_strategy: None,
query: None,
filter_mode: None,
top_k: None,
proxy: cli.proxy,
stealth: if cli.stealth { Some(true) } else { None },
actions: None,
extract: None,
llm_api_key: None,
llm_provider: None,
llm_model: None,
base_url: None,
summary_prompt: None,
max_content_chars: None,
renderer: None,
deadline_ms: None,
debug: None,
};
let cli_deadline = crw_core::Deadline::from_request_ms(req.deadline_ms.unwrap_or(8000));
let cli_extraction_cfg = crw_core::config::ExtractionConfig::default();
let data = match scrape_url(
&req,
&renderer,
None,
&cli_extraction_cfg,
"crw/0.0.3",
cli.stealth,
None,
cli_deadline,
)
.await
{
Ok(d) => d,
Err(e) => {
eprintln!("error: {e}");
std::process::exit(1);
}
};
let content = match cli.format {
Format::Markdown => data.markdown.unwrap_or_default(),
Format::Json => match serde_json::to_string_pretty(&data) {
Ok(s) => s,
Err(e) => {
eprintln!("error: failed to serialize JSON: {e}");
std::process::exit(1);
}
},
Format::Html => data.html.unwrap_or_default(),
Format::Rawhtml => data.raw_html.unwrap_or_default(),
Format::Text => data.plain_text.unwrap_or_default(),
Format::Links => data.links.unwrap_or_default().join("\n"),
};
match cli.output {
Some(path) => {
if let Err(e) = std::fs::write(&path, &content) {
eprintln!("error: failed to write to {path}: {e}");
std::process::exit(1);
}
}
None => print!("{content}"),
}
}