use clap::{Parser, Subcommand};
use crw_core::config::AppConfig;
use crw_server::state::AppState;
use tracing_subscriber::EnvFilter;
#[derive(Parser)]
#[command(name = "crw-server", about = "CRW web scraper API server")]
struct Cli {
#[command(subcommand)]
command: Option<Commands>,
}
#[derive(Subcommand)]
enum Commands {
Setup,
}
#[tokio::main]
async fn main() {
crw_crawl::pdf::run_sandbox_worker_if_invoked();
let cli = Cli::parse();
match cli.command {
Some(Commands::Setup) => {
crw_server::setup::run_setup().await;
}
None => {
run_server().await;
}
}
}
async fn run_server() {
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")),
)
.init();
crw_core::metrics::init();
let config = match AppConfig::load() {
Ok(c) => c,
Err(e) => {
tracing::error!("Failed to load configuration: {e}");
std::process::exit(1);
}
};
crw_crawl::pdf::configure_limits(&config.document);
let addr = format!("{}:{}", config.server.host, config.server.port);
tracing::info!("Starting CRW on {addr}");
tracing::info!("Renderer mode: {:?}", config.renderer.mode);
tracing::info!(
"Renderer render_js_default: {:?}",
config.renderer.render_js_default
);
if let Some(lp) = &config.renderer.lightpanda {
tracing::info!("Lightpanda CDP: {}", lp.ws_url);
}
if let Some(ch) = &config.renderer.chrome {
tracing::info!("Chrome CDP: {}", ch.ws_url);
}
if std::env::var("CRW_CDP_URL").is_ok() {
tracing::warn!(
"CRW_CDP_URL is set but is only honored by `crw` (CLI). \
In server/MCP mode use [renderer.lightpanda.ws_url] / [renderer.chrome.ws_url] \
or CRW_RENDERER__LIGHTPANDA__WS_URL / CRW_RENDERER__CHROME__WS_URL."
);
}
if config.extraction.llm.is_some() {
tracing::info!("LLM structured extraction: enabled");
}
let (search_level, search_msg) = crw_server::diagnostics::search_startup_status(&config.search);
match search_level {
tracing::Level::WARN => tracing::warn!("{search_msg}"),
_ => tracing::info!("{search_msg}"),
}
let disable_server_key = std::env::var("CRW_DISABLE_SERVER_LLM_KEY")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false);
if disable_server_key
&& config
.extraction
.llm
.as_ref()
.is_some_and(|c| !c.api_key.is_empty())
{
tracing::error!(
"CRW_DISABLE_SERVER_LLM_KEY=1 but [extraction.llm].api_key is also configured. \
This is forbidden in SaaS-fronted deploys (refusing to boot)."
);
std::process::exit(1);
}
let state = match AppState::new(config) {
Ok(s) => s,
Err(e) => {
tracing::error!("Failed to build application state: {e}");
std::process::exit(1);
}
};
tracing::info!(
"JS renderers in fallback order: {:?}",
state.renderer.js_renderer_names()
);
if state.renderer.js_renderer_names().is_empty() {
tracing::warn!("No CDP renderer active — JS rendering disabled");
}
if state.config.search.enabled
&& let Some(raw_url) = state.config.search.searxng_url.clone()
{
let origin = crw_server::diagnostics::sanitize_url_origin(&raw_url);
tokio::spawn(async move {
let probe = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(3))
.build();
match probe {
Ok(client) => {
let healthz = format!("{origin}/healthz");
match client.get(&healthz).send().await {
Ok(resp) if resp.status().is_success() => {
tracing::info!("search: SearXNG reachable at {origin}");
}
Ok(resp) => {
tracing::warn!(
"search: SearXNG at {origin} answered /healthz with {} — \
search calls may fail until it is healthy",
resp.status()
);
}
Err(e) => {
tracing::warn!(
"search: configured host {origin} UNREACHABLE at startup \
({}) — search calls will fail until it resolves",
e.without_url()
);
}
}
}
Err(e) => tracing::warn!("search: could not build startup probe client: {e}"),
}
});
}
let baseline_default_ms = state.config.request.deadline_ms_default;
let ladder_min_ms = state.config.renderer.min_deadline_for_full_ladder_ms();
let effective_default_ms = state.config.effective_deadline_ms(None, None);
let baseline_outer_secs = state.config.server.request_timeout_secs;
let effective_outer_secs = state.config.effective_request_timeout_secs();
if state.config.request.auto_extend_deadline_for_ladder
&& effective_default_ms > baseline_default_ms
{
tracing::info!(
deadline_ms_default = baseline_default_ms,
ladder_min_ms,
effective_default_ms,
outer_timeout_secs_baseline = baseline_outer_secs,
outer_timeout_secs_effective = effective_outer_secs,
"request.auto_extend_deadline_for_ladder is on; default request \
deadline auto-raised so the configured renderer ladder \
(http+lightpanda+chrome+overhead) can run uncrushed. Set \
request.auto_extend_deadline_for_ladder = false to enforce the \
baseline cap."
);
}
let renderer = std::sync::Arc::clone(&state.renderer);
let pool_drain =
std::time::Duration::from_secs(state.config.renderer.chrome_pool.shutdown_drain_secs);
let app = crw_server::app::create_app(state);
let listener = match tokio::net::TcpListener::bind(&addr).await {
Ok(l) => l,
Err(e) => {
tracing::error!("Failed to bind to {addr}: {e}");
std::process::exit(1);
}
};
tracing::info!("CRW ready at http://{addr}");
let server = axum::serve(listener, app).with_graceful_shutdown(shutdown_signal());
if let Err(e) = server.await {
tracing::error!("Server error: {e}");
std::process::exit(1);
}
renderer.shutdown_chrome_pool(pool_drain).await;
tracing::info!("Server shut down gracefully");
}
async fn shutdown_signal() {
let ctrl_c = async {
tokio::signal::ctrl_c()
.await
.expect("failed to install Ctrl+C handler");
};
#[cfg(unix)]
let terminate = async {
tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())
.expect("failed to install SIGTERM handler")
.recv()
.await;
};
#[cfg(not(unix))]
let terminate = std::future::pending::<()>();
tokio::select! {
_ = ctrl_c => tracing::info!("Received Ctrl+C, shutting down..."),
_ = terminate => tracing::info!("Received SIGTERM, shutting down..."),
}
}