use clap::{Parser, Subcommand};
use std::io::{self, Read};
use std::sync::Arc;
use crate::{runtime, templates, tools, types};
#[derive(Parser)]
#[command(
name = "qrawl",
version,
about = "Rust toolkit to crawl web data for AI agents"
)]
struct Cli {
#[arg(long, global = true)]
fast: bool,
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Fetch {
url: String,
},
Children {
url: String,
},
Page {
url: String,
},
Body {
url: String,
},
Jsonld {
url: String,
},
Metadata {
url: String,
},
Preview {
url: String,
},
Schemas {
url: String,
},
Emails {
url: String,
},
Phones {
url: String,
},
}
pub fn read_input(input: &str, ctx: Arc<types::Context>) -> String {
if input == "-" {
let mut buffer = String::new();
io::stdin()
.read_to_string(&mut buffer)
.expect("Failed to read from stdin");
buffer
} else if input.starts_with("http://") || input.starts_with("https://") {
fetch_url(input, ctx)
} else {
std::fs::read_to_string(input).unwrap_or_else(|e| {
eprintln!("Error reading file '{}': {}", input, e);
std::process::exit(1);
})
}
}
pub fn fetch_url(url: &str, ctx: Arc<types::Context>) -> String {
let result = runtime::block_on(async move {
types::CTX
.scope(ctx, async { tools::fetch::fetch_strategy(url).await })
.await
});
result.unwrap_or_else(|e| {
eprintln!("Failed to fetch {}: {}", url, e);
std::process::exit(1);
})
}
pub fn print_json<T: serde::Serialize>(value: &T) {
match serde_json::to_string_pretty(value) {
Ok(json) => println!("{}", json),
Err(e) => eprintln!("Error serializing to JSON: {}", e),
}
}
pub fn run() {
let cli = Cli::parse();
let ctx_arc = Arc::new(if cli.fast {
types::Context::fast()
} else {
types::Context::auto()
});
match cli.command {
Commands::Fetch { url } => {
if !url.starts_with("https://") {
eprintln!("Error: URL must start with https://");
std::process::exit(1);
}
eprintln!("Fetching {}...", url);
match runtime::block_on(tools::fetch::fetch_auto_with_result(&url)) {
Ok(result) => {
eprintln!(
"✓ Success\n Profile: {:?}\n Attempts: {}\n Duration: {}ms",
result.profile_used, result.attempts, result.duration_ms
);
}
Err(e) => {
eprintln!("Error: {}", e);
std::process::exit(1);
}
}
}
Commands::Children { url } => {
let result = runtime::block_on(templates::qrawl_children(
vec![url.to_string()],
(*ctx_arc).clone(),
));
let urls = result.map(|tuples| {
tuples
.into_iter()
.map(|(url, _html)| url)
.collect::<Vec<String>>()
});
print_json(&urls);
}
Commands::Page { url } => {
run!(@async ctx_arc.clone(), url.clone(), tools::map::map_page, &url)
}
Commands::Body { url } => {
run!(@async ctx_arc.clone(), url, tools::scrape::scrape_body)
}
Commands::Jsonld { url } => {
run!(@async ctx_arc.clone(), url, tools::scrape::scrape_jsonld)
}
Commands::Metadata { url } => {
run!(@async ctx_arc.clone(), url, tools::scrape::scrape_metadata)
}
Commands::Preview { url } => run!(
@async ctx_arc.clone(), url,
[
tools::scrape::scrape_metadata,
tools::extract::extract_og_preview
]
),
Commands::Schemas { url } => run!(
@async ctx_arc.clone(), url,
[
tools::scrape::scrape_jsonld,
tools::extract::extract_schema_types
]
),
Commands::Emails { url } => {
run!(@template url, templates::qrawl_emails, (*ctx_arc).clone())
}
Commands::Phones { url } => run!(
@async ctx_arc.clone(), url,
[tools::extract::extract_phones, tools::clean::clean_phones]
),
}
}