use clap::Parser;
use fantoccini::wd::TimeoutConfiguration;
use mq_crawler::crawler::Crawler;
use url::Url;
#[derive(Clone, Debug, Default, clap::ValueEnum)]
enum OutputFormat {
#[default]
Text,
Json,
}
impl std::fmt::Display for OutputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OutputFormat::Text => write!(f, "text"),
OutputFormat::Json => write!(f, "json"),
}
}
}
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct CliArgs {
#[clap(short = 'd', long, default_value_t = 1.0)]
crawl_delay: f64,
#[clap(short = 'c', long, default_value_t = 1)]
concurrency: usize,
#[clap(long)]
depth: Option<usize>,
#[clap(long, default_value_t = 5.0)]
implicit_timeout: f64,
#[clap(short = 'q', long)]
mq_query: Option<String>,
#[clap(long, default_value_t = 30.0)]
page_load_timeout: f64,
#[clap(short, long)]
output: Option<String>,
#[clap(long)]
robots_path: Option<String>,
#[clap(long, default_value_t = 10.0)]
script_timeout: f64,
#[clap(required = true)]
url: Url,
#[clap(short = 'U', long, value_name = "WEBDRIVER_URL")]
webdriver_url: Option<Url>,
#[clap(long, conflicts_with = "webdriver_url")]
headless: bool,
#[clap(long, value_name = "PATH", requires = "headless")]
chrome_path: Option<std::path::PathBuf>,
#[clap(long, default_value_t = 0.0, requires = "headless")]
headless_wait: f64,
#[clap(long, value_delimiter = ',', value_name = "DOMAIN")]
allowed_domains: Option<Vec<String>>,
#[clap(short = 'f', long, default_value_t = OutputFormat::Text)]
format: OutputFormat,
#[clap(flatten)]
pub conversion: ConversionArgs,
}
#[derive(Debug, Clone, clap::Args)]
pub struct ConversionArgs {
#[clap(
long,
help = "Extract <script> tags as code blocks in Markdown",
default_value_t = false
)]
pub extract_scripts_as_code_blocks: bool,
#[clap(
long,
help = "Generate YAML front matter from page metadata",
default_value_t = false
)]
pub generate_front_matter: bool,
#[clap(
long,
help = "Use the HTML <title> as the first H1 in Markdown",
default_value_t = false
)]
pub use_title_as_h1: bool,
}
#[tokio::main]
async fn main() {
tracing_subscriber::fmt().with_writer(std::io::stderr).init();
let args = CliArgs::parse();
tracing::info!("Initializing crawler for URL: {}", args.url);
let effective_allowed = args.allowed_domains.map(|v| {
let mut v: Vec<String> = v.into_iter().map(|d| d.trim().to_lowercase()).collect();
if let Some(start_domain) = args.url.domain() {
let start_domain = start_domain.to_lowercase();
if !v.contains(&start_domain) {
v.push(start_domain);
}
}
v
});
let client = if let Some(url) = args.webdriver_url {
mq_crawler::http_client::HttpClient::Fantoccini({
let fantoccini_client = fantoccini::ClientBuilder::rustls()
.expect("Failed to create rustls client builder")
.connect(url.as_ref())
.await
.expect("Failed to connect to WebDriver");
fantoccini_client
.update_timeouts(TimeoutConfiguration::new(
Some(std::time::Duration::from_secs_f64(args.script_timeout)),
Some(std::time::Duration::from_secs_f64(args.page_load_timeout)),
Some(std::time::Duration::from_secs_f64(args.implicit_timeout)),
))
.await
.expect("Failed to set timeouts on Fantoccini client");
fantoccini_client
})
} else if args.headless {
let headless_wait_secs = if !args.headless_wait.is_finite() || args.headless_wait < 0.0 {
tracing::warn!(
"Invalid value for --headless-wait ({}). Falling back to 0 seconds.",
args.headless_wait
);
0.0
} else {
args.headless_wait
};
mq_crawler::http_client::HttpClient::new_chromium(
args.chrome_path,
std::time::Duration::from_secs_f64(headless_wait_secs),
)
.await
.expect("Failed to launch headless Chrome. Ensure Chrome or Chromium is installed.")
} else if effective_allowed.is_some() {
mq_crawler::http_client::HttpClient::new_reqwest_multi_domain(args.page_load_timeout, args.concurrency.max(5))
.unwrap()
} else {
mq_crawler::http_client::HttpClient::new_reqwest(args.page_load_timeout).unwrap()
};
let format = match args.format {
OutputFormat::Text => mq_crawler::crawler::OutputFormat::Text,
OutputFormat::Json => mq_crawler::crawler::OutputFormat::Json,
};
match Crawler::new(
client,
args.url.clone(),
args.crawl_delay,
args.robots_path.clone(),
args.mq_query.clone(),
args.output,
args.concurrency,
format,
mq_markdown::ConversionOptions {
extract_scripts_as_code_blocks: args.conversion.extract_scripts_as_code_blocks,
generate_front_matter: args.conversion.generate_front_matter,
use_title_as_h1: args.conversion.use_title_as_h1,
},
args.depth,
effective_allowed,
)
.await
{
Ok(mut crawler) => {
if let Err(e) = crawler.run().await {
tracing::error!("Crawler run failed: {}", e);
} else {
tracing::info!("Crawling complete.");
}
}
Err(e) => {
tracing::error!("Failed to initialize crawler: {}", e);
}
}
}