1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
use clap::Parser;
use fantoccini::wd::TimeoutConfiguration;
use mq_crawler::crawler::Crawler;
use url::Url;
#[derive(Clone, Debug, Default, clap::ValueEnum)]
enum OutputFormat {
#[default]
Text,
Json,
}
impl std::fmt::Display for OutputFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
OutputFormat::Text => write!(f, "text"),
OutputFormat::Json => write!(f, "json"),
}
}
}
/// A simple web crawler that fetches HTML, converts it to Markdown,
/// and optionally processes it with an mq_lang script.
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct CliArgs {
/// Delay (in seconds) between crawl requests to avoid overloading servers.
#[clap(short = 'd', long, default_value_t = 1.0)]
crawl_delay: f64,
/// Number of concurrent workers for parallel processing.
#[clap(short = 'c', long, default_value_t = 1)]
concurrency: usize,
/// Maximum crawl depth. 0 means only the specified URL, 1 means specified URL and its direct links, etc.
/// If not specified, crawling depth is unlimited.
#[clap(long)]
depth: Option<usize>,
/// Timeout (in seconds) for implicit waits (element finding).
#[clap(long, default_value_t = 5.0)]
implicit_timeout: f64,
/// Optional mq_lang query to process the crawled Markdown content.
#[clap(short = 'q', long)]
mq_query: Option<String>,
/// Timeout (in seconds) for loading a single page.
#[clap(long, default_value_t = 30.0)]
page_load_timeout: f64,
/// Optional path to an output DIRECTORY where markdown files will be saved.
/// If not provided, output is printed to stdout.
#[clap(short, long)]
output: Option<String>,
/// Optional path to a custom robots.txt file. If not provided, robots.txt will be fetched from the site.
#[clap(long)]
robots_path: Option<String>,
/// Timeout (in seconds) for executing scripts on the page.
#[clap(long, default_value_t = 10.0)]
script_timeout: f64,
/// The initial URL to start crawling from.
#[clap(required = true)]
url: Url,
/// Optional WebDriver URL for browser-based crawling (e.g., http://localhost:4444).
/// When specified, uses a headless browser to render JavaScript before extracting content.
#[clap(short = 'U', long, value_name = "WEBDRIVER_URL")]
webdriver_url: Option<Url>,
/// Use a built-in headless Chrome to render JavaScript without an external WebDriver server.
/// Requires Chrome or Chromium to be installed on the system.
/// Cannot be used together with --webdriver-url.
#[clap(long, conflicts_with = "webdriver_url")]
headless: bool,
/// Path to the Chrome/Chromium executable for headless crawling.
/// If not specified, Chrome is auto-detected from standard installation paths.
/// Only used when --headless is set.
#[clap(long, value_name = "PATH", requires = "headless")]
chrome_path: Option<std::path::PathBuf>,
/// Wait time (in seconds) after page load in headless mode.
/// When --headless-network-idle or --headless-wait-for-selector is used,
/// this value also acts as the maximum timeout for those strategies (default 30 s).
/// Only used when --headless is set.
#[clap(long, default_value_t = 0.0, requires = "headless")]
headless_wait: f64,
/// Wait for the browser's networkIdle CDP lifecycle event after page load.
/// Effective for SPAs that issue XHR/fetch requests after the load event.
/// The wait is bounded by --headless-wait (or 30 s if not set).
/// Only used when --headless is set.
#[clap(long, default_value_t = false, requires = "headless")]
headless_network_idle: bool,
/// Wait until the given CSS selector is present in the DOM after page load.
/// Useful when the page's content is injected by JavaScript.
/// Example: --headless-wait-for-selector "main"
/// The wait is bounded by --headless-wait (or 30 s if not set).
/// Only used when --headless is set.
#[clap(long, value_name = "SELECTOR", requires = "headless")]
headless_wait_for_selector: Option<String>,
/// Comma-separated list of domains to crawl in addition to the start URL's domain.
/// If not specified, only the start URL's domain is crawled.
/// If specified, the start URL's domain is always included automatically.
/// Example: --allowed-domains example.com,docs.example.com
#[clap(long, value_delimiter = ',', value_name = "DOMAIN")]
allowed_domains: Option<Vec<String>>,
/// Output format for results and statistics
#[clap(short = 'f', long, default_value_t = OutputFormat::Text)]
format: OutputFormat,
#[clap(flatten)]
pub conversion: ConversionArgs,
}
/// Options for Markdown conversion.
#[derive(Debug, Clone, clap::Args)]
pub struct ConversionArgs {
/// Extract <script> tags as code blocks in Markdown
#[clap(
long,
help = "Extract <script> tags as code blocks in Markdown",
default_value_t = false
)]
pub extract_scripts_as_code_blocks: bool,
/// Generate YAML front matter from page metadata
#[clap(
long,
help = "Generate YAML front matter from page metadata",
default_value_t = false
)]
pub generate_front_matter: bool,
/// Use the HTML <title> as the first H1 in Markdown
#[clap(
long,
help = "Use the HTML <title> as the first H1 in Markdown",
default_value_t = false
)]
pub use_title_as_h1: bool,
}
#[tokio::main]
async fn main() {
tracing_subscriber::fmt().with_writer(std::io::stderr).init();
let args = CliArgs::parse();
tracing::info!("Initializing crawler for URL: {}", args.url);
// Build the effective allowed domains list.
// When --allowed-domains is provided, always include the start URL's domain as well.
let effective_allowed = args.allowed_domains.map(|v| {
let mut v: Vec<String> = v.into_iter().map(|d| d.trim().to_lowercase()).collect();
if let Some(start_domain) = args.url.domain() {
let start_domain = start_domain.to_lowercase();
if !v.contains(&start_domain) {
v.push(start_domain);
}
}
v
});
let client = if let Some(url) = args.webdriver_url {
mq_crawler::http_client::HttpClient::Fantoccini({
let fantoccini_client = fantoccini::ClientBuilder::rustls()
.expect("Failed to create rustls client builder")
.connect(url.as_ref())
.await
.expect("Failed to connect to WebDriver");
fantoccini_client
.update_timeouts(TimeoutConfiguration::new(
Some(std::time::Duration::from_secs_f64(args.script_timeout)),
Some(std::time::Duration::from_secs_f64(args.page_load_timeout)),
Some(std::time::Duration::from_secs_f64(args.implicit_timeout)),
))
.await
.expect("Failed to set timeouts on Fantoccini client");
fantoccini_client
})
} else if args.headless {
let headless_wait_secs = if !args.headless_wait.is_finite() || args.headless_wait < 0.0 {
tracing::warn!(
"Invalid value for --headless-wait ({}). Falling back to 0 seconds.",
args.headless_wait
);
0.0
} else {
args.headless_wait
};
// strategy_timeout: use --headless-wait if > 0, otherwise 30 s.
let strategy_timeout = if headless_wait_secs > 0.0 {
std::time::Duration::from_secs_f64(headless_wait_secs)
} else {
std::time::Duration::from_secs(30)
};
// fixed_delay: only apply when no other strategy is active.
let fixed_delay = if args.headless_network_idle || args.headless_wait_for_selector.is_some() {
std::time::Duration::ZERO
} else {
std::time::Duration::from_secs_f64(headless_wait_secs)
};
let wait_config = mq_crawler::http_client::ChromiumWaitConfig {
fixed_delay,
wait_for_selector: args.headless_wait_for_selector.clone(),
network_idle: args.headless_network_idle,
strategy_timeout,
};
mq_crawler::http_client::HttpClient::new_chromium(args.chrome_path, wait_config)
.await
.expect("Failed to launch headless Chrome. Ensure Chrome or Chromium is installed.")
} else if effective_allowed.is_some() {
mq_crawler::http_client::HttpClient::new_reqwest_multi_domain(args.page_load_timeout, args.concurrency.max(5))
.unwrap()
} else {
mq_crawler::http_client::HttpClient::new_reqwest(args.page_load_timeout).unwrap()
};
let format = match args.format {
OutputFormat::Text => mq_crawler::crawler::OutputFormat::Text,
OutputFormat::Json => mq_crawler::crawler::OutputFormat::Json,
};
match Crawler::new(
client,
args.url.clone(),
args.crawl_delay,
args.robots_path.clone(),
args.mq_query.clone(),
args.output,
args.concurrency,
format,
mq_markdown::ConversionOptions {
extract_scripts_as_code_blocks: args.conversion.extract_scripts_as_code_blocks,
generate_front_matter: args.conversion.generate_front_matter,
use_title_as_h1: args.conversion.use_title_as_h1,
},
args.depth,
effective_allowed,
)
.await
{
Ok(mut crawler) => {
if let Err(e) = crawler.run().await {
// robots_path no longer passed here
tracing::error!("Crawler run failed: {}", e);
} else {
tracing::info!("Crawling complete.");
}
}
Err(e) => {
tracing::error!("Failed to initialize crawler: {}", e);
}
}
}