use axum::{
extract::Query,
http::StatusCode,
response::{IntoResponse, Response},
routing::get,
Router,
};
use clap::Parser;
use serde::Deserialize;
use std::net::SocketAddr;
use std::path::PathBuf;
use tokio::fs;
use tower_http::trace::TraceLayer;
use tracing::{error, info};
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
use url::Url;
use web_capture::{
capture_screenshot, convert_html_to_markdown, convert_html_to_markdown_enhanced,
convert_relative_urls, convert_to_utf8, fetch_html, html, render_html, EnhancedOptions,
};
#[derive(Parser, Debug)]
#[command(
name = "web-capture",
about = "Capture web pages as HTML, Markdown, or PNG",
version
)]
#[allow(clippy::struct_excessive_bools)]
struct Args {
#[arg(index = 1)]
url: Option<String>,
#[arg(short, long)]
serve: bool,
#[arg(short, long, default_value = "3000", env = "PORT")]
port: u16,
#[arg(short, long, default_value = "html")]
format: String,
#[arg(short, long)]
output: Option<PathBuf>,
#[arg(long, default_value_t = false)]
enhanced: bool,
#[arg(long, default_value_t = true)]
extract_latex: bool,
#[arg(long, default_value_t = true)]
extract_metadata: bool,
#[arg(long, default_value_t = true)]
post_process: bool,
#[arg(long, default_value_t = true)]
detect_code_language: bool,
#[arg(long, default_value_t = false)]
dual_theme: bool,
#[arg(long)]
config_file: Option<PathBuf>,
#[arg(long, default_value_t = false)]
all: bool,
#[arg(long, default_value_t = false)]
dry_run: bool,
#[arg(long, default_value_t = false)]
verbose: bool,
#[arg(long, env = "API_TOKEN")]
api_token: Option<String>,
#[arg(long, default_value = "browser")]
capture: String,
}
#[derive(Debug, Deserialize)]
struct UrlQuery {
url: String,
}
#[derive(Debug, Deserialize)]
struct GDocsQuery {
url: String,
format: Option<String>,
#[serde(rename = "apiToken")]
api_token: Option<String>,
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "web_capture=info,tower_http=info".into()),
)
.with(tracing_subscriber::fmt::layer())
.init();
let args = Args::parse();
if args.serve {
start_server(args.port).await?;
} else if let Some(ref url) = args.url {
capture_url(url, &args.format, args.output.as_ref(), &args).await?;
} else {
eprintln!("Error: Missing URL or --serve flag");
eprintln!("Run with --help for usage information");
std::process::exit(1);
}
Ok(())
}
async fn start_server(port: u16) -> anyhow::Result<()> {
let app = Router::new()
.route("/html", get(html_handler))
.route("/markdown", get(markdown_handler))
.route("/image", get(image_handler))
.route("/fetch", get(fetch_handler))
.route("/stream", get(stream_handler))
.route("/animation", get(animation_handler))
.route("/figures", get(figures_handler))
.route("/themed-image", get(themed_image_handler))
.route("/gdocs", get(gdocs_handler))
.layer(TraceLayer::new_for_http());
let addr = SocketAddr::from(([0, 0, 0, 0], port));
info!("web-capture server listening on http://{}", addr);
info!("");
info!("Available endpoints:");
info!(" GET /html?url=<URL> - Render page as HTML");
info!(" GET /markdown?url=<URL> - Convert page to Markdown");
info!(" GET /image?url=<URL> - Screenshot page as PNG");
info!(" GET /fetch?url=<URL> - Proxy fetch content");
info!(" GET /stream?url=<URL> - Stream content");
info!(" GET /animation?url=<URL> - Capture animation frames");
info!(" GET /figures?url=<URL> - Extract figure images");
info!(" GET /themed-image?url=<URL> - Dual-theme screenshots");
info!(" GET /gdocs?url=<URL>&format=markdown|html|txt - Google Docs capture");
info!("");
info!("Press Ctrl+C to stop the server");
let listener = tokio::net::TcpListener::bind(addr).await?;
axum::serve(listener, app)
.with_graceful_shutdown(shutdown_signal())
.await?;
Ok(())
}
async fn shutdown_signal() {
tokio::signal::ctrl_c()
.await
.expect("Failed to install Ctrl+C handler");
info!("Shutdown signal received, closing server...");
}
async fn html_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let html_result = match fetch_html(&url).await {
Ok(html) => html,
Err(e) => {
error!("Failed to fetch HTML: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error fetching HTML").into_response();
}
};
let needs_render = !html::is_html(&html_result) || html::has_javascript(&html_result);
let final_html = if needs_render {
match render_html(&url).await {
Ok(rendered) => {
let utf8_html = convert_to_utf8(&rendered);
convert_relative_urls(&utf8_html, &url)
}
Err(e) => {
error!("Failed to render HTML: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error rendering HTML").into_response();
}
}
} else {
let utf8_html = convert_to_utf8(&html_result);
convert_relative_urls(&utf8_html, &url)
};
(
StatusCode::OK,
[("Content-Type", "text/html; charset=utf-8")],
final_html,
)
.into_response()
}
async fn markdown_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let html = match fetch_html(&url).await {
Ok(html) => html,
Err(e) => {
error!("Failed to fetch HTML: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error fetching HTML").into_response();
}
};
let markdown = match convert_html_to_markdown(&html, Some(&url)) {
Ok(md) => md,
Err(e) => {
error!("Failed to convert to Markdown: {}", e);
return (
StatusCode::INTERNAL_SERVER_ERROR,
"Error converting to Markdown",
)
.into_response();
}
};
(
StatusCode::OK,
[("Content-Type", "text/markdown")],
markdown,
)
.into_response()
}
async fn image_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let screenshot = match capture_screenshot(&url).await {
Ok(data) => data,
Err(e) => {
error!("Failed to capture screenshot: {}", e);
return (
StatusCode::INTERNAL_SERVER_ERROR,
"Error capturing screenshot",
)
.into_response();
}
};
(
StatusCode::OK,
[
("Content-Type", "image/png"),
("Content-Disposition", "inline; filename=\"screenshot.png\""),
],
screenshot,
)
.into_response()
}
async fn fetch_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let client = match reqwest::Client::builder().build() {
Ok(client) => client,
Err(e) => {
error!("Failed to create HTTP client: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error creating client").into_response();
}
};
let response = match client.get(&url).send().await {
Ok(resp) => resp,
Err(e) => {
error!("Failed to fetch content: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error fetching content").into_response();
}
};
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::OK);
let content_type = response
.headers()
.get("content-type")
.and_then(|v| v.to_str().ok())
.unwrap_or("text/plain")
.to_string();
let body = match response.bytes().await {
Ok(bytes) => bytes.to_vec(),
Err(e) => {
error!("Failed to read response body: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error reading content").into_response();
}
};
(status, [("Content-Type", content_type.as_str())], body).into_response()
}
async fn stream_handler(query: Query<UrlQuery>) -> Response {
fetch_handler(query).await
}
async fn animation_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let options = web_capture::animation::AnimationOptions::default();
match web_capture::animation::capture_animation_frames(&url, &options).await {
Ok(result) => {
let json = serde_json::json!({
"frameCount": result.frames.len(),
"loopDetected": result.loop_detected,
"loopFrame": result.loop_frame,
"duration": result.duration,
"totalFrames": result.total_frames,
});
axum::Json(json).into_response()
}
Err(e) => {
error!("Animation capture error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
}
}
async fn figures_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let html_content = match fetch_html(&url).await {
Ok(html) => html,
Err(e) => {
error!("Failed to fetch HTML for figures: {}", e);
return (StatusCode::INTERNAL_SERVER_ERROR, "Error fetching HTML").into_response();
}
};
let figures = web_capture::figures::extract_figures(&html_content, &url);
let downloaded = web_capture::figures::download_figures(&figures).await;
let json = serde_json::json!({
"url": url,
"totalFound": figures.len(),
"totalDownloaded": downloaded.iter().filter(|d| d.buffer.is_some()).count(),
"figures": downloaded.iter().map(|f| serde_json::json!({
"figureNum": f.figure_num,
"filename": f.filename,
"caption": f.caption,
"originalUrl": f.original_url,
"downloaded": f.buffer.is_some(),
"error": f.error,
})).collect::<Vec<_>>(),
});
axum::Json(json).into_response()
}
async fn themed_image_handler(Query(params): Query<UrlQuery>) -> Response {
let url = match normalize_url(¶ms.url) {
Ok(url) => url,
Err(e) => return (StatusCode::BAD_REQUEST, e).into_response(),
};
let options = web_capture::themed_image::ThemedImageOptions::default();
match web_capture::themed_image::capture_dual_theme_screenshots(&url, &options).await {
Ok(result) => {
let json = serde_json::json!({
"url": result.url,
"width": result.width,
"height": result.height,
"lightSize": result.light_size,
"darkSize": result.dark_size,
});
axum::Json(json).into_response()
}
Err(e) => {
error!("Themed image capture error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
}
}
async fn gdocs_handler(
Query(params): Query<GDocsQuery>,
headers: axum::http::HeaderMap,
) -> Response {
if !web_capture::gdocs::is_google_docs_url(¶ms.url) {
return (
StatusCode::BAD_REQUEST,
"URL is not a Google Docs document URL",
)
.into_response();
}
let api_token = params
.api_token
.as_deref()
.or_else(|| {
headers
.get("authorization")
.and_then(|v| v.to_str().ok())
.and_then(web_capture::gdocs::extract_bearer_token)
})
.or_else(|| headers.get("x-api-token").and_then(|v| v.to_str().ok()));
let format = params.format.as_deref().unwrap_or("markdown");
match format {
"archive" | "zip" => {
match web_capture::gdocs::fetch_google_doc_as_archive(¶ms.url, api_token).await {
Ok(archive) => match web_capture::gdocs::create_archive_zip(&archive) {
Ok(zip_data) => {
let filename = format!("gdoc-{}.zip", archive.document_id);
(
StatusCode::OK,
[
("Content-Type", "application/zip".to_string()),
(
"Content-Disposition",
format!("attachment; filename=\"{filename}\""),
),
],
zip_data,
)
.into_response()
}
Err(e) => {
error!("Google Docs archive error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
},
Err(e) => {
error!("Google Docs capture error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
}
}
"markdown" | "md" => {
match web_capture::gdocs::fetch_google_doc_as_markdown(¶ms.url, api_token).await {
Ok(result) => (
StatusCode::OK,
[("Content-Type", "text/markdown")],
result.content,
)
.into_response(),
Err(e) => {
error!("Google Docs capture error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
}
}
"html" | "txt" | "pdf" | "docx" | "epub" => {
match web_capture::gdocs::fetch_google_doc(¶ms.url, format, api_token).await {
Ok(result) => {
let content_type = match format {
"txt" => "text/plain",
"pdf" => "application/pdf",
"docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"epub" => "application/epub+zip",
_ => "text/html",
};
(
StatusCode::OK,
[("Content-Type", content_type)],
result.content,
)
.into_response()
}
Err(e) => {
error!("Google Docs capture error: {}", e);
(StatusCode::INTERNAL_SERVER_ERROR, e.to_string()).into_response()
}
}
}
_ => (
StatusCode::BAD_REQUEST,
format!("Unsupported format: {format}"),
)
.into_response(),
}
}
#[allow(clippy::too_many_lines)]
async fn capture_url(
url: &str,
format: &str,
output: Option<&PathBuf>,
args: &Args,
) -> anyhow::Result<()> {
let absolute_url = normalize_url(url).map_err(|e| anyhow::anyhow!(e))?;
if web_capture::gdocs::is_google_docs_url(&absolute_url) {
let api_token = args.api_token.as_deref();
match format.to_lowercase().as_str() {
"markdown" | "md" => {
let result =
web_capture::gdocs::fetch_google_doc_as_markdown(&absolute_url, api_token)
.await?;
if let Some(path) = output {
fs::write(path, &result.content).await?;
eprintln!("Google Doc Markdown saved to: {}", path.display());
} else {
print!("{}", result.content);
}
}
_ => {
let format_lower = format.to_lowercase();
let gdocs_format = match format_lower.as_str() {
"png" | "image" | "screenshot" => "html",
other => other,
};
let result =
web_capture::gdocs::fetch_google_doc(&absolute_url, gdocs_format, api_token)
.await?;
if let Some(path) = output {
fs::write(path, &result.content).await?;
eprintln!("Google Doc ({}) saved to: {}", gdocs_format, path.display());
} else {
print!("{}", result.content);
}
}
}
return Ok(());
}
match format.to_lowercase().as_str() {
"markdown" | "md" => {
let html = fetch_html(&absolute_url).await?;
let markdown = if args.enhanced {
let options = EnhancedOptions {
extract_latex: args.extract_latex,
extract_metadata: args.extract_metadata,
post_process: args.post_process,
detect_code_language: args.detect_code_language,
};
let result =
convert_html_to_markdown_enhanced(&html, Some(&absolute_url), &options)?;
result.markdown
} else {
convert_html_to_markdown(&html, Some(&absolute_url))?
};
if let Some(path) = output {
fs::write(path, &markdown).await?;
eprintln!("Markdown saved to: {}", path.display());
} else {
print!("{markdown}");
}
}
"image" | "png" | "screenshot" => {
let screenshot = capture_screenshot(&absolute_url).await?;
if let Some(path) = output {
fs::write(path, &screenshot).await?;
eprintln!("Screenshot saved to: {}", path.display());
} else {
let parsed_url = Url::parse(&absolute_url)?;
let hostname = parsed_url.host_str().unwrap_or("unknown");
let default_filename = format!(
"{}_{}.png",
hostname.replace('.', "_"),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)?
.as_millis()
);
fs::write(&default_filename, &screenshot).await?;
eprintln!("Screenshot saved to: {default_filename}");
}
}
_ => {
let html_content = fetch_html(&absolute_url).await?;
let needs_render = !html::is_html(&html_content) || html::has_javascript(&html_content);
let result = if needs_render {
let rendered = render_html(&absolute_url).await?;
let utf8_html = convert_to_utf8(&rendered);
convert_relative_urls(&utf8_html, &absolute_url)
} else {
let utf8_html = convert_to_utf8(&html_content);
convert_relative_urls(&utf8_html, &absolute_url)
};
if let Some(path) = output {
fs::write(path, &result).await?;
eprintln!("HTML saved to: {}", path.display());
} else {
print!("{result}");
}
}
}
Ok(())
}
fn normalize_url(url: &str) -> Result<String, String> {
web_capture::html::normalize_url(url)
}