#![allow(clippy::print_stderr)]
use std::io::Read;
use clap::Parser;
#[derive(Parser, Debug)]
#[command(
name = "decruft",
version,
about,
after_help = "Advanced options:\n \
--no-exact-selectors Disable exact CSS selector removal\n \
--no-partial-selectors Disable partial class/id pattern removal\n \
--no-hidden Disable hidden element removal\n \
--no-scoring Disable content scoring removal\n \
--no-patterns Disable content pattern removal\n \
--no-standardize Disable content standardization"
)]
#[expect(clippy::struct_excessive_bools)]
struct Cli {
#[arg(default_value = "-")]
input: String,
#[arg(short, long)]
url: Option<String>,
#[arg(short, long, default_value = "json")]
format: OutputFormat,
#[arg(short = 's', long)]
selector: Option<String>,
#[arg(short, long)]
debug: bool,
#[arg(long)]
no_images: bool,
#[arg(long = "no-replies")]
no_replies: bool,
#[arg(long, hide = true)]
no_exact_selectors: bool,
#[arg(long, hide = true)]
no_partial_selectors: bool,
#[arg(long, hide = true)]
no_hidden: bool,
#[arg(long, hide = true)]
no_scoring: bool,
#[arg(long, hide = true)]
no_patterns: bool,
#[arg(long, hide = true)]
no_standardize: bool,
}
#[derive(Debug, Clone, clap::ValueEnum)]
enum OutputFormat {
Html,
Json,
Text,
Markdown,
}
fn main() {
let cli = Cli::parse();
let is_url = cli.input.starts_with("http://") || cli.input.starts_with("https://");
let (html, effective_url) = if is_url {
let url = cli.input.clone();
let content = fetch_url(&url);
(content, Some(url))
} else if cli.input == "-" {
let mut buf = String::new();
if let Err(e) = std::io::stdin().read_to_string(&mut buf) {
eprintln!("Error reading stdin: {e}");
std::process::exit(1);
}
(buf, cli.url.clone())
} else {
match std::fs::read_to_string(&cli.input) {
Ok(s) => (s, cli.url.clone()),
Err(e) => {
eprintln!("Error reading {}: {e}", cli.input);
std::process::exit(1);
}
}
};
let mut options = decruft::DecruftOptions::default();
options.url = effective_url;
options.debug = cli.debug;
options.remove_exact_selectors = !cli.no_exact_selectors;
options.remove_partial_selectors = !cli.no_partial_selectors;
options.remove_images = cli.no_images;
options.remove_hidden_elements = !cli.no_hidden;
options.remove_low_scoring = !cli.no_scoring;
options.standardize = !cli.no_standardize;
options.remove_content_patterns = !cli.no_patterns;
options.content_selector = cli.selector;
options.markdown = matches!(cli.format, OutputFormat::Markdown);
options.separate_markdown = matches!(cli.format, OutputFormat::Markdown);
options.include_replies = !cli.no_replies;
let result = decruft::parse(&html, &options);
match cli.format {
OutputFormat::Json => match serde_json::to_string_pretty(&result) {
Ok(json) => write_stdout(&json),
Err(e) => {
eprintln!("Error serializing result: {e}");
std::process::exit(1);
}
},
OutputFormat::Html => write_stdout(&result.content),
OutputFormat::Markdown => {
let md = result
.content_markdown
.as_deref()
.unwrap_or(&result.content);
write_stdout(md);
}
OutputFormat::Text => {
let text = decruft::strip_html_tags(&result.content);
write_stdout(text.trim());
}
}
}
fn write_stdout(s: &str) {
use std::io::Write;
let stdout = std::io::stdout();
let mut handle = stdout.lock();
if let Err(e) = writeln!(handle, "{s}") {
if e.kind() == std::io::ErrorKind::BrokenPipe {
std::process::exit(0);
}
eprintln!("Error writing output: {e}");
std::process::exit(1);
}
}
fn fetch_url(url: &str) -> String {
let config = ureq::config::Config::builder()
.timeout_global(Some(std::time::Duration::from_secs(30)))
.build();
let agent = ureq::Agent::new_with_config(config);
let response = agent
.get(url)
.header(
"User-Agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
)
.call();
match response {
Ok(resp) if resp.status() == 200 => match resp.into_body().read_to_string() {
Ok(body) => body,
Err(e) => {
eprintln!("Error reading response for {url}: {e}");
std::process::exit(1);
}
},
Ok(resp) => {
eprintln!("Error fetching {url}: HTTP {}", resp.status());
std::process::exit(1);
}
Err(e) => {
eprintln!("Error fetching {url}: {e}");
std::process::exit(1);
}
}
}