use std::io::{self, Read, Write};
use clap::{Parser, ValueEnum};
use serde::Serialize;
use trafilatura::{
create_readable_document, extract, ExtractResult, ExtractionFocus, Metadata, Options,
};
const DEFAULT_USER_AGENT: &str =
"Mozilla/5.0 (X11; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0";
#[derive(Parser)]
#[command(
name = "trafilatura",
about = "Extract readable content from a HTML file or URL",
long_about = "Extract readable content, comments and metadata from a specified source \
which can be either a HTML file or URL. Reads from stdin when no source \
is given."
)]
struct Cli {
source: Option<String>,
#[arg(short, long, value_enum, default_value = "html")]
format: OutputFormat,
#[arg(short, long)]
language: Option<String>,
#[arg(long)]
no_fallback: bool,
#[arg(long)]
no_comments: bool,
#[arg(long)]
no_tables: bool,
#[arg(long)]
images: bool,
#[arg(long)]
links: bool,
#[arg(long)]
precision: bool,
#[arg(long)]
recall: bool,
#[arg(long)]
deduplicate: bool,
#[arg(long)]
has_metadata: bool,
#[arg(short, long)]
verbose: bool,
#[arg(short, long, default_value = "30")]
timeout: u64,
#[arg(long)]
skip_tls: bool,
#[arg(short, long, default_value = DEFAULT_USER_AGENT)]
user_agent: String,
}
#[derive(ValueEnum, Clone)]
enum OutputFormat {
Html,
Txt,
Json,
}
fn main() {
let cli = Cli::parse();
if cli.verbose {
tracing_subscriber::fmt()
.with_max_level(tracing::Level::DEBUG)
.with_writer(std::io::stderr)
.init();
}
let opts = build_options(&cli);
let result = match cli.source.as_deref() {
Some(src) if std::path::Path::new(src).exists() => process_file(src, opts),
Some(src) if is_valid_url(src) => {
let url = url::Url::parse(src).expect("already validated");
process_url(&cli, url, opts)
}
Some(src) => {
eprintln!("error: '{}' is neither a file nor a valid URL", src);
std::process::exit(1);
}
None => process_stdin(opts),
};
let result = match result {
Ok(r) if !r.content_text.is_empty() => r,
Ok(_) => {
eprintln!("error: no readable content found");
std::process::exit(1);
}
Err(e) => {
eprintln!("error: {}", e);
std::process::exit(1);
}
};
if let Err(e) = write_output(&mut io::stdout(), &result, &cli.format) {
eprintln!("error writing output: {}", e);
std::process::exit(1);
}
}
fn process_file(path: &str, opts: Options) -> Result<ExtractResult, String> {
let html =
std::fs::read_to_string(path).map_err(|e| format!("cannot read '{}': {}", path, e))?;
extract(&html, &opts).map_err(|e| e.to_string())
}
fn process_url(cli: &Cli, url: url::Url, mut opts: Options) -> Result<ExtractResult, String> {
if cli.verbose {
eprintln!("downloading {:?}", url.as_str());
}
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(cli.timeout))
.danger_accept_invalid_certs(cli.skip_tls)
.user_agent(&cli.user_agent)
.build()
.map_err(|e| format!("failed to build HTTP client: {}", e))?;
let resp = client
.get(url.as_str())
.send()
.map_err(|e| format!("failed to download '{}': {}", url, e))?;
let content_type = resp
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.unwrap_or("")
.to_owned();
if !content_type.contains("text/html") {
return Err(format!("page is not HTML: \"{}\"", content_type));
}
let html = resp
.text()
.map_err(|e| format!("failed to read response body: {}", e))?;
opts.original_url = Some(url);
extract(&html, &opts).map_err(|e| e.to_string())
}
fn process_stdin(opts: Options) -> Result<ExtractResult, String> {
let mut html = String::new();
io::stdin()
.read_to_string(&mut html)
.map_err(|e| format!("failed to read stdin: {}", e))?;
extract(&html, &opts).map_err(|e| e.to_string())
}
fn build_options(cli: &Cli) -> Options {
let focus = if cli.precision {
ExtractionFocus::FavorPrecision
} else if cli.recall {
ExtractionFocus::FavorRecall
} else {
ExtractionFocus::Balanced
};
let mut o = Options::default();
o.enable_fallback = !cli.no_fallback;
o.target_language = cli.language.clone();
o.exclude_comments = cli.no_comments;
o.exclude_tables = cli.no_tables;
o.include_images = cli.images;
o.include_links = cli.links;
o.focus = focus;
o.deduplicate = cli.deduplicate;
o.has_essential_metadata = cli.has_metadata;
o
}
fn write_output(
w: &mut dyn Write,
result: &ExtractResult,
format: &OutputFormat,
) -> io::Result<()> {
match format {
OutputFormat::Html => write_html(w, result),
OutputFormat::Txt => write_text(w, result),
OutputFormat::Json => write_json(w, result),
}
}
fn write_html(w: &mut dyn Write, result: &ExtractResult) -> io::Result<()> {
let html = create_readable_document(result);
writeln!(w, "{}", html)
}
fn write_text(w: &mut dyn Write, result: &ExtractResult) -> io::Result<()> {
let mut out = result.content_text.clone();
if !result.comments_text.is_empty() {
if !out.is_empty() {
out.push(' ');
}
out.push_str(&result.comments_text);
}
if !out.is_empty() {
out.push('\n');
}
w.write_all(out.as_bytes())
}
fn write_json(w: &mut dyn Write, result: &ExtractResult) -> io::Result<()> {
let output = JsonOutput::from(result);
serde_json::to_writer(&mut *w, &output).map_err(io::Error::other)?;
writeln!(w)
}
#[derive(Serialize)]
#[serde(rename_all = "camelCase")]
struct JsonOutput<'a> {
content_html: &'a str,
content_text: &'a str,
#[serde(skip_serializing_if = "str::is_empty")]
comments_html: &'a str,
#[serde(skip_serializing_if = "str::is_empty")]
comments_text: &'a str,
metadata: &'a Metadata,
}
impl<'a> From<&'a ExtractResult> for JsonOutput<'a> {
fn from(r: &'a ExtractResult) -> Self {
Self {
content_html: &r.content_html,
content_text: &r.content_text,
comments_html: &r.comments_html,
comments_text: &r.comments_text,
metadata: &r.metadata,
}
}
}
fn is_valid_url(s: &str) -> bool {
match url::Url::parse(s) {
Ok(u) => u.scheme() == "http" || u.scheme() == "https",
Err(_) => false,
}
}