use kawat_output::OutputFormat;
use std::collections::HashSet;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Focus {
Balanced,
Precision,
Recall,
}
#[derive(Debug, Clone)]
pub struct ExtractorOptions {
pub format: OutputFormat,
pub fast: bool,
pub focus: Focus,
pub comments: bool,
pub formatting: bool,
pub links: bool,
pub images: bool,
pub tables: bool,
pub dedup: bool,
pub target_language: Option<String>,
pub with_metadata: bool,
pub only_with_metadata: bool,
pub url_blacklist: HashSet<String>,
pub author_blacklist: HashSet<String>,
pub prune_selectors: Vec<String>,
pub date_params: htmldate_rs::DateOptions,
pub min_extracted_size: usize,
pub min_output_size: usize,
pub min_output_comm_size: usize,
pub min_extracted_comm_size: usize,
pub min_duplcheck_size: usize,
pub max_repetitions: usize,
}
impl Default for ExtractorOptions {
fn default() -> Self {
Self {
format: OutputFormat::Txt,
fast: false,
focus: Focus::Balanced,
comments: true,
formatting: false,
links: false,
images: false,
tables: true,
dedup: false,
target_language: None,
with_metadata: false,
only_with_metadata: false,
url_blacklist: HashSet::new(),
author_blacklist: HashSet::new(),
prune_selectors: Vec::new(),
date_params: htmldate_rs::DateOptions::default(),
min_extracted_size: 250,
min_output_size: 1,
min_output_comm_size: 1,
min_extracted_comm_size: 1,
min_duplcheck_size: 100,
max_repetitions: 2,
}
}
}