pub mod cascade;
pub mod compare;
pub mod config;
pub mod document;
pub use config::ExtractorOptions;
pub use document::Document;
pub use kawat_output::OutputFormat;
use thiserror::Error;
#[derive(Debug, Error)]
pub enum ExtractionError {
#[error("HTML parsing failed")]
ParseError,
#[error("document too short (len={0}, min={1})")]
TooShort(usize, usize),
#[error("duplicate document")]
Duplicate,
#[error("wrong language: expected {expected}, got {got:?}")]
WrongLanguage {
expected: String,
got: Option<String>,
},
#[error("blacklisted URL: {0}")]
BlacklistedUrl(String),
#[error("missing required metadata")]
MissingMetadata,
}
pub fn bare_extraction(
html: &str,
options: &ExtractorOptions,
) -> Result<Document, ExtractionError> {
cascade::run(html, options)
}
pub fn extract(html: &str, options: &ExtractorOptions) -> Result<String, ExtractionError> {
let doc = bare_extraction(html, options)?;
Ok(doc.to_formatted_string(options))
}