1pub mod cascade;
15pub mod compare;
16pub mod config;
17pub mod document;
18
19pub use config::ExtractorOptions;
20pub use document::Document;
21pub use kawat_output::OutputFormat;
22
23use thiserror::Error;
24
25#[derive(Debug, Error)]
26pub enum ExtractionError {
27 #[error("HTML parsing failed")]
28 ParseError,
29 #[error("document too short (len={0}, min={1})")]
30 TooShort(usize, usize),
31 #[error("duplicate document")]
32 Duplicate,
33 #[error("wrong language: expected {expected}, got {got:?}")]
34 WrongLanguage {
35 expected: String,
36 got: Option<String>,
37 },
38 #[error("blacklisted URL: {0}")]
39 BlacklistedUrl(String),
40 #[error("missing required metadata")]
41 MissingMetadata,
42}
43
44pub fn bare_extraction(
48 html: &str,
49 options: &ExtractorOptions,
50) -> Result<Document, ExtractionError> {
51 cascade::run(html, options)
52}
53
54pub fn extract(html: &str, options: &ExtractorOptions) -> Result<String, ExtractionError> {
56 let doc = bare_extraction(html, options)?;
57 Ok(doc.to_formatted_string(options))
58}