Skip to main content

kawat_core/
lib.rs

1//! Core extraction orchestrator.
2//!
3//! Implements the full trafilatura extraction cascade:
4//!
5//! ```text
6//! HTML → parse → metadata → clean → convert tags → extract comments
7//!   → trafilatura_sequence:
8//!       extract_content (BODY_XPATH)
9//!       → if not fast: compare_extraction (readability + justext)
10//!       → if still short: baseline
11//!   → size checks → dedup → language filter → output format
12//! ```
13
14pub mod cascade;
15pub mod compare;
16pub mod config;
17pub mod document;
18
19pub use config::ExtractorOptions;
20pub use document::Document;
21pub use kawat_output::OutputFormat;
22
23use thiserror::Error;
24
25#[derive(Debug, Error)]
26pub enum ExtractionError {
27    #[error("HTML parsing failed")]
28    ParseError,
29    #[error("document too short (len={0}, min={1})")]
30    TooShort(usize, usize),
31    #[error("duplicate document")]
32    Duplicate,
33    #[error("wrong language: expected {expected}, got {got:?}")]
34    WrongLanguage {
35        expected: String,
36        got: Option<String>,
37    },
38    #[error("blacklisted URL: {0}")]
39    BlacklistedUrl(String),
40    #[error("missing required metadata")]
41    MissingMetadata,
42}
43
44/// Extract content from an HTML document.
45///
46/// This is the main entry point, equivalent to trafilatura's `bare_extraction()`.
47pub fn bare_extraction(
48    html: &str,
49    options: &ExtractorOptions,
50) -> Result<Document, ExtractionError> {
51    cascade::run(html, options)
52}
53
54/// Extract and format content, equivalent to trafilatura's `extract()`.
55pub fn extract(html: &str, options: &ExtractorOptions) -> Result<String, ExtractionError> {
56    let doc = bare_extraction(html, options)?;
57    Ok(doc.to_formatted_string(options))
58}