use crate::ExtractionError;
use crate::config::ExtractorOptions;
use crate::document::Document;
pub fn run(html: &str, options: &ExtractorOptions) -> Result<Document, ExtractionError> {
let document = scraper::Html::parse_document(html);
let metadata = kawat_metadata::DocumentMetadata::default();
let body = extract_body_text(&document);
if body.len() < options.min_extracted_size {
return Err(ExtractionError::TooShort(
body.len(),
options.min_extracted_size,
));
}
Ok(Document {
metadata,
body,
comments: None,
raw_text: None,
text: None,
})
}
fn extract_body_text(document: &scraper::Html) -> String {
let root = document.root_element();
let text = root.text().collect::<Vec<_>>().join(" ");
text.split_whitespace().collect::<Vec<_>>().join(" ")
}