mod classify;
mod error;
mod paragraph;
mod paragraph_maker;
mod preprocess;
mod revise;
pub mod stoplists;
pub use error::JustextError;
pub use paragraph::{ClassType, Paragraph};
pub use stoplists::{available_languages, get_all_stoplists, get_stoplist};
use std::collections::HashSet;
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct Config {
pub length_low: usize,
pub length_high: usize,
pub stopwords_low: f64,
pub stopwords_high: f64,
pub max_link_density: f64,
pub max_heading_distance: usize,
pub no_headings: bool,
}
impl Default for Config {
fn default() -> Self {
Self {
length_low: 70,
length_high: 200,
stopwords_low: 0.30,
stopwords_high: 0.32,
max_link_density: 0.2,
max_heading_distance: 200,
no_headings: false,
}
}
}
impl Config {
pub fn with_length_low(mut self, n: usize) -> Self {
self.length_low = n;
self
}
pub fn with_length_high(mut self, n: usize) -> Self {
self.length_high = n;
self
}
pub fn with_stopwords_low(mut self, v: f64) -> Self {
self.stopwords_low = v;
self
}
pub fn with_stopwords_high(mut self, v: f64) -> Self {
self.stopwords_high = v;
self
}
pub fn with_max_link_density(mut self, v: f64) -> Self {
self.max_link_density = v;
self
}
pub fn with_max_heading_distance(mut self, n: usize) -> Self {
self.max_heading_distance = n;
self
}
pub fn with_no_headings(mut self, v: bool) -> Self {
self.no_headings = v;
self
}
}
pub fn justext(html: &str, stoplist: &HashSet<String>, config: &Config) -> Vec<Paragraph> {
let doc = preprocess::preprocess(html);
let mut paragraphs = paragraph_maker::make_paragraphs(&doc);
classify::classify_paragraphs(&mut paragraphs, stoplist, config);
revise::revise_paragraph_classification(&mut paragraphs, config.max_heading_distance);
paragraphs
}
pub fn extract_text(html: &str, stoplist: &HashSet<String>, config: &Config) -> String {
justext(html, stoplist, config)
.into_iter()
.filter(|p| !p.is_boilerplate())
.map(|p| p.text)
.collect::<Vec<_>>()
.join("\n")
}
pub fn justext_lang(
html: &str,
language: &str,
config: &Config,
) -> Result<Vec<Paragraph>, JustextError> {
let stoplist = get_stoplist(language)?;
Ok(justext(html, &stoplist, config))
}
pub fn extract_text_lang(
html: &str,
language: &str,
config: &Config,
) -> Result<String, JustextError> {
let stoplist = get_stoplist(language)?;
Ok(extract_text(html, &stoplist, config))
}