mod cleaner;
mod scorer;
mod utils;
use crate::logging::logger::*;
use crate::logging::logging_defs::*;
use crate::models::ExtractOptions;
use crate::node_ext::NodeScoreStore;
use crate::parser::{NodeExt, NodeRef, new_html_element, parse_html};
use crate::utils::*;
use std::collections::HashSet;
use std::sync::LazyLock;
use utils::*;
pub static ALTER_TO_DIV_EXCEPTIONS: LazyLock<HashSet<&'static str>> =
LazyLock::new(|| HashSet::from(["div", "article", "section", "p", "ol", "ul"]));
const DEFAULT_CONTENT_ID: &str = "readability-page-1";
const DEFAULT_CONTENT_CLASS: &str = "page";
pub struct ContentData {
pub content: NodeRef,
pub by_line: String,
pub article_dir: String,
}
struct ContentParser<'a> {
document: NodeRef,
logger: &'a PerfLogger,
options: ExtractOptions,
debug: bool,
article_title: String,
initial_by_line: Option<String>,
}
impl<'a> ContentParser<'a> {
fn new(
document: NodeRef,
article_title: String,
options: ExtractOptions,
initial_by_line: Option<String>,
logger: &PerfLogger,
) -> ContentParser<'_> {
let debug = options.debug;
ContentParser {
document,
logger,
options,
debug,
article_title,
initial_by_line,
}
}
fn remove_nodes_conditionally(
&self,
article_content: &NodeRef,
tags_to_remove: &[&str],
store: &NodeScoreStore,
) {
if self.options.clean_conditionally {
let link_density_modifier = self.options.link_density_modifier;
for tag_name in tags_to_remove {
let is_list = tag_name == &"ul" || tag_name == &"ol";
remove_nodes(article_content, tag_name, |node, tag| {
is_safe_to_remove_node(
node,
tag,
is_list,
link_density_modifier,
self.logger,
store,
)
});
}
}
}
fn prep_article(
&self,
article_content: &NodeRef,
logger: &PerfLogger,
store: &mut NodeScoreStore,
) {
clean_presentation_styles(article_content);
mark_data_tables_in_node(article_content, store);
fix_lazy_images(article_content);
self.remove_nodes_conditionally(article_content, &["form", "fieldset"], store);
for t in &["object", "embed", "iframe"] {
remove_nodes(article_content, t, |node, tag| {
!is_possibly_useful_video_node(node, tag)
});
}
remove_nodes(article_content, "div", |node, _| {
node.attr_value("data-scald-gallery").is_some()
});
for t in &["footer", "link", "aside"] {
remove_nodes(article_content, t, |_, _| true);
}
for n in article_content.children() {
remove_matched_nodes(&n, |node, match_str| {
let is_share_element = SHARE_ELEMENTS_REGEX.is_match(match_str);
let smaller_than_threshold =
node.text_contents().chars().count() < self.options.char_threshold as usize;
is_share_element && smaller_than_threshold
});
}
for t in &["input", "textarea", "select", "button"] {
remove_nodes(article_content, t, |_, _| true);
}
for t in &["h1", "h2"] {
remove_nodes(article_content, t, |node, _| {
get_class_and_id_weight(node) < 0
});
}
self.remove_nodes_conditionally(article_content, &["table", "ul"], store);
self.remove_nodes_conditionally(article_content, &["div"], store);
for t in &["div", "section"] {
remove_nodes(article_content, t, |node, _| {
let inner = get_normalized_text_content(node, logger);
matches_ad_or_loading(inner.as_str())
});
}
rename_tags_with_selector(article_content, "h1", "h2");
remove_nodes(article_content, "p", |node, _| {
let img = select_descendants(node, "img").len();
let embed = select_descendants(node, "embed").len();
let object = select_descendants(node, "object").len();
let iframe = select_descendants(node, "iframe").len();
let total_count = img + embed + object + iframe;
total_count == 0 && node.text_contents().is_empty()
});
apply(article_content, &["br"], |br, _| {
let next = next_element(Some(br.clone()));
if next.is_some() && next.unwrap().element_name() == Some("p") {
br.detach();
}
});
apply(article_content, &["table"], |table, _| {
let tbody = if contains_single_tag_in_element(table, "tbody") {
table.first_element_child().unwrap()
} else {
table.clone()
};
if contains_single_tag_in_element(&tbody, "tr") {
let row = tbody.first_element_child().unwrap();
if contains_single_tag_in_element(&row, "td") {
let cell = row.first_element_child().unwrap();
let new_tag_name = if test_all_siblings(cell.first_child(), is_phrasing_content)
{
"p"
} else {
"div"
};
let cell = cell.clone_and_rename_element(new_tag_name);
table.insert_after(cell);
table.detach();
}
}
});
article_content
.descendants()
.filter(|n| is_empty_node(n, logger))
.for_each(|n| {
if let Some(id) = n.attr_value("id") {
if let Some(name) = n.element_name() {
if name != "a" && name != "span" {
let span = new_html_element("span");
let e = span.as_element().unwrap();
e.attributes.borrow_mut().insert("id", id);
n.insert_after(span);
n.detach();
}
}
} else {
n.detach();
}
});
}
fn get_content_node(&self) -> NodeRef {
let content_node = self.document.select_first("body");
if content_node.is_err() {
if self.debug {
let logger = self.logger;
add_point_to_span_str!(logger, PARSE_CONTENT, "missing_body");
}
return self.document.clone();
}
let content_node = content_node.unwrap();
content_node.as_node().clone()
}
fn update_or_create_top_candidate_node(
&self,
article_content: &NodeRef,
top_candidate: &NodeRef,
had_to_create_top_candidate_node: bool,
_logger: &PerfLogger,
) {
if had_to_create_top_candidate_node {
top_candidate
.as_element()
.unwrap()
.attributes
.borrow_mut()
.insert("id", String::from(DEFAULT_CONTENT_ID));
top_candidate
.as_element()
.unwrap()
.attributes
.borrow_mut()
.insert("class", String::from(DEFAULT_CONTENT_CLASS));
} else {
let div = new_html_element("div");
div.as_element()
.unwrap()
.attributes
.borrow_mut()
.insert("id", String::from(DEFAULT_CONTENT_ID));
div.as_element()
.unwrap()
.attributes
.borrow_mut()
.insert("class", String::from(DEFAULT_CONTENT_CLASS));
move_children(article_content, &div);
article_content.append(div);
}
}
pub fn parse(&mut self, store: &mut NodeScoreStore) -> Option<ContentData> {
let logger = &self.logger;
start_span!(logger, PARSE_CONTENT);
let body_content = self.get_content_node();
add_point_to_span_str!(
logger,
PARSE_CONTENT,
"get_content_node_and_its_serialized_html_begin"
);
let (mut doc_node, content_cache) = (self.document.clone(), self.document.to_string());
let mut by_line: Option<String> = self.initial_by_line.clone();
let mut attempts: Vec<(NodeRef, usize)> = vec![];
let mut article_dir: Option<String> = None;
loop {
let elements_to_score = cleaner::clean(
&doc_node,
&mut by_line,
self.options.strip_unlikelys,
self.article_title.as_str(),
logger,
);
let scoring_res = scorer::score_elements(
&elements_to_score,
store,
self.options.weight_classes,
self.options.n_top_candidates as usize,
&body_content,
logger,
);
let mut article_content = new_html_element("div");
let top_candidate = scoring_res.top_candidate;
let parent_of_top_candidate = top_candidate.parent();
if let Some(p) = parent_of_top_candidate.clone() {
for sibling in p.element_children() {
let append = if sibling == top_candidate {
true
} else {
should_append_sibling(&sibling, &top_candidate, logger, store)
};
if append {
if let Some(sibling_name) = sibling.element_name() {
let sibling_name = sibling_name.to_lowercase();
if !ALTER_TO_DIV_EXCEPTIONS.contains(sibling_name.as_str()) {
article_content.append(sibling.clone_and_rename_element("div"));
} else {
article_content.append(sibling);
}
} else {
article_content.append(sibling);
}
}
}
}
self.prep_article(&article_content, logger, store);
self.update_or_create_top_candidate_node(
&article_content,
&top_candidate,
scoring_res.had_to_create_top_candidate_node,
logger,
);
let mut parse_successfully = true;
let normalized_text = get_normalized_text_content(&article_content, logger);
let text_length = normalized_text.chars().count();
if text_length < self.options.char_threshold as usize {
parse_successfully = false;
doc_node = parse_html(content_cache.as_str());
attempts.push((article_content.clone(), text_length));
if self.options.strip_unlikelys {
self.options.strip_unlikelys = false;
} else if self.options.weight_classes {
self.options.weight_classes = false;
} else if self.options.clean_conditionally {
self.options.clean_conditionally = false;
} else {
attempts.sort_by(|lhs, rhs| rhs.1.cmp(&lhs.1));
if let Some(attmpt) = attempts.first() {
if attmpt.1 == 0 {
let fallback_text = article_content.text_contents();
if !fallback_text.trim().is_empty() {
parse_successfully = true;
} else {
d!({
eprintln!(
"Failed to get readable article content with all flags turned off: {}",
self.article_title
)
});
return None;
}
} else {
article_content = attmpt.0.clone();
parse_successfully = true;
}
} else {
d!({
eprintln!(
"Failed to get readable article content with all flags turned off: {}",
self.article_title
)
});
return None;
}
}
}
if parse_successfully {
let mut ancestors = vec![top_candidate];
if let Some(p) = parent_of_top_candidate {
ancestors.push(p.clone());
ancestors.append(&mut get_node_ancestors(&p, DEFAULT_MAX_ANCESTORS_DEPTH));
}
for n in ancestors {
if let (Some(_), Some(dir)) = (n.element_name(), n.attr_value("dir")) {
article_dir = Some(dir);
break;
}
}
return Some(ContentData {
content: article_content,
by_line: by_line.unwrap_or_default(),
article_dir: article_dir.unwrap_or_default(),
});
}
}
}
}
pub fn get_content(
document: NodeRef,
options: ExtractOptions,
article_title: String,
initial_by_line: Option<String>,
store: &mut NodeScoreStore,
logger: &PerfLogger,
) -> Option<ContentData> {
let mut parser = ContentParser::new(document, article_title, options, initial_by_line, logger);
parser.parse(store)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::logging::logger::PerfLogger;
use std::panic::{AssertUnwindSafe, catch_unwind};
fn new_logger() -> PerfLogger {
PerfLogger::new(vec![])
}
#[test]
fn parse_no_body_does_not_panic_with_debug_false() {
let doc = new_html_element("div");
let logger = new_logger();
let mut options = ExtractOptions::default();
options.debug = false;
let mut parser = ContentParser::new(doc, "title".to_string(), options, None, &logger);
let mut store = NodeScoreStore::default();
let res = catch_unwind(AssertUnwindSafe(|| parser.parse(&mut store)));
assert!(res.is_ok());
}
#[test]
fn parse_no_body_does_not_panic_with_debug_true() {
let doc = new_html_element("div");
let logger = new_logger();
let mut options = ExtractOptions::default();
options.debug = true;
let mut parser = ContentParser::new(doc, "title".to_string(), options, None, &logger);
let mut store = NodeScoreStore::default();
let res = catch_unwind(AssertUnwindSafe(|| parser.parse(&mut store)));
assert!(res.is_ok());
}
}