use crate::constants::regexps;
use crate::dom::{build_match_string, get_tag_name};
use crate::options::ReaderableOptions;
use crate::scoring::is_probably_visible;
use crate::selectors::SELECTORS;
use dom_query::{Document, Node, NodeId};
use hashbrown::HashSet;
pub fn is_probably_readerable(html: &str, options: Option<ReaderableOptions>) -> bool {
let doc = Document::from(html);
is_probably_readerable_doc(&doc, options)
}
pub(crate) fn is_probably_readerable_doc(
doc: &Document,
options: Option<ReaderableOptions>,
) -> bool {
let options = options.unwrap_or_default();
let mut nodes: Vec<Node<'_>> = doc
.select_matcher(&SELECTORS.p_pre_article)
.nodes()
.to_vec();
let mut seen_ids: HashSet<NodeId> = nodes.iter().map(|n| n.id).collect();
for br in doc.select_matcher(&SELECTORS.div_br).nodes().iter() {
if let Some(parent) = br.parent()
&& seen_ids.insert(parent.id)
{
nodes.push(parent);
}
}
let mut score = 0.0;
let mut match_string_buf = String::with_capacity(128);
for node in nodes.iter() {
if !is_probably_visible(node) {
continue;
}
build_match_string(node, &mut match_string_buf);
let candidate_matches = regexps::CANDIDATE_FILTER_SET.matches(&match_string_buf);
if candidate_matches.matched(0) && !candidate_matches.matched(1) {
continue;
}
let is_li_p = {
let mut parent = node.parent();
let mut result = false;
while let Some(p) = parent {
if let Some(tag) = get_tag_name(&p)
&& tag == "LI"
{
result = true;
break;
}
parent = p.parent();
}
result
};
if is_li_p {
continue;
}
let text = node.text();
let text_length = text.trim().chars().count();
if text_length < options.min_content_length {
continue;
}
score += ((text_length - options.min_content_length) as f64).sqrt();
if score > options.min_score {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_short_content_not_readerable() {
let html = "<html><body><p>Short</p></body></html>";
assert!(!is_probably_readerable(html, None));
}
#[test]
fn test_long_content_is_readerable() {
let long_text = "a".repeat(600);
let html = format!("<html><body><p>{}</p></body></html>", long_text);
assert!(is_probably_readerable(&html, None));
}
#[test]
fn test_unlikely_candidates_ignored() {
let long_text = "a".repeat(600);
let html = format!(
"<html><body><p class=\"sidebar\">{}</p></body></html>",
long_text
);
assert!(!is_probably_readerable(&html, None));
}
#[test]
fn test_article_tag_helps() {
let text = "a".repeat(600);
let html = format!("<html><body><article>{}</article></body></html>", text);
assert!(is_probably_readerable(&html, None));
}
}