use std::collections::{HashMap, HashSet};
use std::ops::Deref;
use select::document::Document;
use select::node::Node;
use select::predicate::{Attr, Name, Predicate};
use crate::clean::{DefaultDocumentCleaner, DocumentCleaner};
use crate::video::VideoNode;
use crate::Language;
use url::Url;
pub const ARTICLE_BODY_ATTR: &[(&str, &str); 3] = &[
("itemprop", "articleBody"),
("data-testid", "article-body"),
("name", "articleBody"),
];
pub const PUNCTUATION: &str = r###",."'!?&-/:;()#$%*+<=>@[\]^_`{|}~"###;
pub trait TextContainer<'a> {
fn first_children_text(&self) -> Option<&'a str>;
}
impl<'a> TextContainer<'a> for Node<'a> {
fn first_children_text(&self) -> Option<&'a str> {
self.children().find_map(|n| n.as_text())
}
}
pub struct TextNodeFind<'a> {
document: &'a Document,
next: usize,
}
impl<'a> TextNodeFind<'a> {
fn is_text_node(node: &Node<'a>) -> bool {
Name("p").or(Name("pre").or(Name("td"))).matches(node)
}
fn is_bad(node: &Node<'a>) -> bool {
Name("figure")
.or(Name("media"))
.or(Name("aside"))
.matches(node)
}
fn new(document: &'a Document) -> Self {
Self { document, next: 0 }
}
}
impl<'a> Iterator for TextNodeFind<'a> {
type Item = Node<'a>;
fn next(&mut self) -> Option<Node<'a>> {
while self.next < self.document.nodes.len() {
let node = self.document.nth(self.next).unwrap();
self.next += 1;
if Self::is_bad(&node) {
self.next += node.descendants().count();
}
if Self::is_text_node(&node) {
return Some(node);
}
}
None
}
}
#[derive(Debug, Clone)]
pub struct ArticleTextNode<'a> {
inner: Node<'a>,
}
impl<'a> ArticleTextNode<'a> {
pub fn new(inner: Node<'a>) -> Self {
Self { inner }
}
pub fn clean_text(&self) -> String {
DefaultDocumentCleaner.clean_node_text(self.inner)
}
pub fn images(&self, base_url: Option<&Url>) -> Vec<Url> {
let options = Url::options().base_url(base_url);
self.inner
.find(Name("img"))
.filter_map(|n| n.attr("href").map(str::trim))
.filter_map(|url| options.parse(url).ok())
.collect()
}
pub fn references(&self) -> Vec<Url> {
let mut uniques = HashSet::new();
DefaultDocumentCleaner
.iter_clean_nodes(self.inner)
.filter(|n| Name("a").matches(n))
.filter_map(|n| n.attr("href").map(str::trim))
.filter(|href| uniques.insert(*href))
.filter_map(|url| Url::parse(url).ok())
.collect()
}
pub fn videos(&self) -> Vec<VideoNode<'a>> {
let mut videos: Vec<_> = self
.inner
.find(VideoNode::node_predicate())
.map(VideoNode::new)
.collect();
videos.extend(
self.inner
.find(Name("embed"))
.filter(|n| {
if let Some(parent) = n.parent() {
parent.name() != Some("object")
} else {
false
}
})
.map(VideoNode::new),
);
videos
}
}
impl<'a> Deref for ArticleTextNode<'a> {
type Target = Node<'a>;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
pub struct ArticleTextNodeExtractor;
impl ArticleTextNodeExtractor {
pub const MINIMUM_STOPWORD_COUNT: usize = 5;
pub const MAX_STEPSAWAY_FROM_NODE: usize = 3;
pub fn article_body_predicate() -> for<'r, 's> fn(&'r Node<'s>) -> bool {
|node| {
for (k, v) in ARTICLE_BODY_ATTR.iter().cloned() {
if Attr(k, v).matches(node) {
return true;
}
}
false
}
}
pub fn calculate_best_node(doc: &Document, lang: Language) -> Option<ArticleTextNode> {
let mut starting_boost = 1.0;
let txt_nodes: Vec<_> = ArticleTextNodeExtractor::nodes_to_check(doc)
.filter(|n| !ArticleTextNodeExtractor::is_high_link_density(n))
.filter_map(|node| {
if let Some(stats) = node
.first_children_text()
.and_then(|txt| lang.stopword_count(txt))
{
if stats.stopword_count > 2 {
return Some((node, stats));
}
}
None
})
.collect();
let mut nodes_scores = HashMap::with_capacity(txt_nodes.len());
let nodes_number = txt_nodes.len();
let negative_scoring = 0.0;
let bottom_negativescore_nodes = nodes_number as f64 * 0.25;
for (i, (node, stats)) in txt_nodes.iter().enumerate() {
let mut boost_score = 0.0;
if ArticleTextNodeExtractor::is_boostable(node, lang.clone()) {
boost_score = (1.0 / starting_boost) * 50.0;
starting_boost += 1.0;
}
if nodes_number > 15 {
let score = (nodes_number - i) as f64;
if score <= bottom_negativescore_nodes {
let booster = bottom_negativescore_nodes - score;
boost_score = booster.powf(2.0) * -1.0;
let negscore = boost_score.abs() + negative_scoring;
if negscore > 40.0 {
boost_score = 5.0;
}
}
}
let upscore = stats.stopword_count + boost_score as usize;
if let Some(parent) = node.parent() {
let (score, cnt) = nodes_scores
.entry(parent.index())
.or_insert((0usize, 0usize));
*score += upscore;
*cnt += 1;
if let Some(parent_parent) = parent.parent() {
let (score, cnt) = nodes_scores
.entry(parent_parent.index())
.or_insert((0usize, 0usize));
*cnt += 1;
*score += upscore / 2;
if let Some(parent_2) = parent_parent.parent() {
let (score, cnt) = nodes_scores
.entry(parent_2.index())
.or_insert((0usize, 0usize));
*cnt += 1;
*score += upscore / 3;
}
}
}
}
let mut index = nodes_scores.keys().cloned().next();
let mut top_score = 0;
for (idx, (score, _)) in nodes_scores {
if score > top_score {
top_score = score;
index = Some(idx);
}
}
index.map(|i| ArticleTextNode::new(Node::new(doc, i).unwrap()))
}
fn nodes_to_check(doc: &Document) -> impl Iterator<Item = Node> {
TextNodeFind::new(doc)
}
fn is_boostable(node: &Node, lang: Language) -> bool {
let mut steps_away = 0;
while let Some(sibling) = node.prev().filter(|n| n.is(Name("p"))) {
if steps_away >= ArticleTextNodeExtractor::MAX_STEPSAWAY_FROM_NODE {
return false;
}
if let Some(stats) = sibling
.first_children_text()
.and_then(|txt| lang.stopword_count(txt))
{
if stats.stopword_count > ArticleTextNodeExtractor::MINIMUM_STOPWORD_COUNT {
return true;
}
}
steps_away += 1;
}
false
}
fn is_high_link_density(node: &Node) -> bool {
let links = node.find(Name("a")).filter_map(|n| n.first_children_text());
if let Some(words) = node.as_text().map(|s| s.split_whitespace()) {
let words_number = words.count();
if words_number == 0 {
return true;
}
let (num_links, num_link_words) = links.fold((0usize, 0usize), |(links, sum), n| {
(links + 1, sum + n.split_whitespace().count())
});
if num_links == 0 {
return false;
}
let link_divisor = num_link_words as f64 / words_number as f64;
let score = link_divisor * num_links as f64;
score >= 1.0
} else {
links.count() > 0
}
}
pub fn words(txt: &str) -> impl Iterator<Item = &str> {
txt.split(|c: char| c.is_whitespace() || is_punctuation(c))
.filter(|s| !s.is_empty())
}
}
pub fn is_punctuation(c: char) -> bool {
PUNCTUATION.contains(c)
}
#[derive(Debug, Clone)]
pub struct WordsStats {
pub word_count: usize,
pub stopword_count: usize,
}