use std::borrow::Cow;
use std::collections::HashSet;
use std::ops::Deref;
use std::str::FromStr;
use regex::Regex;
use reqwest::Url;
use select::document::Document;
use select::node::Node;
use select::predicate::{Attr, Name, Predicate};
use url::Host;
use lazy_static::lazy_static;
use crate::article::{
ArticleContent, ArticleUrl, ALLOWED_FILE_EXT, BAD_DOMAINS, BAD_SEGMENTS, GOOD_SEGMENTS,
};
use crate::clean::{DefaultDocumentCleaner, DocumentCleaner};
use crate::date::{ArticleDate, DateExtractor, RE_DATE_SEGMENTS_M_D_Y, RE_DATE_SEGMENTS_Y_M_D};
use crate::category::Category;
use crate::nlp::CATEGORY_STOPWORDS;
use crate::text::{ArticleTextNode, ArticleTextNodeExtractor};
use crate::video::VideoNode;
use crate::Language;
lazy_static! {
static ref RE_AUTHOR_NAME: Regex =
Regex::new(r"(?mi)(By)?\s*((<|(<))a([^>]*)(>|(>)))?(?P<name>[a-z ,.'-]+)((<|(<))\\/a(>|(>)))?").unwrap();
}
pub(crate) struct NodeValueQuery<'a> {
pub name: Name<&'a str>,
pub attr: Attr<&'a str, &'a str>,
pub content_name: &'a str,
}
impl<'a> NodeValueQuery<'a> {
pub fn new(
name: Name<&'a str>,
attr: Attr<&'a str, &'a str>,
content_name: &'a str,
) -> NodeValueQuery<'a> {
NodeValueQuery {
name,
attr,
content_name,
}
}
}
pub struct MetaNode<'a> {
inner: Node<'a>,
}
impl<'a> MetaNode<'a> {
pub fn attr<'b>(&'a self, attr: &'b str) -> Option<&'a str> {
self.inner.attr(attr)
}
pub fn name_attr(&self) -> Option<&str> {
self.attr("name")
}
pub fn property_attr(&self) -> Option<&str> {
self.attr("property")
}
pub fn content_attr(&self) -> Option<&str> {
self.attr("content")
}
pub fn value_attr(&self) -> Option<&str> {
self.attr("value")
}
pub fn key(&self) -> Option<&str> {
if let Some(prop) = self.property_attr() {
Some(prop)
} else {
self.name_attr()
}
}
pub fn value(&self) -> Option<&str> {
if let Some(c) = self.content_attr() {
Some(c)
} else {
self.value_attr()
}
}
pub fn is_key_value(&self) -> bool {
self.key().is_some() && self.value().is_some()
}
}
impl<'a> Deref for MetaNode<'a> {
type Target = Node<'a>;
fn deref(&self) -> &Self::Target {
&self.inner
}
}
pub trait Extractor {
fn title<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> {
if let Some(title) = doc
.find(Name("h1"))
.filter_map(|node| node.as_text().map(str::trim))
.next()
{
return Some(Cow::Borrowed(title));
}
if let Some(title) = self.meta_content(doc, Attr("property", "og:title")) {
return Some(title);
}
if let Some(title) = self.meta_content(doc, Attr("name", "og:title")) {
return Some(title);
}
if let Some(title) = doc.find(Name("title")).next() {
return title.as_text().map(str::trim).map(Cow::Borrowed);
}
None
}
fn authors<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> {
let mut authors = HashSet::new();
for &key in &["name", "rel", "itemprop", "class", "id"] {
for &value in &["author", "byline", "dc.creator", "byl"] {
for node in doc.find(Attr(key, value)) {
let txt = node.text();
let t = txt.trim();
if t.is_empty() {
continue;
}
if let Some(cap) = RE_AUTHOR_NAME.captures(t) {
if let Some(m) = cap.name("name") {
for author in m.as_str().trim().split(" and ") {
authors.insert(author.to_string());
}
}
}
}
}
}
authors.into_iter().map(Cow::Owned).collect()
}
fn publishing_date(&self, doc: &Document, base_url: Option<&Url>) -> Option<ArticleDate> {
if let Some(date) = DateExtractor::extract_from_doc(doc) {
return Some(date);
}
if let Some(url) = base_url {
return DateExtractor::extract_from_str(url.path());
}
None
}
fn favicon(&self, doc: &Document, base_url: &Url) -> Option<Url> {
let options = Url::options().base_url(Some(base_url));
doc.find(Name("link").and(Attr("rel", "icon")))
.filter_map(|node| node.attr("href"))
.filter_map(|href| options.parse(href).ok())
.next()
}
fn base_url(&self, doc: &Document) -> Option<Url> {
doc.find(Name("base"))
.filter_map(|n| n.attr("href"))
.filter_map(|href| Url::parse(href).ok())
.next()
}
fn meta_language(&self, doc: &Document) -> Option<Language> {
let mut unknown_lang = None;
if let Some(meta) = self.meta_content(doc, Attr("http-equiv", "Content-Language")) {
match Language::from_str(&*meta) {
Ok(lang) => return Some(lang),
Err(lang) => {
unknown_lang = Some(lang);
}
}
}
if let Some(meta) = self.meta_content(doc, Attr("name", "lang")) {
match Language::from_str(&*meta) {
Ok(lang) => return Some(lang),
Err(lang) => {
unknown_lang = Some(lang);
}
}
}
unknown_lang
}
fn meta_data<'a>(&self, doc: &'a Document) -> Vec<MetaNode<'a>> {
doc.find(Name("head").descendant(Name("meta")))
.map(|node| MetaNode { inner: node })
.filter(MetaNode::is_key_value)
.collect()
}
fn meta_content<'a, 'b>(
&self,
doc: &'a Document,
attr: Attr<&'b str, &'b str>,
) -> Option<Cow<'a, str>> {
doc.find(Name("head").descendant(Name("meta").and(attr)))
.filter_map(|node| node.attr("content").map(str::trim).map(Cow::Borrowed))
.next()
}
fn meta_thumbnail_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url> {
let options = Url::options().base_url(base_url);
[("name", "thumbnail"), ("name", "thumbnailUrl")]
.iter()
.filter_map(|(k, v)| self.meta_content(doc, Attr(k, v)))
.filter_map(|url| options.parse(&*url).ok())
.next()
}
fn meta_img_url(&self, doc: &Document, base_url: Option<&Url>) -> Option<Url> {
let options = Url::options().base_url(base_url);
if let Some(meta) = self.meta_content(doc, Attr("property", "og:image")) {
if let Ok(url) = options.parse(&*meta) {
return Some(url);
}
}
doc.find(
Name("link").and(
Attr("rel", "img_src")
.or(Attr("rel", "image_src"))
.or(Attr("rel", "icon")),
),
)
.filter_map(|node| node.attr("href"))
.filter_map(|href| options.parse(href).ok())
.next()
}
fn meta_type<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> {
self.meta_content(doc, Attr("property", "og:type"))
}
fn meta_site_name<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> {
self.meta_content(doc, Attr("property", "og:site_name"))
}
fn meta_description<'a>(&self, doc: &'a Document) -> Option<Cow<'a, str>> {
[("property", "description"), ("name", "description")]
.iter()
.filter_map(|(k, v)| self.meta_content(doc, Attr(k, v)))
.next()
}
fn meta_keywords<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> {
for (k, v) in &[
("property", "keywords"),
("name", "news_keywords"),
("name", "keywords"),
] {
if let Some(keywords) = self.meta_content(doc, Attr(k, v)) {
return match keywords {
Cow::Owned(s) => s
.split(',')
.map(str::trim)
.map(ToString::to_string)
.map(Cow::Owned)
.collect(),
Cow::Borrowed(s) => s.split(',').map(str::trim).map(Cow::Borrowed).collect(),
};
}
}
Vec::new()
}
fn text<'a>(&self, doc: &'a Document, lang: Language) -> Option<Cow<'a, str>> {
self.text_with_cleaner(doc, lang, DefaultDocumentCleaner)
}
fn text_with_cleaner<'a, T: DocumentCleaner>(
&self,
doc: &'a Document,
lang: Language,
cleaner: T,
) -> Option<Cow<'a, str>> {
self.article_node(doc, lang)
.map(|n| cleaner.clean_node_text(*n).into())
}
fn article_node<'a>(&self, doc: &'a Document, lang: Language) -> Option<ArticleTextNode<'a>> {
let mut iter =
doc.find(Name("body").descendant(ArticleTextNodeExtractor::article_body_predicate()));
if let Some(node) = iter.next() {
if iter.next().is_none() {
return Some(ArticleTextNode::new(node));
}
}
ArticleTextNodeExtractor::calculate_best_node(doc, lang)
}
fn all_urls<'a>(&self, doc: &'a Document) -> Vec<Cow<'a, str>> {
let mut uniques = HashSet::new();
doc.find(Name("a"))
.filter_map(|n| n.attr("href").map(str::trim))
.filter(|href| uniques.insert(*href))
.map(Cow::Borrowed)
.collect()
}
fn article_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<ArticleUrl> {
let options = Url::options().base_url(base_url);
let mut uniques = HashSet::new();
let q = doc
.find(Name("a"))
.filter_map(|n| {
if let Some(href) = n.attr("href").map(str::trim) {
Some((href, n.as_text().map(str::trim)))
} else {
None
}
})
.filter(|(href, _)| uniques.insert(*href))
.filter_map(|(link, title)| {
options
.parse(link)
.map(|url| ArticleUrl::new_with_title(url, title))
.ok()
});
if let Some(base_url) = base_url {
q.filter(|article| Self::is_article(article, base_url))
.collect()
} else {
q.collect()
}
}
fn image_urls(&self, doc: &Document, base_url: Option<&Url>) -> Vec<Url> {
let options = Url::options().base_url(base_url);
doc.find(Name("img"))
.filter_map(|n| n.attr("href").map(str::trim))
.filter_map(|url| options.parse(url).ok())
.collect()
}
fn is_article(article: &ArticleUrl, base_url: &Url) -> bool {
if article.url.path().starts_with('#') {
return false;
}
if !is_valid_domain(&article.url, base_url) {
return false;
}
let mut path_segments = Vec::new();
if let Some(segments) = article.url.path_segments() {
for segment in segments {
let segment = segment.to_lowercase().to_lowercase();
if BAD_SEGMENTS.contains(&segment.as_str()) {
return false;
}
if !segment.is_empty() {
path_segments.push(segment);
}
}
if path_segments.is_empty() {
return false;
}
let last_segment = path_segments.remove(path_segments.len() - 1);
let mut iter = last_segment.rsplitn(2, '.');
let after = iter.next();
if let Some(segment) = iter.next() {
let extension = after.unwrap().to_lowercase();
if ALLOWED_FILE_EXT.contains(&extension.as_str()) {
if segment.len() > 10 {
path_segments.push(segment.to_string());
}
} else if extension.chars().all(char::is_numeric) {
path_segments.push(last_segment);
} else {
return false;
}
} else {
path_segments.push(last_segment);
}
} else {
return false;
}
if let Some(last_segment) = path_segments.last() {
let (dash_count, underscore_count) = count_dashes_and_underscores(last_segment);
if dash_count > 4 || underscore_count > 4 {
if let Some(Host::Domain(domain)) = article.url.host() {
if let Some(domain) = domain.split('.').rev().nth(1) {
let delim = if underscore_count > dash_count {
'_'
} else {
'-'
};
if last_segment.split(delim).all(|s| s != domain) {
return true;
}
} else {
return true;
}
} else {
return true;
}
}
}
if RE_DATE_SEGMENTS_Y_M_D.is_match(article.url.path()) {
return true;
}
if RE_DATE_SEGMENTS_M_D_Y.is_match(article.url.path()) {
return true;
}
if path_segments.len() > 1 {
for segment in path_segments.iter() {
if GOOD_SEGMENTS.contains(&segment.as_str()) {
return true;
}
}
}
false
}
fn is_category(category: &Category, base_url: &Url) -> bool {
if category.url.path().starts_with("/#") {
return false;
}
if let Some(segments) = category.url.path_segments() {
for (i, segment) in segments.enumerate() {
if i > 0 {
return false;
}
if CATEGORY_STOPWORDS.contains(&segment) || segment == "index.html" {
return false;
}
}
}
if category.url.scheme() != base_url.scheme() {
return false;
}
is_valid_domain(&category.url, base_url)
}
fn categories(&self, doc: &Document, base_url: &Url) -> Vec<Category> {
let options = Url::options().base_url(Some(base_url));
let category_urls: HashSet<_> = self
.all_urls(doc)
.into_iter()
.filter_map(|url| options.parse(&*url).ok())
.map(|mut url| {
url.set_query(None);
url
})
.collect();
category_urls
.into_iter()
.map(Category::new)
.filter(|cat| Self::is_category(cat, base_url))
.collect()
}
fn article_content<'a>(
&self,
doc: &'a Document,
base_url: Option<&Url>,
lang: Option<Language>,
) -> ArticleContent<'a> {
let mut builder = ArticleContent::builder()
.authors(self.authors(doc))
.keywords(self.meta_keywords(doc));
let lang = if let Some(meta_lang) = self.meta_language(doc) {
builder = builder.language(meta_lang.clone());
meta_lang
} else {
lang.unwrap_or_default()
};
if let Some(txt_node) = self.article_node(doc, lang) {
builder = builder
.videos(
txt_node
.videos()
.into_iter()
.filter_map(|x| x.get_src_url(base_url))
.filter_map(|url| url.ok())
.collect(),
)
.references(txt_node.references())
.text(txt_node.clean_text().into())
.images(txt_node.images(base_url));
}
if let Some(description) = self.meta_description(doc) {
builder = builder.description(description);
}
if let Some(thumbnail) = self.meta_thumbnail_url(doc, base_url) {
builder = builder.thumbnail(thumbnail);
}
if let Some(title) = self.title(doc) {
builder = builder.title(title);
}
if let Some(date) = self.publishing_date(doc, base_url) {
builder = builder.publishing_date(date);
}
if let Some(img) = self.meta_img_url(doc, base_url) {
builder = builder.top_image(img);
}
builder.build()
}
fn canonical_link(&self, doc: &Document) -> Option<Url> {
if let Some(link) = doc
.find(Name("link").and(Attr("rel", "canonical")))
.filter_map(|node| node.attr("href"))
.next()
{
return Url::parse(link).ok();
}
if let Some(meta) = self.meta_content(doc, Attr("property", "og:url")) {
return Url::parse(&*meta).ok();
}
None
}
fn videos<'a>(&self, doc: &'a Document, lang: Option<Language>) -> Vec<VideoNode<'a>> {
if let Some(node) = self.article_node(doc, lang.unwrap_or_default()) {
node.videos()
} else {
Vec::new()
}
}
}
fn count_dashes_and_underscores<T: AsRef<str>>(s: T) -> (usize, usize) {
let s = s.as_ref();
s.chars().fold((0, 0), |(dashes, unders), c| {
if c == '-' {
(dashes + 1, unders)
} else if c == '_' {
(dashes, unders + 1)
} else {
(dashes, unders)
}
})
}
fn is_valid_domain(url: &Url, base_url: &Url) -> bool {
if let Some(Host::Domain(domain)) = url.host() {
let base_subdomains = base_url.domain().map(|x| x.split('.').collect::<Vec<_>>());
if let Some(parent_domains) = &base_subdomains {
let candidate_domains: Vec<_> = domain.split('.').collect();
if parent_domains.iter().all(|d| candidate_domains.contains(d)) {
if candidate_domains.iter().any(|d| *d == "m" || *d == "i") {
return false;
}
if candidate_domains
.iter()
.all(|d| !CATEGORY_STOPWORDS.contains(d) && !BAD_DOMAINS.contains(d))
{
return true;
}
}
}
} else {
return base_url.host() == url.host();
}
false
}
#[derive(Debug, Default)]
pub struct DefaultExtractor;
impl Extractor for DefaultExtractor {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn author_regex() {
let m = RE_AUTHOR_NAME
.captures("By <a href="/profiles/meg-wagner">Joseph Kelley</a>")
.unwrap()
.name("name")
.unwrap();
assert_eq!(m.as_str(), "Joseph Kelley");
let m = RE_AUTHOR_NAME
.captures("By <a href="/profiles/meg-wagner">Joseph Kelley</a>")
.unwrap()
.name("name")
.unwrap();
assert_eq!(m.as_str(), "Joseph Kelley");
let m = RE_AUTHOR_NAME
.captures("Joseph Kelley")
.unwrap()
.name("name")
.unwrap();
assert_eq!(m.as_str(), "Joseph Kelley");
let m = RE_AUTHOR_NAME
.captures("By Joseph Kelley")
.unwrap()
.name("name")
.unwrap();
assert_eq!(m.as_str(), "Joseph Kelley");
let m = RE_AUTHOR_NAME
.captures("J\'oseph-Kelley")
.unwrap()
.name("name")
.unwrap();
assert_eq!(m.as_str(), "J\'oseph-Kelley");
}
#[test]
fn detect_articles() {
macro_rules! assert_articles {
($base:expr => $($url:expr,)*) => {
let base_url = Url::parse($base).unwrap();
$(
let article = ArticleUrl::new(Url::parse($url).unwrap());
assert!(DefaultExtractor::is_article(&article, &base_url));
)*
};
}
assert_articles!(
"https://extrablatt.com" =>
"https://extrablatt.com/politics/live-news/some-title-12-05-2019/index.html",
"https://extrablatt.com/2019/12/04/us/politics/some-title.html",
"https://www.extrablatt.com/graphics/2019/investigations/some-title/",
"https://extrablatt.com/2019/12/06/uk/some-longer-title-with-dashes/index.html",
"https://www.extrablatt.com/politik/some-longer-title-with-dashes-interview-1.347823?reduced=true",
"https://www.extrablatt.com/auto/some_longer_title_with_underscores_1300105.html",
"https://extrablatt.com/hmm-some-very-long-title-speparated",
);
}
}