use std::collections::HashMap;
use chrono::{NaiveDate, NaiveDateTime, NaiveTime};
use regex::Regex;
use select::document::Document;
use select::predicate::{Attr, Name, Predicate};
#[cfg(feature = "serde0")]
use serde::{Deserialize, Serialize};
use lazy_static::lazy_static;
use crate::extract::NodeValueQuery;
lazy_static! {
pub(crate) static ref RE_DATE_SEGMENTS_Y_M_D: Regex = Regex::new(r"(?mi)(19|20)\d\d[-\\/\.](0[1-9]|1[012]|([jfmasond]\w{2,7}))[-\\/\.](0[1-9]|[12][0-9]|3[01])").unwrap();
pub(crate) static ref RE_DATE_SEGMENTS_M_D_Y: Regex = Regex::new(r"(?mi)(0[1-9]|1[012]|([jfmasond]\w{2,7}))[-\\/\.](0[1-9]|[12][0-9]|3[01])[-\\/\.](19|20)\d\d").unwrap();
pub(crate) static ref RE_KEY_VALUE_PUBLISH_DATE: Regex = Regex::new(r#"(?mi)"\s*(([^"]|\w)*)?(date[-_\s]?(Published|created)|Pub(lish|lication)?[-_\s]?Date)\s*"\s*[:=]\s*"\s*(?P<date>[^"]*)\s*""#).unwrap();
pub(crate) static ref RE_KEY_VALUE_MODIFIED_DATE: Regex = Regex::new(r#"(?mi)"\s*(([^"]|\w)*)?((date[\s_-]?modified|modified[\s_-]?date))\s*"\s*[:=]\s*"\s*(?P<date>[^"]*)\s*""#).unwrap();
pub(crate) static ref MODIFIED_DATE_NODES: Vec<NodeValueQuery<'static>> = {
let mut nodes = Vec::with_capacity(7);
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "article:modified"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "modified"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "ModificationDate"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "modification_date"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "lastmod"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("itemprop", "dateModified"),
"datetime"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "dateModified"),
"content"));
nodes
};
pub(crate) static ref PUBLISH_DATE_NODES: Vec<NodeValueQuery<'static>> = {
let mut nodes = Vec::with_capacity(13);
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "rnews:datePublished"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "article:published_time"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "article:published"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "OriginalPublicationDate"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("itemprop", "datePublished"),
"datetime"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("property", "og:published_time"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "article_date_original"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "publication_date"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "sailthru.date"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "PublishDate"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "pubdate"),
"content"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("pubdate", "pubdate"),
"datetime"));
nodes.push(NodeValueQuery::new( Name("meta"), Attr("name", "publish_date"),
"content"));
nodes.push(NodeValueQuery::new( Name("div"), Attr("id", "taboola-feed-below-article"),
"data-publishdate"));
nodes
};
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde0", derive(Serialize, Deserialize))]
pub enum Date {
Date(NaiveDate),
DateTime(NaiveDateTime),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde0", derive(Serialize, Deserialize))]
pub enum Update {
Date(NaiveDate),
DateTime(NaiveDateTime),
Time(NaiveTime),
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde0", derive(Serialize, Deserialize))]
pub struct ArticleDate {
pub published: Date,
pub last_updated: Option<Update>,
}
pub struct DateExtractor;
impl DateExtractor {
pub fn extract_from_doc(doc: &Document) -> Option<ArticleDate> {
if let Some(published) =
DateExtractor::extract_date(doc, &PUBLISH_DATE_NODES, &RE_KEY_VALUE_PUBLISH_DATE)
{
let last_updated = DateExtractor::extract_date(
&doc,
&MODIFIED_DATE_NODES,
&RE_KEY_VALUE_MODIFIED_DATE,
)
.map(Update::DateTime);
return Some(ArticleDate {
published: Date::DateTime(published),
last_updated,
});
}
None
}
fn extract_date<'a>(
doc: &Document,
nodes: &[NodeValueQuery<'a>],
regex: &Regex,
) -> Option<NaiveDateTime> {
let mut date = {
for node in nodes {
if let Some(content) = doc
.find(node.name.and(node.attr))
.filter_map(|n| n.attr(node.content_name))
.next()
{
if let Some(date) = DateExtractor::fuzzy_dtparse(content) {
return Some(date);
}
}
}
None
};
if date.is_none() {
if let Some(head) = doc
.find(Name("head"))
.filter_map(|head| head.as_text())
.next()
{
if let Some(capture) = regex.captures(head) {
date = capture
.name("date")
.and_then(|m| DateExtractor::fuzzy_dtparse(m.as_str()))
}
}
}
date
}
fn fuzzy_dtparse(s: &str) -> Option<NaiveDateTime> {
let mut tzinfod = HashMap::new();
tzinfod.insert("ET".to_string(), 14400);
let parser = dtparse::Parser::default();
parser
.parse(
s, None, None, true,
true,
None, false, &tzinfod,
)
.map(|(date, _, _)| date)
.ok()
}
pub fn extract_from_str(s: &str) -> Option<ArticleDate> {
DateExtractor::fuzzy_dtparse(s).map(|published| ArticleDate {
published: Date::DateTime(published),
last_updated: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn date_modified() {
let caps = RE_KEY_VALUE_MODIFIED_DATE
.captures(r#""datemodified":"2019-12-05T15:34:34+0100""#)
.unwrap();
assert_eq!(
caps.name("date").unwrap().as_str(),
"2019-12-05T15:34:34+0100"
)
}
#[test]
fn publish_modified() {
let caps = RE_KEY_VALUE_PUBLISH_DATE
.captures(r#""datePublished":"2019-12-05T15:34:34+0100""#)
.unwrap();
assert_eq!(
caps.name("date").unwrap().as_str(),
"2019-12-05T15:34:34+0100"
)
}
}