article_date_extractor/
extract_date.rs1use regex::Regex;
2use chrono::NaiveDate;
3use select::document::Document;
4use select::predicate::{Name, Attr};
5use serde_json;
6use serde_json::Value;
7use errors::*;
8
9static FMTS: &[&str] = &["%A, %B %e, %Y",
11 "%Y-%m-%dT%H:%M:%S%:z",
12 "/%Y/%m/%d/",
13 "/%Y/%d/%m/",
14 "%Y-%m-%d",
15 "%B %e, %Y",
16 "%Y-%m-%d %H:%M:%S",
17 "%Y-%m-%dT%H:%M:%SZ",
18 "%B %k, %Y, %H:%M %p",
19 "%Y-%m-%d %H:%M:%S.000000"];
20
21lazy_static! {
23 static ref RE: Regex =
25 Regex::new(r"([\./\-_]{0,1}(19|20)\d{2})[\./\-_]{0,1}(([0-3]{0,1}[0-9][\./\-_])|(\w{3,5}[\./\-_]))([0-3]{0,1}[0-9][\./\-]{0,1})").unwrap();
26}
27
28fn parse_date(input: &str) -> Result<NaiveDate> {
29 FMTS.iter()
30 .flat_map(|fmt| NaiveDate::parse_from_str(input, fmt))
31 .next()
32 .ok_or("None of the formats matched the date".into())
33}
34
35fn extract_from_url(url: &str) -> Option<String> {
36 RE.find(url).map(|val| val.as_str().to_string())
37}
38
39fn extract_from_ldjson<'a>(html: &'a Document) -> Option<String> {
40 html.find(Attr("type", "application/ld+json"))
41 .next()
42 .map(|ldj| ldj.text())
43 .and_then(|ldjson| serde_json::from_str(&ldjson).ok())
44 .and_then(|decoded_ldjson: Value| {
45 let published = decoded_ldjson
46 .get("datePublished")
47 .and_then(|date| date.as_str())
48 .map(|date| date.to_string());
49
50 let created = decoded_ldjson
51 .get("dateCreated")
52 .and_then(|date| date.as_str())
53 .map(|date| date.to_string());
54
55 published.or(created)
56 })
57}
58
59fn meta_name_denotes_date(meta_name: &str) -> bool {
60 match meta_name.to_lowercase().as_str() {
61 "pubdate" |
62 "publishdate" |
63 "timestamp" |
64 "dc.date.issued" |
65 "date" |
66 "sailthru.date" |
67 "article.published" |
68 "published-date" |
69 "article.created" |
70 "article_date_original" |
71 "cxenseparse:recs:publishtime" |
72 "date_published" => true,
73 _ => false,
74 }
75}
76
77fn meta_itemprop_denotes_date(item_prop: &str) -> bool {
78 match item_prop.to_lowercase().as_str() {
79 "datepublished" | "datecreated" => true,
80 _ => false,
81 }
82}
83
84fn meta_http_equiv_denotes_date(http_equiv: &str) -> bool {
85 match http_equiv.to_lowercase().as_str() {
86 "date" => true,
87 _ => false,
88 }
89}
90
91fn meta_property_denotes_date(meta_property: &str) -> bool {
92 match meta_property {
93 "article:published_time" |
94 "bt:pubdate" => true,
95 _ => false,
96 }
97}
98
99fn extract_from_meta<'a>(html: &'a Document) -> Option<String> {
100 html.find(Name("meta")).flat_map(|meta| {
101 let content = match meta.attr("content") {
102 Some(c) => c,
103 None => return None,
104 };
105 let content = content.trim();
106
107 let meta_name = meta.attr("name");
108 let item_prop = meta.attr("itemprop");
109 let http_equiv = meta.attr("http-equiv");
110 let meta_property = meta.attr("property");
111
112 let content_has_date = meta_name.map(meta_name_denotes_date)
113 .or_else(|| item_prop.map(meta_itemprop_denotes_date))
114 .or_else(|| http_equiv.map(meta_http_equiv_denotes_date))
115 .or_else(|| meta_property.map(meta_property_denotes_date))
116 .unwrap_or(false);
117
118 if content_has_date {
119 Some(content.to_string())
120 } else if Some("og:image") == meta_property {
121 extract_from_url(content)
122 } else {
123 None
124 }
125 }).next()
126}
127
128fn extract_time_tag<'a>(html: &'a Document) -> Option<String> {
129 html.find(Name("time")).flat_map(|time| {
130 if time.attr("class") == Some("timestamp") {
131 Some(time.text().trim_matches('\n').to_string())
132 } else {
133 time.attr("datetime")
134 .and_then(|dt| Some(dt.to_string()))
135 }
136 }).next()
137}
138
139fn extract_span_date_published<'a>(html: &'a Document) -> Option<String> {
140 html.find(Name("span")).flat_map(|tag| {
141 if tag.attr("itemprop") == Some("datePublished") {
142 tag.attr("content").map(|v| v.to_string())
143 } else if !tag.text().is_empty() && tag.attr("itemprop") == Some("datePublished") {
144 Some(tag.text().trim_matches('\n').to_string())
145 } else {
146 None
147 }
148 }).next()
149}
150
151fn extract_from_tag_with_regex<'a>(html: &'a Document, reg: &Regex, tag: &str) -> Option<String> {
152 html.find(Name(tag)).flat_map(|t| {
153 t.attr("class").and_then(|v| {
154 if reg.is_match(v) {
155 Some(t.text().trim_matches('\n').to_string())
156 } else {
157 None
158 }
159 })
160 }).next()
161}
162
163fn extract_from_html_tag<'a>(html: &'a Document) -> Option<String> {
164 lazy_static! {
165 static ref TAG_RE: Regex =
166 Regex::new(r"(?i)publishdate|pubdate|timestamp|article_date|articledate|date").unwrap();
167 }
168
169 extract_time_tag(html)
170 .or_else(|| extract_span_date_published(html))
171 .or_else(|| extract_from_tag_with_regex(html, &TAG_RE, "span"))
172 .or_else(|| extract_from_tag_with_regex(html, &TAG_RE, "p"))
173 .or_else(|| extract_from_tag_with_regex(html, &TAG_RE, "div"))
174 .or(None)
175}
176
177pub fn extract_article_published_date(link: &str, html: &str) -> Result<NaiveDate> {
204 let doc = Document::from(html);
205
206 extract_from_url(link)
207 .or_else(|| extract_from_ldjson(&doc))
208 .or_else(|| extract_from_meta(&doc))
209 .or_else(|| extract_from_html_tag(&doc))
210 .ok_or("Couldn't find the date to parse".into())
211 .and_then(|v| parse_date(&v))
212}
213
214#[cfg(test)]
216mod test {
217 use super::extract_from_url;
218 use super::parse_date;
219 use super::extract_from_meta;
220 use super::extract_from_ldjson;
221 use super::extract_from_html_tag;
222 use chrono::NaiveDate;
223 use select::document::Document;
224
225 #[test]
226 fn parsing_date() {
227 assert_eq!(NaiveDate::from_ymd(2015, 11, 30),
228 parse_date("/2015/11/30/").unwrap());
229 assert_eq!(NaiveDate::from_ymd(2015, 11, 30),
230 parse_date("/2015/30/11/").unwrap());
231
232 assert!(parse_date("bad_format").is_err());
233 }
234
235 #[test]
236 fn extracting_from_url() {
237 let link = "http://edition.cnn.\
238 com/2015/11/28/opinions/sutter-cop21-paris-preview-two-degrees/index.\
239 html";
240 assert_eq!(Some("/2015/11/28/".to_string()), extract_from_url(link));
241
242 let link = "";
243 assert_eq!(None, extract_from_url(link));
244 }
245
246 #[test]
247 fn extracting_from_ldjson() {
248 let document = Document::from(include_str!("../tests/fixtures/techcrunch.html"));
249
250 assert_eq!(Some("2015-12-01T07:50:48Z".to_string()),
251 extract_from_ldjson(&document));
252 }
253
254 #[test]
255 fn extracting_from_meta() {
256 let document = Document::from(include_str!("../tests/fixtures/techcrunch.html"));
257
258 assert_eq!(Some(("2015-11-30 23:50:48".to_string())),
259 extract_from_meta(&document));
260 }
261
262 #[test]
263 fn extracting_from_html_tag() {
264 let document = Document::from(include_str!("../tests/fixtures/google_blog.html"));
265
266 assert_eq!(Some("Thursday, March 16, 2017".to_string()),
267 extract_from_html_tag(&document));
268 }
269}