1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
use html5ever::local_name;
use kuchiki::NodeRef;
const TITLE_KEYS: [&str; 6] = [
"og:title", "twitter:title", "dc:title", "dcterm:title",
"weibo:article:title", "weibo:webpage:title",
];
const BYLINE_KEYS: [&str; 6] = [
"author", "dc:creator", "dcterm:creator", "og:article:author",
"article:author", "byl",
];
const DESCRIPTION_KEYS: [&str; 7] = [
"description", "dc:description", "dcterm:description", "og:description",
"weibo:article:description", "weibo:webpage:description", "twitter:description"
];
pub struct Metadata {
pub page_title: Option<String>,
pub article_title: Option<String>,
pub byline: Option<String>,
pub description: Option<String>,
}
pub fn extract(root: &NodeRef) -> Metadata {
let mut page_title = root.select_first("title")
.map(|node| node.text_contents())
.ok();
let mut article_title = get_article_title(root);
match (&page_title, &article_title) {
(None, Some(at)) => {page_title = Some(at.clone());},
(Some(pt), None) => {article_title = Some(pt.clone());},
_ => (),
}
let byline = extract_meta_content(root, &BYLINE_KEYS);
let description = get_article_description(root);
Metadata {page_title, article_title, byline, description}
}
fn get_article_title(root: &NodeRef) -> Option<String> {
let meta_title = extract_meta_content(root, &TITLE_KEYS);
if meta_title.is_some() {
return meta_title;
}
let mut h1s = root.select("h1").unwrap();
match (h1s.next(), h1s.next()) {
(Some(h), None) => return Some(h.text_contents()),
(Some(_), Some(_)) => return None,
_ => (),
}
let mut h2s = root.select("h2").unwrap();
if let (Some(h), None) = (h2s.next(), h2s.next()) {
return Some(h.text_contents())
}
None
}
fn get_article_description(root: &NodeRef) -> Option<String> {
let meta_desc = extract_meta_content(root, &DESCRIPTION_KEYS);
if meta_desc.is_some() {
return meta_desc;
}
root.select_first("p")
.map(|p| p.text_contents())
.ok()
}
fn extract_meta_content(root: &NodeRef, expected_types: &[&str]) -> Option<String> {
let meta_type_attrs = [
local_name!("name"),
local_name!("property"),
local_name!("itemprop"),
];
for meta_node in root.select("meta").unwrap() {
for attr_name in &meta_type_attrs {
let attributes = meta_node.attributes.borrow();
if let Some(meta_type) = attributes.get(attr_name) {
if expected_types.contains(&meta_type) {
if let Some(content) = attributes.get(local_name!("content")) {
return Some(content.to_string());
}
}
}
}
}
None
}
#[allow(unused_imports)]
use kuchiki::{parse_html, traits::TendrilSink};
#[test]
fn test_extract() {
const DOC: &str =
"<!doctype html>
<head>
<title>Some Article - Some Site</title>
<meta name=\"og:title\" content=\"Some Article\">
<meta property=\"author\" content=\"Joe Schmoe\">
<meta itemprop=\"dcterm:description\" content=\"A test article for test cases.\">
</head>
<body>
</body>";
let root = kuchiki::parse_html().one(DOC);
let metadata = extract(&root);
assert_eq!(metadata.page_title, Some("Some Article - Some Site".into()));
assert_eq!(metadata.article_title, Some("Some Article".into()));
}