readable_readability/
metadata.rs1use html5ever::local_name;
2use kuchiki::NodeRef;
3
4
5const TITLE_KEYS: [&str; 6] = [
6 "og:title", "twitter:title", "dc:title", "dcterm:title",
7 "weibo:article:title", "weibo:webpage:title",
8];
9const BYLINE_KEYS: [&str; 6] = [
10 "author", "dc:creator", "dcterm:creator", "og:article:author",
11 "article:author", "byl",
12];
13const DESCRIPTION_KEYS: [&str; 7] = [
14 "description", "dc:description", "dcterm:description", "og:description",
15 "weibo:article:description", "weibo:webpage:description", "twitter:description"
16];
17const IMAGE_KEYS: [&str; 3] = [
18 "og:image", "og:image:url", "twitter:image",
19];
20
21
22pub struct Metadata {
23 pub page_title: Option<String>,
24 pub article_title: Option<String>,
25 pub image_url: Option<String>,
26 pub byline: Option<String>,
27 pub description: Option<String>,
28}
29
30
31pub fn extract(root: &NodeRef) -> Metadata {
32 let mut page_title = root.select_first("title")
33 .map(|node| node.text_contents())
34 .ok();
35
36 let mut article_title = get_article_title(root);
37
38 match (&page_title, &article_title) {
39 (None, Some(at)) => {page_title = Some(at.clone());},
40 (Some(pt), None) => {article_title = Some(pt.clone());},
41 _ => (),
42 }
43
44 let image_url = extract_meta_content(root, &IMAGE_KEYS);
45 let byline = extract_meta_content(root, &BYLINE_KEYS);
46 let description = get_article_description(root);
47 Metadata {page_title, article_title, image_url, byline, description}
48}
49
50
51fn get_article_title(root: &NodeRef) -> Option<String> {
52 let meta_title = extract_meta_content(root, &TITLE_KEYS);
53 if meta_title.is_some() {
54 return meta_title;
55 }
56
57 let mut h1s = root.select("h1").unwrap();
60 match (h1s.next(), h1s.next()) {
61 (Some(h), None) => return Some(h.text_contents()),
62 (Some(_), Some(_)) => return None,
64 _ => (),
65 }
66
67 let mut h2s = root.select("h2").unwrap();
69 if let (Some(h), None) = (h2s.next(), h2s.next()) {
70 return Some(h.text_contents())
71 }
72 None
73}
74
75
76fn get_article_description(root: &NodeRef) -> Option<String> {
77 let meta_desc = extract_meta_content(root, &DESCRIPTION_KEYS);
78 if meta_desc.is_some() {
79 return meta_desc;
80 }
81
82 root.select_first("p")
84 .map(|p| p.text_contents())
85 .ok()
86}
87
88
89fn extract_meta_content(root: &NodeRef, expected_types: &[&str]) -> Option<String> {
92 let meta_type_attrs = [
93 local_name!("name"),
94 local_name!("property"),
95 local_name!("itemprop"),
96 ];
97 for meta_node in root.select("meta").unwrap() {
99 for attr_name in &meta_type_attrs {
100 let attributes = meta_node.attributes.borrow();
101 if let Some(meta_type) = attributes.get(attr_name) {
102 if expected_types.contains(&meta_type) {
103 if let Some(content) = attributes.get(local_name!("content")) {
104 return Some(content.to_string());
105 }
106 }
107 }
108 }
109 }
110 None
111}
112
113
114mod tests {
115 #![cfg(test)]
116 use super::*;
117 use kuchiki::{parse_html, traits::TendrilSink};
118
119 #[test]
120 fn test_extract() {
121 const DOC: &str =
122 "<!doctype html>
123 <head>
124 <title>Some Article - Some Site</title>
125 <meta name=\"og:title\" content=\"Some Article\">
126 <meta name=\"og:image\" content=\"https://somesite.com/image.png\">
127 <meta property=\"author\" content=\"Joe Schmoe\">
128 <meta itemprop=\"dcterm:description\" content=\"A test article for test cases.\">
129 </head>
130 <body>
131 </body>";
132
133 let root = kuchiki::parse_html().one(DOC);
134 let metadata = extract(&root);
135 assert_eq!(metadata.page_title, Some("Some Article - Some Site".into()));
136 assert_eq!(metadata.article_title, Some("Some Article".into()));
137 assert_eq!(metadata.image_url, Some("https://somesite.com/image.png".into()));
138 assert_eq!(metadata.byline, Some("Joe Schmoe".into()));
139 assert_eq!(metadata.description, Some("A test article for test cases.".into()));
140 }
141}