readable_readability/
metadata.rs

1use html5ever::local_name;
2use kuchiki::NodeRef;
3
4
5const TITLE_KEYS: [&str; 6] = [
6    "og:title", "twitter:title", "dc:title", "dcterm:title",
7    "weibo:article:title", "weibo:webpage:title",
8];
9const BYLINE_KEYS: [&str; 6] = [
10    "author", "dc:creator", "dcterm:creator", "og:article:author",
11    "article:author", "byl",
12];
13const DESCRIPTION_KEYS: [&str; 7] = [
14    "description", "dc:description", "dcterm:description", "og:description",
15    "weibo:article:description", "weibo:webpage:description", "twitter:description"
16];
17const IMAGE_KEYS: [&str; 3] = [
18    "og:image", "og:image:url", "twitter:image",
19];
20
21
22pub struct Metadata {
23    pub page_title: Option<String>,
24    pub article_title: Option<String>,
25    pub image_url: Option<String>,
26    pub byline: Option<String>,
27    pub description: Option<String>,
28}
29
30
31pub fn extract(root: &NodeRef) -> Metadata {
32    let mut page_title = root.select_first("title")
33        .map(|node| node.text_contents())
34        .ok();
35
36    let mut article_title = get_article_title(root);
37
38    match (&page_title, &article_title) {
39        (None, Some(at)) => {page_title = Some(at.clone());},
40        (Some(pt), None) => {article_title = Some(pt.clone());},
41        _ => (),
42    }
43
44    let image_url = extract_meta_content(root, &IMAGE_KEYS);
45    let byline = extract_meta_content(root, &BYLINE_KEYS);
46    let description = get_article_description(root);
47    Metadata {page_title, article_title, image_url, byline, description}
48}
49
50
51fn get_article_title(root: &NodeRef) -> Option<String> {
52    let meta_title = extract_meta_content(root, &TITLE_KEYS);
53    if meta_title.is_some() {
54        return meta_title;
55    }
56
57    // if no qualifying meta tag is found, look for a single h1
58    // if there are multiple h1s, give up
59    let mut h1s = root.select("h1").unwrap();
60    match (h1s.next(), h1s.next()) {
61        (Some(h), None) => return Some(h.text_contents()),
62        // we don't want to accept an h2 below if there are multiple h1s
63        (Some(_), Some(_)) => return None,
64        _ => (),
65    }
66
67    // same deal for h2's
68    let mut h2s = root.select("h2").unwrap();
69    if let (Some(h), None) = (h2s.next(), h2s.next()) {
70        return Some(h.text_contents())
71    }
72    None
73}
74
75
76fn get_article_description(root: &NodeRef) -> Option<String> {
77    let meta_desc = extract_meta_content(root, &DESCRIPTION_KEYS);
78    if meta_desc.is_some() {
79        return meta_desc;
80    }
81
82    // if the description isn't specified in a meta tag, use the text of the first <p>
83    root.select_first("p")
84        .map(|p| p.text_contents())
85        .ok()
86}
87
88
89// Given a root node and a list of meta keys, return the content of the first meta tag
90// with its `name`, `property`, or `itemprop` attribute set to one of the expected types.
91fn extract_meta_content(root: &NodeRef, expected_types: &[&str]) -> Option<String> {
92    let meta_type_attrs = [
93        local_name!("name"),
94        local_name!("property"),
95        local_name!("itemprop"),
96    ];
97    // unwrap is safe here because select() only errors when you give it an invalid CSS selector
98    for meta_node in root.select("meta").unwrap() {
99        for attr_name in &meta_type_attrs {
100            let attributes = meta_node.attributes.borrow();
101            if let Some(meta_type) = attributes.get(attr_name) {
102                if expected_types.contains(&meta_type) {
103                    if let Some(content) = attributes.get(local_name!("content")) {
104                        return Some(content.to_string());
105                    }
106                }
107            }
108        }
109    }
110    None
111}
112
113
114mod tests {
115    #![cfg(test)]
116    use super::*;
117    use kuchiki::{parse_html, traits::TendrilSink};
118
119    #[test]
120    fn test_extract() {
121        const DOC: &str = 
122            "<!doctype html>
123            <head>
124                <title>Some Article - Some Site</title>
125                <meta name=\"og:title\" content=\"Some Article\">
126                <meta name=\"og:image\" content=\"https://somesite.com/image.png\">
127                <meta property=\"author\" content=\"Joe Schmoe\">
128                <meta itemprop=\"dcterm:description\" content=\"A test article for test cases.\">
129            </head>
130            <body>
131            </body>";
132
133        let root = kuchiki::parse_html().one(DOC);
134        let metadata = extract(&root);
135        assert_eq!(metadata.page_title, Some("Some Article - Some Site".into()));
136        assert_eq!(metadata.article_title, Some("Some Article".into()));
137        assert_eq!(metadata.image_url, Some("https://somesite.com/image.png".into()));
138        assert_eq!(metadata.byline, Some("Joe Schmoe".into()));
139        assert_eq!(metadata.description, Some("A test article for test cases.".into()));
140    }
141}