use html5ever::local_name;
use kuchiki::NodeRef;
const TITLE_KEYS: [&str; 6] = [
"og:title", "twitter:title", "dc:title", "dcterm:title",
"weibo:article:title", "weibo:webpage:title",
];
const BYLINE_KEYS: [&str; 6] = [
"author", "dc:creator", "dcterm:creator", "og:article:author",
"article:author", "byl",
];
const DESCRIPTION_KEYS: [&str; 7] = [
"description", "dc:description", "dcterm:description", "og:description",
"weibo:article:description", "weibo:webpage:description", "twitter:description"
];
const IMAGE_KEYS: [&str; 3] = [
"og:image", "og:image:url", "twitter:image",
];
pub struct Metadata {
pub page_title: Option<String>,
pub article_title: Option<String>,
pub image_url: Option<String>,
pub byline: Option<String>,
pub description: Option<String>,
}
pub fn extract(root: &NodeRef) -> Metadata {
let mut page_title = root.select_first("title")
.map(|node| node.text_contents())
.ok();
let mut article_title = get_article_title(root);
match (&page_title, &article_title) {
(None, Some(at)) => {page_title = Some(at.clone());},
(Some(pt), None) => {article_title = Some(pt.clone());},
_ => (),
}
let image_url = extract_meta_content(root, &IMAGE_KEYS);
let byline = extract_meta_content(root, &BYLINE_KEYS);
let description = get_article_description(root);
Metadata {page_title, article_title, image_url, byline, description}
}
fn get_article_title(root: &NodeRef) -> Option<String> {
let meta_title = extract_meta_content(root, &TITLE_KEYS);
if meta_title.is_some() {
return meta_title;
}
let mut h1s = root.select("h1").unwrap();
match (h1s.next(), h1s.next()) {
(Some(h), None) => return Some(h.text_contents()),
(Some(_), Some(_)) => return None,
_ => (),
}
let mut h2s = root.select("h2").unwrap();
if let (Some(h), None) = (h2s.next(), h2s.next()) {
return Some(h.text_contents())
}
None
}
fn get_article_description(root: &NodeRef) -> Option<String> {
let meta_desc = extract_meta_content(root, &DESCRIPTION_KEYS);
if meta_desc.is_some() {
return meta_desc;
}
root.select_first("p")
.map(|p| p.text_contents())
.ok()
}
fn extract_meta_content(root: &NodeRef, expected_types: &[&str]) -> Option<String> {
let meta_type_attrs = [
local_name!("name"),
local_name!("property"),
local_name!("itemprop"),
];
for meta_node in root.select("meta").unwrap() {
for attr_name in &meta_type_attrs {
let attributes = meta_node.attributes.borrow();
if let Some(meta_type) = attributes.get(attr_name) {
if expected_types.contains(&meta_type) {
if let Some(content) = attributes.get(local_name!("content")) {
return Some(content.to_string());
}
}
}
}
}
None
}
mod tests {
#![cfg(test)]
use super::*;
use kuchiki::{parse_html, traits::TendrilSink};
#[test]
fn test_extract() {
const DOC: &str =
"<!doctype html>
<head>
<title>Some Article - Some Site</title>
<meta name=\"og:title\" content=\"Some Article\">
<meta name=\"og:image\" content=\"https://somesite.com/image.png\">
<meta property=\"author\" content=\"Joe Schmoe\">
<meta itemprop=\"dcterm:description\" content=\"A test article for test cases.\">
</head>
<body>
</body>";
let root = kuchiki::parse_html().one(DOC);
let metadata = extract(&root);
assert_eq!(metadata.page_title, Some("Some Article - Some Site".into()));
assert_eq!(metadata.article_title, Some("Some Article".into()));
assert_eq!(metadata.image_url, Some("https://somesite.com/image.png".into()));
assert_eq!(metadata.byline, Some("Joe Schmoe".into()));
assert_eq!(metadata.description, Some("A test article for test cases.".into()));
}
}