use scraper::{Html, Selector};
pub struct MetadataExtractor;
impl Default for MetadataExtractor {
fn default() -> Self {
Self
}
}
impl MetadataExtractor {
pub fn new() -> Self {
Self
}
pub fn extract(&self, html: &str) -> PageMetadata {
let document = Html::parse_document(html);
PageMetadata {
title: self.extract_title(&document),
description: self.extract_meta(&document, "description"),
author: self.extract_meta(&document, "author"),
keywords: self.extract_keywords(&document),
published_time: self.extract_time(&document, "article:published_time")
.or_else(|| self.extract_time(&document, "datePublished")),
modified_time: self.extract_time(&document, "article:modified_time")
.or_else(|| self.extract_time(&document, "dateModified")),
robots: self.extract_meta(&document, "robots"),
viewport: self.extract_meta(&document, "viewport"),
charset: self.extract_charset(&document),
canonical: self.extract_canonical(&document),
language: self.extract_language(&document),
}
}
fn extract_title(&self, document: &Html) -> Option<String> {
let selector = Selector::parse("title").ok()?;
document
.select(&selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join("").trim().to_string())
}
fn extract_meta(&self, document: &Html, name: &str) -> Option<String> {
let selector = Selector::parse(&format!(r#"meta[name="{}"]"#, name)).ok()?;
document
.select(&selector)
.next()
.and_then(|el| el.value().attr("content").map(String::from))
}
fn extract_keywords(&self, document: &Html) -> Vec<String> {
self.extract_meta(document, "keywords")
.map(|s| {
s.split(',')
.map(|k| k.trim().to_string())
.filter(|k| !k.is_empty())
.collect()
})
.unwrap_or_default()
}
fn extract_time(&self, document: &Html, property: &str) -> Option<String> {
let prop_selector = Selector::parse(&format!(r#"meta[property="{}"]"#, property)).ok();
if let Some(sel) = prop_selector {
if let Some(el) = document.select(&sel).next() {
if let Some(content) = el.value().attr("content") {
return Some(content.to_string());
}
}
}
let itemprop_selector = Selector::parse(&format!(r#"[itemprop="{}"]"#, property)).ok();
if let Some(sel) = itemprop_selector {
if let Some(el) = document.select(&sel).next() {
if let Some(dt) = el.value().attr("datetime") {
return Some(dt.to_string());
}
if let Some(content) = el.value().attr("content") {
return Some(content.to_string());
}
}
}
None
}
fn extract_charset(&self, document: &Html) -> Option<String> {
let charset_selector = Selector::parse("meta[charset]").ok()?;
if let Some(el) = document.select(&charset_selector).next() {
if let Some(charset) = el.value().attr("charset") {
return Some(charset.to_string());
}
}
let content_type_selector = Selector::parse(r#"meta[http-equiv="Content-Type"]"#).ok()?;
if let Some(el) = document.select(&content_type_selector).next() {
if let Some(content) = el.value().attr("content") {
if let Some(pos) = content.to_lowercase().find("charset=") {
let charset: String = content[pos + 8..]
.chars()
.take_while(|&c| c != ';' && c != ' ' && c != '"')
.collect();
return Some(charset);
}
}
}
None
}
fn extract_canonical(&self, document: &Html) -> Option<String> {
let selector = Selector::parse(r#"link[rel="canonical"]"#).ok()?;
document
.select(&selector)
.next()
.and_then(|el| el.value().attr("href").map(String::from))
}
fn extract_language(&self, document: &Html) -> Option<String> {
let html_selector = Selector::parse("html").ok()?;
if let Some(html) = document.select(&html_selector).next() {
if let Some(lang) = html.value().attr("lang") {
return Some(lang.to_string());
}
}
self.extract_meta(document, "language")
}
}
#[derive(Debug, Clone, Default)]
pub struct PageMetadata {
pub title: Option<String>,
pub description: Option<String>,
pub author: Option<String>,
pub keywords: Vec<String>,
pub published_time: Option<String>,
pub modified_time: Option<String>,
pub robots: Option<String>,
pub viewport: Option<String>,
pub charset: Option<String>,
pub canonical: Option<String>,
pub language: Option<String>,
}