use std::{fmt, io};
use html5gum::{HtmlString, IoReader, Tokenizer};
use log::trace;
use minreq::URL;
#[derive(Default)]
pub struct WebPage {
pub title: Option<String>,
pub description: Option<String>,
pub author: Option<String>,
}
#[derive(Debug)]
pub enum WebPageError {
Http(minreq::Error),
Io(io::Error),
Unsuccessful {
status_code: i32,
reason_phrase: String,
},
NotHtml,
}
pub fn fetch<U: Into<URL>>(url: U) -> Result<WebPage, WebPageError> {
let resp = minreq::get(url)
.with_timeout(15)
.with_max_redirects(10)
.with_max_headers_size(8192) .with_max_status_line_length(1024)
.with_header(
"User-Agent",
format!(
"{}/{}; (+{})",
env!("CARGO_PKG_NAME"),
env!("CARGO_PKG_VERSION"),
env!("CARGO_PKG_HOMEPAGE"),
),
)
.send_lazy()?;
if resp.status_code != 200 {
return Err(WebPageError::Unsuccessful {
status_code: resp.status_code,
reason_phrase: resp.reason_phrase,
});
}
if !resp
.headers
.get("content-type")
.map_or(false, |content_type| content_type.contains("text/html"))
{
return Err(WebPageError::NotHtml);
}
let tokenizer = Tokenizer::new(IoReader::new(resp));
extract_meta_data(tokenizer)
}
fn extract_meta_data(
tokenizer: Tokenizer<IoReader<minreq::ResponseLazy>>,
) -> Result<WebPage, WebPageError> {
let mut title = None;
let mut description = None;
let mut author = None;
let property_attr = HtmlString(b"property".to_vec());
let content_attr = HtmlString(b"content".to_vec());
let name_attr = HtmlString(b"name".to_vec());
let mut title_tag = String::new();
let mut in_title = false;
for token in tokenizer {
let token = token?;
match token {
html5gum::Token::StartTag(tag) if *tag.name == b"meta" => {
trace!("Tag {:?}", tag);
let content = tag
.attributes
.get(&content_attr)
.and_then(|v| std::str::from_utf8(v).ok());
let Some(content) = content.map(str::trim) else {
trace!("content missing or invalid");
continue;
};
let property = tag.attributes.get(&property_attr);
match property.map(|v| v.as_slice()) {
Some(b"og:title") => set_if_longer(&mut title, content),
Some(b"og:description") => set_if_longer(&mut description, content),
Some(_) => {}
None => {
let name = tag.attributes.get(&name_attr);
match name.map(|s| s.as_slice()) {
Some(b"author") => set_if_longer(&mut author, content),
Some(b"description") => set_if_longer(&mut description, content),
_ => {}
}
}
}
}
html5gum::Token::StartTag(tag) if *tag.name == b"title" => in_title = true,
html5gum::Token::EndTag(tag) if *tag.name == b"title" => {
in_title = false;
}
html5gum::Token::String(text) if in_title => {
if let Ok(text) = std::str::from_utf8(&text) {
title_tag.push_str(text);
}
}
_ => {}
}
}
if title.is_none() && !title_tag.trim().is_empty() {
set_if_longer(&mut title, &title_tag)
}
Ok(WebPage {
title,
description,
author,
})
}
pub(crate) fn set_if_longer(value: &mut Option<String>, candidate: &str) {
match value {
Some(existing) if candidate.len() > existing.len() => {
value.replace(candidate.to_string());
}
Some(_) => {}
None => *value = Some(candidate.to_string()),
}
}
impl From<minreq::Error> for WebPageError {
fn from(err: minreq::Error) -> Self {
WebPageError::Http(err)
}
}
impl From<io::Error> for WebPageError {
fn from(err: io::Error) -> Self {
WebPageError::Io(err)
}
}
impl fmt::Display for WebPageError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
WebPageError::Http(err) => write!(f, "HTTP error: {err}"),
WebPageError::Io(err) => write!(f, "I/O error: {err}"),
WebPageError::Unsuccessful {
status_code,
reason_phrase,
} => write!(
f,
"HTTP request was unsuccessful: {reason_phrase} ({status_code})"
),
WebPageError::NotHtml => f.write_str("Response was not HTML"),
}
}
}
impl std::error::Error for WebPageError {}