use chrono::{
DateTime,
Utc,
};
use scraper::{
ElementRef,
Html,
Selector,
};
use url::Url;
use crate::{
article::{
Article,
Content::{
Heading,
Paragraph,
self,
},
},
author::Author,
image::Image,
};
pub fn process<'a>(html: &'a str) -> Vec<Article> {
let document = Html::parse_document(html);
let kind = Selector::parse("html > head > meta[name=type]").ok();
let kind = match kind {
Some(kind) => match document.select(&kind).next() {
Some(kind) => match kind.value().attr("content") {
Some(kind) => kind,
None => "unknown",
}
None => "unknown",
}
None => "unknown",
};
let mut out = vec![];
match kind {
"live-story" => {
let post = Selector::parse("article.live-story-post").ok();
if let Some(post) = post {
for el in document.select(&post) {
if let Ok (title) = Selector::parse("h2.live-story-post__headline") &&
let Some(title) = el.select(&title).next() &&
let Some(href) = canonical_url(&document)
{
let title = title.text().collect::<Vec<_>>().join(" ");
if title.len() > 0 {
if let Some(at) = el.value().attr("data-last-updated") {
let published = match DateTime::parse_from_rfc2822(at) {
Ok(at) => Some(at.to_utc()),
Err(_) => None,
};
out.push(Article {
alternate: alternates_urls(&document),
authors: authors (&document),
canonical: href,
content: content(&el),
description: None,
hero_image: hero (&document),
images: images(&el),
published,
title,
})
}
}
}
}
}
}
"article" => {
match Selector::parse("main.article__main") {
Ok(article) => {
if let Some(article) = document.select(&article).next() &&
let Some(href) = canonical_url(&document) &&
let Some(title) = headline(&document)
{
out.push(Article {
alternate: alternates_urls(&document),
authors: authors (&document),
canonical: href,
content: content (&article),
description: description (&document),
hero_image: hero (&document),
images: images (&article),
published: published (&document),
title,
});
}
}
Err(_) => eprintln!("Couldn't find article to parse")
}
}
_ => eprintln!("{kind} is an unsupported content type for the CNN source")
}
out
}
fn headline(context: &Html) -> Option<String> {
match Selector::parse("div.headline h1.headline__text") {
Ok(h) => {
if let Some(head) = context.select(&h).next() {
let text = head.text().collect::<Vec<_>>().join(" ");
let text = text.trim();
let text = text.replace(" ", " ");
Some(text)
}
else { None }
}
Err(_) => None,
}
}
fn authors(context: &Html) -> Vec<Author> {
let mut out = vec![];
match Selector::parse("a.byline__link") {
Ok(byline) => {
for el in context.select(&byline) {
if let Some(href) = el.value().attr("href") {
match Url::parse(href) {
Ok(href) => {
let name = el.text().collect::<Vec<_>>().join(" ");
out.push(Author { href, name });
}
Err(_) => ()
}
}
}
}
Err(_) => ()
}
out
}
fn description(context: &Html) -> Option<String> {
match Selector::parse("html > head > meta[name=description]") {
Ok(d) => {
if let Some(desc) = context.select(&d).next() &&
let Some(desc) = desc.value().attr("content")
{ Some(desc.to_string()) }
else { None }
}
Err(_) => None,
}
}
fn alternates_urls(context: &Html) -> Option<Vec<(String, Url)>> {
match Selector::parse("html > head > link[rel=alternate]") {
Ok(alt) => {
let mut out = vec![];
for a in context.select(&alt) {
if let Some(lang) = a.value().attr("hreflang") &&
let Some(href) = a.value().attr("href") &&
let Ok (url) = Url::parse(href)
{ out.push((lang.to_string(), url)) }
}
if out.len() > 0 { Some(out) }
else { None }
}
Err(_) => None
}
}
fn hero(context: &Html) -> Option<Url> {
match Selector::parse("html > head > meta[property=\"og:image\"]") {
Ok(curl) => {
if let Some(url) = context.select(&curl).next() &&
let Some(url) = url.value().attr("content")
{
match Url::parse(url) {
Ok(u) => Some(u),
Err(_) => None,
}
}
else { None }
}
Err(_) => None,
}
}
fn canonical_url(context: &Html) -> Option<Url> {
match Selector::parse("html > head > link[rel=canonical]") {
Ok(curl) => {
if let Some(url) = context.select(&curl).next() &&
let Some(url) = url.value().attr("href")
{
match Url::parse(url) {
Ok(u) => Some(u),
Err(_) => None,
}
}
else { None }
}
Err(_) => None,
}
}
fn published(context: &Html) -> Option<DateTime<Utc>> {
let published = Selector::parse("html > head > meta[property=\"article:published_time\"]").ok();
if let Some(published) = published &&
let Some(time) = context.select(&published).next() &&
let Some(time) = time.value().attr("content")
{ return time.parse::<DateTime<Utc>>().ok() }
None
}
fn _modified(context: &Html) -> Option<DateTime<Utc>> {
let modified = Selector::parse("html > head > meta[property=\"article:modified_time\"]").ok();
if let Some(modified) = modified &&
let Some(time) = context.select(&modified).next() &&
let Some(time) = time.value().attr("content")
{ return time.parse::<DateTime<Utc>>().ok() }
None
}
fn content(context: &ElementRef) -> Option<Vec<Content>> {
match Selector::parse("p.paragraph, p.pull-quote_block-quote__text, h2.subheader") {
Ok(p) => {
let mut out = vec![];
for p in context.select(&p) {
let text = p.text().collect::<Vec<_>>().join(" ");
let text = text.trim();
let text = text.replace(" ", " ");
let tag = p.value().name();
if tag == "h2" { out.push(Heading (text)); }
else { out.push(Paragraph(text)); }
}
Some(out)
}
Err(_) => None
}
}
fn images(context: &ElementRef) -> Option<Vec<Image>> {
match Selector::parse("div.image") {
Ok(image) => {
let mut out = vec![];
for i in context.select(&image) {
let mut src = String::new();
let mut alt = String::new();
let mut credit = None;
if let Ok (img) = Selector::parse("picture.image__picture > img") &&
let Some(img) = i.select(&img).next() &&
let Some(s) = img.value().attr("src") &&
let Some(a) = img.value().attr("alt")
{
src = s.to_string();
alt = a.to_string();
}
if let Ok (c) = Selector::parse("figcaption.image__credit") &&
let Some(c) = i.select(&c).next()
{
let text = c.text().collect::<Vec<_>>().join(" ");
credit = Some(text);
}
if let Some(credit) = credit &&
let Some(href) = Url::parse(&src).ok()
{ out.push(Image { href, caption: alt, credit }) }
}
if out.len() > 0 { Some(out) }
else { None }
}
Err(_) => None
}
}