use {
crate::{
converter,
models::{Chapter, Details, Language, Rating, State, Story},
select,
utils::{req, sleep, word_count},
Error, ScrapeError,
},
awc::http::uri::Uri,
bytes::Bytes,
chrono::{DateTime, NaiveDate, Utc},
html5ever::{
driver::{self, ParseOpts},
tendril::stream::TendrilSink,
},
scraper::{Html, Selector},
};
lazy_static::lazy_static! {
static ref CHAPTER_NAME: Selector = Selector::parse(r#"#chapters > .chapter > div[role="complementary"] > h3"#,).unwrap();
static ref CHAPTER_TEXT: Selector = Selector::parse(r#"#chapters > .chapter > div[role="article"] > p"#).unwrap();
static ref STORY_AUTHOR: Selector = Selector::parse(r#"#workskin > .preface > .byline.heading > a[rel="author"]"#).unwrap();
static ref STORY_SUMMARY: Selector = Selector::parse("#workskin > .preface > .summary > blockquote > p").unwrap();
static ref STORY_NAME: Selector = Selector::parse("#workskin > .preface > .title").unwrap();
static ref STORY_RATING: Selector = Selector::parse(".work > .rating.tags > ul > li > .tag").unwrap();
static ref STORY_ORIGINS: Selector = Selector::parse(".work > .fandom.tags > ul > li > .tag").unwrap();
static ref STORY_STATS_CHAPTERS: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.chapters").unwrap();
static ref STORY_STATS_LANGUAGE: Selector = Selector::parse("dl.work > dd.language").unwrap();
static ref STORY_STATS_CREATED: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.published").unwrap();
static ref STORY_STATS_UPDATED: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.status").unwrap();
static ref STORY_ACTIONS: Selector = Selector::parse("#feedback > .actions > li > a").unwrap();
}
pub async fn scrape(url: &Uri) -> Result<Story, Error> {
let id = url
.path()
.split('/')
.filter(|s| !s.is_empty())
.nth(1)
.expect("No story ID found in URL");
log::info!("[{}] Scraping initial details", url);
let html = req(url).await?;
let details = actix_threadpool::run(|| get_details(html)).await?;
let chapters = details.chapters;
let mut story = Story::new(details);
log::info!("[{}] Beginning chapter scraping", url);
let first = url
.path()
.split('/')
.filter(|s| !s.is_empty())
.nth(3)
.and_then(|id_hash| id_hash.split('#').next().map(String::from))
.expect("No story ID found in URL");
if chapters != 1 {
let mut next = Some(first);
for num in 1..=chapters {
if let Some(ch) = next {
log::info!("[{}] Scraping chapter {} [{}]", url, num, ch);
sleep().await?;
let uri = format!("https://archiveofourown.org/works/{}/chapters/{}", id, &ch)
.parse::<Uri>()?;
let bytes = req(&uri).await?;
let (n, chapter) = actix_threadpool::run(|| get_chapter(bytes)).await?;
next = n;
story.chapters.push(chapter);
} else {
log::error!("[error] The scraper is trying to access a chapter that doesn't exist, this isn't good");
}
}
} else {
log::info!("[{}] Scraping chapter 1 [{}]", url, first);
sleep().await?;
let uri = format!(
"https://archiveofourown.org/works/{}/chapters/{}",
id, &first
)
.parse::<Uri>()?;
let bytes = req(&uri).await?;
story
.chapters
.push(actix_threadpool::run(|| get_chapter(bytes)).await?.1);
}
story.words = story.chapters.iter().map(|c| word_count(&c.main)).sum();
Ok(story)
}
pub fn get_details(bytes: Bytes) -> Result<Details, ScrapeError> {
let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
let html = parser.one(std::str::from_utf8(bytes.as_ref())?);
let authors: Vec<String> = select!(string[] <> html => &STORY_AUTHOR);
let origins: Vec<String> = select!(string[] <> html => &STORY_ORIGINS);
let name: String = select!(string <> html => &STORY_NAME).trim().to_string();
let summary: String = select!(string <> html => &STORY_SUMMARY);
let chapter_expected: String = select!(string <> html => &STORY_STATS_CHAPTERS);
let chapters: u32 = chapter_expected
.split('/')
.next()
.and_then(|s| s.parse::<u32>().ok())
.unwrap();
let language: Language = match select!(string <> html => &STORY_STATS_LANGUAGE).trim() {
"English" => Language::English,
_ => unreachable!(),
};
let rating: Rating = match select!(string <> html => &STORY_RATING).trim() {
"Explicit" => Rating::Explicit,
"Mature" => Rating::Mature,
"Teen And Up Audiences" => Rating::Teen,
"General Audiences" => Rating::General,
_ => unreachable!(),
};
let state = {
let mut split = chapter_expected.split('/');
let current: &str = split.next().unwrap();
let expected: &str = split.next().unwrap();
if current == expected {
State::Completed
} else {
State::InProgress
}
};
let created: Option<DateTime<Utc>> =
NaiveDate::parse_from_str(&select!(string <> html => &STORY_STATS_CREATED), "%Y-%m-%d")
.map(|date| date.and_hms(0, 0, 0))
.map(|dt| DateTime::from_utc(dt, Utc))
.ok();
let updated: Option<DateTime<Utc>> = if state != State::Completed || chapters != 1 {
NaiveDate::parse_from_str(&select!(string <> html => &STORY_STATS_UPDATED), "%Y-%m-%d")
.map(|date| date.and_hms(0, 0, 0))
.map(|dt| DateTime::from_utc(dt, Utc))
.ok()
} else {
None
};
Ok(Details {
name,
summary,
chapters,
language,
rating,
state,
authors,
origins,
tags: Vec::new(),
created: created.unwrap_or_else(Utc::now),
updated: updated.unwrap_or_else(Utc::now),
})
}
pub fn get_chapter(bytes: Bytes) -> Result<(Option<String>, Chapter), ScrapeError> {
let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
let html = parser.one(std::str::from_utf8(bytes.as_ref())?);
let next: Option<String> = html
.select(&STORY_ACTIONS)
.find(node_filter)
.and_then(|node| {
node.value().attr("href").and_then(|href| {
href.split('/')
.nth(4)
.and_then(|id_hash| id_hash.split('#').next().map(String::from))
})
});
let main = converter::parse(
html.select(&CHAPTER_TEXT)
.next()
.expect("[chapter_text] HTML is missing the chapter text node, did the html change?")
.inner_html(),
)?;
let name: String = select!(string <> html => &CHAPTER_NAME);
Ok((
next,
Chapter {
name,
words: word_count(&main),
pre: String::new(),
post: String::new(),
main,
},
))
}
fn node_filter(node: &scraper::element_ref::ElementRef<'_>) -> bool {
node.value()
.attr("href")
.map(|href| href.starts_with("/works/"))
.unwrap_or_else(|| false)
&& node
.text()
.next()
.map(|text| text == "Next Chapter →")
.unwrap_or_else(|| false)
}