use {
crate::{
converter,
models::{Chapter, Details, Language, Rating, State, Story},
select,
utils::{req, sleep, word_count},
Error, ScrapeError,
},
awc::http::uri::Uri,
bytes::Bytes,
chrono::{DateTime, Datelike, NaiveTime, TimeZone, Utc},
html5ever::{
driver::{self, ParseOpts},
tendril::stream::TendrilSink,
},
scraper::{Html, Selector},
std::str,
};
lazy_static::lazy_static! {
static ref CHAPTER_NAME: Selector = Selector::parse("select#chap_select > option[selected]").unwrap();
static ref CHAPTER_TEXT: Selector = Selector::parse("#storytext").unwrap();
static ref STORY_AUTHOR: Selector = Selector::parse("#profile_top > a.xcontrast_txt:not([title])").unwrap();
static ref STORY_DETAILS: Selector = Selector::parse("#profile_top > span.xgray.xcontrast_txt").unwrap();
static ref STORY_SUMMARY: Selector = Selector::parse("#profile_top > div.xcontrast_txt").unwrap();
static ref STORY_NAME: Selector = Selector::parse("#profile_top > b.xcontrast_txt").unwrap();
static ref STORY_ORIGINS: Selector = Selector::parse("#pre_story_links > span.lc-left > a.xcontrast_txt:last-child",).unwrap();
}
pub async fn scrape(url: &Uri) -> Result<Story, Error> {
let id = url
.path()
.split('/')
.filter(|s| !s.is_empty())
.nth(1)
.expect("No story ID found in URL");
log::info!("[{}] Scraping initial details", url);
let bytes = req(&url).await?;
let details = actix_threadpool::run(|| get_details(bytes)).await?;
let chapters = details.chapters;
let mut story = Story::new(details);
log::info!("[{}] Beginning chapter scraping", url);
if chapters != 1 {
for page in 1..=chapters {
log::info!("[{}] Scraping chapter {}", url, page);
sleep().await?;
let url = format!("https://www.fanfiction.net/s/{}/{}", id, page)
.as_str()
.parse()?;
let bytes = req(&url).await?;
story
.chapters
.push(actix_threadpool::run(|| get_chapter(bytes)).await?);
}
} else {
log::info!("[{}] Scraping chapter {}", url, 1);
sleep().await?;
let url = format!("https://www.fanfiction.net/s/{}/{}", id, 1)
.as_str()
.parse()?;
let bytes = req(&url).await?;
story
.chapters
.push(actix_threadpool::run(|| get_chapter(bytes)).await?);
}
story.words = story.chapters.iter().map(|c| word_count(&c.main)).sum();
Ok(story)
}
pub fn get_details(bytes: Bytes) -> Result<Details, ScrapeError> {
let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
let html = parser.one(std::str::from_utf8(bytes.as_ref())?);
let name: String = select!(string <> html => &STORY_NAME);
let summary: String = select!(string <> html => &STORY_SUMMARY);
let details: String = select!(string <> html => &STORY_DETAILS);
let authors: Vec<String> = select!(string[] <> html => &STORY_AUTHOR);
let origins: Vec<String> = select!(string[] <> html => &STORY_ORIGINS);
let mut chapters = 1u32;
let mut language = Language::English;
let mut rating = Rating::Teen;
let mut state = State::InProgress;
let mut created: Option<DateTime<Utc>> = None;
let mut updated: Option<DateTime<Utc>> = None;
let words = details.split('-').count();
for (i, s) in details.split('-').map(str::trim).rev().enumerate() {
if s.starts_with("Chapters: ") {
if let Some(ch) = s
.split(':')
.map(str::trim)
.nth(1)
.and_then(|s| s.parse::<u32>().ok())
{
chapters = ch;
}
}
if i == words - 2 {
language = match s {
"English" => Language::English,
_ => unreachable!(),
};
}
if s.starts_with("Rated: ") {
if let Some(ra) = s.split(':').map(str::trim).nth(1).and_then(|s| {
s.split(' ')
.map(str::trim)
.filter(|s| !s.is_empty())
.nth(1)
.map(|s| match s {
"MA" => Rating::Explicit,
"M" => Rating::Mature,
"T" => Rating::Teen,
"K" | "K+" => Rating::General,
_ => unreachable!("Unknown rating found, please report this"),
})
}) {
rating = ra;
}
}
if s.starts_with("Status: ") {
if let Some(st) = s.split(':').map(str::trim).nth(1).map(|s| match s {
"Complete" => State::Completed,
_ => unreachable!(),
}) {
state = st;
}
}
if s.starts_with("Published: ") {
if let Some(cr) = s.split(':').map(str::trim).nth(1).and_then(time) {
if updated.is_none() {
updated = Some(cr);
}
created = Some(cr);
}
}
if s.starts_with("Updated: ") {
if let Some(cr) = s.split(':').map(str::trim).nth(1).and_then(time) {
updated = Some(cr);
}
}
}
Ok(Details {
name,
summary,
chapters,
language,
rating,
state,
authors,
origins,
tags: Vec::new(),
created: created.unwrap_or_else(Utc::now),
updated: updated.unwrap_or_else(Utc::now),
})
}
pub fn get_chapter(bytes: Bytes) -> Result<Chapter, ScrapeError> {
let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
let html = parser.one(std::str::from_utf8(bytes.as_ref())?);
let main = converter::parse(
html.select(&CHAPTER_TEXT)
.next()
.expect("[chapter_text] HTML is missing the chapter text node, did the html change?")
.inner_html(),
)?;
Ok(Chapter {
name: html
.select(&CHAPTER_NAME)
.next()
.and_then(|cn| cn.text().next())
.map(|cn| cn.split(' ').skip(1).collect::<Vec<_>>().join(" "))
.expect("[chapter_name] No text in selected element"),
words: word_count(&main),
pre: String::new(),
post: String::new(),
main,
})
}
fn time(text: &str) -> Option<DateTime<Utc>> {
let mut parts = text.split('/');
parts
.next()
.and_then(|t| t.parse::<u32>().ok())
.and_then(|m| {
parts
.next()
.and_then(|t| t.parse::<u32>().ok())
.and_then(|d| {
let y = parts
.next()
.and_then(|t| t.parse::<i32>().ok())
.unwrap_or_else(|| Utc::today().year());
Utc.ymd(y, m, d).and_time(NaiveTime::from_hms(0, 0, 0))
})
})
}