use {
crate::{
converter,
models::{Chapter, Details, Language, Rating, State, Story},
select,
utils::{request, sleep, word_count, RequestError},
},
chrono::{DateTime, NaiveDate, Utc},
http_req::uri::Uri,
scraper::{Html, Selector},
};
#[derive(Debug)]
pub enum ArchiveOfOurOwnError {
Io { err: std::io::Error },
Utf8 { err: std::str::Utf8Error },
InvalidEncoding,
Non200Response,
Http { err: http::Error },
HttpReq { err: http_req::error::Error },
}
impl From<RequestError> for ArchiveOfOurOwnError {
fn from(err: RequestError) -> ArchiveOfOurOwnError {
match err {
RequestError::Io { err } => ArchiveOfOurOwnError::Io { err },
RequestError::InvalidEncoding => ArchiveOfOurOwnError::InvalidEncoding,
RequestError::Non200Response => ArchiveOfOurOwnError::Non200Response,
RequestError::Http { err } => ArchiveOfOurOwnError::Http { err },
RequestError::HttpReq { err } => ArchiveOfOurOwnError::HttpReq { err },
}
}
}
impl std::fmt::Display for ArchiveOfOurOwnError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ArchiveOfOurOwnError::Io { err } => write!(f, "(Io) {}", err),
ArchiveOfOurOwnError::Utf8 { err } => write!(f, "(Utf8) {}", err),
ArchiveOfOurOwnError::InvalidEncoding => write!(
f,
"(InvalidEncoding) Server returned with and encoding that the scraper can't handle"
),
ArchiveOfOurOwnError::Non200Response => write!(
f,
"(Non200Response) Server returned with a non 200 status code"
),
ArchiveOfOurOwnError::Http { ref err } => write!(f, "(Http) {}", err),
ArchiveOfOurOwnError::HttpReq { ref err } => write!(f, "(HttpReq) {}", err),
}
}
}
impl From<std::io::Error> for ArchiveOfOurOwnError {
fn from(err: std::io::Error) -> ArchiveOfOurOwnError {
ArchiveOfOurOwnError::Io { err }
}
}
impl From<std::str::Utf8Error> for ArchiveOfOurOwnError {
fn from(err: std::str::Utf8Error) -> ArchiveOfOurOwnError {
ArchiveOfOurOwnError::Utf8 { err }
}
}
impl From<http::Error> for ArchiveOfOurOwnError {
fn from(err: http::Error) -> ArchiveOfOurOwnError {
ArchiveOfOurOwnError::Http { err }
}
}
impl From<http_req::error::Error> for ArchiveOfOurOwnError {
fn from(err: http_req::error::Error) -> ArchiveOfOurOwnError {
ArchiveOfOurOwnError::HttpReq { err }
}
}
impl std::error::Error for ArchiveOfOurOwnError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
ArchiveOfOurOwnError::Io { ref err } => Some(err),
ArchiveOfOurOwnError::Utf8 { ref err } => Some(err),
ArchiveOfOurOwnError::InvalidEncoding => None,
ArchiveOfOurOwnError::Non200Response => None,
ArchiveOfOurOwnError::Http { ref err } => Some(err),
ArchiveOfOurOwnError::HttpReq { ref err } => Some(err),
}
}
}
#[derive(Debug)]
pub struct ArchiveOfOurOwn {
chapter_name: Selector,
chapter_text: Selector,
story_author: Selector,
story_summary: Selector,
story_name: Selector,
story_rating: Selector,
story_origins: Selector,
story_stats_chapters: Selector,
story_stats_language: Selector,
story_stats_created: Selector,
story_stats_updated: Selector,
story_actions: Selector,
}
impl ArchiveOfOurOwn {
pub fn new() -> Self {
Self::default()
}
pub fn scrape(&self, url: &Uri) -> Result<Story, ArchiveOfOurOwnError> {
let id = url
.path()
.and_then(|p| p.split('/').filter(|s| !s.is_empty()).nth(1))
.expect("No story ID found in URL");
log::info!("[{}] Scraping initial details", url);
let details = self.get_details(url)?;
let chapters = details.chapters;
let mut story = Story::new(details);
log::info!("[{}] Beginning chapter scraping", url);
let first = url
.path()
.and_then(|p| {
p.split('/')
.filter(|s| !s.is_empty())
.nth(3)
.and_then(|id_hash| id_hash.split('#').next().map(String::from))
})
.expect("No story ID found in URL");
if chapters != 1 {
let mut next = Some(first);
for num in 1..=chapters {
if let Some(ch) = next {
log::info!("[{}] Scraping chapter {} [{}]", url, num, ch);
let (n, chapter) = self.get_chapter(id, &ch)?;
next = n;
story.chapters.push(chapter);
} else {
log::error!("[error] The scraper is trying to access a chapter that doesn't exist, this isn't good");
}
}
} else {
log::info!("[{}] Scraping chapter 1 [{}]", url, first);
story.chapters.push(self.get_chapter(id, &first)?.1);
}
story.words = story.chapters.iter().map(|c| word_count(&c.main)).sum();
Ok(story)
}
pub fn get_details(&self, url: &Uri) -> Result<Details, ArchiveOfOurOwnError> {
let res = request(url).map_err(ArchiveOfOurOwnError::from)?;
let html = Html::parse_document(std::str::from_utf8(&res)?);
let authors: Vec<String> = select!(string[] <> html => &self.story_author);
let origins: Vec<String> = select!(string[] <> html => &self.story_origins);
let name: String = select!(string <> html => &self.story_name)
.trim()
.to_string();
let summary: String = select!(string <> html => &self.story_summary);
let chapter_expected: String = select!(string <> html => &self.story_stats_chapters);
let chapters: u32 = chapter_expected
.split('/')
.next()
.and_then(|s| s.parse::<u32>().ok())
.unwrap();
let language: Language = match select!(string <> html => &self.story_stats_language).trim()
{
"English" => Language::English,
_ => unreachable!(),
};
let rating: Rating = match select!(string <> html => &self.story_rating).trim() {
"Explicit" => Rating::Explicit,
"Mature" => Rating::Mature,
"Teen And Up Audiences" => Rating::Teen,
"General Audiences" => Rating::General,
_ => unreachable!(),
};
let state = {
let mut split = chapter_expected.split('/');
let current: &str = split.next().unwrap();
let expected: &str = split.next().unwrap();
if current == expected {
State::Completed
} else {
State::InProgress
}
};
let created: Option<DateTime<Utc>> = NaiveDate::parse_from_str(
&select!(string <> html => &self.story_stats_created),
"%Y-%m-%d",
)
.map(|date| date.and_hms(0, 0, 0))
.map(|dt| DateTime::from_utc(dt, Utc))
.ok();
let updated: Option<DateTime<Utc>> = if state != State::Completed || chapters != 1 {
NaiveDate::parse_from_str(
&select!(string <> html => &self.story_stats_updated),
"%Y-%m-%d",
)
.map(|date| date.and_hms(0, 0, 0))
.map(|dt| DateTime::from_utc(dt, Utc))
.ok()
} else {
None
};
Ok(Details {
name,
summary,
chapters,
language,
rating,
state,
authors,
origins,
tags: Vec::new(),
created: created.unwrap_or_else(Utc::now),
updated: updated.unwrap_or_else(Utc::now),
})
}
pub fn get_chapter(
&self,
id: &str,
chapter: &str,
) -> Result<(Option<String>, Chapter), ArchiveOfOurOwnError> {
sleep();
let url = format!(
"https://archiveofourown.org/works/{}/chapters/{}",
id, chapter
)
.as_str()
.parse()?;
let res = request(&url).map_err(ArchiveOfOurOwnError::from)?;
let html = Html::parse_document(std::str::from_utf8(&res)?);
let next: Option<String> = html
.select(&self.story_actions)
.find(Self::node_filter)
.and_then(|node| {
node.value().attr("href").and_then(|href| {
href.split('/')
.nth(4)
.and_then(|id_hash| id_hash.split('#').next().map(String::from))
})
});
let main = converter::parse(
html.select(&self.chapter_text)
.next()
.expect(
"[chapter_text] HTML is missing the chapter text node, did the html change?",
)
.inner_html(),
)?;
let name: String = select!(string <> html => &self.chapter_name);
Ok((
next,
Chapter {
name,
words: word_count(&main),
pre: String::new(),
post: String::new(),
main,
},
))
}
fn node_filter(node: &scraper::element_ref::ElementRef<'_>) -> bool {
node.value()
.attr("href")
.map(|href| href.starts_with("/works/"))
.unwrap_or_else(|| false)
&& node
.text()
.next()
.map(|text| text == "Next Chapter →")
.unwrap_or_else(|| false)
}
}
impl Default for ArchiveOfOurOwn {
fn default() -> Self {
Self {
chapter_name: Selector::parse(
r#"#chapters > .chapter > div[role="complementary"] > h3"#,
)
.unwrap(),
chapter_text: Selector::parse(r#"#chapters > .chapter > div[role="article"] > p"#)
.unwrap(),
story_author: Selector::parse(
r#"#workskin > .preface > .byline.heading > a[rel="author"]"#,
)
.unwrap(),
story_summary: Selector::parse("#workskin > .preface > .summary > blockquote > p")
.unwrap(),
story_name: Selector::parse("#workskin > .preface > .title").unwrap(),
story_rating: Selector::parse(".work > .rating.tags > ul > li > .tag").unwrap(),
story_origins: Selector::parse(".work > .fandom.tags > ul > li > .tag").unwrap(),
story_stats_chapters: Selector::parse("dl.work > dd.stats > dl.stats > dd.chapters")
.unwrap(),
story_stats_language: Selector::parse("dl.work > dd.language").unwrap(),
story_stats_created: Selector::parse("dl.work > dd.stats > dl.stats > dd.published")
.unwrap(),
story_stats_updated: Selector::parse("dl.work > dd.stats > dl.stats > dd.status")
.unwrap(),
story_actions: Selector::parse("#feedback > .actions > li > a").unwrap(),
}
}
}