story-dl 0.2.0

Story web scraping
Documentation
use {
    crate::{
        converter,
        models::{Chapter, Details, Language, Rating, State, Story},
        select,
        utils::{req, sleep, word_count},
        Error, ScrapeError,
    },
    awc::http::uri::Uri,
    bytes::Bytes,
    chrono::{DateTime, NaiveDate, Utc},
    html5ever::{
        driver::{self, ParseOpts},
        tendril::stream::TendrilSink,
    },
    scraper::{Html, Selector},
};

lazy_static::lazy_static! {
    static ref CHAPTER_NAME: Selector = Selector::parse(r#"#chapters > .chapter > div[role="complementary"] > h3"#,).unwrap();
    static ref CHAPTER_TEXT: Selector = Selector::parse(r#"#chapters > .chapter > div[role="article"] > p"#).unwrap();

    static ref STORY_AUTHOR: Selector = Selector::parse(r#"#workskin > .preface > .byline.heading > a[rel="author"]"#).unwrap();
    static ref STORY_SUMMARY: Selector = Selector::parse("#workskin > .preface > .summary > blockquote > p").unwrap();
    static ref STORY_NAME: Selector = Selector::parse("#workskin > .preface > .title").unwrap();

    static ref STORY_RATING: Selector = Selector::parse(".work > .rating.tags > ul > li > .tag").unwrap();
    static ref STORY_ORIGINS: Selector = Selector::parse(".work > .fandom.tags > ul > li > .tag").unwrap();

    static ref STORY_STATS_CHAPTERS: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.chapters").unwrap();
    static ref STORY_STATS_LANGUAGE: Selector = Selector::parse("dl.work > dd.language").unwrap();
    static ref STORY_STATS_CREATED: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.published").unwrap();
    static ref STORY_STATS_UPDATED: Selector = Selector::parse("dl.work > dd.stats > dl.stats > dd.status").unwrap();

    static ref STORY_ACTIONS: Selector = Selector::parse("#feedback > .actions > li > a").unwrap();
}

pub async fn scrape(url: &Uri) -> Result<Story, Error> {
    let id = url
        .path()
        .split('/')
        .filter(|s| !s.is_empty())
        .nth(1)
        .expect("No story ID found in URL");

    log::info!("[{}] Scraping initial details", url);

    let html = req(url).await?;

    let details = actix_threadpool::run(|| get_details(html)).await?;

    let chapters = details.chapters;

    let mut story = Story::new(details);

    log::info!("[{}] Beginning chapter scraping", url);

    let first = url
        .path()
        .split('/')
        .filter(|s| !s.is_empty())
        .nth(3)
        .and_then(|id_hash| id_hash.split('#').next().map(String::from))
        .expect("No story ID found in URL");

    if chapters != 1 {
        let mut next = Some(first);

        for num in 1..=chapters {
            if let Some(ch) = next {
                log::info!("[{}] Scraping chapter {} [{}]", url, num, ch);

                sleep().await?;

                let uri = format!("https://archiveofourown.org/works/{}/chapters/{}", id, &ch)
                    .parse::<Uri>()?;

                let bytes = req(&uri).await?;

                let (n, chapter) = actix_threadpool::run(|| get_chapter(bytes)).await?;

                next = n;

                story.chapters.push(chapter);
            } else {
                log::error!("[error] The scraper is trying to access a chapter that doesn't exist, this isn't good");
            }
        }
    } else {
        log::info!("[{}] Scraping chapter 1 [{}]", url, first);

        sleep().await?;

        let uri = format!(
            "https://archiveofourown.org/works/{}/chapters/{}",
            id, &first
        )
        .parse::<Uri>()?;

        let bytes = req(&uri).await?;

        story
            .chapters
            .push(actix_threadpool::run(|| get_chapter(bytes)).await?.1);
    }

    story.words = story.chapters.iter().map(|c| word_count(&c.main)).sum();

    Ok(story)
}

pub fn get_details(bytes: Bytes) -> Result<Details, ScrapeError> {
    let parser = driver::parse_document(Html::new_document(), ParseOpts::default());

    let html = parser.one(std::str::from_utf8(bytes.as_ref())?);

    let authors: Vec<String> = select!(string[] <> html => &STORY_AUTHOR);
    let origins: Vec<String> = select!(string[] <> html => &STORY_ORIGINS);

    let name: String = select!(string <> html => &STORY_NAME).trim().to_string();
    let summary: String = select!(string <> html => &STORY_SUMMARY);

    let chapter_expected: String = select!(string <> html => &STORY_STATS_CHAPTERS);

    let chapters: u32 = chapter_expected
        .split('/')
        .next()
        .and_then(|s| s.parse::<u32>().ok())
        .unwrap();

    let language: Language = match select!(string <> html => &STORY_STATS_LANGUAGE).trim() {
        "English" => Language::English,
        _ => unreachable!(),
    };

    let rating: Rating = match select!(string <> html => &STORY_RATING).trim() {
        "Explicit" => Rating::Explicit,
        "Mature" => Rating::Mature,
        "Teen And Up Audiences" => Rating::Teen,
        "General Audiences" => Rating::General,
        _ => unreachable!(),
    };

    let state = {
        let mut split = chapter_expected.split('/');

        let current: &str = split.next().unwrap();
        let expected: &str = split.next().unwrap();

        if current == expected {
            State::Completed
        } else {
            State::InProgress
        }
    };

    let created: Option<DateTime<Utc>> =
        NaiveDate::parse_from_str(&select!(string <> html => &STORY_STATS_CREATED), "%Y-%m-%d")
            .map(|date| date.and_hms(0, 0, 0))
            .map(|dt| DateTime::from_utc(dt, Utc))
            .ok();

    let updated: Option<DateTime<Utc>> = if state != State::Completed || chapters != 1 {
        NaiveDate::parse_from_str(&select!(string <> html => &STORY_STATS_UPDATED), "%Y-%m-%d")
            .map(|date| date.and_hms(0, 0, 0))
            .map(|dt| DateTime::from_utc(dt, Utc))
            .ok()
    } else {
        None
    };

    Ok(Details {
        name,
        summary,

        chapters,
        language,
        rating,
        state,

        authors,
        origins,
        tags: Vec::new(),

        created: created.unwrap_or_else(Utc::now),
        updated: updated.unwrap_or_else(Utc::now),
    })
}

pub fn get_chapter(bytes: Bytes) -> Result<(Option<String>, Chapter), ScrapeError> {
    let parser = driver::parse_document(Html::new_document(), ParseOpts::default());

    let html = parser.one(std::str::from_utf8(bytes.as_ref())?);

    let next: Option<String> = html
        .select(&STORY_ACTIONS)
        .find(node_filter)
        .and_then(|node| {
            node.value().attr("href").and_then(|href| {
                // figure out a way to remove the &str -> String
                href.split('/')
                    .nth(4)
                    .and_then(|id_hash| id_hash.split('#').next().map(String::from))
            })
        });

    let main = converter::parse(
        html.select(&CHAPTER_TEXT)
            .next()
            .expect("[chapter_text] HTML is missing the chapter text node, did the html change?")
            .inner_html(),
    )?;

    let name: String = select!(string <> html => &CHAPTER_NAME);

    Ok((
        next,
        Chapter {
            name,
            words: word_count(&main),
            pre: String::new(),
            post: String::new(),
            main,
        },
    ))
}

fn node_filter(node: &scraper::element_ref::ElementRef<'_>) -> bool {
    node.value()
        .attr("href")
        .map(|href| href.starts_with("/works/"))
        .unwrap_or_else(|| false)
        && node
            .text()
            .next()
            .map(|text| text == "Next Chapter →")
            .unwrap_or_else(|| false)
}