story-dl 0.2.0

Story web scraping
Documentation
use {
    crate::{
        converter,
        models::{Chapter, Details, Language, Rating, State, Story},
        select,
        utils::{req, sleep, word_count},
        Error, ScrapeError,
    },
    awc::http::uri::Uri,
    bytes::Bytes,
    chrono::{DateTime, Datelike, NaiveTime, TimeZone, Utc},
    html5ever::{
        driver::{self, ParseOpts},
        tendril::stream::TendrilSink,
    },
    scraper::{Html, Selector},
    std::str,
};

lazy_static::lazy_static! {
    static ref CHAPTER_NAME: Selector = Selector::parse("select#chap_select > option[selected]").unwrap();
    static ref CHAPTER_TEXT: Selector = Selector::parse("#storytext").unwrap();

    static ref STORY_AUTHOR: Selector = Selector::parse("#profile_top > a.xcontrast_txt:not([title])").unwrap();
    static ref STORY_DETAILS: Selector = Selector::parse("#profile_top > span.xgray.xcontrast_txt").unwrap();
    static ref STORY_SUMMARY: Selector = Selector::parse("#profile_top > div.xcontrast_txt").unwrap();
    static ref STORY_NAME: Selector = Selector::parse("#profile_top > b.xcontrast_txt").unwrap();
    static ref STORY_ORIGINS: Selector = Selector::parse("#pre_story_links > span.lc-left > a.xcontrast_txt:last-child",).unwrap();
}

pub async fn scrape(url: &Uri) -> Result<Story, Error> {
    let id = url
        .path()
        .split('/')
        .filter(|s| !s.is_empty())
        .nth(1)
        .expect("No story ID found in URL");

    log::info!("[{}] Scraping initial details", url);

    let bytes = req(&url).await?;

    let details = actix_threadpool::run(|| get_details(bytes)).await?;

    let chapters = details.chapters;

    let mut story = Story::new(details);

    log::info!("[{}] Beginning chapter scraping", url);

    if chapters != 1 {
        for page in 1..=chapters {
            log::info!("[{}] Scraping chapter {}", url, page);

            sleep().await?;

            let url = format!("https://www.fanfiction.net/s/{}/{}", id, page)
                .as_str()
                .parse()?;

            let bytes = req(&url).await?;

            story
                .chapters
                .push(actix_threadpool::run(|| get_chapter(bytes)).await?);
        }
    } else {
        log::info!("[{}] Scraping chapter {}", url, 1);

        sleep().await?;

        let url = format!("https://www.fanfiction.net/s/{}/{}", id, 1)
            .as_str()
            .parse()?;

        let bytes = req(&url).await?;

        story
            .chapters
            .push(actix_threadpool::run(|| get_chapter(bytes)).await?);
    }

    story.words = story.chapters.iter().map(|c| word_count(&c.main)).sum();

    Ok(story)
}

pub fn get_details(bytes: Bytes) -> Result<Details, ScrapeError> {
    let parser = driver::parse_document(Html::new_document(), ParseOpts::default());

    let html = parser.one(std::str::from_utf8(bytes.as_ref())?);

    let name: String = select!(string <> html => &STORY_NAME);
    let summary: String = select!(string <> html => &STORY_SUMMARY);
    let details: String = select!(string <> html => &STORY_DETAILS);
    let authors: Vec<String> = select!(string[] <> html => &STORY_AUTHOR);
    let origins: Vec<String> = select!(string[] <> html => &STORY_ORIGINS);

    let mut chapters = 1u32;
    let mut language = Language::English;
    let mut rating = Rating::Teen;
    let mut state = State::InProgress;

    let mut created: Option<DateTime<Utc>> = None;
    let mut updated: Option<DateTime<Utc>> = None;

    let words = details.split('-').count();

    for (i, s) in details.split('-').map(str::trim).rev().enumerate() {
        if s.starts_with("Chapters: ") {
            if let Some(ch) = s
                .split(':')
                .map(str::trim)
                .nth(1)
                .and_then(|s| s.parse::<u32>().ok())
            {
                chapters = ch;
            }
        }

        if i == words - 2 {
            language = match s {
                "English" => Language::English,
                _ => unreachable!(),
            };
        }

        if s.starts_with("Rated: ") {
            if let Some(ra) = s.split(':').map(str::trim).nth(1).and_then(|s| {
                s.split(' ')
                    .map(str::trim)
                    .filter(|s| !s.is_empty())
                    .nth(1)
                    .map(|s| match s {
                        "MA" => Rating::Explicit,
                        "M" => Rating::Mature,
                        "T" => Rating::Teen,
                        "K" | "K+" => Rating::General,
                        _ => unreachable!("Unknown rating found, please report this"),
                    })
            }) {
                rating = ra;
            }
        }

        if s.starts_with("Status: ") {
            if let Some(st) = s.split(':').map(str::trim).nth(1).map(|s| match s {
                "Complete" => State::Completed,
                _ => unreachable!(),
            }) {
                state = st;
            }
        }

        if s.starts_with("Published: ") {
            if let Some(cr) = s.split(':').map(str::trim).nth(1).and_then(time) {
                if updated.is_none() {
                    updated = Some(cr);
                }

                created = Some(cr);
            }
        }

        if s.starts_with("Updated: ") {
            if let Some(cr) = s.split(':').map(str::trim).nth(1).and_then(time) {
                updated = Some(cr);
            }
        }
    }

    Ok(Details {
        name,
        summary,

        chapters,
        language,
        rating,
        state,

        authors,
        origins,
        tags: Vec::new(),

        created: created.unwrap_or_else(Utc::now),
        updated: updated.unwrap_or_else(Utc::now),
    })
}

pub fn get_chapter(bytes: Bytes) -> Result<Chapter, ScrapeError> {
    let parser = driver::parse_document(Html::new_document(), ParseOpts::default());

    let html = parser.one(std::str::from_utf8(bytes.as_ref())?);

    let main = converter::parse(
        html.select(&CHAPTER_TEXT)
            .next()
            .expect("[chapter_text] HTML is missing the chapter text node, did the html change?")
            .inner_html(),
    )?;

    Ok(Chapter {
        name: html
            .select(&CHAPTER_NAME)
            .next()
            .and_then(|cn| cn.text().next())
            .map(|cn| cn.split(' ').skip(1).collect::<Vec<_>>().join(" "))
            .expect("[chapter_name] No text in selected element"),
        words: word_count(&main),
        pre: String::new(),
        post: String::new(),
        main,
    })
}

fn time(text: &str) -> Option<DateTime<Utc>> {
    let mut parts = text.split('/');

    parts
        .next()
        .and_then(|t| t.parse::<u32>().ok())
        .and_then(|m| {
            parts
                .next()
                .and_then(|t| t.parse::<u32>().ok())
                .and_then(|d| {
                    let y = parts
                        .next()
                        .and_then(|t| t.parse::<i32>().ok())
                        .unwrap_or_else(|| Utc::today().year());

                    Utc.ymd(y, m, d).and_time(NaiveTime::from_hms(0, 0, 0))
                })
        })
}