story-dl 0.1.0

Story web scraping
Documentation
use {
    clap::{App, Arg},
    chrono::Local,
    log::{Level, Log, Metadata, Record},
    std::{fmt, io::{self, Write}, fs},
    story_dl::{models::Story, ArchiveOfOurOwn, FanFiction, Uri},
};

fn main() -> Result<(), Error> {
    log::set_boxed_logger(Box::new(Logger::new(log::Level::Info)?))?;
    log::set_max_level(log::Level::Info.to_level_filter());

    let matches = App::new(clap::crate_name!())
        .author(clap::crate_authors!())
        .about(clap::crate_description!())
        .version(clap::crate_version!())
        .arg(
            Arg::with_name("file")
                .short("f")
                .value_name("FILE")
                .takes_value(true)
                .conflicts_with("url")
                .help("File to use as a scraping list"),
        )
        .arg(
            Arg::with_name("url")
                .short("u")
                .value_name("URL")
                .takes_value(true)
                .help("URL of the story to download"),
        )
        .arg(
            Arg::with_name("output")
                .short("o")
                .value_name("FORMAT")
                .possible_values(&Output::variants())
                .default_value("epub")
                .required(true)
                .help("Output format of the downloaded story"),
        )
        .get_matches();

    let downloader = Downloader::new();

    let output = matches.value_of("output").unwrap().parse::<Output>()?;

    if matches.is_present("file") {
        let file = matches.value_of("file").unwrap();

        let imports = serde_json::from_slice::<Import>(&fs::read(file)?[..])?;

        for import in imports {
            let url: Uri = match import {
                Element::Text(url) => url,
                Element::Url { url } => url,
            }
            .parse()?;

            let story = downloader.scrape(&url)?;

            Downloader::export(story, output)?;
        }
    } else if matches.is_present("url") {
        let url: Uri = matches.value_of("url").unwrap().parse()?;

        let story = downloader.scrape(&url)?;

        Downloader::export(story, output)?;
    }

    Ok(())
}

#[derive(Copy, Clone)]
enum Output {
    EPub,
    Json,
    MessagePack,
}

impl Output {
    fn variants() -> [&'static str; 3] {
        ["epub", "json", "message-pack"]
    }

    fn file_extension(self) -> &'static str {
        match self {
            Output::EPub => "epub",
            Output::Json => "json",
            Output::MessagePack => "msgpk",
        }
    }
}

impl std::str::FromStr for Output {
    type Err = Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "epub" => Ok(Output::EPub),
            "json" => Ok(Output::Json),
            "message-pack" => Ok(Output::MessagePack),
            _ => Err(Error::ParseOutput),
        }
    }
}

impl fmt::Display for Output {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            Output::EPub => write!(f, "epub"),
            Output::Json => write!(f, "json"),
            Output::MessagePack => write!(f, "message-pack"),
        }
    }
}

pub struct Downloader {
    archive_of_our_own: ArchiveOfOurOwn,
    fanfiction: FanFiction,
}

impl Downloader {
    pub fn new() -> Self {
        Self::default()
    }

    fn scrape(&self, url: &Uri) -> Result<Story, Error> {
        let site = Site::from_url(url).expect("Given url doesn't match any known site");

        let story = match site {
            Site::ArchiveOfOurOwn => self.archive_of_our_own.scrape(url)?,
            Site::FanFiction => self.fanfiction.scrape(url)?,
        };

        Ok(story)
    }

    fn export(story: Story, output: Output) -> Result<(), Error> {
        let mut buffer = Vec::with_capacity(story.words as usize);

        match output {
            Output::EPub => {
                use {
                    comrak::{markdown_to_html, ComrakOptions},
                    epub_builder::{EpubBuilder, EpubContent, ZipLibrary},
                };

                let options = ComrakOptions::default();

                let mut epub = EpubBuilder::new(ZipLibrary::new()?)?;

                epub.metadata("title", &story.name)?;
                epub.metadata("author", story.authors.join(", "))?;
                epub.metadata("description", &story.summary)?;

                for (i, chapter) in story.chapters.iter().enumerate() {
                    let body = itertools::join(
                        &[
                            format!(
                                r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <meta http-equiv="Content-Style-Type" content="text/css" />
  <meta name="generator" content="Rust EPUB library" />
  <title>{}</title>
  <link rel="stylesheet" type="text/css" href="stylesheet.css" />
</head>
<body>"#,
                                chapter.name
                            ),
                            markdown_to_html(&chapter.pre, &options),
                            markdown_to_html(&chapter.main, &options),
                            markdown_to_html(&chapter.post, &options),
                            String::from("</body></html>"),
                        ],
                        "<br/><hr/><br/>",
                    );

                    epub.add_content(
                        EpubContent::new(format!("chapter-{}.xhtml", i + 1), body.as_bytes())
                            .title(&chapter.name),
                    )?;
                }

                epub.inline_toc();

                epub.generate(&mut buffer)?;
            }
            Output::Json => {
                serde_json::to_writer(&mut buffer, &story)?;
            }
            Output::MessagePack => {
                rmps::encode::write(&mut buffer, &story)?;
            }
        }

        fs::write(
            format!(
                "{} - {}.{}",
                story.name,
                story.authors.join(", "),
                output.file_extension()
            ),
            &buffer,
        )?;

        Ok(())
    }
}

impl Default for Downloader {
    fn default() -> Self {
        Self {
            archive_of_our_own: ArchiveOfOurOwn::new(),
            fanfiction: FanFiction::new(),
        }
    }
}

enum Site {
    ArchiveOfOurOwn,
    FanFiction,
}

impl Site {
    fn from_url(url: &Uri) -> Option<Self> {
        url.host().and_then(|host| match host {
            "archiveofourown.org" | "www.archiveofourown.org" => Some(Site::ArchiveOfOurOwn),
            "fanfiction.net" | "www.fanfiction.net" | "m.fanfiction.net" => Some(Site::FanFiction),
            _ => None,
        })
    }
}

pub type Import = Vec<Element>;

#[derive(Debug, serde::Serialize, serde::Deserialize)]
#[serde(untagged)]
pub enum Element {
    Url { url: String },
    Text(String),
}

pub trait Drain {
    fn level(&self) -> Level;
    fn ignore(&self, record: &Record) -> bool;
    fn write(&self, string: &str) -> std::io::Result<()>;
    fn flush(&self) -> std::io::Result<()>;
}

pub struct StdoutDrain {
    level: Level,
}

impl StdoutDrain {
    pub fn new(level: Level) -> io::Result<Self> {
        Ok(Self { level })
    }
}

impl Drain for StdoutDrain {
    fn level(&self) -> Level {
        self.level
    }

    fn ignore(&self, _record: &Record) -> bool {
        false
    }

    fn write(&self, string: &str) -> io::Result<()> {
        writeln!(&mut io::stdout(), "{}", string)
    }

    fn flush(&self) -> io::Result<()> {
        io::stdout().flush()
    }
}

#[derive(Debug)]
enum Error {
    Io { err: std::io::Error },

    ArchiveOfOurOwn { err: story_dl::ArchiveOfOurOwnError },
    FanFiction { err: story_dl::FanFictionError },

    EPub { err: epub_builder::Error },
    Json { err: serde_json::Error },
    MessagePack { err: rmps::encode::Error },

    HttpReq { err: http_req::error::Error },
    Logger { err: log::SetLoggerError },
    ParseOutput,
}

impl From<std::io::Error> for Error {
    fn from(err: std::io::Error) -> Error {
        Error::Io { err }
    }
}

impl From<story_dl::ArchiveOfOurOwnError> for Error {
    fn from(err: story_dl::ArchiveOfOurOwnError) -> Error {
        Error::ArchiveOfOurOwn { err }
    }
}

impl From<story_dl::FanFictionError> for Error {
    fn from(err: story_dl::FanFictionError) -> Error {
        Error::FanFiction { err }
    }
}

impl From<epub_builder::Error> for Error {
    fn from(err: epub_builder::Error) -> Error {
        Error::EPub { err }
    }
}

impl From<serde_json::Error> for Error {
    fn from(err: serde_json::Error) -> Error {
        Error::Json { err }
    }
}

impl From<rmps::encode::Error> for Error {
    fn from(err: rmps::encode::Error) -> Error {
        Error::MessagePack { err }
    }
}

impl From<http_req::error::Error> for Error {
    fn from(err: http_req::error::Error) -> Error {
        Error::HttpReq { err }
    }
}

impl From<log::SetLoggerError> for Error {
    fn from(err: log::SetLoggerError) -> Error {
        Error::Logger { err }
    }
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Io { err } => write!(f, "io error {}", err),

            Error::ArchiveOfOurOwn { err } => write!(f, "archive of our own error {}", err),
            Error::FanFiction { err } => write!(f, "fanfiction error {}", err),

            Error::EPub { err } => write!(f, "epub error {}", err),
            Error::Json { err } => write!(f, "json {}", err),
            Error::MessagePack { err } => write!(f, "message-pack error {}", err),

            Error::HttpReq { err } => write!(f, "http request error {}", err),
            Error::Logger { err } => write!(f, "log {}", err),
            Error::ParseOutput => write!(f, "valid values: [epub, json, message-pack]",),
        }
    }
}

impl std::error::Error for Error {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self {
            Error::Io { ref err } => Some(err),

            Error::ArchiveOfOurOwn { ref err } => Some(err),
            Error::FanFiction { ref err } => Some(err),

            Error::EPub { ref err } => Some(err),
            Error::Json { ref err } => Some(err),
            Error::MessagePack { ref err } => Some(err),

            Error::HttpReq { ref err } => Some(err),
            Error::Logger { ref err } => Some(err),
            Error::ParseOutput => None,
        }
    }
}

pub struct Logger {
    stdout: StdoutDrain,
}

impl Logger {
    pub fn new(level: Level) -> io::Result<Self> {
        Ok(Self {
            stdout: StdoutDrain::new(level)?,
        })
    }

    fn log_format(record: &Record) -> String {
        let target = if !record.target().is_empty() {
            record.target()
        } else {
            record.module_path().unwrap_or_default()
        };

        format!(
            "{} {:<5} [{}] {}",
            Local::now().format("%Y-%m-%d %H:%M:%S,%3f"),
            record.level().to_string(),
            target,
            record.args(),
        )
    }
}

impl Log for Logger {
    fn enabled(&self, metadata: &Metadata) -> bool {
        metadata.level() <= self.stdout.level()
    }

    fn log(&self, record: &Record) {
        let msg = Self::log_format(&record);

        if let Err(err) = self.stdout.write(&msg) {
            eprintln!("Could not write to target: {}", err);
        }
    }

    fn flush(&self) {
        if let Err(err) = self.stdout.flush() {
            eprintln!("Could not flush drain: {}", err);
        }
    }
}