novel-cli 0.17.0

A set of tools for downloading novels from the web, manipulating text, and generating EPUB
Documentation
use std::fs;
use std::ops::Range;
use std::path::{Path, PathBuf};

use clap::Args;
use color_eyre::eyre::{self, Result};
use fluent_templates::Loader;
use hashbrown::HashSet;
use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd, TextMergeWithOffset};

use crate::utils::{self, CurrentDir, Lang};
use crate::{LANG_ID, LOCALES};

#[must_use]
#[derive(Args)]
#[command(arg_required_else_help = true,
    about = LOCALES.lookup(&LANG_ID, "check_command"))]
pub struct Check {
    #[arg(help = LOCALES.lookup(&LANG_ID, "file_path"))]
    pub file_path: PathBuf,

    #[arg(long, default_value_t = false,
        help = LOCALES.lookup(&LANG_ID, "basic_check"))]
    pub basic_check: bool,

    #[arg(long, default_value_t = false,
        help = LOCALES.lookup(&LANG_ID, "word_count"))]
    pub word_count: bool,
}

pub fn execute(config: Check) -> Result<()> {
    let input_file_path;
    let input_file_parent_path;

    if utils::is_markdown_or_txt_file(&config.file_path)? {
        input_file_path = dunce::canonicalize(&config.file_path)?;
        input_file_parent_path = input_file_path.parent().unwrap().to_path_buf();
    } else if let Ok(Some(path)) =
        utils::try_get_markdown_or_txt_file_name_in_dir(&config.file_path)
    {
        input_file_path = path;
        input_file_parent_path = dunce::canonicalize(&config.file_path)?;
    } else {
        eyre::bail!("Invalid input path: `{}`", config.file_path.display());
    }
    tracing::info!("Input file path: `{}`", input_file_path.display());

    let current_dir = CurrentDir::new(input_file_parent_path)?;

    let bytes = fs::read(&input_file_path)?;
    let markdown = simdutf8::basic::from_utf8(&bytes)?;
    let mut parser = TextMergeWithOffset::new(
        Parser::new_ext(markdown, Options::ENABLE_YAML_STYLE_METADATA_BLOCKS).into_offset_iter(),
    );

    let lang = check_metadata(&mut parser)?;

    let max_width = (utils::terminal_size().0 / 2) as usize;
    let mut char_set = HashSet::new();
    let mut in_paragraph = false;
    let mut word_count = 0;
    parser.for_each(|(event, range)| match event {
        Event::Start(tag) => match tag {
            Tag::Heading { level, .. } => {
                let title = markdown[range].trim_start_matches('#').trim();

                if level == HeadingLevel::H1 {
                    if !check_volume_title(title, lang) {
                        println_msg(format!("Irregular volume title format: `{title}`"));
                    }
                } else if level == HeadingLevel::H2 {
                    if !check_chapter_title(title, lang) {
                        println_msg(format!("Irregular chapter title format: `{title}`"));
                    }
                } else {
                    println_msg(format!(
                        "Irregular heading level: `{level:?}`, content: `{title}`"
                    ));
                }
            }
            Tag::Image { dest_url, .. } => {
                let image_path = Path::new(dest_url.as_ref());

                if !image_path.is_file() {
                    println_msg(format!("Image `{}` does not exist", image_path.display()));
                }
            }
            Tag::Paragraph => {
                in_paragraph = true;
            }
            Tag::BlockQuote(_)
            | Tag::CodeBlock(_)
            | Tag::List(_)
            | Tag::Item
            | Tag::FootnoteDefinition(_)
            | Tag::Table(_)
            | Tag::TableHead
            | Tag::TableRow
            | Tag::TableCell
            | Tag::Emphasis
            | Tag::Strong
            | Tag::Strikethrough
            | Tag::Link { .. }
            | Tag::HtmlBlock
            | Tag::MetadataBlock(_)
            | Tag::DefinitionList
            | Tag::DefinitionListTitle
            | Tag::DefinitionListDefinition
            | Tag::Superscript
            | Tag::Subscript => {
                if !config.basic_check {
                    let content = console::truncate_str(markdown[range].trim(), max_width, "...");

                    println_msg(format!(
                        "Markdown tag that should not appear: `{tag:?}`, content: `{content}`"
                    ));
                }
            }
        },
        Event::Text(text) => {
            if !config.basic_check {
                for c in text.chars() {
                    if !unicode_blocks::is_cjk(c)
                        && !utils::is_punctuation(c)
                        && !c.is_ascii_alphanumeric()
                        && c != ' '
                    {
                        if char_set.contains(&c) {
                            continue;
                        } else {
                            char_set.insert(c);

                            println_msg(format!(
                                "Irregular char: `{}`, at `{}`",
                                c,
                                console::truncate_str(
                                    markdown[range.clone()].trim(),
                                    max_width,
                                    "..."
                                )
                            ));
                        }
                    }
                }
            }

            if config.word_count {
                for c in text.chars() {
                    if unicode_blocks::is_cjk(c) {
                        word_count += 1;
                    }
                }
            }
        }
        Event::End(tag) => {
            if let TagEnd::Paragraph = tag {
                in_paragraph = false;
            }
        }
        Event::HardBreak
        | Event::Code(_)
        | Event::Html(_)
        | Event::FootnoteReference(_)
        | Event::SoftBreak
        | Event::Rule
        | Event::TaskListMarker(_)
        | Event::InlineHtml(_)
        | Event::InlineMath(_)
        | Event::DisplayMath(_) => {
            if !config.basic_check {
                let content = console::truncate_str(markdown[range].trim(), max_width, "...");

                println_msg(format!(
                    "Markdown event that should not appear: `{event:?}`, content: `{content}`"
                ));
            }
        }
    });

    if config.word_count {
        println!("Total number of words: {word_count}");
    }

    current_dir.restore()?;

    Ok(())
}

fn check_metadata<'a, T>(parser: &mut TextMergeWithOffset<'a, T>) -> Result<Lang>
where
    T: Iterator<Item = (Event<'a>, Range<usize>)>,
{
    let metadata = utils::get_metadata(parser)?;

    eyre::ensure!(
        metadata.cover_image_is_ok(),
        "Cover image does not exist: `{}`",
        metadata.cover_image.unwrap().display()
    );

    Ok(metadata.lang)
}

fn println_msg(msg: String) {
    println!("{} {}", utils::emoji("⚠️"), msg);
}

macro_rules! regex {
    ($re:literal $(,)?) => {{
        static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
        RE.get_or_init(|| regex::Regex::new($re).unwrap())
    }};
}

#[must_use]
fn check_chapter_title<T>(title: T, lang: Lang) -> bool
where
    T: AsRef<str>,
{
    let title = title.as_ref();

    match lang {
        Lang::ZhHant => {
            let regex = regex!(r"第([零一二三四五六七八九十百千]|[0-9]){1,7}[章話] .+");
            regex.is_match(title.as_ref())
        }
        Lang::ZhHans => {
            let regex = regex!(r"第([零一二三四五六七八九十百千]|[0-9]){1,7}[章话] .+");
            regex.is_match(title.as_ref())
        }
    }
}

#[must_use]
fn check_volume_title<T>(title: T, lang: Lang) -> bool
where
    T: AsRef<str>,
{
    let title = title.as_ref();

    match lang {
        Lang::ZhHant => {
            let regex = regex!(r"第([一二三四五六七八九十]|[0-9]){1,3}卷 .+");
            regex.is_match(title) || title == "簡介"
        }
        Lang::ZhHans => {
            let regex = regex!(r"第([一二三四五六七八九十]|[0-9]){1,3}卷 .+");
            regex.is_match(title) || title == "简介"
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn check_chapter_title_test() {
        assert!(check_chapter_title("第一章 被俘虏的开始", Lang::ZhHans));
        assert!(check_chapter_title(
            "第一百三十二章 标标标标标标标标标",
            Lang::ZhHans
        ));
        assert!(check_chapter_title("第123章 标题标标标标", Lang::ZhHans));
        assert!(!check_chapter_title("第一章 ", Lang::ZhHans));
        assert!(!check_chapter_title("第1二3话", Lang::ZhHans));
        assert!(!check_chapter_title("第123话标题", Lang::ZhHans));
        assert!(!check_chapter_title("123话 标题", Lang::ZhHans));
    }

    #[test]
    fn check_volume_title_test() {
        assert!(check_volume_title(
            "第三十二卷 标标标标标标标标标",
            Lang::ZhHans
        ));
        assert!(!check_volume_title("第123话 标题标标标标", Lang::ZhHans));
        assert!(!check_volume_title("第1卷 ", Lang::ZhHans));
    }
}