suimu/command/
check.rs

1use std::fs::File;
2use std::path::PathBuf;
3use std::str::FromStr;
4
5use anyhow::{anyhow, ensure, Result};
6use lazy_static::lazy_static;
7use levenshtein::levenshtein;
8use log::{error, info, warn};
9use regex::Regex;
10use structopt::{clap, StructOpt};
11use unicode_normalization::{is_nfc, UnicodeNormalization};
12
13use crate::utils::{check_csv, check_logic};
14use crate::{MaybeMusic, Music, Platform};
15
16#[derive(StructOpt)]
17#[structopt(
18version = clap::crate_version ! (),
19author = clap::crate_authors ! (),
20about = "Validate csv files"
21)]
22pub struct CheckOpt {
23    #[structopt(
24        about = "The CSV file to check",
25        default_value = "suisei-music.csv",
26        index = 1,
27        required = true
28    )]
29    csv_file: PathBuf,
30
31    #[structopt(short, long, about = "Only check formats")]
32    format_only: bool,
33
34    #[structopt(long)]
35    json_output: bool,
36}
37
38/// Return the Levenshtein ratio of two strings. SHall be a value between 0 and
39/// 1.
40fn similarity_ratio(a: &str, b: &str) -> f32 {
41    let len = a.chars().count().max(b.chars().count());
42    1f32 - (levenshtein(a, b) as f32) / (len as f32)
43}
44
45fn similarity_check(
46    field_name: &str,
47    musics: &[MaybeMusic],
48    picker: impl for<'a> Fn(&'a MaybeMusic) -> &'a str,
49) {
50    for (i, one) in musics.iter().map(&picker).enumerate() {
51        for two in musics.iter().map(&picker).skip(i + 1) {
52            if one == two {
53                continue;
54            }
55            let sim = similarity_ratio(one, two);
56            if sim > 0.75 {
57                warn!(
58                    "[{}] {} & {}: Similar titles ({})",
59                    field_name, one, two, sim
60                );
61            }
62        }
63    }
64}
65
66lazy_static! {
67    static ref RE: Regex = Regex::new(r" ?[(\(].+[)\)]$").unwrap();
68}
69
70pub fn check(opts: CheckOpt) -> Result<()> {
71    let mut has_err = false;
72
73    let csv_file: PathBuf = opts.csv_file;
74    info!("CSV file: {:?}", csv_file);
75
76    ensure!(csv_file.exists(), format!("{:?} does not exists", csv_file));
77
78    let read_file = File::open(csv_file).unwrap();
79    let check_result =
80        check_csv(&read_file).map_err(|e| anyhow!(format!("CSV validation failed: {}", e)))?;
81
82    info!(
83        "CSV successfully validated. {} entries found.",
84        check_result.len()
85    );
86
87    if opts.format_only {
88        return Ok(());
89    }
90
91    // Support analysis
92    info!("Checking entry support...");
93    for x in &check_result {
94        if x.video_type.is_empty() {
95            // Often used to skip a conversion
96            warn!("{}: Empty video_type", x);
97            continue;
98        }
99        if let Err(v) = Platform::from_str(&x.video_type) {
100            error!("{}: {}", x, v);
101            has_err = true;
102        }
103    }
104
105    // Potential typo analysis
106    info!("Checking potential typos...");
107    for x in &check_result {
108        if x.title.trim() != x.title {
109            error!("{}: Spaces around title", x);
110            has_err = true;
111        }
112        if x.artist.trim() != x.artist {
113            error!("{}: Spaces around artist", x);
114            has_err = true;
115        }
116    }
117
118    // Unicode NFC check
119    info!("Checking Unicode NFC conformity...");
120    for x in &check_result {
121        if !is_nfc(&x.title) {
122            error!(
123                "{}: Title is not in NFC, please change to '{}'",
124                x,
125                x.title.chars().nfc()
126            );
127            has_err = true;
128        }
129        if !is_nfc(&x.artist) {
130            error!(
131                "{}: Artist is not in NFC, please change to '{}'",
132                x,
133                x.artist.chars().nfc()
134            );
135            has_err = true;
136        }
137    }
138
139    info!("Check similar metadatas...");
140
141    // Title: ignore bracketed suffix
142    let mut check_result_altered = check_result.clone();
143    for i in check_result_altered.iter_mut() {
144        i.title = RE.replace_all(&i.title, "").to_string();
145    }
146
147    similarity_check("Title", &check_result_altered, |x| &x.title);
148    similarity_check("Artist", &check_result, |x| &x.artist);
149
150    info!("Validating fields...");
151
152    let converted_result = check_result
153        .into_iter()
154        .filter_map(|x| {
155            let x_desc = x.to_string();
156            let v: Result<Music> = x.try_into();
157            match v {
158                Ok(m) => Some(m),
159                Err(e) => {
160                    if &e.to_string() == "No status present" {
161                        // Often used to skip a conversion
162                        warn!("{}: Failed to convert to music: {}", x_desc, e);
163                    } else {
164                        error!("{}: Failed to convert to music: {}", x_desc, e);
165                        has_err = true;
166                    }
167                    None
168                }
169            }
170        })
171        .collect::<Vec<Music>>();
172
173    // Logic analysis
174    info!("Checking entry logic...");
175    for x in &converted_result {
176        if let Err(v) = check_logic(x) {
177            error!("{}: {}", x, v);
178            has_err = true;
179        }
180    }
181
182    info!("Check finished.");
183
184    if opts.json_output {
185        let base = serde_json::to_string(&converted_result).unwrap();
186        println!("{}", base);
187    }
188
189    if has_err {
190        Err(anyhow!("Some hard checks didn't pass."))
191    } else {
192        Ok(())
193    }
194}
195
196#[test]
197fn test_similarity_ratio() {
198    // Normal cases
199    assert_eq!(similarity_ratio("test", "test"), 1.0);
200    assert_eq!(similarity_ratio("abcd", "efgh"), 0.0);
201    // CJK
202    assert_eq!(similarity_ratio("双海亚美", "双海真美"), 0.75);
203    assert_eq!(similarity_ratio("中文Aka", "英文Aka"), 0.8);
204}