jisho 0.2.10

Jisho is a CLI tool & Rust library that provides a Japanese-English dictionary.
Documentation
use bitcode::Decode;
use flate2::read::ZlibDecoder;
use lazy_static::lazy_static;
use rustc_hash::FxHashMap;
use serde::Serialize;
use std::io::Read;

type Dictionary = FxHashMap<String, Vec<Entry>>;

#[derive(Clone, Debug, PartialEq, Decode, Serialize)]
pub struct Entry {
    pub kanji: String,
    pub reading: String,
    pub meanings: Vec<Sense>,
    pub frequency: i32,
}

#[derive(Clone, Debug, PartialEq, Decode, Serialize)]
pub struct Sense {
    pub glosses: Vec<String>,
}

lazy_static! {
    static ref J2E: Dictionary = {
        let b = include_bytes!(concat!(env!("OUT_DIR"), "/j2e"));
        if cfg!(feature = "compression") {
            let mut d = ZlibDecoder::new(b.as_slice());
            let mut v = Vec::new();
            d.read_to_end(&mut v).unwrap();
            bitcode::decode(&v).unwrap()
        } else {
            bitcode::decode(b).unwrap()
        }
    };
    static ref E2J: Dictionary = {
        let b = include_bytes!(concat!(env!("OUT_DIR"), "/e2j"));
        if cfg!(feature = "compression") {
            let mut d = ZlibDecoder::new(b.as_slice());
            let mut v = Vec::new();
            d.read_to_end(&mut v).unwrap();
            bitcode::decode(&v).unwrap()
        } else {
            bitcode::decode(b).unwrap()
        }
    };
    static ref READING: Dictionary = {
        let b = include_bytes!(concat!(env!("OUT_DIR"), "/reading"));
        if cfg!(feature = "compression") {
            let mut d = ZlibDecoder::new(b.as_slice());
            let mut v = Vec::new();
            d.read_to_end(&mut v).unwrap();
            bitcode::decode(&v).unwrap()
        } else {
            bitcode::decode(b).unwrap()
        }
    };
}

fn strip_first(input: &str) -> &str {
    let mut chars = input.chars();
    chars.next();
    chars.as_str()
}

fn strip_last(input: &str) -> &str {
    let mut chars = input.chars();
    chars.next_back();
    chars.as_str()
}

fn is_kanji(c: &char) -> bool {
    (*c >= '\u{4e00}' && *c <= '\u{9fff}') || // CJK Unified Ideographs
        (*c >= '\u{f900}' && *c <= '\u{faff}') // CJK Compatibility Ideographs
}

fn is_hiragana(c: &char) -> bool {
    *c >= '\u{3040}' && *c <= '\u{309f}'
}

fn is_katakana(c: &char) -> bool {
    *c >= '\u{30a0}' && *c <= '\u{30ff}'
}

fn collect_exact_results(dictionary: &'static Dictionary, input: &str) -> Vec<&'static Entry> {
    let mut results = Vec::new();
    if dictionary.contains_key(input) {
        let entries = dictionary.get(input).unwrap();
        results.extend(entries);
    }
    results.sort_by_key(|e| e.frequency);
    results
}

fn collect_prefix_results(dictionary: &'static Dictionary, input: &str) -> Vec<&'static Entry> {
    let mut results = Vec::new();
    for key in dictionary.keys() {
        if key.starts_with(input) {
            let entries = dictionary.get(key).unwrap();
            for entry in entries {
                if !results.contains(&entry) {
                    results.push(entry);
                }
            }
        }
    }
    results.sort_by_key(|e| e.frequency);
    results
}

fn collect_postfix_results(dictionary: &'static Dictionary, input: &str) -> Vec<&'static Entry> {
    let mut results = Vec::new();
    for key in dictionary.keys() {
        if key.ends_with(input) {
            let entries = dictionary.get(key).unwrap();
            for entry in entries {
                if !results.contains(&entry) {
                    results.push(entry);
                }
            }
        }
    }
    results.sort_by_key(|e| e.frequency);
    results
}

fn collect_wildcard_results(dictionary: &'static Dictionary, input: &str) -> Vec<&'static Entry> {
    let mut results = Vec::new();
    for key in dictionary.keys() {
        if key.len() == input.len()
            && key
                .chars()
                .zip(input.chars())
                .all(|(k, i)| k == i || i == '?' || i == '')
        {
            let entries = dictionary.get(key).unwrap();
            for entry in entries {
                if !results.contains(&entry) {
                    results.push(entry);
                }
            }
        }
    }
    results.sort_by_key(|e| e.frequency);
    results
}

fn collect_results(dictionary: &'static Dictionary, input: &str) -> Vec<&'static Entry> {
    let mut results = collect_exact_results(dictionary, input);
    if results.is_empty() {
        results.extend(collect_prefix_results(dictionary, input));
    }
    if results.is_empty() {
        results.extend(collect_postfix_results(dictionary, input));
    }
    results
}

enum Mode {
    Default,
    Exact,
    Prefix,
    Postfix,
    Wildcard,
}

pub fn lookup(input_raw: &str) -> Vec<&Entry> {
    let mut mode: Mode = Mode::Default;
    let mut input = input_raw;
    if input_raw.starts_with('=') {
        mode = Mode::Exact;
        input = strip_first(input_raw);
    } else if input_raw.ends_with(['*', '']) {
        mode = Mode::Prefix;
        input = strip_last(input_raw);
    } else if input_raw.starts_with(['*', '']) {
        mode = Mode::Postfix;
        input = strip_first(input_raw);
    } else if input_raw.contains(['?', '']) {
        mode = Mode::Wildcard;
    }

    if input.chars().any(|c| is_kanji(&c)) {
        match mode {
            Mode::Default => collect_results(&J2E, input),
            Mode::Exact => collect_exact_results(&J2E, input),
            Mode::Prefix => collect_prefix_results(&J2E, input),
            Mode::Postfix => collect_postfix_results(&J2E, input),
            Mode::Wildcard => collect_wildcard_results(&J2E, input),
        }
    } else if input.chars().all(|c| is_hiragana(&c) || is_katakana(&c)) {
        match mode {
            Mode::Default => collect_results(&READING, input),
            Mode::Exact => collect_exact_results(&READING, input),
            Mode::Prefix => collect_prefix_results(&READING, input),
            Mode::Postfix => collect_postfix_results(&READING, input),
            Mode::Wildcard => collect_wildcard_results(&READING, input),
        }
    } else {
        match mode {
            Mode::Default => collect_results(&E2J, input),
            Mode::Exact => collect_exact_results(&E2J, input),
            Mode::Prefix => collect_prefix_results(&E2J, input),
            Mode::Postfix => collect_postfix_results(&E2J, input),
            Mode::Wildcard => collect_wildcard_results(&E2J, input),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn entry() -> Entry {
        Entry {
            kanji: "".to_string(),
            reading: "みどり".to_string(),
            meanings: vec![
                Sense {
                    glosses: vec!["green".to_string()],
                },
                Sense {
                    glosses: vec!["greenery".to_string(), "verdure".to_string()],
                },
            ],
            frequency: 3,
        }
    }

    #[test]
    fn kanji_lookup() {
        let results = lookup("");
        assert_eq!(results.first().unwrap(), &&entry())
    }

    #[test]
    fn reading_lookup() {
        let results = lookup("みどり");
        assert_eq!(results.first().unwrap(), &&entry())
    }

    #[test]
    fn meaning_lookup() {
        let mut results = lookup("green");
        assert!(results.contains(&&entry()));

        let results_len = results.len();
        results.dedup();
        assert!(results_len == results.len())
    }
}