#![warn(clippy::all)]
#![warn(missing_docs)]
#![warn(missing_doc_code_examples)]
use std::collections::HashSet;
pub const LANGUAGES: [&str; 32] = ["arabic", "azerbaijani", "catalan", "danish", "english", "french",
"hindi", "indonesian", "norwegian", "portuguese", "russian", "spanish", "turkish", "vietnamese",
"bulgarian", "czech", "dutch", "finnish", "german", "hungarian", "italian", "polish",
"romanian", "slovak", "swedish", "ukrainian", "hebrew", "greek", "kazakh", "nepali", "slovenian",
"tajik"];
pub const LANGUAGES_ISO_693_1: [&str; 32] = ["ar", "az", "ca", "da", "en", "fr",
"hi", "in", "nn", "pt", "ru", "es", "tr", "vi",
"bg", "cs", "nl", "fi", "de", "hu", "it", "pl",
"ro", "sk", "sv", "uk", "he", "el", "kk", "ne", "sl",
"tg"];
pub const LANGUAGES_ISO_693_2T: [&str; 32] = ["ara", "aze", "cat", "dan", "eng", "fra",
"hin", "ind", "nno", "por", "rus", "spa", "tur", "vie",
"bul", "ces", "nld", "fin", "deu", "hun", "ita", "pol",
"ron", "slk", "swe", "ukr", "heb", "ell", "kaz", "nep", "slv",
"tgk"];
pub fn get(language: &str) -> Vec<String> {
match convert_language_code(language) {
"english" => read_from_bytes(include_bytes!("savand/english.txt")),
"hebrew" => read_from_bytes(include_bytes!("savand/hebrew.txt")),
"arabic" => read_from_bytes(include_bytes!("savand/arabic.txt")),
"catalan" => read_from_bytes(include_bytes!("savand/catalan.txt")),
"danish" => read_from_bytes(include_bytes!("savand/danish.txt")),
"french" => read_from_bytes(include_bytes!("savand/french.txt")),
"hindi" => read_from_bytes(include_bytes!("savand/hindi.txt")),
"indonesian" => read_from_bytes(include_bytes!("savand/indonesian.txt")),
"norwegian" => read_from_bytes(include_bytes!("savand/norwegian.txt")),
"portuguese" => read_from_bytes(include_bytes!("savand/portuguese.txt")),
"russian" => read_from_bytes(include_bytes!("savand/russian.txt")),
"spanish" => read_from_bytes(include_bytes!("savand/spanish.txt")),
"turkish" => read_from_bytes(include_bytes!("savand/turkish.txt")),
"vietnamese" => read_from_bytes(include_bytes!("savand/vietnamese.txt")),
"bulgarian" => read_from_bytes(include_bytes!("savand/bulgarian.txt")),
"czech" => read_from_bytes(include_bytes!("savand/czech.txt")),
"dutch" => read_from_bytes(include_bytes!("savand/dutch.txt")),
"finnish" => read_from_bytes(include_bytes!("savand/finnish.txt")),
"german" => read_from_bytes(include_bytes!("savand/german.txt")),
"hungarian" => read_from_bytes(include_bytes!("savand/hungarian.txt")),
"italian" => read_from_bytes(include_bytes!("savand/italian.txt")),
"polish" => read_from_bytes(include_bytes!("savand/polish.txt")),
"romanian" => read_from_bytes(include_bytes!("savand/romanian.txt")),
"slovak" => read_from_bytes(include_bytes!("savand/slovak.txt")),
"swedish" => read_from_bytes(include_bytes!("savand/swedish.txt")),
"ukrainian" => read_from_bytes(include_bytes!("savand/ukrainian.txt")),
"azerbaijani" =>read_from_bytes(include_bytes!("nltk/azerbaijani")),
"kazakh" => read_from_bytes(include_bytes!("nltk/kazakh")),
"nepali" => read_from_bytes(include_bytes!("nltk/nepali")),
"slovenian" => read_from_bytes(include_bytes!("nltk/slovene")),
"tajik" => read_from_bytes(include_bytes!("nltk/tajik")),
_ => panic!("Unfortunately, the {} language is not currently supported. Please make sure that the name of the language is spelled in English.", language)
}
}
pub fn get_nltk(language: &str) -> Vec<String> {
match convert_language_code(language) {
"english" => read_from_bytes(include_bytes!("nltk/english")),
"arabic" => read_from_bytes(include_bytes!("nltk/arabic")),
"danish" => read_from_bytes(include_bytes!("nltk/danish")),
"french" => read_from_bytes(include_bytes!("nltk/french")),
"indonesian" => read_from_bytes(include_bytes!("nltk/indonesian")),
"norwegian" => read_from_bytes(include_bytes!("nltk/norwegian")),
"portuguese" => read_from_bytes(include_bytes!("nltk/portuguese")),
"russian" => read_from_bytes(include_bytes!("nltk/russian")),
"spanish" => read_from_bytes(include_bytes!("nltk/spanish")),
"turkish" => read_from_bytes(include_bytes!("nltk/turkish")),
"dutch" => read_from_bytes(include_bytes!("nltk/dutch")),
"finnish" => read_from_bytes(include_bytes!("nltk/finnish")),
"german" => read_from_bytes(include_bytes!("nltk/german")),
"hungarian" => read_from_bytes(include_bytes!("nltk/hungarian")),
"italian" => read_from_bytes(include_bytes!("nltk/italian")),
"romanian" => read_from_bytes(include_bytes!("nltk/romanian")),
"swedish" => read_from_bytes(include_bytes!("nltk/swedish")),
"azerbaijani" =>read_from_bytes(include_bytes!("nltk/azerbaijani")),
"kazakh" => read_from_bytes(include_bytes!("nltk/kazakh")),
"nepali" => read_from_bytes(include_bytes!("nltk/nepali")),
"slovenian" => read_from_bytes(include_bytes!("nltk/slovene")),
"tajik" => read_from_bytes(include_bytes!("nltk/tajik")),
_ => panic!("Unfortunately, the {} language is not currently supported in NLTK. Please make sure that the name of the language is spelled in English.", language)
}
}
fn convert_language_code(language: &str) -> &str {
if language.len() == 2 {
convert_language_from_iso_693_1(language)
} else if language.len() == 3 {
convert_language_from_iso_693_2t(language)
} else {
language
}
}
fn convert_language_from_iso_693_1(code: &str) -> &str {
let mut iter = LANGUAGES_ISO_693_1.iter();
let idx = iter.position(|&x| x == code);
match idx {
Some(x) => LANGUAGES[x],
None => panic!("It looks like you're trying to use an ISO 693-1 (2-letter) language code. Unfortunately, the {} language code is not currently supported.", code),
}
}
fn convert_language_from_iso_693_2t(code: &str) -> &str {
let mut iter = LANGUAGES_ISO_693_2T.iter();
let idx = iter.position(|&x| x == code);
match idx {
Some(x) => LANGUAGES[x],
None => panic!("It looks like you're trying to use an ISO 693-2T (3-letter) language code. Unfortunately, the {} language code is not currently supported.", code),
}
}
fn read_from_bytes(bytes: &[u8]) -> Vec<String> {
let contents=String::from_utf8_lossy(bytes);
let split_contents = contents.split("\n");
let mut output = vec![];
for word in split_contents {
output.push(String::from(word));
}
output
}
pub fn vec_to_set(words: Vec<String>) -> HashSet<String> {
let mut hash_words: HashSet<String> = HashSet::new();
for word in words {
hash_words.insert(word);
}
hash_words
}
#[cfg(test)]
mod good_tests {
use crate::{get, get_nltk};
#[test]
fn good_language_name() {
let x = get("arabic");
for y in x {
println!("{}", y);
}
}
#[test]
fn good_language_code_1() {
let x = get("en");
for y in x {
println!("{}", y);
}
}
#[test]
fn good_language_code_2t() {
let x = get_nltk("eng");
for y in x {
println!("{}", y);
}
}
}
#[cfg(test)]
mod weird_character_tests {
use crate::get;
#[test]
fn hebrew() {
let x = get("hebrew");
for y in x {
println!("{}", y);
}
}
#[test]
fn arabic() {
let x = get("arabic");
for y in x {
println!("{}", y);
}
}
#[test]
fn russian() {
let x = get("russian");
for y in x {
println!("{}", y);
}
}
}
#[cfg(test)]
mod panic_tests {
use crate::get;
#[test]
#[should_panic]
fn bad_language_name() {
let x = get("engilsh");
for y in x {
println!("{}", y);
}
}
#[test]
#[should_panic]
fn bad_language_code_1() {
let x = get("zz");
for y in x {
println!("{}", y);
}
}
#[test]
#[should_panic]
fn bad_language_code_2t() {
let x = get("zzz");
for y in x {
println!("{}", y);
}
}
}
#[cfg(test)]
mod conversion_tests {
use crate::{get, vec_to_set};
#[test]
fn convert_to_set() {
let vec = get("es");
let set = vec_to_set(vec);
for y in set {
println!("{}", y);
}
}
}