use lazy_static::lazy_static;
use std::collections::HashMap;
#[derive(Clone, Debug)]
pub struct LanguageMetadata {
pub tag: &'static str,
pub corpus_id: &'static str,
pub name: &'static str,
pub latin_script: bool,
pub notes: Option<&'static str>,
}
pub const BASE_URL: &str = "https://storage.googleapis.com/books/ngrams/books";
pub const VERSION: &str = "20120701";
lazy_static! {
pub static ref SUPPORTED_LANGUAGES: HashMap<&'static str, LanguageMetadata> = {
let mut m = HashMap::new();
m.insert("en", LanguageMetadata {
tag: "en",
corpus_id: "eng",
name: "English",
latin_script: true,
notes: None,
});
m.insert("en-fiction", LanguageMetadata {
tag: "en-fiction",
corpus_id: "eng-fiction",
name: "English Fiction",
latin_script: true,
notes: Some("Subset of English corpus from fiction works"),
});
m.insert("de", LanguageMetadata {
tag: "de",
corpus_id: "ger",
name: "German",
latin_script: true,
notes: None,
});
m.insert("fr", LanguageMetadata {
tag: "fr",
corpus_id: "fre",
name: "French",
latin_script: true,
notes: None,
});
m.insert("es", LanguageMetadata {
tag: "es",
corpus_id: "spa",
name: "Spanish",
latin_script: true,
notes: None,
});
m.insert("it", LanguageMetadata {
tag: "it",
corpus_id: "ita",
name: "Italian",
latin_script: true,
notes: None,
});
m.insert("ru", LanguageMetadata {
tag: "ru",
corpus_id: "rus",
name: "Russian",
latin_script: false,
notes: Some("Cyrillic script"),
});
m.insert("he", LanguageMetadata {
tag: "he",
corpus_id: "heb",
name: "Hebrew",
latin_script: false,
notes: Some("Right-to-left script"),
});
m.insert("zh", LanguageMetadata {
tag: "zh",
corpus_id: "chi-sim",
name: "Chinese (Simplified)",
latin_script: false,
notes: Some("Character-based, no word boundaries"),
});
m
};
pub static ref MULTIGRAM_PREFIXES: Vec<&'static str> = {
let mut buf = String::with_capacity(1368);
let mut offsets: Vec<(usize, usize)> = Vec::with_capacity(678);
for c1 in 'a'..='z' {
for c2 in 'a'..='z' {
let start = buf.len();
buf.push(c1);
buf.push(c2);
offsets.push((start, buf.len()));
}
}
let other_start = buf.len();
buf.push_str("other");
offsets.push((other_start, buf.len()));
let punct_start = buf.len();
buf.push_str("punctuation");
offsets.push((punct_start, buf.len()));
let leaked: &'static str = Box::leak(buf.into_boxed_str());
offsets.into_iter().map(|(s, e)| &leaked[s..e]).collect()
};
}
pub static UNIGRAM_PREFIXES: &[&str] = &[
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "other",
];
pub fn get_file_url(language: &str, order: u8, prefix: &str) -> Option<String> {
let metadata = SUPPORTED_LANGUAGES.get(language)?;
Some(format!(
"{}/googlebooks-{}-all-{}gram-{}-{}.gz",
BASE_URL, metadata.corpus_id, order, VERSION, prefix
))
}
pub fn get_order_urls(language: &str, order: u8) -> Option<Vec<String>> {
let metadata = SUPPORTED_LANGUAGES.get(language)?;
let prefixes: &[&str] = if order == 1 {
UNIGRAM_PREFIXES
} else {
MULTIGRAM_PREFIXES.as_slice()
};
let urls: Vec<String> = prefixes
.iter()
.map(|prefix| {
format!(
"{}/googlebooks-{}-all-{}gram-{}-{}.gz",
BASE_URL, metadata.corpus_id, order, VERSION, prefix
)
})
.collect();
Some(urls)
}
pub fn get_prefixes(order: u8) -> Vec<String> {
if order == 1 {
UNIGRAM_PREFIXES.iter().map(|s| s.to_string()).collect()
} else {
MULTIGRAM_PREFIXES.iter().map(|s| s.to_string()).collect()
}
}
pub fn is_valid_prefix(order: u8, prefix: &str) -> bool {
if order == 1 {
UNIGRAM_PREFIXES.contains(&prefix)
} else {
MULTIGRAM_PREFIXES.contains(&prefix)
}
}
pub fn is_supported(language: &str) -> bool {
SUPPORTED_LANGUAGES.contains_key(language)
}
#[derive(Clone, Debug)]
pub struct LanguageInfo {
pub tag: String,
pub name: String,
pub corpus_id: String,
}
impl LanguageInfo {
pub fn from_code(code: &str) -> Option<Self> {
let metadata = SUPPORTED_LANGUAGES.get(code)?;
Some(Self {
tag: metadata.tag.to_string(),
name: metadata.name.to_string(),
corpus_id: metadata.corpus_id.to_string(),
})
}
}
pub fn get_metadata(language: &str) -> Option<&'static LanguageMetadata> {
SUPPORTED_LANGUAGES.get(language)
}
pub fn list_languages() -> Vec<&'static str> {
SUPPORTED_LANGUAGES.keys().copied().collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_supported_languages() {
assert!(is_supported("en"));
assert!(is_supported("de"));
assert!(is_supported("fr"));
assert!(!is_supported("invalid"));
}
#[test]
fn test_get_file_url() {
let url = get_file_url("en", 1, "a").unwrap();
assert_eq!(
url,
"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-1gram-20120701-a.gz"
);
let url = get_file_url("en", 5, "aa").unwrap();
assert_eq!(
url,
"https://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-aa.gz"
);
}
#[test]
fn test_unigram_prefixes() {
assert_eq!(UNIGRAM_PREFIXES.len(), 27); assert_eq!(UNIGRAM_PREFIXES[0], "a");
assert_eq!(UNIGRAM_PREFIXES[25], "z");
assert_eq!(UNIGRAM_PREFIXES[26], "other");
}
#[test]
fn test_multigram_prefixes() {
assert_eq!(MULTIGRAM_PREFIXES.len(), 678);
assert_eq!(MULTIGRAM_PREFIXES[0], "aa");
assert_eq!(MULTIGRAM_PREFIXES[675], "zz");
assert_eq!(MULTIGRAM_PREFIXES[676], "other");
assert_eq!(MULTIGRAM_PREFIXES[677], "punctuation");
}
#[test]
fn test_get_prefixes() {
let unigram_prefixes = get_prefixes(1);
assert_eq!(unigram_prefixes.len(), 27);
let bigram_prefixes = get_prefixes(2);
assert_eq!(bigram_prefixes.len(), 678);
}
#[test]
fn test_german_url() {
let url = get_file_url("de", 3, "abc").unwrap();
assert!(url.contains("googlebooks-ger-all-3gram"));
}
#[test]
fn test_is_valid_prefix_unigrams() {
assert!(is_valid_prefix(1, "a"));
assert!(is_valid_prefix(1, "j"));
assert!(is_valid_prefix(1, "z"));
assert!(is_valid_prefix(1, "other"));
assert!(!is_valid_prefix(1, "th"));
assert!(!is_valid_prefix(1, "aa"));
assert!(!is_valid_prefix(1, "punctuation"));
assert!(!is_valid_prefix(1, "invalid"));
assert!(!is_valid_prefix(1, ""));
}
#[test]
fn test_is_valid_prefix_multigrams() {
assert!(is_valid_prefix(2, "th"));
assert!(is_valid_prefix(2, "aa"));
assert!(is_valid_prefix(2, "zz"));
assert!(is_valid_prefix(2, "other"));
assert!(is_valid_prefix(2, "punctuation"));
assert!(is_valid_prefix(3, "th"));
assert!(is_valid_prefix(4, "aa"));
assert!(is_valid_prefix(5, "punctuation"));
assert!(!is_valid_prefix(2, "t"));
assert!(!is_valid_prefix(3, "a"));
assert!(!is_valid_prefix(2, "invalid"));
assert!(!is_valid_prefix(5, ""));
}
}