use std::collections::{BTreeSet, HashMap};
use std::env;
use std::fs::File;
use std::io::BufWriter;
use std::io::Write;
use std::path::Path;
use std::sync::LazyLock;
static DEFAULT_PINYIN_DATA: &str = include_str!("data/pinyin.txt");
static DEFAULT_STOPWORD: &str = include_str!("data/stopword.txt");
static TONE_TO_PLAIN: LazyLock<HashMap<char, char>> = LazyLock::new(|| {
HashMap::from([
('ā', 'a'),
('á', 'a'),
('ǎ', 'a'),
('à', 'a'),
('ē', 'e'),
('é', 'e'),
('ě', 'e'),
('è', 'e'),
('ế', 'e'),
('ề', 'e'),
('ê', 'e'),
('ō', 'o'),
('ó', 'o'),
('ǒ', 'o'),
('ò', 'o'),
('ī', 'i'),
('í', 'i'),
('ǐ', 'i'),
('ì', 'i'),
('ū', 'u'),
('ú', 'u'),
('ǔ', 'u'),
('ù', 'u'),
('ǘ', 'u'),
('ǚ', 'u'),
('ǜ', 'u'),
('ü', 'u'),
('ń', 'n'),
('ň', 'n'),
('ǹ', 'n'),
('ḿ', 'm'),
])
});
fn to_plain(input: &str) -> String {
let value = input
.chars()
.map(|ch| {
if let Some(char) = TONE_TO_PLAIN.get(&ch) {
char.to_owned()
} else {
ch
}
})
.collect::<String>();
let values = value.split(",").map(str::trim).collect::<BTreeSet<&str>>();
let mut pinyin = "\"".to_owned();
let len = values.len() - 1;
for (index, value) in values.iter().enumerate() {
pinyin.push_str(value);
if index != len {
pinyin.push(',');
}
}
pinyin.push('"');
pinyin
}
fn main() {
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=data/pinyin.txt");
println!("cargo:rerun-if-changed=data/stopword.txt");
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("pinyin_data.rs");
let mut file = BufWriter::new(File::create(&path).unwrap());
let mut dirt = phf_codegen::Map::new();
for line in DEFAULT_PINYIN_DATA.split("\n") {
if line.is_empty() || line.starts_with("#") {
continue;
}
let mut codepoint_and_pinyin = line.split(": ");
let codepoint = if let Some(codepoint) = codepoint_and_pinyin.next() {
char::from_u32(u32::from_str_radix(&codepoint[2..], 16).unwrap()).unwrap()
} else {
char::default()
};
let pinyin = if let Some(pinyin) = codepoint_and_pinyin.next() {
to_plain(pinyin)
} else {
String::default()
};
dirt.entry(codepoint, pinyin);
}
write!(
&mut file,
"static PINYIN_DIRT: phf::Map<char, &'static str> = {}",
dirt.build()
)
.unwrap();
writeln!(&mut file, ";").unwrap();
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("stopword_data.rs");
let mut file = BufWriter::new(File::create(&path).unwrap());
let mut stopword = phf_codegen::OrderedSet::new();
for line in DEFAULT_STOPWORD.split("\n") {
if line.is_empty() || line.starts_with("#") {
continue;
}
stopword.entry(line.trim());
}
write!(
&mut file,
"static STOPWORD: phf::OrderedSet<&'static str> = {}",
stopword.build()
)
.unwrap();
writeln!(&mut file, ";").unwrap();
}