use std::io::Result;
fn main() -> Result<()> {
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=process_map");
#[cfg(not(feature = "runtime_build"))]
{
use std::collections::HashMap;
use std::env;
use std::fs::File;
use std::io::Write;
use daachorse::{
CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder,
MatchKind as DoubleArrayAhoCorasickMatchKind,
};
const FANJIAN: &str = include_str!("./process_map/FANJIAN.txt");
const NUM_NORM: &str = include_str!("./process_map/NUM-NORM.txt");
const NORM: &str = include_str!("./process_map/NORM.txt");
const PINYIN: &str = include_str!("./process_map/PINYIN.txt");
let out_dir = env::var("OUT_DIR").unwrap();
let process_str_map = HashMap::from([
("fanjian", vec![FANJIAN]),
("normalize", vec![NORM, NUM_NORM]),
("pinyin", vec![PINYIN]),
]);
for process_type_bit_str in ["fanjian", "normalize", "pinyin"] {
let mut process_dict = HashMap::new();
for process_map in process_str_map.get(process_type_bit_str).unwrap() {
process_dict.extend(process_map.trim().lines().map(|pair_str| {
let mut pair_str_split = pair_str.split('\t');
(
pair_str_split.next().unwrap(),
pair_str_split.next().unwrap(),
)
}))
}
process_dict.retain(|&key, &mut value| key != value);
let process_list = process_dict
.iter()
.map(|(&key, _)| key)
.collect::<Vec<&str>>();
let mut process_list_bin =
File::create(format!("{out_dir}/{process_type_bit_str}_process_list.bin"))?;
process_list_bin.write_all(process_list.join("\n").as_bytes())?;
let process_replace_list = process_dict
.iter()
.map(|(_, &val)| val)
.collect::<Vec<&str>>();
let mut process_replace_list_bin = File::create(format!(
"{out_dir}/{process_type_bit_str}_process_replace_list.bin"
))?;
process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?;
if process_type_bit_str == "pinyin" {
let process_replace_list = process_dict
.iter()
.map(|(_, &val)| val.trim_matches(' '))
.collect::<Vec<&str>>();
let mut process_replace_list_bin =
File::create(format!("{out_dir}/pinyinchar_process_replace_list.bin"))?;
process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?;
}
if ["fanjian", "pinyin"].contains(&process_type_bit_str) {
let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
CharwiseDoubleArrayAhoCorasickBuilder::new()
.match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
.build(&process_list)
.unwrap();
let matcher_bytes = matcher.serialize();
let mut matcher_bin = File::create(format!(
"{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin"
))?;
matcher_bin.write_all(&matcher_bytes)?;
}
#[cfg(not(feature = "dfa"))]
if process_type_bit_str == "normalize" {
let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
CharwiseDoubleArrayAhoCorasickBuilder::new()
.match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest)
.build(&process_list)
.unwrap();
let matcher_bytes = matcher.serialize();
let mut matcher_bin = File::create(format!(
"{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin"
))?;
matcher_bin.write_all(&matcher_bytes)?;
}
}
#[cfg(not(feature = "dfa"))]
{
use std::collections::HashSet;
const TEXT_DELETE: &str = include_str!("./process_map/TEXT-DELETE.txt");
const WHITE_SPACE: &[&str] = &[
"\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}",
"\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}",
"\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}",
"\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}",
];
let mut process_set = HashSet::new();
process_set.extend(TEXT_DELETE.trim().lines().map(|line| line));
process_set.extend(WHITE_SPACE);
let process_list = process_set.iter().map(|&s| s).collect::<Vec<&str>>();
let mut process_list_bin = File::create(format!("{out_dir}/delete_process_list.bin"))?;
process_list_bin.write_all(process_list.join("\n").as_bytes())?;
let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
CharwiseDoubleArrayAhoCorasickBuilder::new()
.match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest)
.build(&process_list)
.unwrap();
let matcher_bytes = matcher.serialize();
let mut matcher_bin = File::create(format!(
"{out_dir}/delete_daachorse_charwise_u32_matcher.bin"
))?;
matcher_bin.write_all(&matcher_bytes)?;
}
}
Ok(())
}