use std::collections::HashMap;
mod unicode_blocks;
pub mod models;
pub const DEFAULT_THRESHOLD: i32 = 1000;
pub type Model = HashMap<String, i32>;
const INVALID_FEATURE: &str = "▔";
pub fn parse(model: &Model, input: &str) -> Vec<String> {
parse_with_threshold(model, input, DEFAULT_THRESHOLD)
}
pub fn parse_with_threshold(model: &Model, input: &str, threshold: i32) -> Vec<String> {
if input.is_empty() {
return vec![String::default()];
}
let mut out: Vec<String> = Vec::new();
let mut p1 = "U"; let mut p2 = "U"; let mut p3 = "U";
let mut chars = input.char_indices();
let (mut w1, mut b1) = ("", INVALID_FEATURE); let (mut w2, mut b2) = ("", INVALID_FEATURE); let (mut w3, mut s3, mut b3) = get_unicode_block_and_feature(input, &mut chars); let (mut w4, mut s4, mut b4) = get_unicode_block_and_feature(input, &mut chars); let (mut w5, mut s5, mut b5) = get_unicode_block_and_feature(input, &mut chars);
let mut start: usize = 0;
let mut end = s3;
let mut wb = String::with_capacity(20);
while s3 != 0 {
let (w6, s6, b6) = get_unicode_block_and_feature(input, &mut chars);
let score = get_feature(
model, &mut wb, w1, w2, w3, w4, w5, w6, b1, b2, b3, b4, b5, b6, p1, p2, p3,
);
if score > threshold {
out.push(input[start..end].to_string());
start = end;
}
p1 = p2;
p2 = p3;
if score > 0 {
p3 = "B"; } else {
p3 = "O"; }
w1 = w2;
w2 = w3;
w3 = w4;
w4 = w5;
w5 = w6;
b1 = b2;
b2 = b3;
b3 = b4;
b4 = b5;
b5 = b6;
s3 = s4;
s4 = s5;
s5 = s6;
end += s3;
}
if start < input.len() {
out.push(input[start..].to_string());
}
out
}
fn get_unicode_block_and_feature<'a>(
input: &'a str,
chars: &mut std::str::CharIndices,
) -> (&'a str, usize, &'a str) {
let v = chars.next();
if v.is_none() {
return ("", 0, INVALID_FEATURE);
}
let (index, c) = v.unwrap();
let size = c.len_utf8();
let pos = match unicode_blocks::UNICODE_BLOCKS.binary_search(&(c as u32)) {
Ok(v) => v + 1,
Err(e) => e,
};
(
&input[index..index + size],
size,
unicode_blocks::BLOCK_FEATURES[pos],
)
}
#[allow(clippy::too_many_arguments)]
fn get_feature(
model: &Model,
buf: &mut String, w1: &str,
w2: &str,
w3: &str,
w4: &str,
w5: &str,
w6: &str,
b1: &str,
b2: &str,
b3: &str,
b4: &str,
b5: &str,
b6: &str,
p1: &str,
p2: &str,
p3: &str,
) -> i32 {
let mut score: i32 = 0;
score += model.get(key(buf, &["UP1:", p1])).unwrap_or(&0);
score += model.get(key(buf, &["UP2:", p2])).unwrap_or(&0);
score += model.get(key(buf, &["UP3:", p3])).unwrap_or(&0);
score += model.get(key(buf, &["BP1:", p1, p2])).unwrap_or(&0);
score += model.get(key(buf, &["BP2:", p2, p3])).unwrap_or(&0);
score += model.get(key(buf, &["UW1:", w1])).unwrap_or(&0);
score += model.get(key(buf, &["UW2:", w2])).unwrap_or(&0);
score += model.get(key(buf, &["UW3:", w3])).unwrap_or(&0);
score += model.get(key(buf, &["UW4:", w4])).unwrap_or(&0);
score += model.get(key(buf, &["UW5:", w5])).unwrap_or(&0);
score += model.get(key(buf, &["UW6:", w6])).unwrap_or(&0);
score += model.get(key(buf, &["BW1:", w2, w3])).unwrap_or(&0);
score += model.get(key(buf, &["BW2:", w3, w4])).unwrap_or(&0);
score += model.get(key(buf, &["BW3:", w4, w5])).unwrap_or(&0);
score += model.get(key(buf, &["TW1:", w1, w2, w3])).unwrap_or(&0);
score += model.get(key(buf, &["TW2:", w2, w3, w4])).unwrap_or(&0);
score += model.get(key(buf, &["TW3:", w3, w4, w5])).unwrap_or(&0);
score += model.get(key(buf, &["TW4:", w4, w5, w6])).unwrap_or(&0);
score += model.get(key(buf, &["UB1:", b1])).unwrap_or(&0);
score += model.get(key(buf, &["UB2:", b2])).unwrap_or(&0);
score += model.get(key(buf, &["UB3:", b3])).unwrap_or(&0);
score += model.get(key(buf, &["UB4:", b4])).unwrap_or(&0);
score += model.get(key(buf, &["UB5:", b5])).unwrap_or(&0);
score += model.get(key(buf, &["UB6:", b6])).unwrap_or(&0);
score += model.get(key(buf, &["BB1:", b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["BB2:", b3, b4])).unwrap_or(&0);
score += model.get(key(buf, &["BB3:", b4, b5])).unwrap_or(&0);
score += model.get(key(buf, &["TB1:", b1, b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["TB2:", b2, b3, b4])).unwrap_or(&0);
score += model.get(key(buf, &["TB3:", b3, b4, b5])).unwrap_or(&0);
score += model.get(key(buf, &["TB4:", b4, b5, b6])).unwrap_or(&0);
score += model.get(key(buf, &["UQ1:", p1, b1])).unwrap_or(&0);
score += model.get(key(buf, &["UQ2:", p2, b2])).unwrap_or(&0);
score += model.get(key(buf, &["UQ3:", p3, b3])).unwrap_or(&0);
score += model.get(key(buf, &["BQ1:", p2, b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["BQ2:", p2, b3, b4])).unwrap_or(&0);
score += model.get(key(buf, &["BQ3:", p3, b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["BQ4:", p3, b3, b4])).unwrap_or(&0);
score += model.get(key(buf, &["TQ1:", p2, b1, b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["TQ2:", p2, b2, b3, b4])).unwrap_or(&0);
score += model.get(key(buf, &["TQ3:", p3, b1, b2, b3])).unwrap_or(&0);
score += model.get(key(buf, &["TQ4:", p3, b2, b3, b4])).unwrap_or(&0);
score
}
fn key<'a>(buf: &'a mut String, params: &[&str]) -> &'a str {
buf.clear();
for param in params {
buf.push_str(param);
}
buf
}
#[cfg(test)]
mod tests {
#[test]
fn test_parse() {
let m = super::models::default_japanese_model();
assert_eq!(super::parse(m, ""), vec![""]);
assert_eq!(super::parse(m, "日本語"), vec!["日本語"]);
assert_eq!(super::parse(m, "水と油"), vec!["水と", "油"]);
assert_eq!(
super::parse(m, "水道水とミネラルウォーター"),
vec!["水道水と", "ミネラルウォーター"]
);
assert_eq!(
super::parse(m, "PythonとJavaScriptとGolang"),
vec!["Pythonと", "JavaScriptと", "Golang"]
);
assert_eq!(
super::parse(
m,
"日本語の文章において語の区切りに空白を挟んで記述すること"
),
vec![
"日本語の",
"文章に",
"おいて",
"語の",
"区切りに",
"空白を",
"挟んで",
"記述する",
"こと"
]
);
assert_eq!(
super::parse(m, "これはテストです。"),
vec!["これは", "テストです。"]
);
assert_eq!(
super::parse(m, "これは美しいペンです。"),
vec!["これは", "美しい", "ペンです。"]
);
assert_eq!(
super::parse(m, "今日は天気です。"),
vec!["今日は", "天気です。"]
);
assert_eq!(
super::parse(m, "今日はとても天気です。"),
vec!["今日は", "とても", "天気です。"]
);
assert_eq!(
super::parse(m, "あなたに寄り添う最先端のテクノロジー。"),
vec!["あなたに", "寄り添う", "最先端の", "テクノロジー。"]
);
assert_eq!(
super::parse(m, "これはテストです。今日は晴天です。"),
vec!["これは", "テストです。", "今日は", "晴天です。"]
);
assert_eq!(
super::parse(m, "これはテストです。\n今日は晴天です。"),
vec!["これは", "テストです。", "\n今日は", "晴天です。"]
);
}
#[test]
fn test_parse_zh_hans() {
let m = super::models::default_simplified_chinese_model();
assert_eq!(super::parse(m, ""), vec![""]);
assert_eq!(
super::parse(m, "今天是晴天。"),
vec!["今天", "是", "晴天。"]
);
}
#[test]
fn test_get_unicode_block_and_feature() {
let seek = |chars: &mut std::str::CharIndices, offset: usize| {
for _ in 0..offset {
chars.next();
}
};
let input = "abc";
let mut chars = input.char_indices();
seek(&mut chars, 0);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("a", 1, "001",)
);
let input = "xyz";
let mut chars = input.char_indices();
seek(&mut chars, 2);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("z", 1, "001",)
);
let input = "out of index";
let mut chars = input.char_indices();
seek(&mut chars, 12);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("", 0, super::INVALID_FEATURE,)
);
let input = "あいうえお";
let mut chars = input.char_indices();
seek(&mut chars, 0);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("あ", 3, "108",)
);
let input = "わをん";
let mut chars = input.char_indices();
seek(&mut chars, 2);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("ん", 3, "108",)
);
let input = "安";
let mut chars = input.char_indices();
seek(&mut chars, 0);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("安", 3, "120",)
);
let input = "範囲外アクセス";
let mut chars = input.char_indices();
seek(&mut chars, 7);
assert_eq!(
super::get_unicode_block_and_feature(input, &mut chars),
("", 0, super::INVALID_FEATURE,)
);
}
#[test]
fn test_key() {
let mut wb = String::with_capacity(20);
assert_eq!(super::key(&mut wb, &[""]), "");
assert_eq!(super::key(&mut wb, &["AAA", "BBB"]), "AAABBB");
assert_eq!(super::key(&mut wb, &["AAA", "BBB", "CCC"]), "AAABBBCCC");
assert_eq!(
super::key(&mut wb, &["TW4:", "日", "本", "語"]),
"TW4:日本語"
);
assert_eq!(
super::key(&mut wb, &["TQ4:", "O", "120", "120", "120"]),
"TQ4:O120120120"
);
assert_eq!(wb.capacity(), 20);
}
}