pub mod replacer;
#[macro_use]
extern crate lazy_static;
extern crate prefixtree;
#[macro_use]
extern crate serde_derive;
use self::prefixtree::{prefix_tree_from_str, PrefixTree};
use regex_automata::dense::DenseDFA;
use regex_automata::Regex;
use regex_automata::DFA;
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::iter::Peekable;
use std::path::Path;
use thiserror::Error;
macro_rules! insert_prefix {
($filename:expr) => {
if cfg!(feature = "onedir") {
Path::new(concat!("./", $filename))
} else {
Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename))
}
};
}
macro_rules! insert_prefix_str {
($filename:expr) => {
if cfg!(feature = "onedir") {
concat!("./", $filename)
} else {
concat!(env!("CARGO_MANIFEST_DIR"), "/data/", $filename)
}
};
}
pub fn default_dict_path() -> &'static Path {
insert_prefix!("mixed-wordlist.txt")
}
pub fn thai_cluster_path() -> Option<String> {
Some(insert_prefix_str!("thai_cluster_rules.txt").to_owned())
}
pub fn thai_replace_rules_path() -> Option<String> {
Some(insert_prefix_str!("thai-replace-rules.json").to_owned())
}
pub type Dict = PrefixTree<char, bool>;
type ClusterRulesMatcher = DenseDFA<Vec<usize>, usize>;
type SplitRulesMatcher = Regex;
lazy_static! {
static ref DEFAULT_THAI_SPLIT_RE: Regex =
Regex::new("[\r\t\n ]+|[A-Za-z]+|[0-9]+|[๐-๙]+|“").unwrap();
}
#[derive(Error, Debug)]
pub enum WordcutError {
#[error("Cannot open cluster rules at `{0}`")]
CannotOpenClusterRulesAt(String),
#[error("Cannot read a cluster rule")]
CannotReadClusterRule,
#[error("Cannot compile cluster rules `{0}`")]
CannotCompileClusterRules(String),
#[error("Cannot open split rules at `{0}`")]
CannotOpenSplitRulesAt(String),
#[error("Cannot compile split rules `{0}`")]
CannotCompileSplitRules(String),
}
pub fn create_prefix_tree(words: &[&str]) -> PrefixTree<char, bool> {
let words_payloads: Vec<(&str, bool)> = words.iter().map(|&word| (word, true)).collect();
prefix_tree_from_str(&words_payloads[..])
}
#[derive(Clone, PartialEq, Eq, Copy, Debug, Serialize, Deserialize)]
pub enum EdgeType {
Init,
Dict,
Unk,
Pat,
}
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct Edge {
pub w: usize,
pub unk: usize,
pub p: usize,
pub etype: EdgeType,
}
impl Edge {
pub fn is_unk(&self) -> bool {
self.etype == EdgeType::Unk
}
pub fn better_than(&self, o: &Edge) -> bool {
if self.etype == EdgeType::Pat && o.etype == EdgeType::Unk {
return true;
}
if self.etype == EdgeType::Unk && o.etype == EdgeType::Pat {
return false;
}
if self.unk < o.unk {
return true;
}
if self.unk > o.unk {
return false;
}
if self.w < o.w {
return true;
}
if self.w > o.w {
return false;
}
if o.is_unk() && !self.is_unk() {
return true;
}
false
}
pub fn better(a: &Option<Edge>, b: &Option<Edge>) -> bool {
if a.is_none() {
return false;
}
if b.is_none() {
return true;
}
a.unwrap().better_than(&b.unwrap())
}
}
pub trait EdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
}
#[derive(Debug)]
pub struct EdgeBuildingContext<'a> {
pub text: &'a [char],
pub i: usize,
pub ch: char,
pub left_boundary: usize,
pub best_edge: Option<Edge>,
}
pub struct UnkEdgeBuilder {}
impl UnkEdgeBuilder {
pub fn new() -> UnkEdgeBuilder {
UnkEdgeBuilder {}
}
}
impl EdgeBuilder for UnkEdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
if context.best_edge.is_some() {
return None;
}
let source = path[context.left_boundary];
Some(Edge {
p: context.left_boundary,
etype: EdgeType::Unk,
unk: source.unk + 1,
w: source.w + 1,
})
}
}
#[derive(Clone)]
struct Pointer {
node_id: usize,
s: usize,
offset: usize,
is_final: bool,
}
impl Pointer {
fn update(&mut self, dict: &Dict, ch: char) -> bool {
match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
None => false,
Some(&(child_id, is_final, _)) => {
self.node_id = child_id as usize;
self.is_final = is_final;
self.offset += 1;
true
}
}
}
fn gen_edge(&self, path: &[Edge]) -> Edge {
let source = path[self.s];
Edge {
etype: EdgeType::Dict,
p: self.s,
w: source.w + 1,
unk: source.unk,
}
}
}
pub struct DictEdgeBuilder<'a> {
dict: &'a Dict,
pointers: Vec<Pointer>,
}
impl<'a> DictEdgeBuilder<'a> {
pub fn new(dict: &Dict) -> DictEdgeBuilder {
const MAX_SIZE: usize = 0xFF;
DictEdgeBuilder {
dict,
pointers: Vec::with_capacity(MAX_SIZE),
}
}
fn add_pointer(&mut self, context: &EdgeBuildingContext) {
self.pointers.push(Pointer {
node_id: 0,
offset: 0,
is_final: false,
s: context.i,
});
}
fn update_pointers(&mut self, context: &EdgeBuildingContext) {
let mut j = 0;
for i in 0..self.pointers.len() {
let valid = self.pointers[i].update(self.dict, context.ch);
if valid {
if j < i {
self.pointers[j] = self.pointers[i].clone()
}
j += 1
}
}
self.pointers.truncate(j);
}
fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
let mut best_edge: Option<Edge> = None;
for pointer in pointers {
if pointer.is_final {
let edge = pointer.gen_edge(path);
if best_edge.is_none() {
best_edge = Some(edge)
} else if edge.better_than(&best_edge.unwrap()) {
best_edge = Some(edge)
}
}
}
best_edge
}
}
impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
self.add_pointer(context);
self.update_pointers(context);
self.gen_edge(&self.pointers, path)
}
}
pub struct RuleBasedEdgeBuilder {
range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
}
impl RuleBasedEdgeBuilder {
pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
let mut ranges = vec![];
for m in re.find_iter(text.as_bytes()) {
let (ms, me) = m;
let s = byte_to_char_idx_map[ms];
let e = byte_to_char_idx_map[me];
ranges.push(TextRange { s, e });
}
RuleBasedEdgeBuilder {
range_peekable: ranges.into_iter().peekable(),
}
}
}
impl EdgeBuilder for RuleBasedEdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
loop {
if let Some(r) = self.range_peekable.peek() {
if context.i >= r.e {
self.range_peekable.next();
} else {
break;
}
} else {
return None;
}
}
if let Some(r) = self.range_peekable.peek() {
if r.e != context.i + 1 {
return None;
}
let source = path[r.s];
Some(Edge {
etype: EdgeType::Pat,
p: r.s,
w: source.w + 1,
unk: source.unk,
})
} else {
None
}
}
}
#[inline]
fn does_not_break_cluster(s: usize, e: usize, text_len: usize, clusters: &[usize]) -> bool {
(s == 0 || clusters[s] == 0 || clusters[s] != clusters[s - 1])
&& (e == text_len || clusters[e - 1] == 0 || clusters[e] != clusters[e - 1])
}
#[inline]
fn should_skip_edge(edge: &Option<Edge>, i: usize, text_len: usize, clusters: &[usize]) -> bool {
let mut skip_edge = false;
if let Some(edge) = edge {
let s = edge.p;
let e = i + 1;
skip_edge = !edge.is_unk() && !does_not_break_cluster(s, e, text_len, clusters);
}
skip_edge
}
fn build_path_with_clusters(
mut builders: Vec<&mut dyn EdgeBuilder>,
clusters: &[usize],
text: &[char],
) -> Vec<Edge> {
let mut path = vec![];
path.push(Edge {
w: 0,
unk: 0,
p: 0,
etype: EdgeType::Init,
});
let mut context = EdgeBuildingContext {
text,
i: 0,
ch: '\0',
left_boundary: 0,
best_edge: None,
};
let text_len = text.len();
for i in 0..text_len {
context.ch = text[i];
context.i = i;
context.best_edge = None;
for builder in &mut builders {
let edge = builder.build(&context, &path);
if !should_skip_edge(&edge, i, text_len, clusters)
&& Edge::better(&edge, &context.best_edge)
{
context.best_edge = edge
}
}
path.push(context.best_edge.unwrap());
if !context.best_edge.unwrap().is_unk() {
context.left_boundary = i + 1;
}
}
path
}
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct DagEdge {
pub s: usize,
pub e: usize,
pub etype: EdgeType,
}
pub type Dag = Vec<Vec<DagEdge>>;
pub trait DagEdgeBuilder {
fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge>;
}
impl<'a> DagEdgeBuilder for DictEdgeBuilder<'a> {
fn build_dag_edges(&mut self, context: &EdgeBuildingContext) -> Vec<DagEdge> {
self.add_pointer(context);
self.update_pointers(context);
self.pointers
.iter()
.filter(|p| p.is_final)
.map(|p| DagEdge {
s: p.s,
e: context.i + 1,
etype: EdgeType::Dict,
})
.collect()
}
}
pub fn build_dag(dict: &Dict, text: &Vec<char>) -> Dag {
let mut builders: Vec<Box<dyn DagEdgeBuilder>> = vec![Box::new(DictEdgeBuilder::new(dict))];
let mut dag = Vec::with_capacity(text.len() + 1);
for _ in 0..text.len() + 1 {
dag.push(vec![]);
}
dag[0].push(DagEdge {
s: 0,
e: 0,
etype: EdgeType::Init,
});
let mut context = EdgeBuildingContext {
text: &text,
i: 0,
ch: '\0',
left_boundary: 0,
best_edge: None,
};
for i in 0..text.len() {
context.ch = text[i];
context.i = i;
context.best_edge = None;
for builder in &mut builders {
for edge in builder.build_dag_edges(&context) {
dag[edge.e].push(edge)
}
}
}
let mut left_boundary = 0;
for i in 1..text.len() + 1 {
if dag[i].len() == 0 {
dag[i].push(DagEdge {
s: left_boundary,
e: i,
etype: EdgeType::Unk,
});
} else {
left_boundary = i;
}
}
dag
}
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub struct TextRange {
pub s: usize,
pub e: usize,
}
pub fn path_to_ranges(path: &[Edge]) -> Vec<TextRange> {
let len = path.len();
if len == 0 {
return vec![];
}
let mut ranges: Vec<TextRange> = Vec::with_capacity(len);
let mut e = len - 1;
while e > 0 {
let edge = &path[e];
let s = edge.p;
ranges.push(TextRange { s, e });
e = s;
}
ranges.reverse();
ranges
}
pub fn path_to_byte_ranges(path: &[Edge], text: &[char]) -> Vec<TextRange> {
let char_ranges = path_to_ranges(path);
let mut ranges: Vec<TextRange> = Vec::with_capacity(char_ranges.len());
let mut global_byte_offset = 0;
for r in char_ranges {
let mut word_byte_offset = 0;
for i in r.s..r.e {
word_byte_offset += text[i].len_utf8();
}
ranges.push(TextRange {
s: global_byte_offset,
e: global_byte_offset + word_byte_offset,
});
global_byte_offset += word_byte_offset;
}
ranges
}
pub fn path_to_str_vec(path: &[Edge], text: &[char]) -> Vec<String> {
let ranges = path_to_ranges(path);
let mut str_vec: Vec<String> = Vec::with_capacity(ranges.len());
for r in ranges {
let mut buf = String::with_capacity(3 * (r.e - r.s + 1));
for i in r.s..r.e {
buf.push(text[i]);
}
str_vec.push(buf)
}
str_vec
}
pub struct Wordcut {
dict: Dict,
cluster_re: Option<ClusterRulesMatcher>,
split_re: SplitRulesMatcher,
}
impl Wordcut {
pub fn new(dict: Dict) -> Wordcut {
Wordcut {
dict,
cluster_re: None,
split_re: DEFAULT_THAI_SPLIT_RE.clone(),
}
}
pub fn new_with_cluster_re(dict: Dict, cluster_re: ClusterRulesMatcher) -> Wordcut {
Wordcut {
dict,
cluster_re: Some(cluster_re),
split_re: DEFAULT_THAI_SPLIT_RE.clone(),
}
}
pub fn new_with_cluster_re_and_split_re(
dict: Dict,
cluster_re: ClusterRulesMatcher,
split_re: SplitRulesMatcher,
) -> Wordcut {
Wordcut {
dict,
cluster_re: Some(cluster_re),
split_re,
}
}
#[inline]
pub fn build_path(&self, text: &str, text_chars: &[char]) -> Vec<Edge> {
let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
let mut dict_edge_builder = DictEdgeBuilder::new(&self.dict);
let mut unk_edge_builder = UnkEdgeBuilder::new();
let mut rule_based_edge_builder =
RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &self.split_re);
let builders: Vec<&mut dyn EdgeBuilder> = vec![
&mut dict_edge_builder,
&mut unk_edge_builder,
&mut rule_based_edge_builder,
];
let clusters = if let Some(cluster_re) = &self.cluster_re {
find_clusters(text, &byte_to_char_idx_map, cluster_re, text_chars.len())
} else {
let mut clusters = vec![];
clusters.resize(text_chars.len() + 1, 0);
clusters
};
build_path_with_clusters(builders, &clusters, text_chars)
}
#[allow(dead_code)]
pub fn segment(&self, text: &str) -> Vec<TextRange> {
let text_chars: Vec<char> = text.chars().collect();
let path = self.build_path(text, &text_chars);
path_to_ranges(&path)
}
pub fn segment_into_byte_ranges(&self, text: &str) -> Vec<TextRange> {
let text_chars: Vec<char> = text.chars().collect();
let path = self.build_path(text, &text_chars);
path_to_byte_ranges(&path, &text_chars)
}
pub fn segment_into_strings(&self, text: &str) -> Vec<String> {
let text_chars: Vec<char> = text.chars().collect();
let path = self.build_path(text, &text_chars);
path_to_str_vec(&path, &text_chars)
}
pub fn put_delimiters(&self, text: &str, delim: &str) -> String {
self.segment_into_strings(text).join(delim)
}
#[allow(dead_code)]
pub fn build_dag(&self, text: &str) -> Dag {
build_dag(&self.dict, &text.chars().collect())
}
}
pub fn create_byte_to_char_idx_map(text: &str) -> Vec<usize> {
let mut byte_to_char_map = vec![];
let mut i = 0;
for b in text.as_bytes() {
if (*b as i8) >= -0x40 {
byte_to_char_map.push(i);
i += 1;
} else {
byte_to_char_map.push(0);
}
}
byte_to_char_map.push(i);
byte_to_char_map
}
#[derive(Debug)]
pub struct ClusterPointer {
state_id: usize,
p: usize,
}
#[derive(Debug)]
pub struct ClusterEdge {
acc_pat_len: usize,
unk_cnt: usize,
p: usize,
is_unk: bool,
}
pub fn find_cluster_path(dfa: &ClusterRulesMatcher, text: &str) -> Vec<ClusterEdge> {
let mut pointers = vec![];
let mut ch_i = 0;
let mut path = vec![];
let mut left_boundary = 0;
path.push(ClusterEdge {
p: 0,
acc_pat_len: 0,
unk_cnt: 0,
is_unk: false,
});
for ch in text.as_bytes() {
let mut best_edge: Option<ClusterEdge> = None;
pointers.push(ClusterPointer {
state_id: dfa.start_state(),
p: ch_i,
});
let mut j = 0;
for i in 0..pointers.len() {
let next_id = dfa.next_state(pointers[i].state_id, *ch);
if !dfa.is_dead_state(next_id) {
pointers[j] = ClusterPointer {
state_id: next_id,
p: pointers[i].p,
};
j += 1;
if dfa.is_match_state(next_id) {
let source = &path[pointers[i].p];
let edge = ClusterEdge {
p: pointers[i].p,
acc_pat_len: source.acc_pat_len + (ch_i - pointers[i].p + 1),
unk_cnt: source.unk_cnt,
is_unk: false,
};
if match &best_edge {
Some(b_edge) => {
b_edge.unk_cnt > edge.unk_cnt
|| (b_edge.unk_cnt == edge.unk_cnt
&& b_edge.acc_pat_len < edge.acc_pat_len)
}
None => true,
} {
best_edge = Some(edge);
}
}
}
}
pointers.truncate(j);
if best_edge.is_none() {
let source = &path[left_boundary];
best_edge = Some(ClusterEdge {
p: left_boundary,
acc_pat_len: source.acc_pat_len,
unk_cnt: source.unk_cnt + (ch_i - left_boundary + 1),
is_unk: true,
});
}
let best_edge = best_edge.unwrap();
if !best_edge.is_unk {
left_boundary = ch_i + 1;
}
path.push(best_edge);
ch_i += 1;
}
path
}
pub fn find_clusters(
text: &str,
byte_to_char_idx_map: &[usize],
dfa: &ClusterRulesMatcher,
len: usize,
) -> Vec<usize> {
let mut clusters = vec![];
clusters.resize(len, 0);
let mut id = 1;
let path = find_cluster_path(dfa, text);
let mut me = path.len() - 1;
while me > 0 {
let edge = &path[me];
let ms = edge.p;
let s = byte_to_char_idx_map[ms];
let e = byte_to_char_idx_map[me];
if !edge.is_unk {
for i in s..e {
clusters[i] = id;
}
id += 1;
}
me = ms;
}
clusters
}
pub fn load_wordlist(path: impl AsRef<Path>) -> io::Result<Vec<String>> {
let f = File::open(path.as_ref())?;
let f = io::BufReader::new(f);
Ok(f.lines().map(|line| line.unwrap()).collect())
}
pub fn load_dict(path: impl AsRef<Path>) -> io::Result<Dict> {
let wordlist = load_wordlist(path)?;
let wordlist: Vec<_> = wordlist.iter().map(|w| &w[..]).collect();
Ok(create_prefix_tree(&wordlist))
}
pub fn load_cluster_rules(path: &Path) -> Result<ClusterRulesMatcher, WordcutError> {
let f = File::open(path)
.map_err(|_| WordcutError::CannotOpenClusterRulesAt(path.to_string_lossy().to_string()))?;
let f = io::BufReader::new(f);
let mut rules = vec![];
for line in f.lines() {
let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
rules.push(format!("({})", line.trim()));
}
let rules = rules.join("|");
let mut builder = regex_automata::dense::Builder::new();
builder.anchored(true);
builder.unicode(true);
Ok(builder
.build(&rules)
.map_err(|_| WordcutError::CannotCompileClusterRules(rules))?)
}
pub fn load_split_rules(path: &Path) -> Result<SplitRulesMatcher, WordcutError> {
let f = File::open(path)
.map_err(|_| WordcutError::CannotOpenSplitRulesAt(path.to_string_lossy().to_string()))?;
let f = io::BufReader::new(f);
let mut rules = vec![];
for line in f.lines() {
let line = line.map_err(|_| WordcutError::CannotReadClusterRule)?;
rules.push(format!("({})", line.trim()));
}
let rules = rules.join("|");
Ok(Regex::new(&rules).map_err(|_| WordcutError::CannotCompileSplitRules(rules))?)
}
#[cfg(test)]
mod tests {
extern crate serde_json;
use super::*;
use DagEdge;
use EdgeType;
use TextRange;
use Wordcut;
#[test]
fn test_prefix_tree() {
let prefix_tree = super::create_prefix_tree(&["A"]);
assert_eq!(
prefix_tree.seek(&(0, 0, 'A')),
Some(&(0 as u32, true, Some(true)))
);
assert_eq!(prefix_tree.seek(&(0, 0, 'B')), None);
}
#[test]
fn test_segment() {
let dict = super::create_prefix_tree(&["กา", "กาก"]);
let wordcut = Wordcut::new(dict);
let ranges = wordcut.segment("กากกา");
let expected = vec![TextRange { s: 0, e: 3 }, TextRange { s: 3, e: 5 }];
assert_eq!(ranges, expected)
}
#[test]
fn test_segment_into_byte_ranges() {
let dict = super::create_prefix_tree(&["กา", "กาก"]);
let wordcut = Wordcut::new(dict);
let ranges = wordcut.segment_into_byte_ranges("กากกา");
let expected = vec![TextRange { s: 0, e: 9 }, TextRange { s: 9, e: 15 }];
assert_eq!(ranges, expected)
}
#[test]
fn test_segment_to_strings() {
let dict = super::create_prefix_tree(&["กา", "กาก"]);
let wordcut = Wordcut::new(dict);
let toks = wordcut.segment_into_strings("กากกา");
let expected = vec![String::from("กาก"), String::from("กา")];
assert_eq!(toks, expected)
}
#[test]
fn test_put_delimiters() {
let dict = super::create_prefix_tree(&["กา", "กาก"]);
let wordcut = Wordcut::new(dict);
assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
}
#[test]
fn test_load_wordlist() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let v = super::load_wordlist(path);
assert_eq!(v.unwrap(), vec![String::from("กา"), String::from("กาก")])
}
#[test]
fn test_wordcut() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new(dict.unwrap());
assert_eq!(wordcut.put_delimiters("กากกา", "|"), String::from("กาก|กา"))
}
#[test]
fn test_wordcut_with_replacer() {
let dict = super::create_prefix_tree(&["ข้อ", "รับ", "สำหรับ", "เสนอ"]);
let wordcut = Wordcut::new(dict);
let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
let mod_text = replacer::replace(&imm_rules, "สําหรับข้อเสนอ");
assert_eq!(
wordcut.put_delimiters(&mod_text, "|"),
String::from("สำหรับ|ข้อ|เสนอ")
)
}
#[test]
fn test_wordcut_with_replacer_two_occurs() {
let dict = super::create_prefix_tree(&["กำลัง", "ทำ", "พยายาม", "ลัง", "ให้"]);
let wordcut = Wordcut::new(dict);
let rule = r###"{"pattern": "ํา", "replacement": "ำ"}"###;
let rule: replacer::Rule = serde_json::from_str(rule).unwrap();
let imm_rules = replacer::ImmRule::from_rules(&vec![rule]).unwrap();
let mod_text = replacer::replace(&imm_rules, "กําลังพยายามทําให้");
assert_eq!(
wordcut.put_delimiters(&mod_text, "|"),
String::from("กำลัง|พยายาม|ทำ|ให้")
)
}
#[test]
fn test_wordcut_with_latin() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new(dict.unwrap());
assert_eq!(
wordcut.put_delimiters("ฑฑACญญ", "|"),
String::from("ฑฑ|AC|ญญ")
)
}
#[test]
fn test_wordcut_with_two_spaces() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new(dict.unwrap());
assert_eq!(
wordcut.put_delimiters("กา มา", "|"),
String::from("กา| |มา")
)
}
#[test]
fn test_wordcut_with_two_spaces_unk() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new(dict.unwrap());
assert_eq!(
wordcut.put_delimiters("แแ ยย", "|"),
String::from("แแ| |ยย")
)
}
#[test]
fn test_wordcut_with_unicode_quote() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new(dict.unwrap());
assert_eq!(
wordcut.put_delimiters("“ฆกากา”", "|"),
String::from("“|ฆ|กา|กา|”")
)
}
#[test]
fn test_dag() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path).unwrap();
let wordcut = Wordcut::new(dict);
let dag = wordcut.build_dag("กากกา");
let expected = vec![
vec![DagEdge {
s: 0,
e: 0,
etype: EdgeType::Init,
}], vec![DagEdge {
s: 0,
e: 1,
etype: EdgeType::Unk,
}], vec![DagEdge {
s: 0,
e: 2,
etype: EdgeType::Dict,
}], vec![DagEdge {
s: 0,
e: 3,
etype: EdgeType::Dict,
}], vec![DagEdge {
s: 3,
e: 4,
etype: EdgeType::Unk,
}], vec![DagEdge {
s: 3,
e: 5,
etype: EdgeType::Dict,
}], ];
assert_eq!(dag, expected);
}
#[test]
fn test_dag_in_object() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let dag = super::build_dag(&dict.unwrap(), &"กากกา".chars().collect());
let expected = vec![
vec![DagEdge {
s: 0,
e: 0,
etype: EdgeType::Init,
}], vec![DagEdge {
s: 0,
e: 1,
etype: EdgeType::Unk,
}], vec![DagEdge {
s: 0,
e: 2,
etype: EdgeType::Dict,
}], vec![DagEdge {
s: 0,
e: 3,
etype: EdgeType::Dict,
}], vec![DagEdge {
s: 3,
e: 4,
etype: EdgeType::Unk,
}], vec![DagEdge {
s: 3,
e: 5,
etype: EdgeType::Dict,
}], ];
assert_eq!(dag, expected);
}
#[test]
fn test_dag_empty() {
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/thai2words.txt"));
let dict = super::load_dict(&path);
let dag = super::build_dag(&dict.unwrap(), &"".chars().collect());
let expected = vec![
vec![DagEdge {
s: 0,
e: 0,
etype: EdgeType::Init,
}], ];
assert_eq!(dag, expected);
}
#[test]
fn test_dag_to_json() {
let dag = vec![
vec![DagEdge {
s: 0,
e: 0,
etype: EdgeType::Init,
}], ];
let s = serde_json::to_string(&dag).unwrap();
assert_eq!(s, "[[{\"s\":0,\"e\":0,\"etype\":\"Init\"}]]");
}
#[test]
fn test_find_clusters() {
let text = "กาแกกก์A";
let path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&path).unwrap();
let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
let clusters = find_clusters(
text,
&byte_to_char_idx_map,
&cluster_re,
text.chars().count(),
);
assert_eq!(clusters, vec![2, 2, 1, 1, 1, 1, 1, 0]);
}
#[test]
fn test_wordcut_with_clusters() {
let text = "แมวแฐแกกก์มา";
let cluster_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
assert_eq!(
wordcut.put_delimiters(text, "|||"),
String::from("แมว|||แฐแกกก์|||มา")
);
}
#[test]
fn test_wordcut_with_clusters_portsmouth() {
let text = "จากพอร์ตสมัธไป";
let cluster_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
assert_eq!(
wordcut.put_delimiters(text, "|||"),
String::from("จาก|||พอร์ต|||สมัธ|||ไป")
);
}
#[test]
fn test_wordcut_with_clusters2() {
let text = "มีรีเควสต์อะไร";
let cluster_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
assert_eq!(
wordcut.put_delimiters(text, "|||"),
String::from("มี|||รี|||เค|||วสต์|||อะไร")
);
}
#[test]
fn test_wordcut_khmer_cluster_basic() {
let text = "ឡារី";
let cluster_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/khmer_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/khmerdict.txt"));
let dict = super::load_dict(&path);
let wordcut = Wordcut::new_with_cluster_re(dict.unwrap(), cluster_re);
assert_eq!(wordcut.put_delimiters(text, "|||"), String::from("ឡា|||រី"));
}
#[test]
fn test_rule_based_edge_builder() {
let text = " ABก";
let text_chars: Vec<char> = text.chars().collect();
let byte_to_char_idx_map = create_byte_to_char_idx_map(text);
let mut builder =
RuleBasedEdgeBuilder::new(&byte_to_char_idx_map, text, &DEFAULT_THAI_SPLIT_RE);
let mut path = vec![];
path.push(Edge {
w: 10,
unk: 20,
p: 0,
etype: EdgeType::Init,
});
let edge = builder.build(
&EdgeBuildingContext {
text: &text_chars,
i: 0,
ch: '\0',
left_boundary: 0,
best_edge: None,
},
&path,
);
assert!(edge.is_none());
path.push(Edge {
w: 20,
unk: 30,
p: 0,
etype: EdgeType::Unk,
});
let edge = builder.build(
&EdgeBuildingContext {
text: &text_chars,
i: 1,
ch: '\0',
left_boundary: 0,
best_edge: None,
},
&path,
);
assert!(edge.is_some());
path.push(Edge {
w: 30,
unk: 40,
p: 0,
etype: EdgeType::Pat,
});
let edge = builder.build(
&EdgeBuildingContext {
text: &text_chars,
i: 2,
ch: '\0',
left_boundary: 0,
best_edge: None,
},
&path,
);
assert!(edge.is_none());
path.push(Edge {
w: 50,
unk: 60,
p: 0,
etype: EdgeType::Unk,
});
let edge = builder.build(
&EdgeBuildingContext {
text: &text_chars,
i: 3,
ch: '\0',
left_boundary: 0,
best_edge: None,
},
&path,
);
assert!(edge.is_some());
let edge = edge.unwrap();
assert_eq!(
edge,
Edge {
w: 31,
unk: 40,
p: 2,
etype: EdgeType::Pat
}
);
}
#[test]
fn test_wordcut_with_split_rules() {
let text = "AB X(A)/12";
let cluster_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let split_path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_split_rules.txt"
));
let cluster_re = super::load_cluster_rules(&cluster_path).unwrap();
let path = super::Path::new(concat!(env!("CARGO_MANIFEST_DIR"), "/data/words_th.txt"));
let dict = super::load_dict(&path);
let split_re = load_split_rules(&split_path).unwrap();
let wordcut =
Wordcut::new_with_cluster_re_and_split_re(dict.unwrap(), cluster_re, split_re);
assert_eq!(
wordcut.put_delimiters(text, "|||"),
String::from("AB||| |||X|||(|||A|||)|||/|||12")
);
}
#[test]
fn test_find_clusters_path() {
let path = super::Path::new(concat!(
env!("CARGO_MANIFEST_DIR"),
"/data/thai_cluster_rules.txt"
));
let cluster_re = super::load_cluster_rules(&path).unwrap();
let cluster_path = find_cluster_path(&cluster_re, "เกียำ");
assert_eq!(cluster_path.len(), 16);
assert_eq!(cluster_path[15].p, 9);
}
}