1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445
//! Dictionary based Thai word tokenizer //! //! It uses Dictionary to lookup for a known word. If there's multiple //! possible ways to tokenize words, it choose a result that has least number of words. //! This is known as "maximum matching" algorithm. It is one of the most common use //! algorithm that produce acceptable quality. //! //! It can handle some unknown words. It does so by minimizing number of characters //! that need to be took off from the text until a known word is found. use crate::dict::{SizedNode, terminals_prefix}; use super::MultiOwn; use super::{TreeOp, TreeNode}; /// Extra metadata required to get a proper tokenization on Thai text. #[allow(dead_code)] struct LeafNode<T> { /// An actual leaf node on the tree. node: T, /// A bytes count of unknown characters on the branch of tokenization tree unknown_bytes_count: usize } /// Make a result tree contains all possible construct of `unit` combination. /// Caller typicall need to do `make_result_tree(&*dict.root, "SOME_TEXT_TO_PARSE", root_node)` /// /// Typically, `unit` is a word. /// /// # Parameters /// - `nodes` - Set of root dictionary nodes. It's typical in [SizedDict](struct.SizedDict.html#field.root) /// - `value` - A string to be parsed by given dictionary. /// - `parent` - A [TreeNode](struct.TreeNode.html) that will be root node of all the parsed unit /// - `leaves` - A Vec contains all the possible leaves nodes. #[allow(dead_code)] fn make_result_tree<'a>(nodes: &[SizedNode], value: &'a str, parent: MultiOwn<TreeNode<&'a str>>, leaves: &mut Vec<LeafNode<MultiOwn<TreeNode<&'a str>>>>) { #[cfg(not(feature="single-thread"))] fn add_child<'a>(parent: &MultiOwn<TreeNode<&'a str>>, value: &'a str, upto: usize) -> MultiOwn<TreeNode<&'a str>> { std::sync::Arc::clone(parent).add_child(&value[..upto]) } #[cfg(feature="single-thread")] fn add_child<'a>(parent: &MultiOwn<TreeNode<&'a str>>, value: &'a str, upto: usize) -> MultiOwn<TreeNode<&'a str>> { std::rc::Rc::clone(parent).add_child(&value[..upto]) } /// Consume a portion of value until either entire value is consumed or token(s) are found. /// /// If it consumed entire value, it will add a leaf node into leaves vec. /// /// If it consumed chunk of value, it will add a node to parent node, change pointer to parent node to new node, /// change pointer to value to point to remaining slice, and add all potential known tokens to result vec. /// /// In anycase, it will update consumed_bytes but not accumulated_unknown_bytes. #[inline(always)] fn consume_unknown<'a>(nodes: &[SizedNode], value: &mut &'a str, accumulated_unknown_bytes: usize, consumed_bytes: &mut usize, parent: &mut MultiOwn<TreeNode<&'a str>>, results: &mut Vec<usize>, leaves: &mut Vec<LeafNode<MultiOwn<TreeNode<&'a str>>>>) { // Apply some algorithm to extract unknown word and repeatly re-evaluate if the remain // from algorithm is a known word let mut chars = value.chars(); // Take a chars iterator and consume all repeating chars while let Some(c) = chars.next() { *consumed_bytes += c.len_utf8(); terminals_prefix(nodes, value, *consumed_bytes, results); if results.len() > 0 { // Found an offset where sub-sequence chars is a known word. // Make a new node and make it parent of sun-sequence of valid words. *parent = add_child(parent, value, *consumed_bytes); // Shift value by consumed byte as the unknown word boundary is at consumed bytes. *value = &value[*consumed_bytes..]; // Since the value to evaluate is a slice where start is shifted by consumed bytes, // the results vec need to be subtracted by consumed bytes. results.iter_mut().for_each(|val| *val -= *consumed_bytes); // stop lookup as known word is found. break; } } if results.len() == 0 { // Entire value is consumed and no known word found. // It mean there's unknown word trailing or entire value is an unknown word. // No need to lookup more. Simply make entire value as a single leaf node. let node = add_child(parent, value, *consumed_bytes); leaves.push(LeafNode { node, unknown_bytes_count: accumulated_unknown_bytes + *consumed_bytes, }); } } let mut eval_queue = std::collections::VecDeque::new(); eval_queue.push_back((value, parent, 0)); // 0 is accumulated unknown bytes of given tree branch let mut results = Vec::new(); while let Some((mut value, mut parent, accumulated_unknown_bytes)) = eval_queue.pop_front() { // Each branch shall have it own remain value to be evaluate, node parent, and accumulated unknown bytes. results.clear(); terminals_prefix(nodes, value, 0, &mut results); // A prefix bytes that was unknown word which need to be consumed in order to reach known word. let mut consumed_bytes = 0; // no match for any entry in dictionary if results.len() == 0 && value.len() > 0 { consume_unknown(nodes, &mut value, accumulated_unknown_bytes, &mut consumed_bytes, &mut parent, &mut results, leaves); } let new_accumulated_unknown_bytes = accumulated_unknown_bytes + consumed_bytes; // On each result, it is a known word offset results.iter().for_each(|offset| { let node = add_child(&parent, value, *offset); let remain = &value[(*offset)..]; if remain.len() > 0 { // more value remain to try eval_queue.push_back((remain, node, new_accumulated_unknown_bytes)); } else { // no more value, add matched terminal to `leaves` vec leaves.push(LeafNode {node, unknown_bytes_count: new_accumulated_unknown_bytes}); } }); } } /// Maximal matching algorithm with unknown word support. /// /// This is an implementation based on concept of maximum matching. /// See this [Wikipedia page](https://en.wikipedia.org/wiki/Matching_(graph_theory)#Maximal_matchings) /// for brief explanation of the algorithm. /// /// It take a dictionary in form of &[SizedNode] and a text to be tokenized. /// # Parameters /// - `dict` - A slice of [dict::SizedNode](/tokenizer/dict/struct.SizedNode.html) which /// can be obtain from `root` field of [dict::SizedDict](/tokenizer/dict/struct.SizedDict.html). /// - `text` - A slice of string to be tokenized. /// # Return /// A vec contains slice of tokenized word. fn maximal_matching<'a>(dict: &[SizedNode], text: &'a str) -> Vec<&'a str> { /// There's three possible states in one vertex #[derive(Clone)] enum VertexState { /// A vertex that nobody visit yet None, /// A vertex that is recognized in advance while attempting to isloate unknown word Cache(Vec<usize>), /// A vertex that is visited by breadth-first-search strategy Some(Vec<usize>) } impl Default for VertexState { /// By default, `VertexState` is `None` fn default() -> VertexState { VertexState::None } } impl VertexState { /// Take current value out of this `VertexState` and leave `None` in place. /// If current state is `None`, it return `Option::None` pub fn take(&mut self) -> Option<Vec<usize>> { let value = std::mem::take(self); match value { VertexState::None => None, VertexState::Cache(v) | VertexState::Some(v) => { Some(v) }, } } } /// Take as least necessary bytes as needed until first known word is found. /// It need a dictionary to identify a known word. It will increment an offset by one character /// until there's a prefix match to a dictionary entry. /// For example, if text is "abcdef" and "ab" is unknown word and "cd" and "cde" is known word in dic /// then this function will put 3, and 4 in `results` vec and return 2 which is unknown word boundary. /// /// # Parameters /// - `nodes` - A slice of [dict::SizedNode](/tokenizer/dict/struct.SizedNode.html) which /// can be obtain from `root` field of [dict::SizedDict](/tokenizer/dict/struct.SizedDict.html). /// - `value` - A string slice to find an offset of unknown words. /// - `offset` - A position to start isolate an unknown word. /// - `results` - A vec which will store known words that come after the isolated unknown word. /// # Return /// It return an offset boundary of unknown word. /// For example, if text is "abcdef" and "cd" is the only unknown word and offset is 2, /// it will return 4. Caller can directly took slice from `&value[2..4]` to obtains that "cd" fn consume_unknown<'a>(nodes: &[SizedNode], value: &str, offset: usize, results: &mut Vec<usize>) -> usize { // Apply some algorithm to extract unknown word and repeatly re-evaluate if the remain // from algorithm is a known word let mut consumed_bytes = offset; let mut chars = value[consumed_bytes..].chars(); // Take a chars iterator and consume all repeating chars while let Some(c) = chars.next() { consumed_bytes += c.len_utf8(); terminals_prefix(nodes, value, consumed_bytes, results); if results.len() > 0 { // stop lookup as known word is found. break; } } consumed_bytes } // 2D dynamic vec to simulate word graph let mut vertices = vec![VertexState::None; text.len()]; // `branches` is reusable vec to temporarily hold possible branch from any particular vertex. let mut branches = Vec::with_capacity(text.len()); // `queue` is a processing queue of vertex to be processed. // The vertex being pushed to this queue shall make a graph traversal "breadth-first" let mut queue = Vec::with_capacity(text.len() / 2); queue.push([0, 0, 0]); // previous pointer, current vertex index, unknown bytes count // Assuming text has at least 2 chars per word let mut result = std::collections::VecDeque::with_capacity(text.len() / 2); let mut i = 0; let mut not_ended = true; while not_ended { // Retreive next offset let [_, offset, unknown_len] = queue[i]; branches.clear(); match vertices[offset] { VertexState::None => { // Find next prefix from offset terminals_prefix(dict, text, offset, &mut branches); // create state of vertex which is Vec that contains offset to vertex let mut vertex = Vec::with_capacity(branches.len()); for v in branches.iter() { vertex.push(*v); // add next offset to vertex state queue.push([i, *v, unknown_len]); if *v >= text.len() { not_ended = false; break; } } if branches.len() > 0 { // known word case vertices[offset] = VertexState::Some(vertex); } else { // No known word match so not even single branch is return let cur_unknown_length = consume_unknown(dict, text, offset, &mut branches); queue.push([i, cur_unknown_length, unknown_len + cur_unknown_length - offset]); // Identified unknown word boundary vertices[offset] = VertexState::Some(vec![cur_unknown_length]); if cur_unknown_length >= text.len() { // Unknown word is trailing in text break; // No need to do any futher processing } // All the returned branches are 1 step ahead of other branch so don't push it to queue yet // or it will break breadth-first strategy let mut peeked = branches.clone(); peeked.shrink_to_fit(); // The branch size shall never changed. vertices[cur_unknown_length] = VertexState::Cache(peeked); } }, VertexState::Cache(_) => { // Reach the peeked branches. Push all these branch into processing queue. let nexts = vertices[offset].take(); if let Some(nexts) = nexts { // attempt to add each vertex to processing queue. for v in nexts.iter() { queue.push([i, *v, unknown_len]); if *v >= text.len() { // There is a vertex that already reach last char of text. not_ended = false; break } } vertices[offset] = VertexState::Some(nexts); // Change state of vertex to Some } }, VertexState::Some(_) => { // We need to update the link back if vertex count is equals and unknown word count is lower. // So that it pick the path with least unknown word. // Since the result is construct based on queue and best result is a last vertex in queue, // it might point to a path that has more longer unknown token. } } i += 1; } let last_queue = queue.len() - 1; // Prune remain queue to see if there's any last vertex candidate with lesser unknown word. // Check only up to vertex before last element as last element is currently comparator vertex while i < queue.len() - 1 { let [_, offset, unknown_bytes] = queue[i]; match vertices[offset] { VertexState::None | VertexState::Cache(_) => {}, VertexState::Some(ref vs) => { if vs.iter().any(|v| {*v >= text.len()}) && unknown_bytes < queue[last_queue][2] { // There's at least one edge point to last node with lesser unknown_bytes. // Redirect last vertex reverse link to current vertex instead as it is new best vertex. queue[last_queue][0] = i; } } } i += 1; } let [mut i, mut last_offset, _] = queue[last_queue]; // last element of queue while i > 0 { let [index, offset, _] = queue[i]; // since offset is an offset of vertex and each vertex position designate a first char of word.. result.push_front(&text[offset..last_offset]); last_offset = offset; // move offset to beginning of character i = index; // move index to another node in queue } // first word result.push_front(&text[0..last_offset]); result.into() } /// Dictionary based Thai text tokenizer pub struct Tokenizer { dict: crate::dict::SizedDict, } impl Tokenizer { /// Construct a Thai tokenizer using given path as a dictionary. /// /// Thai text tokenization rely on dictionary or corpus depending on algorithm being use /// to identify a token. /// /// In this implementation, we chose Dictionary approach and use "maximum matching" algorithm. /// The quality of tokenization depends on quality of dictionary. /// /// The dictionary should be a text file where each line in text file must be individual word. /// The content of text file should have following format: /// ```txt /// กรุงเทพ /// กรุงเทพมหานคร /// จังหวัด /// ``` pub fn new<P: AsRef<std::path::Path>>(dict_path: P) -> std::io::Result<Tokenizer> { Ok(Tokenizer { dict: crate::dict::Dict::load_txt(dict_path)?.into() }) } } /// Create a tokenizer from slice of `&str` using the slice as dictionary. impl From<&[&str]> for Tokenizer { fn from(slice: &[&str]) -> Tokenizer { let mut dict = crate::dict::Dict::new(); slice.iter().for_each(|word| {dict.add(word)}); Tokenizer { dict: dict.into() } } } /// Create a tokenizer from slice of `&String` using the slice as dictionary. impl From<&[&String]> for Tokenizer { fn from(slice: &[&String]) -> Tokenizer { let mut dict = crate::dict::Dict::new(); slice.iter().for_each(|word| {dict.add(word)}); Tokenizer { dict: dict.into() } } } /// Create a tokenizer from slice of `String` using the slice as dictionary. impl From<&[String]> for Tokenizer { fn from(slice: &[String]) -> Tokenizer { let mut dict = crate::dict::Dict::new(); slice.iter().for_each(|word| {dict.add(word)}); Tokenizer { dict: dict.into() } } } impl crate::tokenizer::Tokenizer for Tokenizer { fn tokenize<'b>(&self, value: &'b str) -> Vec<&'b str> { #[cfg(not(feature="single-thread"))] use rayon::iter::ParallelIterator; #[cfg(not(feature="single-thread"))] fn make_iter(raw: &str) -> rayon::str::SplitWhitespace { use rayon::prelude::*; raw.par_split_whitespace() } #[cfg(feature="single-thread")] fn make_iter(raw: &str) -> std::str::SplitWhitespace { raw.split_whitespace() } make_iter(value).map(|boundary| { // let mut leaf_nodes = Vec::new(); // let root = TreeNode::root(); // make_result_tree(&self.dict.root, boundary, root, &mut leaf_nodes); // let mut min_count = boundary.len(); // worst case length // let mut min_unknown = boundary.len(); // Impossible case where each char is treat as unknown token // let mut idx = 0; // // Maximum matching approach, find a path with least node // leaf_nodes.iter().enumerate().for_each(|(i, n)| { // let level = n.node.level(); // let unknown = n.unknown_bytes_count; // if unknown < min_unknown { // min_unknown = unknown; // if level <= min_count { // // New best case, lower unknown and max matched // min_count = level; // idx = i; // } // } else { // if level < min_count { // // Subjective case, more unknown tokens but better matched // // Need more sophisticate approach to solve this // } // } // }); // let expected_node = leaf_nodes.remove(idx); // let result = expected_node.node.into_vec(); // result maximal_matching(&self.dict.root, boundary) }).flatten().collect() } } #[cfg(test)] mod tests;