tokenizer/tokenizer/mod.rs
1//! A trait and some implementation of Tokenizer
2//!
3//! This module is where the [Tokenizer](trait.Tokenizer.html) trait is defined.
4//! It contains sub-modules that actually implement the trait for particular language.
5//!
6//! Current list of sample tokenizer implementation:
7//! - English
8//! - Thai
9
10#[cfg(not(feature="single-thread"))]
11use std::sync::{Arc, RwLock, Weak};
12
13#[cfg(feature="single-thread")]
14use std::{cell::RefCell, rc::{Rc, Weak}};
15
16#[cfg(not(feature="single-thread"))]
17type MultiOwn<T> = Arc<RwLock<T>>;
18
19#[cfg(feature="single-thread")]
20type MultiOwn<T> = Rc<RefCell<T>>;
21
22/// Currently supported tree operation.
23///
24/// Since Rust tree is not allow to have cyclic relation ship thus we need to wrap
25/// a node with `Weak` on one end and either `Rc` or `Arc` in another end.
26/// Since both of that types required interior mutability to mutate data.
27/// There's a dilemma on whether the trait signature shall be `&self` or `&mut self`.
28///
29/// In this case, we choose to avoid making a decision by consume `self` instead.
30/// This is to make it very obvious that the implementor shall clone the parent node.
31/// Otherwise, it will consume the parent node itself.
32pub trait TreeOp<T> {
33 /// Add a child node to tree. It will increment level but it will not increment unknown_count.
34 fn add_child(self, value: T) -> Self;
35
36 /// Get a level of node. Root node will have level 0. Child of root will have level 1 and so on.
37 fn level(&self) -> usize;
38
39 /// Consume itself and return a `Vec` that have a value from root till this node.
40 /// It will only have all the node value on this branch. All sibling nodes
41 /// are excluded. The childs of this node are also excluded.
42 fn into_vec(self) -> Vec<T>;
43}
44
45/// A Tree node that hold possibles tokenization result.
46///
47/// The relation between parent and child node in Rust requires either `Rc` or `Arc`.
48/// There's two feature gate to control this.
49/// - multi-thread - It will wrap this TreeNode in Arc<RwLock<TreeNode<T>>>
50/// - single-thread - It will wrap this TreeNode in `Rc<RefCell<TreeNode<T>>>`
51///
52/// Rust prohibit cyclic relationship. So either parent or child need to hold a `Weak` container type.
53/// In current design, we choose to make a parent hold a `Weak` reference to childs. This is to make
54/// it possible to partially drop some unused childs from parent.
55///
56/// The root node will have no value, nor parent. Thus value and parent must be wrap inside Option.
57///
58/// Since both `Arc` and `Rc` only allow share immutable owned value but we need to add child node.
59/// We need to wrap it inside interior mutability kind of type. It's either `RefCell`, `Mutex`, or `RwLock`.
60/// As per above feature gate description, for `multi-thread`, it'll use `RwLock`. For `single-thread`,
61/// it'll use `RefCell`.
62///
63/// The node shall also know their own level so user don't have to traverse entire tree to find out the
64/// min and max depth of the tree. They only need to check on every leaves nodes.
65#[derive(Debug)]
66#[allow(unused)]
67pub(crate) struct TreeNode<T> {
68 /// Level of node in current tree. Root node is at level 0. Childs of root is at level 1.
69 level: usize,
70 /// Current value of current node. Each node shall represent exactly one token.
71 /// Root node will not have value.
72 value: Option<T>,
73
74 /// Reference to parent node. If child node is not drop, the parent will always live.
75 parent: Option<MultiOwn<TreeNode<T>>>,
76 /// Reference to childs of current node. It is possible that the child is already dropped.
77 #[cfg(not(feature="single-thread"))]
78 childs: Vec<Weak<RwLock<TreeNode<T>>>>,
79 #[cfg(feature="single-thread")]
80 childs: Vec<Weak<RefCell<TreeNode<T>>>>
81}
82
83#[allow(unused)]
84impl<T> TreeNode<T> {
85 /// Since every tree operation require wrapping itself in `Arc<RwLock<>>`, it would
86 /// make user have easier usage by simply return `Arc<RwLock<TreeNode<T>>>`.
87 #[cfg(not(feature="single-thread"))]
88 #[allow(dead_code)]
89 fn root() -> Arc<RwLock<TreeNode<T>>> {
90 Arc::new(RwLock::new(TreeNode {
91 level: 0,
92 value: None,
93
94 parent: None,
95 childs: Vec::new()
96 }))
97 }
98 /// Since every tree operation require wrapping itself in `Rc<RefCell<>>`, it would
99 /// make user have easier usage by simply return `Rc<RefCell<TreeNode<T>>>`.
100 #[cfg(feature="single-thread")]
101 fn root() -> Rc<RefCell<TreeNode<T>>> {
102 Rc::new(RefCell::new(TreeNode {
103 level: 0,
104 value: None,
105
106 parent: None,
107 childs: Vec::new()
108 }))
109 }
110}
111
112/// Directly implement `TreeOp<T>` for both `Arc<RwLock<TreeNode<T>>>` and
113/// `Rc<RefCell<TreeNode<T>>>` so caller can have easy access to some of
114/// node properties.
115impl<T> TreeOp<T> for MultiOwn<TreeNode<T>> where T: Copy {
116 #[cfg(not(feature="single-thread"))]
117 fn add_child(self, value: T) -> MultiOwn<TreeNode<T>> {
118 let level = self.read().unwrap().level;
119 let child = Arc::new(RwLock::new(TreeNode {
120 level: level + 1,
121 value: Some(value),
122 parent: Some(Arc::clone(&self)),
123 childs: Vec::new()
124 }));
125
126 self.write().unwrap().childs.push(Arc::downgrade(&child));
127
128 child
129 }
130 #[cfg(feature="single-thread")]
131 fn add_child(self, value: T) -> MultiOwn<TreeNode<T>> {
132 let level = self.borrow().level;
133 let child = Rc::new(RefCell::new(TreeNode {
134 level: level + 1,
135 value: Some(value),
136 parent: Some(Rc::clone(&self)),
137 childs: Vec::new()
138 }));
139
140 self.borrow_mut().childs.push(Rc::downgrade(&child));
141
142 child
143 }
144
145 fn level(&self) -> usize {
146 #[cfg(not(feature="single-thread"))]
147 return self.read().unwrap().level;
148 #[cfg(feature="single-thread")]
149 return self.borrow().level;
150 }
151
152 fn into_vec(self) -> Vec<T> {
153 #[cfg(not(feature="single-thread"))]
154 return (&*self.read().unwrap()).into();
155 #[cfg(feature="single-thread")]
156 return (&*self.borrow()).into();
157 }
158}
159
160/// Convert branch of tree from given node up to root node into a Vec<T>.
161///
162/// If the given node is a root node or the node has no value, it'll panic.
163///
164/// This is shallow type conversion thus `T` must implement `Copy`.
165/// It is automatically implement for most of built-in Rust type, including borrowed value.
166impl<T> std::convert::From<&TreeNode<T>> for Vec<T> where T: Copy {
167
168 fn from(node: &TreeNode<T>) -> Vec<T> {
169 let mut v = Vec::with_capacity(node.level);
170
171 #[cfg(not(feature="single-thread"))]
172 fn traverse_tree<T>(node: &MultiOwn<TreeNode<T>>, vec: &mut Vec<T>) where T: Copy {
173 let actual_node = node.read().unwrap();
174
175 if let Some(ref parent) = actual_node.parent {
176 traverse_tree(parent, vec);
177 // Add value here as it is not a root node.
178 vec.push(*actual_node.value.as_ref().unwrap());
179 }
180 }
181 #[cfg(feature="single-thread")]
182 fn traverse_tree<T>(node: &MultiOwn<TreeNode<T>>, vec: &mut Vec<T>) where T: Copy {
183 let actual_node = node.borrow();
184
185 if let Some(ref parent) = actual_node.parent {
186 traverse_tree(parent, vec);
187 // Add value here as it is not a root node.
188 vec.push(*actual_node.value.as_ref().unwrap());
189 }
190 }
191
192 if let Some(ref parent) = node.parent {
193 traverse_tree(parent, &mut v);
194 }
195
196 if node.value.is_none() {
197 panic!("The given node has no value. Either it is a root node or it is improper constructed node.");
198 }
199
200 v.push(*node.value.as_ref().unwrap());
201
202 v.into()
203 }
204}
205
206/// A trait that all Tokenizer should implement.
207pub trait Tokenizer {
208 /// Tokenize given `text` and return a `Vec<&str>` where each `&str` inside
209 /// a `Vec` is a slice from given text.
210 fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
211}
212
213pub mod en;
214pub mod th;
215
216#[cfg(test)]
217mod tests;