Skip to main content

tokenizer/tokenizer/
mod.rs

1//! A trait and some implementation of Tokenizer
2//! 
3//! This module is where the [Tokenizer](trait.Tokenizer.html) trait is defined.
4//! It contains sub-modules that actually implement the trait for particular language.
5//! 
6//! Current list of sample tokenizer implementation:
7//! - English
8//! - Thai
9
10#[cfg(not(feature="single-thread"))]
11use std::sync::{Arc, RwLock, Weak};
12
13#[cfg(feature="single-thread")]
14use std::{cell::RefCell, rc::{Rc, Weak}};
15
16#[cfg(not(feature="single-thread"))]
17type MultiOwn<T> = Arc<RwLock<T>>;
18
19#[cfg(feature="single-thread")]
20type MultiOwn<T> = Rc<RefCell<T>>;
21
22/// Currently supported tree operation.
23/// 
24/// Since Rust tree is not allow to have cyclic relation ship thus we need to wrap
25/// a node with `Weak` on one end and either `Rc` or `Arc` in another end.
26/// Since both of that types required interior mutability to mutate data.
27/// There's a dilemma on whether the trait signature shall be `&self` or `&mut self`.
28/// 
29/// In this case, we choose to avoid making a decision by consume `self` instead.
30/// This is to make it very obvious that the implementor shall clone the parent node.
31/// Otherwise, it will consume the parent node itself.
32pub trait TreeOp<T> {
33    /// Add a child node to tree. It will increment level but it will not increment unknown_count.
34    fn add_child(self, value: T) -> Self;
35
36    /// Get a level of node. Root node will have level 0. Child of root will have level 1 and so on.
37    fn level(&self) -> usize;
38
39    /// Consume itself and return a `Vec` that have a value from root till this node.
40    /// It will only have all the node value on this branch. All sibling nodes
41    /// are excluded. The childs of this node are also excluded.
42    fn into_vec(self) -> Vec<T>;
43}
44
45/// A Tree node that hold possibles tokenization result.
46/// 
47/// The relation between parent and child node in Rust requires either `Rc` or `Arc`.
48/// There's two feature gate to control this.
49/// - multi-thread - It will wrap this TreeNode in Arc<RwLock<TreeNode<T>>>
50/// - single-thread - It will wrap this TreeNode in `Rc<RefCell<TreeNode<T>>>`
51/// 
52/// Rust prohibit cyclic relationship. So either parent or child need to hold a `Weak` container type.
53/// In current design, we choose to make a parent hold a `Weak` reference to childs. This is to make
54/// it possible to partially drop some unused childs from parent.
55/// 
56/// The root node will have no value, nor parent. Thus value and parent must be wrap inside Option.
57/// 
58/// Since both `Arc` and `Rc` only allow share immutable owned value but we need to add child node.
59/// We need to wrap it inside interior mutability kind of type. It's either `RefCell`, `Mutex`, or `RwLock`.
60/// As per above feature gate description, for `multi-thread`, it'll use `RwLock`. For `single-thread`,
61/// it'll use `RefCell`.
62/// 
63/// The node shall also know their own level so user don't have to traverse entire tree to find out the
64/// min and max depth of the tree. They only need to check on every leaves nodes.
65#[derive(Debug)]
66#[allow(unused)]
67pub(crate) struct TreeNode<T> {
68    /// Level of node in current tree. Root node is at level 0. Childs of root is at level 1.
69    level: usize,
70    /// Current value of current node. Each node shall represent exactly one token.
71    /// Root node will not have value.
72    value: Option<T>,
73
74    /// Reference to parent node. If child node is not drop, the parent will always live.
75    parent: Option<MultiOwn<TreeNode<T>>>,
76    /// Reference to childs of current node. It is possible that the child is already dropped.
77    #[cfg(not(feature="single-thread"))]
78    childs: Vec<Weak<RwLock<TreeNode<T>>>>,
79    #[cfg(feature="single-thread")]
80    childs: Vec<Weak<RefCell<TreeNode<T>>>>
81}
82
83#[allow(unused)]
84impl<T> TreeNode<T> {
85    /// Since every tree operation require wrapping itself in `Arc<RwLock<>>`, it would
86    /// make user have easier usage by simply return `Arc<RwLock<TreeNode<T>>>`.
87    #[cfg(not(feature="single-thread"))]
88    #[allow(dead_code)]
89    fn root() -> Arc<RwLock<TreeNode<T>>> {
90        Arc::new(RwLock::new(TreeNode {
91            level: 0,
92            value: None,
93
94            parent: None,
95            childs: Vec::new()
96        }))
97    }
98    /// Since every tree operation require wrapping itself in `Rc<RefCell<>>`, it would
99    /// make user have easier usage by simply return `Rc<RefCell<TreeNode<T>>>`.
100    #[cfg(feature="single-thread")]
101    fn root() -> Rc<RefCell<TreeNode<T>>> {
102        Rc::new(RefCell::new(TreeNode {
103            level: 0,
104            value: None,
105
106            parent: None,
107            childs: Vec::new()
108        }))
109    }
110}
111
112/// Directly implement `TreeOp<T>` for both `Arc<RwLock<TreeNode<T>>>` and 
113/// `Rc<RefCell<TreeNode<T>>>` so caller can have easy access to some of
114/// node properties.
115impl<T> TreeOp<T> for MultiOwn<TreeNode<T>> where T: Copy {
116    #[cfg(not(feature="single-thread"))]
117    fn add_child(self, value: T) -> MultiOwn<TreeNode<T>> {
118        let level = self.read().unwrap().level;
119        let child = Arc::new(RwLock::new(TreeNode {
120            level: level + 1,
121            value: Some(value),
122            parent: Some(Arc::clone(&self)),
123            childs: Vec::new()
124        }));
125
126        self.write().unwrap().childs.push(Arc::downgrade(&child));
127
128        child
129    }
130    #[cfg(feature="single-thread")]
131    fn add_child(self, value: T) -> MultiOwn<TreeNode<T>> {
132        let level = self.borrow().level;
133        let child = Rc::new(RefCell::new(TreeNode {
134            level: level + 1,
135            value: Some(value),
136            parent: Some(Rc::clone(&self)),
137            childs: Vec::new()
138        }));
139
140        self.borrow_mut().childs.push(Rc::downgrade(&child));
141
142        child
143    }
144
145    fn level(&self) -> usize {
146        #[cfg(not(feature="single-thread"))]
147        return self.read().unwrap().level;
148        #[cfg(feature="single-thread")]
149        return self.borrow().level;
150    }
151
152    fn into_vec(self) -> Vec<T> {
153        #[cfg(not(feature="single-thread"))]
154        return (&*self.read().unwrap()).into();
155        #[cfg(feature="single-thread")]
156        return (&*self.borrow()).into();
157    }
158}
159
160/// Convert branch of tree from given node up to root node into a Vec<T>.
161/// 
162/// If the given node is a root node or the node has no value, it'll panic.
163/// 
164/// This is shallow type conversion thus `T` must implement `Copy`.
165/// It is automatically implement for most of built-in Rust type, including borrowed value.
166impl<T> std::convert::From<&TreeNode<T>> for Vec<T> where T: Copy {
167
168    fn from(node: &TreeNode<T>) -> Vec<T> {
169        let mut v = Vec::with_capacity(node.level);
170        
171        #[cfg(not(feature="single-thread"))]
172        fn traverse_tree<T>(node: &MultiOwn<TreeNode<T>>, vec: &mut Vec<T>) where T: Copy {
173            let actual_node = node.read().unwrap();
174            
175            if let Some(ref parent) = actual_node.parent {
176                traverse_tree(parent, vec);
177                // Add value here as it is not a root node. 
178                vec.push(*actual_node.value.as_ref().unwrap());
179            }
180        }
181        #[cfg(feature="single-thread")]
182        fn traverse_tree<T>(node: &MultiOwn<TreeNode<T>>, vec: &mut Vec<T>) where T: Copy {
183            let actual_node = node.borrow();
184            
185            if let Some(ref parent) = actual_node.parent {
186                traverse_tree(parent, vec);
187                // Add value here as it is not a root node. 
188                vec.push(*actual_node.value.as_ref().unwrap());
189            }
190        }
191
192        if let Some(ref parent) = node.parent {
193            traverse_tree(parent, &mut v);
194        }
195        
196        if node.value.is_none() {
197            panic!("The given node has no value. Either it is a root node or it is improper constructed node.");
198        }
199
200        v.push(*node.value.as_ref().unwrap());
201
202        v.into()
203    }
204}
205
206/// A trait that all Tokenizer should implement.
207pub trait Tokenizer {
208    /// Tokenize given `text` and return a `Vec<&str>` where each `&str` inside
209    /// a `Vec` is a slice from given text.
210    fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
211}
212
213pub mod en;
214pub mod th;
215
216#[cfg(test)]
217mod tests;