roketok/
lib.rs

1use crate::{
2    config::{Configuration, TokenConfiguration}, iter::StreamIterator, tokens::{Branch, Token, TreeNode}
3};
4
5/// Provides the configurations for tokenizers
6/// the most basic being:
7/// ```rust,ignore
8/// Configuration<_, _>
9/// ```
10pub mod config;
11
12/// Provides `Record` which records the tokenizers
13/// data, allowing for more complex tokenizations.
14pub mod record;
15
16/// Provides the essentials for interacting and manipulating
17/// tokens after tokenization.
18pub mod tokens;
19
20#[doc(hidden)]
21pub mod iter;
22
23/// Gives you all the basic utilities
24/// without scavenging for them.
25pub mod prelude {
26    pub use crate::config::*;
27    pub use crate::Tokenizer;
28}
29
30/// # Tokenizer
31/// Uses `Configuration` to tokenize contents.
32/// 
33/// # Example (Taken from README.md)
34/// ```rust
35/// use roketok::prelude::*;
36/// 
37/// #[derive(Default)]
38/// enum TokenKind {
39///     Identifier,
40///     Number,
41///     
42///     Asterisk,
43///     Ampersand,
44///     Semicolon,
45///     
46///     Equal,
47///     AddEqual,
48///     
49///     Parenthesis,
50///     
51///     #[default]
52///     Invalid,
53/// }
54/// 
55/// fn main() {
56///     let contents = r#"
57///         void foo(int *value) {
58///             *value += 35;
59///         }
60///         
61///         int main(void) {
62///             int value = 34;
63///             foo(&value);
64///             return value;
65///         }
66///     "#;
67///     
68///     let config = Configuration::new()
69///         .add_tokens([
70///             (TokenConfiguration::Rule(&|iter, _| {
71///                 if let Some(char) = iter.last() {
72///                     if !char.is_alphabetic() { return false; }
73///                     while let Some(char) = iter.peek() {
74///                         if !char.is_alphanumeric() { break; }
75///                         let _ = iter.next();
76///                     }
77///                     return true;
78///                 }
79///                 false
80///             }), TokenKind::Identifier),
81///             (TokenConfiguration::Rule(&|iter, _| {
82///                 if let Some(char) = iter.last() {
83///                     if !char.is_alphanumeric() { return false; }
84///                     while let Some(char) = iter.peek() {
85///                         if !char.is_alphanumeric() { break; }
86///                         let _ = iter.next();
87///                     }
88///                     return true;
89///                 }
90///                 false
91///             }), TokenKind::Number),
92///             
93///             (TokenConfiguration::Boring(&['*']), TokenKind::Asterisk),
94///             (TokenConfiguration::Boring(&['&']), TokenKind::Ampersand),
95///             
96///             (TokenConfiguration::Boring(&['=']), TokenKind::Equal),
97///             (TokenConfiguration::Boring(&['+', '=']), TokenKind::AddEqual),
98///             
99///             (TokenConfiguration::Boring(&[';']), TokenKind::Semicolon),
100///             
101///             (TokenConfiguration::Branch(&['('], &[')']), TokenKind::Parenthesis),
102///         ]);
103///     let mut tokenizer = Tokenizer::new(&config, contents);
104///     let tree = tokenizer.build();
105/// }
106/// ```
107pub struct Tokenizer<'items, K: Default + Clone> {
108    config: &'items Configuration<'items, K>,
109    iter: StreamIterator<'items>,
110}
111
112impl<'items, K: Default + Clone> Tokenizer<'items, K> {
113    /// Creates a new `Tokenizer` from a configuration and the
114    /// contents (the `String` you want to tokenize).
115    /// 
116    /// # Example
117    /// ```rust
118    /// use roketok::prelude::*;
119    /// 
120    /// #[derive(Default, Clone)]
121    /// enum TokenKind {
122    ///     #[default]
123    ///     Invalid,
124    /// }
125    /// 
126    /// let contents = "This gets tokenized. But configuration is empty, so in this case it doesn't.";
127    /// 
128    /// let config = Configuration::<'_, TokenKind>::new();
129    /// let tokenizer = Tokenizer::new(&config, &contents);
130    /// ```
131    /// 
132    /// See [`Tokenizer`] for more details.
133    pub fn new(config: &'items Configuration<'items, K>, contents: &'items str) -> Self {
134        Self {
135            config,
136            iter: StreamIterator::new(contents),
137        }
138    }
139    
140    #[must_use]
141    #[inline(always)]
142    fn matches(&mut self, chars: &[char]) -> bool {
143        let mut iter = self.iter;
144        let mut matches = false;
145        for (i, char) in (0..chars.len()).zip(iter.last()) {
146            if char == chars[i] {
147                matches = true;
148            } else {
149                matches = false;
150                break;
151            }
152            
153            if i + 1 != chars.len() {
154                let _ = iter.next();
155            }
156        }
157        
158        if matches {
159            self.iter = iter;
160        }
161        
162        matches
163    }
164    
165    #[doc(hidden)]
166    fn tokenize(&mut self) -> TreeNode<K> {
167        let start_iter_pos = self.iter.position() - 1;
168        for (config, kind) in self.config.0.iter() {
169            match config {
170                TokenConfiguration::Rule(rule) => {
171                    let record = self.iter.record().clone();
172                    if rule(&mut self.iter, &record) == true {
173                        return TreeNode::Leaf(Token {
174                            value: self.iter.grab(start_iter_pos..self.iter.position()),
175                            kind: kind.clone(),
176                            record,
177                        });
178                    }
179                },
180                TokenConfiguration::Boring(chars) => {
181                    let record = self.iter.record().clone();
182                    if self.matches(chars) {
183                        return TreeNode::Leaf(Token {
184                            value: self.iter.grab(start_iter_pos..self.iter.position()),
185                            kind: kind.clone(),
186                            record,
187                        });
188                    }
189                },
190                TokenConfiguration::Branch(start_chars, end_chars) => {
191                    let record = self.iter.record().clone();
192                    if self.matches(start_chars) {
193                        let start_token =  Token {
194                            value: self.iter.grab(start_iter_pos..self.iter.position()),
195                            kind: kind.clone(),
196                            record,
197                        };
198                        
199                        let mut stream = Vec::new();
200                        let mut end_token = None;
201                        while let Some(char) = self.iter.next() {
202                            if char.is_whitespace() { continue; }
203                            let token = self.tokenize();
204                            if let TreeNode::Leaf(token) = &token {
205                                if token.value == end_chars.iter().collect::<String>() {
206                                    end_token = Some(token.clone());
207                                    break;
208                                }
209                            }
210                            stream.push(token);
211                        }
212                        
213                        return TreeNode::Branch(Branch {
214                            value: (
215                                start_token.value,
216                                if let Some(end_token) = &end_token {
217                                    end_token.value.clone()
218                                } else {
219                                    "?".to_string()
220                                },
221                            ),
222                            kind: kind.clone(),
223                            stream,
224                            record,
225                            has_end: end_token.is_some(),
226                        });
227                    }
228                },
229            }
230        }
231        
232        TreeNode::Leaf(Token {
233            value: self.iter.last().unwrap().to_string(),
234            kind: K::default(),
235            record: self.iter.record().clone(),
236        })
237    }
238    
239    /// # Builds the Token Tree
240    /// Creates the token tree using the configuration
241    /// and contents you provided in new. See
242    /// [`Tokenizer::new`] for more details.
243    pub fn build(&mut self) -> Vec<TreeNode<K>> {
244        let mut stream = Vec::new();
245        
246        while let Some(char) = self.iter.next() {
247            if char.is_whitespace() { continue; }
248            stream.push(self.tokenize());
249        }
250        
251        stream
252    }
253}