roketok/lib.rs
1use crate::{
2 config::{Configuration, TokenConfiguration}, iter::StreamIterator, tokens::{Branch, Token, TreeNode}
3};
4
5/// Provides the configurations for tokenizers
6/// the most basic being:
7/// ```rust,ignore
8/// Configuration<_, _>
9/// ```
10pub mod config;
11
12/// Provides `Record` which records the tokenizers
13/// data, allowing for more complex tokenizations.
14pub mod record;
15
16/// Provides the essentials for interacting and manipulating
17/// tokens after tokenization.
18pub mod tokens;
19
20#[doc(hidden)]
21pub mod iter;
22
23/// Gives you all the basic utilities
24/// without scavenging for them.
25pub mod prelude {
26 pub use crate::config::*;
27 pub use crate::Tokenizer;
28}
29
30/// # Tokenizer
31/// Uses `Configuration` to tokenize contents.
32///
33/// # Example (Taken from README.md)
34/// ```rust
35/// use roketok::prelude::*;
36///
37/// #[derive(Default)]
38/// enum TokenKind {
39/// Identifier,
40/// Number,
41///
42/// Asterisk,
43/// Ampersand,
44/// Semicolon,
45///
46/// Equal,
47/// AddEqual,
48///
49/// Parenthesis,
50///
51/// #[default]
52/// Invalid,
53/// }
54///
55/// fn main() {
56/// let contents = r#"
57/// void foo(int *value) {
58/// *value += 35;
59/// }
60///
61/// int main(void) {
62/// int value = 34;
63/// foo(&value);
64/// return value;
65/// }
66/// "#;
67///
68/// let config = Configuration::new()
69/// .add_tokens([
70/// (TokenConfiguration::Rule(&|iter, _| {
71/// if let Some(char) = iter.last() {
72/// if !char.is_alphabetic() { return false; }
73/// while let Some(char) = iter.peek() {
74/// if !char.is_alphanumeric() { break; }
75/// let _ = iter.next();
76/// }
77/// return true;
78/// }
79/// false
80/// }), TokenKind::Identifier),
81/// (TokenConfiguration::Rule(&|iter, _| {
82/// if let Some(char) = iter.last() {
83/// if !char.is_alphanumeric() { return false; }
84/// while let Some(char) = iter.peek() {
85/// if !char.is_alphanumeric() { break; }
86/// let _ = iter.next();
87/// }
88/// return true;
89/// }
90/// false
91/// }), TokenKind::Number),
92///
93/// (TokenConfiguration::Boring(&['*']), TokenKind::Asterisk),
94/// (TokenConfiguration::Boring(&['&']), TokenKind::Ampersand),
95///
96/// (TokenConfiguration::Boring(&['=']), TokenKind::Equal),
97/// (TokenConfiguration::Boring(&['+', '=']), TokenKind::AddEqual),
98///
99/// (TokenConfiguration::Boring(&[';']), TokenKind::Semicolon),
100///
101/// (TokenConfiguration::Branch(&['('], &[')']), TokenKind::Parenthesis),
102/// ]);
103/// let mut tokenizer = Tokenizer::new(&config, contents);
104/// let tree = tokenizer.build();
105/// }
106/// ```
107pub struct Tokenizer<'items, K: Default + Clone> {
108 config: &'items Configuration<'items, K>,
109 iter: StreamIterator<'items>,
110}
111
112impl<'items, K: Default + Clone> Tokenizer<'items, K> {
113 /// Creates a new `Tokenizer` from a configuration and the
114 /// contents (the `String` you want to tokenize).
115 ///
116 /// # Example
117 /// ```rust
118 /// use roketok::prelude::*;
119 ///
120 /// #[derive(Default, Clone)]
121 /// enum TokenKind {
122 /// #[default]
123 /// Invalid,
124 /// }
125 ///
126 /// let contents = "This gets tokenized. But configuration is empty, so in this case it doesn't.";
127 ///
128 /// let config = Configuration::<'_, TokenKind>::new();
129 /// let tokenizer = Tokenizer::new(&config, &contents);
130 /// ```
131 ///
132 /// See [`Tokenizer`] for more details.
133 pub fn new(config: &'items Configuration<'items, K>, contents: &'items str) -> Self {
134 Self {
135 config,
136 iter: StreamIterator::new(contents),
137 }
138 }
139
140 #[must_use]
141 #[inline(always)]
142 fn matches(&mut self, chars: &[char]) -> bool {
143 let mut iter = self.iter;
144 let mut matches = false;
145 for (i, char) in (0..chars.len()).zip(iter.last()) {
146 if char == chars[i] {
147 matches = true;
148 } else {
149 matches = false;
150 break;
151 }
152
153 if i + 1 != chars.len() {
154 let _ = iter.next();
155 }
156 }
157
158 if matches {
159 self.iter = iter;
160 }
161
162 matches
163 }
164
165 #[doc(hidden)]
166 fn tokenize(&mut self) -> TreeNode<K> {
167 let start_iter_pos = self.iter.position() - 1;
168 for (config, kind) in self.config.0.iter() {
169 match config {
170 TokenConfiguration::Rule(rule) => {
171 let record = self.iter.record().clone();
172 if rule(&mut self.iter, &record) == true {
173 return TreeNode::Leaf(Token {
174 value: self.iter.grab(start_iter_pos..self.iter.position()),
175 kind: kind.clone(),
176 record,
177 });
178 }
179 },
180 TokenConfiguration::Boring(chars) => {
181 let record = self.iter.record().clone();
182 if self.matches(chars) {
183 return TreeNode::Leaf(Token {
184 value: self.iter.grab(start_iter_pos..self.iter.position()),
185 kind: kind.clone(),
186 record,
187 });
188 }
189 },
190 TokenConfiguration::Branch(start_chars, end_chars) => {
191 let record = self.iter.record().clone();
192 if self.matches(start_chars) {
193 let start_token = Token {
194 value: self.iter.grab(start_iter_pos..self.iter.position()),
195 kind: kind.clone(),
196 record,
197 };
198
199 let mut stream = Vec::new();
200 let mut end_token = None;
201 while let Some(char) = self.iter.next() {
202 if char.is_whitespace() { continue; }
203 let token = self.tokenize();
204 if let TreeNode::Leaf(token) = &token {
205 if token.value == end_chars.iter().collect::<String>() {
206 end_token = Some(token.clone());
207 break;
208 }
209 }
210 stream.push(token);
211 }
212
213 return TreeNode::Branch(Branch {
214 value: (
215 start_token.value,
216 if let Some(end_token) = &end_token {
217 end_token.value.clone()
218 } else {
219 "?".to_string()
220 },
221 ),
222 kind: kind.clone(),
223 stream,
224 record,
225 has_end: end_token.is_some(),
226 });
227 }
228 },
229 }
230 }
231
232 TreeNode::Leaf(Token {
233 value: self.iter.last().unwrap().to_string(),
234 kind: K::default(),
235 record: self.iter.record().clone(),
236 })
237 }
238
239 /// # Builds the Token Tree
240 /// Creates the token tree using the configuration
241 /// and contents you provided in new. See
242 /// [`Tokenizer::new`] for more details.
243 pub fn build(&mut self) -> Vec<TreeNode<K>> {
244 let mut stream = Vec::new();
245
246 while let Some(char) = self.iter.next() {
247 if char.is_whitespace() { continue; }
248 stream.push(self.tokenize());
249 }
250
251 stream
252 }
253}