roketok/lib.rs
1use std::rc::Rc;
2
3use crate::{config::Configuration, iter::StreamIterator};
4
5/// Provides the configurations for tokenizers
6/// the most basic being:
7/// ```rust,ignore
8/// Configuration<_, _>
9/// ```
10pub mod config;
11
12#[doc(hidden)]
13mod iter;
14
15/// Gives you all the basic utilities
16/// without scavenging for them.
17pub mod prelude {
18 pub use crate::config::*;
19 pub use crate::*;
20}
21
22/// # Token
23/// Represents a set of characters and their value
24/// and position data. Also comes with a `kind`.
25#[derive(Debug, Clone)]
26pub struct Token<K: Default> {
27 /* Value Data */
28 pub value: String,
29 pub kind: K,
30
31 /* Position Data */
32 pub row: usize,
33 pub col: usize,
34}
35
36/// # Stream Tokenizer
37/// A very basic tokenizer, no token trees, nothing.
38/// Just creates a stream of tokens based on a set of rules.
39///
40/// # Example
41///
42/// ```rust
43/// use roketok::prelude::*;
44///
45/// #[derive(Debug, Clone, Default)]
46/// pub enum TokenKind {
47/// Number,
48///
49/// Add,
50/// Sub,
51/// Mul,
52/// Div,
53///
54/// #[default]
55/// Invalid,
56/// }
57///
58/// fn main() {
59/// let config = Configuration::<TokenKind>::new()
60/// .add_rule(|c, _| c.is_numeric(), TokenKind::Number)
61/// .add_tokens([
62/// (&['+'], TokenKind::Add),
63/// (&['-'], TokenKind::Sub),
64/// (&['*'], TokenKind::Mul),
65/// (&['/'], TokenKind::Div),
66/// ]);
67/// let contents = "32 * 64 / 324 * 6 - 232 + 6644 + 324 * 3256 - 2".to_string();
68/// let mut tokenizer = StreamTokenizer::new(&config, &contents);
69/// let stream = tokenizer.create_stream();
70/// }
71/// ```
72pub struct StreamTokenizer<'ci, K: Default + Clone> {
73 /* Configuration */
74 config: Rc<&'ci Configuration<'ci, K>>,
75
76 /* Content Iteration */
77 iter: StreamIterator<'ci>,
78 pos: (usize, usize),
79}
80
81impl<'ci, K: Default + Clone> StreamTokenizer<'ci, K> {
82 /// Creates the `StreamTokenizer`, takes in basic config and
83 /// file contents, or whatever you want to tokenize.
84 pub fn new(config: &'ci Configuration<'ci, K>, contents: &'ci String) -> Self {
85 Self {
86 config: Rc::new(config),
87 iter: StreamIterator::new(contents),
88 pos: (1, 1),
89 }
90 }
91
92 #[doc(hidden)]
93 fn next(&mut self) -> Option<char> {
94 if let Some(next) = self.iter.next() {
95 if next == '\n' {
96 self.pos.0 += 1;
97 self.pos.1 = 1;
98 } else {
99 self.pos.1 += 1;
100 }
101
102 return Some(next);
103 }
104
105 None
106 }
107
108 #[doc(hidden)]
109 #[must_use]
110 #[inline(always)]
111 fn tokenize_symbols(&mut self,
112 mut start_pos: (usize, usize),
113 symbols: String
114 ) -> Vec<Token<K>> {
115 let mut stack = Vec::new();
116
117 let mut slice = &symbols[..];
118 loop {
119 let matching = self.config.tokens.iter()
120 .filter(|e| {
121 for (c1, c2) in e.0.iter().zip(slice.chars()) {
122 if *c1 != c2 { return false; }
123 }
124
125 true
126 })
127 .collect::<Vec<_>>();
128 if matching.len() == 0 {
129 stack.push(Token {
130 value: slice.to_string(),
131 kind: K::default(),
132 row: start_pos.0,
133 col: start_pos.1,
134 });
135 break;
136 }
137
138 let mut best_match = matching[0];
139 for entry in matching {
140 if best_match.0.len() < entry.0.len() {
141 best_match = entry;
142 }
143 }
144
145 stack.push(Token {
146 value: best_match.0.iter().collect::<String>(),
147 kind: best_match.1.clone(),
148 row: start_pos.0,
149 col: start_pos.1,
150 });
151
152 let best_match_len = best_match.0.len();
153 if best_match_len >= slice.len() { break; }
154
155 slice = &slice[best_match_len..];
156 start_pos.1 += best_match_len;
157 }
158
159 stack
160 }
161
162 /// # Create Stream
163 /// This function, believe it or not creates the token stream.
164 /// There are examples already showing how this works, so please refer
165 /// to them.
166 pub fn create_stream(&mut self) -> Box<[Token<K>]> {
167 let mut stream = Vec::new();
168 let config = self.config.clone();
169
170 let mut start_iter_pos;
171 let mut start_pos;
172 'update: loop {
173 start_iter_pos = self.iter.position();
174 start_pos = self.pos;
175 while let Some(current) = self.next() {
176 if current.is_whitespace() {
177 continue 'update;
178 }
179
180 if let Some((rule, kind)) = config.rules.iter()
181 .find(|e| e.0(¤t, 0))
182 {
183 let mut current_index = 1;
184 while let Some(current) = self.iter.peek() {
185 if !rule(¤t, current_index) { break; }
186 self.next();
187 current_index += 1;
188 }
189
190 let end_iter_pos = self.iter.position();
191 let value = self.iter.grab(start_iter_pos..end_iter_pos);
192 stream.push(Token {
193 /* ValueData */
194 value,
195 kind: kind.clone(),
196
197 /* Position Data */
198 row: start_pos.0,
199 col: start_pos.1
200 });
201
202 continue 'update;
203 }
204
205 while let Some(current) = self.iter.peek() {
206 if current.is_whitespace()
207 || config.rules.iter().find(|e| e.0(¤t, 0)).is_some()
208 {
209 break;
210 }
211
212 self.next();
213 }
214
215 let symbols = self.iter.grab(start_iter_pos..self.iter.position());
216 stream.extend(self.tokenize_symbols(start_pos, symbols));
217 continue 'update;
218 }
219
220 break;
221 }
222
223 stream.into()
224 }
225}