roketok/
lib.rs

1use std::rc::Rc;
2
3use crate::{config::Configuration, iter::StreamIterator};
4
5/// Provides the configurations for tokenizers
6/// the most basic being:
7/// ```rust,ignore
8/// Configuration<_, _>
9/// ```
10pub mod config;
11
12#[doc(hidden)]
13mod iter;
14
15/// Gives you all the basic utilities
16/// without scavenging for them.
17pub mod prelude {
18    pub use crate::config::*;
19    pub use crate::*;
20}
21
22/// # Token
23/// Represents a set of characters and their value
24/// and position data. Also comes with a `kind`.
25#[derive(Debug, Clone)]
26pub struct Token<K: Default> {
27    /* Value Data */
28    pub value: String,
29    pub kind: K,
30    
31    /* Position Data */
32    pub row: usize,
33    pub col: usize,
34}
35
36/// # Stream Tokenizer
37/// A very basic tokenizer, no token trees, nothing.
38/// Just creates a stream of tokens based on a set of rules.
39/// 
40/// # Example
41/// 
42/// ```rust
43/// use roketok::prelude::*;
44/// 
45/// #[derive(Debug, Clone, Default)]
46/// pub enum TokenKind {
47///     Number,
48/// 
49///     Add,
50///     Sub,
51///     Mul,
52///     Div,
53/// 
54///     #[default]
55///     Invalid,
56/// }
57/// 
58/// fn main() {
59///     let config = Configuration::<TokenKind>::new()
60///         .add_rule(|c, _| c.is_numeric(), TokenKind::Number)
61///         .add_tokens([
62///             (&['+'], TokenKind::Add),
63///             (&['-'], TokenKind::Sub),
64///             (&['*'], TokenKind::Mul),
65///             (&['/'], TokenKind::Div),
66///         ]);
67///     let contents = "32 * 64 / 324 * 6 - 232 + 6644 + 324 * 3256 - 2".to_string();
68///     let mut tokenizer = StreamTokenizer::new(&config, &contents);
69///     let stream = tokenizer.create_stream();
70/// }
71/// ```
72pub struct StreamTokenizer<'ci, K: Default + Clone> {
73    /* Configuration */
74    config: Rc<&'ci Configuration<'ci, K>>,
75    
76    /* Content Iteration */
77    iter: StreamIterator<'ci>,
78    pos: (usize, usize),
79}
80
81impl<'ci, K: Default + Clone> StreamTokenizer<'ci, K> {
82    /// Creates the `StreamTokenizer`, takes in basic config and
83    /// file contents, or whatever you want to tokenize.
84    pub fn new(config: &'ci Configuration<'ci, K>, contents: &'ci String) -> Self {
85        Self {
86            config: Rc::new(config),
87            iter: StreamIterator::new(contents),
88            pos: (1, 1),
89        }
90    }
91    
92    #[doc(hidden)]
93    fn next(&mut self) -> Option<char> {
94        if let Some(next) = self.iter.next() {
95            if next == '\n' {
96                self.pos.0 += 1;
97                self.pos.1 = 1;
98            } else {
99                self.pos.1 += 1;
100            }
101            
102            return Some(next);
103        }
104        
105        None
106    }
107    
108    #[doc(hidden)]
109    #[must_use]
110    #[inline(always)]
111    fn tokenize_symbols(&mut self,
112        mut start_pos: (usize, usize),
113        symbols: String
114    ) -> Vec<Token<K>> {
115        let mut stack = Vec::new();
116        
117        let mut slice = &symbols[..];
118        loop {
119            let matching = self.config.tokens.iter()
120                .filter(|e| {
121                    for (c1, c2) in e.0.iter().zip(slice.chars()) {
122                        if *c1 != c2 { return false; }
123                    }
124                    
125                    true
126                })
127                .collect::<Vec<_>>();
128            if matching.len() == 0 {
129                stack.push(Token {
130                    value: slice.to_string(),
131                    kind: K::default(),
132                    row: start_pos.0,
133                    col: start_pos.1,
134                });
135                break;
136            }
137            
138            let mut best_match = matching[0];
139            for entry in matching {
140                if best_match.0.len() < entry.0.len() {
141                    best_match = entry;
142                }
143            }
144            
145            stack.push(Token {
146                value: best_match.0.iter().collect::<String>(),
147                kind: best_match.1.clone(),
148                row: start_pos.0,
149                col: start_pos.1,
150            });
151            
152            let best_match_len = best_match.0.len();
153            if best_match_len >= slice.len() { break; }
154            
155            slice = &slice[best_match_len..];
156            start_pos.1 += best_match_len;
157        }
158        
159        stack
160    }
161    
162    /// # Create Stream
163    /// This function, believe it or not creates the token stream.
164    /// There are examples already showing how this works, so please refer
165    /// to them.
166    pub fn create_stream(&mut self) -> Box<[Token<K>]> {
167        let mut stream = Vec::new();
168        let config = self.config.clone();
169        
170        let mut start_iter_pos;
171        let mut start_pos;
172        'update: loop {
173            start_iter_pos = self.iter.position();
174            start_pos = self.pos;
175            while let Some(current) = self.next() {
176                if current.is_whitespace() {
177                    continue 'update;
178                }
179                
180                if let Some((rule, kind)) = config.rules.iter()
181                    .find(|e| e.0(&current, 0))
182                {
183                    let mut current_index = 1;
184                    while let Some(current) = self.iter.peek() {
185                        if !rule(&current, current_index) { break; }
186                        self.next();
187                        current_index += 1;
188                    }
189                    
190                    let end_iter_pos = self.iter.position();
191                    let value = self.iter.grab(start_iter_pos..end_iter_pos);
192                    stream.push(Token {
193                        /* ValueData */
194                        value,
195                        kind: kind.clone(),
196                        
197                        /* Position Data */
198                        row: start_pos.0,
199                        col: start_pos.1
200                    });
201                    
202                    continue 'update;
203                }
204                
205                while let Some(current) = self.iter.peek() {
206                    if current.is_whitespace()
207                        || config.rules.iter().find(|e| e.0(&current, 0)).is_some()
208                    {
209                        break;
210                    }
211                    
212                    self.next();
213                }
214                
215                let symbols = self.iter.grab(start_iter_pos..self.iter.position());
216                stream.extend(self.tokenize_symbols(start_pos, symbols));
217                continue 'update;
218            }
219            
220            break;
221        }
222        
223        stream.into()
224    }
225}