rust_forth_tokenizer/
lib.rs

1/// This is how to use the Forth tokenizer library
2/// ```
3/// use rust_forth_tokenizer::ForthToken;
4/// use rust_forth_tokenizer::ForthTokenizer;
5///
6///     let tokenizer = ForthTokenizer::new("word : wordname 1 2 3 ; definition");
7///     // The code also supports the regular for loop iterator syntax
8///     let collected: Vec<_> = tokenizer.into_iter().collect();
9///     assert_eq!(
10///            &collected,
11///            &vec![
12///                ForthToken::Command("word"),
13///                ForthToken::Colon,
14///                ForthToken::Command("wordname"),
15///                ForthToken::Number(1),
16///                ForthToken::Number(2),
17///                ForthToken::Number(3),
18///                ForthToken::SemiColon,
19///                ForthToken::Command("definition"),
20///            ]
21///        );
22/// ```
23
24/// This Enum lists the token types that are used by the Forth interpreter
25#[derive(Debug, PartialEq)]
26pub enum ForthToken<'a> {
27    Number(i64),
28    Command(&'a str),
29    // Command, string
30    StringCommand(&'a str, &'a str),
31    Colon,
32    SemiColon,
33    DropLineComment(&'a str),
34    ParenthesizedRemark(&'a str),
35}
36
37/// This is the ForthTokenizer, it is the actual tokenizer
38pub struct ForthTokenizer<'a> {
39    to_tokenize: &'a str,
40}
41
42impl<'a> ForthTokenizer<'a> {
43    pub fn new(to_tokenize: &'a str) -> ForthTokenizer<'a> {
44        ForthTokenizer { to_tokenize }
45    }
46}
47
48impl<'a> IntoIterator for ForthTokenizer<'a> {
49    type Item = ForthToken<'a>;
50    type IntoIter = ForthTokenizerIntoIterator<'a>;
51
52    fn into_iter(self) -> Self::IntoIter {
53        ForthTokenizerIntoIterator {
54            to_tokenize: self.to_tokenize,
55        }
56    }
57}
58
59pub struct ForthTokenizerIntoIterator<'a> {
60    to_tokenize: &'a str,
61}
62
63// The `Iterator` trait only requires a method to be defined for the `next` element.
64impl<'a> Iterator for ForthTokenizerIntoIterator<'a> {
65    type Item = ForthToken<'a>;
66
67    // The return type is `Option<T>`:
68    //     * When the `Iterator` is finished, `None` is returned.
69    //     * Otherwise, the next value is wrapped in `Some` and returned.
70    fn next(&mut self) -> Option<ForthToken<'a>> {
71        // We ignore whitespace
72        self.to_tokenize = self.to_tokenize.trim_start();
73
74        if let Some(c) = self.to_tokenize.chars().next() {
75            match c {
76                '\\' => {
77                    let (first, rest) = split_at_newline(self.to_tokenize);
78                    self.to_tokenize = rest;
79                    Some(ForthToken::DropLineComment(first))
80                }
81                ':' => {
82                    self.to_tokenize = &self.to_tokenize[1..];
83                    Some(ForthToken::Colon)
84                }
85                ';' => {
86                    self.to_tokenize = &self.to_tokenize[1..];
87                    Some(ForthToken::SemiColon)
88                }
89                '(' => {
90                    let (first, rest) = split_at_token(self.to_tokenize, ')');
91                    self.to_tokenize = rest;
92                    Some(ForthToken::ParenthesizedRemark(first))
93                }
94                _ => {
95                    let (start, rest) = split_at_ascii_whitespace(self.to_tokenize);
96                    self.to_tokenize = rest;
97
98                    if start.ends_with('"') {
99                        let (newstart, newrest) = split_at_token(rest, '"');
100                        self.to_tokenize = newrest;
101
102                        return Some(ForthToken::StringCommand(&start, newstart));
103                    }
104                    // Determine if its a number or a command
105                    match start.parse::<i64>() {
106                        // We found a number, then return it as a number token
107                        Ok(n) => Some(ForthToken::Number(n)),
108                        // Wasn't a number, treat it as a *word*
109                        Err(_) => Some(ForthToken::Command(start)),
110                    }
111                }
112            }
113        } else {
114            None
115        }
116    }
117}
118
119impl<'a> IntoIterator for &'a ForthTokenizer<'a> {
120    type Item = ForthToken<'a>;
121    type IntoIter = ForthTokenizerIntoIterator<'a>;
122
123    fn into_iter(self) -> Self::IntoIter {
124        ForthTokenizerIntoIterator {
125            to_tokenize: self.to_tokenize,
126        }
127    }
128}
129
130fn split_at_newline(to_split: &str) -> (&str, &str) {
131    let mut line_iterator = to_split.splitn(2, &['\n', '\r'][..]);
132    if let Some(first) = line_iterator.next() {
133        if let Some(rest) = line_iterator.next() {
134            match rest.chars().next().unwrap() {
135                '\n' => (first, &rest[1..]),
136                _ => (first, rest),
137            }
138        } else {
139            (first, "")
140        }
141    } else {
142        ("", "")
143    }
144}
145
146fn split_at_ascii_whitespace(to_split: &str) -> (&str, &str) {
147    let mut line_iterator = to_split.splitn(2, |c: char| c.is_ascii_whitespace());
148    if let Some(first) = line_iterator.next() {
149        if let Some(rest) = line_iterator.next() {
150            match rest.chars().next().unwrap() {
151                '\n' => (first, &rest[1..]),
152                _ => (first, rest),
153            }
154        } else {
155            (first, "")
156        }
157    } else {
158        ("", "")
159    }
160}
161
162fn split_at_token(to_split: &str, token: char) -> (&str, &str) {
163    let mut line_iterator = to_split.splitn(2, token);
164    if let Some(first) = line_iterator.next() {
165        if let Some(rest) = line_iterator.next() {
166            match rest.chars().next().unwrap() {
167                '\n' => (first, &rest[1..]),
168                _ => (first, rest),
169            }
170        } else {
171            (first, "")
172        }
173    } else {
174        ("", "")
175    }
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn test_split_at_newline_1() {
184        assert_eq!(split_at_newline(""), ("", ""));
185    }
186
187    #[test]
188    fn test_split_at_newline_2() {
189        assert_eq!(split_at_newline("abc"), ("abc", ""));
190    }
191
192    #[test]
193    fn test_split_at_newline_3() {
194        assert_eq!(split_at_newline("abc\r\ndef"), ("abc", "def"));
195    }
196
197    #[test]
198    fn test_split_at_newline_4() {
199        assert_eq!(split_at_newline("abc\ndef"), ("abc", "def"));
200        assert_eq!(split_at_newline(""), ("", ""));
201    }
202    #[test]
203    fn test_split_at_newline_5() {
204        assert_eq!(
205            split_at_newline("abc\r\ndef\r\nghi\r\njkl"),
206            ("abc", "def\r\nghi\r\njkl")
207        );
208    }
209
210    #[test]
211    fn test_split_at_newline_6() {
212        assert_eq!(
213            split_at_newline("abc\ndef\nghi\njkl"),
214            ("abc", "def\nghi\njkl")
215        );
216        assert_eq!(split_at_newline(""), ("", ""));
217    }
218
219    #[test]
220    fn test_number_1() {
221        let tokenizer = ForthTokenizer::new("1 these 2 are 3 words 4");
222        let collected: Vec<_> = tokenizer.into_iter().collect();
223        assert_eq!(
224            &collected,
225            &vec![
226                ForthToken::Number(1),
227                ForthToken::Command("these"),
228                ForthToken::Number(2),
229                ForthToken::Command("are"),
230                ForthToken::Number(3),
231                ForthToken::Command("words"),
232                ForthToken::Number(4),
233            ]
234        );
235    }
236
237    #[test]
238    fn test_command_1() {
239        let tokenizer = ForthTokenizer::new("these are #words 1 with 2 numbers");
240        let collected: Vec<_> = tokenizer.into_iter().collect();
241        assert_eq!(
242            &collected,
243            &vec![
244                ForthToken::Command("these"),
245                ForthToken::Command("are"),
246                ForthToken::Command("#words"),
247                ForthToken::Number(1),
248                ForthToken::Command("with"),
249                ForthToken::Number(2),
250                ForthToken::Command("numbers"),
251            ]
252        );
253    }
254
255    #[test]
256    fn test_colon_1() {
257        let tokenizer = ForthTokenizer::new("word : wordname 1 2 3 ; definition");
258        let collected: Vec<_> = tokenizer.into_iter().collect();
259        assert_eq!(
260            &collected,
261            &vec![
262                ForthToken::Command("word"),
263                ForthToken::Colon,
264                ForthToken::Command("wordname"),
265                ForthToken::Number(1),
266                ForthToken::Number(2),
267                ForthToken::Number(3),
268                ForthToken::SemiColon,
269                ForthToken::Command("definition"),
270            ]
271        );
272    }
273
274    #[test]
275    fn test_semicolon_1() {
276        let tokenizer = ForthTokenizer::new("word : wordname 1 $whatever 3 ; definition");
277        let collected: Vec<_> = tokenizer.into_iter().collect();
278        assert_eq!(
279            &collected,
280            &vec![
281                ForthToken::Command("word"),
282                ForthToken::Colon,
283                ForthToken::Command("wordname"),
284                ForthToken::Number(1),
285                ForthToken::Command("$whatever"),
286                ForthToken::Number(3),
287                ForthToken::SemiColon,
288                ForthToken::Command("definition"),
289            ]
290        );
291    }
292
293    #[test]
294    fn test_stringcommand_1() {
295        let tokenizer = ForthTokenizer::new("1 2 \" This is a string\" 3 4");
296        let collected: Vec<_> = tokenizer.into_iter().collect();
297        assert_eq!(
298            &collected,
299            &vec![
300                ForthToken::Number(1),
301                ForthToken::Number(2),
302                ForthToken::StringCommand("\"", "This is a string"),
303                ForthToken::Number(3),
304                ForthToken::Number(4),
305            ]
306        );
307    }
308
309    #[test]
310    fn test_stringcommand_2() {
311        let tokenizer = ForthTokenizer::new("1 2 .s\" This is a string\" 3 4");
312        let collected: Vec<_> = tokenizer.into_iter().collect();
313        assert_eq!(
314            &collected,
315            &vec![
316                ForthToken::Number(1),
317                ForthToken::Number(2),
318                ForthToken::StringCommand(".s\"", "This is a string"),
319                ForthToken::Number(3),
320                ForthToken::Number(4),
321            ]
322        );
323    }
324
325    #[test]
326    fn test_droplinecomment_1() {
327        // Forgot the space after the 2, this will come out totally differently than a comment
328        let tokenizer = ForthTokenizer::new("1 2\\ This is a dropline comment\n\r1 3\r\n4");
329        let collected: Vec<_> = tokenizer.into_iter().collect();
330        assert_eq!(
331            &collected,
332            &vec![
333                ForthToken::Number(1),
334                ForthToken::Command("2\\"),
335                ForthToken::Command("This"),
336                ForthToken::Command("is"),
337                ForthToken::Command("a"),
338                ForthToken::Command("dropline"),
339                ForthToken::Command("comment"),
340                ForthToken::Number(1),
341                ForthToken::Number(3),
342                ForthToken::Number(4),
343            ]
344        );
345    }
346
347    #[test]
348    fn test_droplinecomment_2() {
349        let tokenizer = ForthTokenizer::new("1 2 \\ This is a dropline comment\n\r1 3\r\n4");
350        let collected: Vec<_> = tokenizer.into_iter().collect();
351        assert_eq!(
352            &collected,
353            &vec![
354                ForthToken::Number(1),
355                ForthToken::Number(2),
356                ForthToken::DropLineComment("\\ This is a dropline comment"),
357                ForthToken::Number(1),
358                ForthToken::Number(3),
359                ForthToken::Number(4),
360            ]
361        );
362    }
363
364    #[test]
365    fn test_parenthesized_remark_1() {
366        // This isn't maybe intuitive, but we lose the trailing ) because its a delimiter... No easy way to change that that I know of
367        let tokenizer = ForthTokenizer::new(
368            "1 2 \\ This is a dropline comment ( This is not a parenthesized remark )\n\r1 ( This is in fact a parenthesized remark )3\r\n4",
369        );
370        let collected: Vec<_> = tokenizer.into_iter().collect();
371        assert_eq!(
372            &collected,
373            &vec![
374                ForthToken::Number(1),
375                ForthToken::Number(2),
376                ForthToken::DropLineComment(
377                    "\\ This is a dropline comment ( This is not a parenthesized remark )"
378                ),
379                ForthToken::Number(1),
380                ForthToken::ParenthesizedRemark("( This is in fact a parenthesized remark "),
381                ForthToken::Number(3),
382                ForthToken::Number(4),
383            ]
384        );
385    }
386
387    #[test]
388    fn test_bug_1() {
389        let tokenizer = ForthTokenizer::new("1 1 1\n2 2 2\n3 3 3");
390        let collected: Vec<_> = tokenizer.into_iter().collect();
391        assert_eq!(
392            &collected,
393            &vec![
394                ForthToken::Number(1),
395                ForthToken::Number(1),
396                ForthToken::Number(1),
397                ForthToken::Number(2),
398                ForthToken::Number(2),
399                ForthToken::Number(2),
400                ForthToken::Number(3),
401                ForthToken::Number(3),
402                ForthToken::Number(3)
403            ]
404        );
405    }
406}