Skip to main content

prolog2/parser/
tokeniser.rs

1//! Lexer: converts Prolog source text into a token stream.
2
3const DELIMINATORS: &[char] = &[
4    '(', ')', ',', '.', ' ', '\r', '\n', '\t', '\\', ':', '-', '+', '/', '*', '=', '[', ']', '|',
5    '>', '<', '{', '}',
6];
7const KNOWN_SYMBOLS: &[&str] = &[":-", "==", "=/=", "/=", "=:=", "**", "<=", ">=", "/*", "*/"];
8
9
10// --------------------------------------------------------------------------------------
11// Tokenise File
12// --------------------------------------------------------------------------------------
13
14fn form_empty_list_token(tokens: &mut Vec<String>) {
15    let mut i = 0;
16    while tokens.len() > i + 1 {
17        if tokens[i] == "[" {
18            let mut j = i + 1;
19            let mut new_lines = 0;
20            loop {
21                match tokens[j].as_str() {
22                    " " | "\t" => j += 1,
23                    "\n" => {
24                        new_lines += 1;
25                        j += 1;
26                    }
27                    "]" => {
28                        tokens.drain(i..j + 1);
29                        tokens.insert(i, "[]".into());
30                        for _ in 0..new_lines {
31                            tokens.insert(i + 1, "\n".into());
32                        }
33                        break;
34                    }
35                    _ => break,
36                }
37            }
38        }
39        i += 1;
40    }
41}
42
43fn form_empty_set_token(tokens: &mut Vec<String>) {
44    let mut i = 0;
45    while tokens.len() > i + 1 {
46        if tokens[i] == "{" {
47            let mut j = i + 1;
48            let mut new_lines = 0;
49            loop {
50                match tokens[j].as_str() {
51                    " " | "\t" => j += 1,
52                    "\n" => {
53                        new_lines += 1;
54                        j += 1;
55                    }
56                    "}" => {
57                        tokens.drain(i..j + 1);
58                        tokens.insert(i, "{}".into());
59                        for _ in 0..new_lines {
60                            tokens.insert(i + 1, "\n".into());
61                        }
62                        break;
63                    }
64                    _ => break,
65                }
66            }
67        }
68        i += 1;
69    }
70}
71
72fn form_empty_tuple_token(tokens: &mut Vec<String>) {
73    let mut i = 0;
74    while tokens.len() > i + 1 {
75        if tokens[i] == "(" {
76            let mut j = i + 1;
77            let mut new_lines = 0;
78            loop {
79                match tokens[j].as_str() {
80                    " " | "\t" => j += 1,
81                    "\n" => {
82                        new_lines += 1;
83                        j += 1;
84                    }
85                    ")" => {
86                        tokens.drain(i..j + 1);
87                        tokens.insert(i, "()".into());
88                        for _ in 0..new_lines {
89                            tokens.insert(i + 1, "\n".into());
90                        }
91                        break;
92                    }
93                    _ => break,
94                }
95            }
96        }
97        i += 1;
98    }
99}
100
101fn walk_string(
102    characters: &Vec<char>,
103    mut i: usize,
104    mark: char,
105) -> Result<(String, usize), String> {
106    // TODO: more complex escape character processing
107    let mut str = vec![mark];
108    while let Some(&c) = characters.get(i) {
109        match c {
110            c if c == mark => {
111                str.push(c);
112                return Ok((str.iter().collect(), i + 1));
113            }
114            '\\' => {
115                match characters.get(i + 1) {
116                    Some('n') => str.push('\n'),
117                    Some('t') => str.push('\t'),
118                    Some('\\') => str.push('\\'),
119                    Some('"') => str.push('"'),
120                    Some('\'') => str.push('\''),
121                    Some(_) => return Err("'\\' used without proper escape character".into()),
122                    None => break,
123                }
124                i += 2
125            }
126            _ => {
127                str.push(c);
128                i += 1;
129            }
130        }
131    }
132    Err(format!("Unexpected end of file, missing closing {mark}"))
133}
134
135fn walk_multi_line_comment(characters: &Vec<char>, mut i: usize) -> Result<(usize, usize), String> {
136    let mut newlines = 0;
137    while i <= characters.len() - 2 {
138        if characters[i] == '\n' {
139            newlines += 1
140        }
141        if &characters[i..i + 2] == ['*', '/'] {
142            return Ok((i + 2, newlines));
143        }
144        i += 1;
145    }
146    Err("Unclosed multi line comment".into())
147}
148
149fn walk_single_line_comment(characters: &Vec<char>, mut i: usize) -> Result<usize, ()> {
150    while let Some(&c) = characters.get(i) {
151        if c == '\n' {
152            return Ok(i + 1);
153        }
154        i += 1;
155    }
156    Err(())
157}
158
159/* If two numerical tokens are split by a '.' char unify into one token
160*/
161fn join_decimal_nums(tokens: &mut Vec<String>) {
162    let mut i = 0;
163    while tokens.len() > i + 2 {
164        i += 1;
165        if tokens[i] == "."
166            && tokens[i - 1].chars().all(|c| c.is_numeric())
167            && tokens[i + 1].chars().all(|c| c.is_numeric())
168        {
169            let combined_value = [
170                tokens[i - 1].as_str(),
171                tokens[i].as_str(),
172                tokens[i + 1].as_str(),
173            ]
174            .concat();
175            tokens.remove(i + 1);
176            tokens.remove(i);
177            tokens.remove(i - 1);
178            tokens.insert(i - 1, combined_value);
179        }
180    }
181}
182
183/* If a substract sign is at the front of number unifiy into one token
184*/
185fn form_negative_nums(tokens: &mut Vec<String>) {
186    let mut i = 0;
187    while tokens.len() > i + 1 {
188        if tokens[i] == "-" && tokens[i + 1].chars().all(|c| c.is_numeric() || c == '.') {
189            let combined_value = [tokens[i].as_str(), tokens[i + 1].as_str()].concat();
190            tokens.remove(i + 1);
191            tokens.remove(i);
192            tokens.insert(i, combined_value);
193        }
194        i += 1;
195    }
196}
197
198fn form_known_symbols(tokens: &mut Vec<String>) {
199    let mut i = 0;
200    while i < tokens.len() - 1 {
201        if DELIMINATORS.contains(&tokens[i].chars().next().unwrap()) {
202            for &symbol in KNOWN_SYMBOLS {
203                if symbol.len() <= tokens.len() - i {
204                    if symbol == tokens[i..i + symbol.len()].concat() {
205                        tokens.drain(i..i + symbol.len());
206                        tokens.insert(i, symbol.into());
207                    }
208                }
209            }
210        }
211        i += 1;
212    }
213}
214
215pub fn tokenise(text: String) -> Result<Vec<String>, String> {
216    let mut tokens = Vec::<String>::new();
217    let mut last_i = 0;
218    let characters: Vec<char> = text.chars().collect();
219    let mut i = 0;
220
221    while i < characters.len() {
222        // println!("{tokens:?}");
223        let c = characters[i];
224        match c {
225            '\'' | '"' => {
226                let (token, i2) = walk_string(&characters, i + 1, c)?;
227                tokens.push(token);
228                i = i2;
229                last_i = i;
230            }
231            '%' => match walk_single_line_comment(&characters, i + 1) {
232                Ok(i2) => {
233                    i = i2;
234                    last_i = i;
235                    tokens.push("\n".into());
236                }
237                Err(_) => {
238                    i = characters.len();
239                    last_i = i;
240                    break;
241                }
242            },
243            '/' => {
244                if characters[i + 1] == '*' {
245                    let (i2, newlines) = walk_multi_line_comment(&characters, i + 1)?;
246                    i = i2;
247                    last_i = i;
248                    for _ in 0..newlines {
249                        tokens.push("\n".into());
250                    }
251                } else {
252                    tokens.push(text[last_i..i].into());
253                    tokens.push(c.into());
254                    i += 1;
255                    last_i = i;
256                }
257            }
258            c if DELIMINATORS.contains(&c) => {
259                tokens.push(text[last_i..i].into());
260                tokens.push(c.into());
261                i += 1;
262                last_i = i;
263            }
264            _ => i += 1,
265        }
266    }
267    tokens.push(text[last_i..].into());
268    tokens.retain(|token| "" != *token);
269    form_known_symbols(&mut tokens);
270    join_decimal_nums(&mut tokens);
271    form_negative_nums(&mut tokens);
272    form_empty_list_token(&mut tokens);
273    form_empty_set_token(&mut tokens);
274    form_empty_tuple_token(&mut tokens);
275    tokens.retain(|t1| !["\t", " ", "\r"].iter().any(|t2| t1 == t2));
276    Ok(tokens)
277}
278
279#[cfg(test)]
280mod tests {
281    use super::tokenise;
282
283    #[test]
284    fn single_line_comments() {
285        let file =
286            "\n%simple predicate\np(x,y):-%The head\nq(x),%body 1\nr(y).%body 2".to_string();
287
288        let tokens = tokenise(file).unwrap();
289
290        assert_eq!(
291            tokens,
292            [
293                "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "\n", "q", "(", "x", ")", ",",
294                "\n", "r", "(", "y", ")", "."
295            ]
296        );
297    }
298
299    #[test]
300    fn multi_line_comments() {
301        let file = "/* This is a\nmutli\nline\ncomment */\np(x,y):-q(x,y).".to_string();
302
303        let tokens = tokenise(file).unwrap();
304
305        assert_eq!(
306            tokens,
307            [
308                "\n", "\n", "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "q", "(", "x", ",",
309                "y", ")", "."
310            ]
311        )
312    }
313
314    #[test]
315    fn unclosed_multi_line_comments() {
316        let file = "p(x,y):-q(x,y)
317        /* a comment
318        on two lines
319        fact(a,b)."
320            .to_string();
321
322        match tokenise(file) {
323            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
324            Err(message) => assert_eq!(message, "Unclosed multi line comment"),
325        }
326    }
327
328    #[test]
329    fn unclosed_strings() {
330        let file = "\"a string".to_string();
331
332        match tokenise(file) {
333            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
334            Err(message) => assert_eq!(message, "Unexpected end of file, missing closing \""),
335        }
336
337        let file = "'a string".to_string();
338
339        match tokenise(file) {
340            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
341            Err(message) => assert_eq!(message, "Unexpected end of file, missing closing '"),
342        }
343    }
344
345    #[test]
346    fn string_tokenisation() {
347        assert_eq!(tokenise("\"a.b\'/c\"".into()).unwrap(), ["\"a.b'/c\""]);
348        assert_eq!(tokenise("'a.b\"/c'".into()).unwrap(), ["'a.b\"/c'"]);
349
350        assert_eq!(
351            tokenise("p(\"a.b\'/c\").".into()).unwrap(),
352            ["p", "(", "\"a.b'/c\"", ")", "."]
353        );
354    }
355
356    #[test]
357    fn string_escape_characters() {
358        assert_eq!(tokenise("\"a\\\"b\"".into()).unwrap(), ["\"a\"b\""]);
359
360        assert_eq!(tokenise("'a\\'b'".into()).unwrap(), ["'a'b'"]);
361
362        assert_eq!(
363            tokenise("\" \\n \\t \\\\ \"".into()).unwrap(),
364            ["\" \n \t \\ \""]
365        );
366    }
367
368    #[test]
369    fn float_tokenisation() {
370        assert_eq!(tokenise("123.123".into()).unwrap(), ["123.123"]);
371        assert_eq!(
372            tokenise("123.123.123".into()).unwrap(),
373            ["123.123", ".", "123"]
374        );
375        assert_eq!(tokenise("123 . 123".into()).unwrap(), ["123", ".", "123"]);
376    }
377
378    #[test]
379    fn negative_number_tokenisation() {
380        assert_eq!(tokenise("-123 - 123".into()).unwrap(), ["-123", "-", "123"]);
381        assert_eq!(tokenise("-123.123".into()).unwrap(), ["-123.123"]);
382        assert_eq!(tokenise("- 123.123".into()).unwrap(), ["-", "123.123"]);
383        assert_eq!(tokenise("-123 . 123".into()).unwrap(), ["-123", ".", "123"]);
384    }
385
386    #[test]
387    fn list_tokenisation() {
388        assert_eq!(
389            tokenise("[123,abc, VAR]".into()).unwrap(),
390            ["[", "123", ",", "abc", ",", "VAR", "]"]
391        );
392        assert_eq!(
393            tokenise("[123,abc, VAR|T]".into()).unwrap(),
394            ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
395        );
396        assert_eq!(
397            tokenise("[123,abc, VAR | T]".into()).unwrap(),
398            ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
399        );
400    }
401
402    #[test]
403    fn empty_list_token() {
404        assert_eq!(tokenise("[]".into()).unwrap(), ["[]"]);
405        assert_eq!(tokenise("[ ]".into()).unwrap(), ["[]"]);
406        assert_eq!(tokenise("[\n]".into()).unwrap(), ["[]", "\n"]);
407        assert_eq!(tokenise("[\n\n]".into()).unwrap(), ["[]", "\n", "\n"]);
408        assert_eq!(tokenise("[\t]".into()).unwrap(), ["[]"]);
409        assert_eq!(tokenise("[  \n\t\n ]".into()).unwrap(), ["[]", "\n", "\n"]);
410    }
411
412    #[test]
413    fn empty_set_token() {
414        assert_eq!(tokenise("{}".into()).unwrap(), ["{}"]);
415        assert_eq!(tokenise("{ }".into()).unwrap(), ["{}"]);
416        assert_eq!(tokenise("{\n}".into()).unwrap(), ["{}", "\n"]);
417        assert_eq!(tokenise("{\n\n}".into()).unwrap(), ["{}", "\n", "\n"]);
418        assert_eq!(tokenise("{\t}".into()).unwrap(), ["{}"]);
419        assert_eq!(tokenise("{  \n\t\n }".into()).unwrap(), ["{}", "\n", "\n"]);
420    }
421
422    #[test]
423    fn known_symbol_formation() {
424        assert_eq!(tokenise(":-".into()).unwrap(), [":-"]);
425        assert_eq!(tokenise(": -".into()).unwrap(), [":", "-"]);
426
427        assert_eq!(tokenise("==".into()).unwrap(), ["=="]);
428        assert_eq!(tokenise("= =".into()).unwrap(), ["=", "="]);
429
430        assert_eq!(tokenise("=/=".into()).unwrap(), ["=/="]);
431        assert_eq!(tokenise("= / =".into()).unwrap(), ["=", "/", "="]);
432
433        assert_eq!(tokenise("/=".into()).unwrap(), ["/="]);
434        assert_eq!(tokenise("/ =".into()).unwrap(), ["/", "="]);
435
436        assert_eq!(tokenise("=:=".into()).unwrap(), ["=:="]);
437        assert_eq!(tokenise("=  : =".into()).unwrap(), ["=", ":", "="]);
438
439        assert_eq!(tokenise("**".into()).unwrap(), ["**"]);
440        assert_eq!(tokenise("* *".into()).unwrap(), ["*", "*",]);
441
442        assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
443        assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
444
445        assert_eq!(tokenise(">=".into()).unwrap(), [">="]);
446        assert_eq!(tokenise("> =".into()).unwrap(), [">", "=",]);
447
448        assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
449        assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
450    }
451
452    #[test]
453    fn tokenise_multiple_clauses() {
454        let text = " p(a,[b,c|[\t]]).
455        p(X,Y):- Q(X,Y), {Q}.
456        :- [\"file/name\"], test.
457        "
458        .to_string();
459
460        assert_eq!(
461            tokenise(text).unwrap(),
462            [
463                "p",
464                "(",
465                "a",
466                ",",
467                "[",
468                "b",
469                ",",
470                "c",
471                "|",
472                "[]",
473                "]",
474                ")",
475                ".",
476                "\n",
477                "p",
478                "(",
479                "X",
480                ",",
481                "Y",
482                ")",
483                ":-",
484                "Q",
485                "(",
486                "X",
487                ",",
488                "Y",
489                ")",
490                ",",
491                "{",
492                "Q",
493                "}",
494                ".",
495                "\n",
496                ":-",
497                "[",
498                "\"file/name\"",
499                "]",
500                ",",
501                "test",
502                ".",
503                "\n"
504            ]
505        );
506    }
507}