Skip to main content

prolog2/parser/
tokeniser.rs

1//! Lexer: converts Prolog source text into a token stream.
2
3const DELIMINATORS: &[char] = &[
4    '(', ')', ',', '.', ' ', '\r', '\n', '\t', '\\', ':', '-', '+', '/', '*', '=', '[', ']', '|',
5    '>', '<', '{', '}',
6];
7const KNOWN_SYMBOLS: &[&str] = &[":-", "==", "=/=", "/=", "=:=", "**", "<=", ">=", "/*", "*/"];
8
9// --------------------------------------------------------------------------------------
10// Tokenise File
11// --------------------------------------------------------------------------------------
12
13fn form_empty_list_token(tokens: &mut Vec<String>) {
14    let mut i = 0;
15    while tokens.len() > i + 1 {
16        if tokens[i] == "[" {
17            let mut j = i + 1;
18            let mut new_lines = 0;
19            loop {
20                match tokens[j].as_str() {
21                    " " | "\t" => j += 1,
22                    "\n" => {
23                        new_lines += 1;
24                        j += 1;
25                    }
26                    "]" => {
27                        tokens.drain(i..j + 1);
28                        tokens.insert(i, "[]".into());
29                        for _ in 0..new_lines {
30                            tokens.insert(i + 1, "\n".into());
31                        }
32                        break;
33                    }
34                    _ => break,
35                }
36            }
37        }
38        i += 1;
39    }
40}
41
42fn form_empty_set_token(tokens: &mut Vec<String>) {
43    let mut i = 0;
44    while tokens.len() > i + 1 {
45        if tokens[i] == "{" {
46            let mut j = i + 1;
47            let mut new_lines = 0;
48            loop {
49                match tokens[j].as_str() {
50                    " " | "\t" => j += 1,
51                    "\n" => {
52                        new_lines += 1;
53                        j += 1;
54                    }
55                    "}" => {
56                        tokens.drain(i..j + 1);
57                        tokens.insert(i, "{}".into());
58                        for _ in 0..new_lines {
59                            tokens.insert(i + 1, "\n".into());
60                        }
61                        break;
62                    }
63                    _ => break,
64                }
65            }
66        }
67        i += 1;
68    }
69}
70
71fn form_empty_tuple_token(tokens: &mut Vec<String>) {
72    let mut i = 0;
73    while tokens.len() > i + 1 {
74        if tokens[i] == "(" {
75            let mut j = i + 1;
76            let mut new_lines = 0;
77            loop {
78                match tokens[j].as_str() {
79                    " " | "\t" => j += 1,
80                    "\n" => {
81                        new_lines += 1;
82                        j += 1;
83                    }
84                    ")" => {
85                        tokens.drain(i..j + 1);
86                        tokens.insert(i, "()".into());
87                        for _ in 0..new_lines {
88                            tokens.insert(i + 1, "\n".into());
89                        }
90                        break;
91                    }
92                    _ => break,
93                }
94            }
95        }
96        i += 1;
97    }
98}
99
100fn walk_string(
101    characters: &Vec<char>,
102    mut i: usize,
103    mark: char,
104) -> Result<(String, usize), String> {
105    // TODO: more complex escape character processing
106    let mut str = vec![mark];
107    while let Some(&c) = characters.get(i) {
108        match c {
109            c if c == mark => {
110                str.push(c);
111                return Ok((str.iter().collect(), i + 1));
112            }
113            '\\' => {
114                match characters.get(i + 1) {
115                    Some('n') => str.push('\n'),
116                    Some('t') => str.push('\t'),
117                    Some('\\') => str.push('\\'),
118                    Some('"') => str.push('"'),
119                    Some('\'') => str.push('\''),
120                    Some(_) => return Err("'\\' used without proper escape character".into()),
121                    None => break,
122                }
123                i += 2
124            }
125            _ => {
126                str.push(c);
127                i += 1;
128            }
129        }
130    }
131    Err(format!("Unexpected end of file, missing closing {mark}"))
132}
133
134fn walk_multi_line_comment(characters: &Vec<char>, mut i: usize) -> Result<(usize, usize), String> {
135    let mut newlines = 0;
136    while i <= characters.len() - 2 {
137        if characters[i] == '\n' {
138            newlines += 1
139        }
140        if &characters[i..i + 2] == ['*', '/'] {
141            return Ok((i + 2, newlines));
142        }
143        i += 1;
144    }
145    Err("Unclosed multi line comment".into())
146}
147
148fn walk_single_line_comment(characters: &Vec<char>, mut i: usize) -> Result<usize, ()> {
149    while let Some(&c) = characters.get(i) {
150        if c == '\n' {
151            return Ok(i + 1);
152        }
153        i += 1;
154    }
155    Err(())
156}
157
158/* If two numerical tokens are split by a '.' char unify into one token
159*/
160fn join_decimal_nums(tokens: &mut Vec<String>) {
161    let mut i = 0;
162    while tokens.len() > i + 2 {
163        i += 1;
164        if tokens[i] == "."
165            && tokens[i - 1].chars().all(|c| c.is_numeric())
166            && tokens[i + 1].chars().all(|c| c.is_numeric())
167        {
168            let combined_value = [
169                tokens[i - 1].as_str(),
170                tokens[i].as_str(),
171                tokens[i + 1].as_str(),
172            ]
173            .concat();
174            tokens.remove(i + 1);
175            tokens.remove(i);
176            tokens.remove(i - 1);
177            tokens.insert(i - 1, combined_value);
178        }
179    }
180}
181
182/* If a substract sign is at the front of number unifiy into one token
183*/
184fn form_negative_nums(tokens: &mut Vec<String>) {
185    let mut i = 0;
186    while tokens.len() > i + 1 {
187        if tokens[i] == "-" && tokens[i + 1].chars().all(|c| c.is_numeric() || c == '.') {
188            let combined_value = [tokens[i].as_str(), tokens[i + 1].as_str()].concat();
189            tokens.remove(i + 1);
190            tokens.remove(i);
191            tokens.insert(i, combined_value);
192        }
193        i += 1;
194    }
195}
196
197fn form_known_symbols(tokens: &mut Vec<String>) {
198    let mut i = 0;
199    while i < tokens.len() - 1 {
200        if DELIMINATORS.contains(&tokens[i].chars().next().unwrap()) {
201            for &symbol in KNOWN_SYMBOLS {
202                if symbol.len() <= tokens.len() - i {
203                    if symbol == tokens[i..i + symbol.len()].concat() {
204                        tokens.drain(i..i + symbol.len());
205                        tokens.insert(i, symbol.into());
206                    }
207                }
208            }
209        }
210        i += 1;
211    }
212}
213
214pub fn tokenise(text: String) -> Result<Vec<String>, String> {
215    let mut tokens = Vec::<String>::new();
216    let mut last_i = 0;
217    let characters: Vec<char> = text.chars().collect();
218    let mut i = 0;
219
220    while i < characters.len() {
221        // println!("{tokens:?}");
222        let c = characters[i];
223        match c {
224            '\'' | '"' => {
225                let (token, i2) = walk_string(&characters, i + 1, c)?;
226                tokens.push(token);
227                i = i2;
228                last_i = i;
229            }
230            '%' => match walk_single_line_comment(&characters, i + 1) {
231                Ok(i2) => {
232                    i = i2;
233                    last_i = i;
234                    tokens.push("\n".into());
235                }
236                Err(_) => {
237                    i = characters.len();
238                    last_i = i;
239                    break;
240                }
241            },
242            '/' => {
243                if characters[i + 1] == '*' {
244                    let (i2, newlines) = walk_multi_line_comment(&characters, i + 1)?;
245                    i = i2;
246                    last_i = i;
247                    for _ in 0..newlines {
248                        tokens.push("\n".into());
249                    }
250                } else {
251                    tokens.push(text[last_i..i].into());
252                    tokens.push(c.into());
253                    i += 1;
254                    last_i = i;
255                }
256            }
257            c if DELIMINATORS.contains(&c) => {
258                tokens.push(text[last_i..i].into());
259                tokens.push(c.into());
260                i += 1;
261                last_i = i;
262            }
263            _ => i += 1,
264        }
265    }
266    tokens.push(text[last_i..].into());
267    tokens.retain(|token| "" != *token);
268    form_known_symbols(&mut tokens);
269    join_decimal_nums(&mut tokens);
270    form_negative_nums(&mut tokens);
271    form_empty_list_token(&mut tokens);
272    form_empty_set_token(&mut tokens);
273    form_empty_tuple_token(&mut tokens);
274    tokens.retain(|t1| !["\t", " ", "\r"].iter().any(|t2| t1 == t2));
275    Ok(tokens)
276}
277
278#[cfg(test)]
279mod tests {
280    use super::tokenise;
281
282    #[test]
283    fn single_line_comments() {
284        let file = "\n%simple predicate\np(x,y):-%The head\nq(x),%body 1\nr(y).%body 2".to_string();
285
286        let tokens = tokenise(file).unwrap();
287
288        assert_eq!(
289            tokens,
290            [
291                "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "\n", "q", "(", "x", ")", ",",
292                "\n", "r", "(", "y", ")", "."
293            ]
294        );
295    }
296
297    #[test]
298    fn multi_line_comments() {
299        let file = "/* This is a\nmutli\nline\ncomment */\np(x,y):-q(x,y).".to_string();
300
301        let tokens = tokenise(file).unwrap();
302
303        assert_eq!(
304            tokens,
305            [
306                "\n", "\n", "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "q", "(", "x", ",",
307                "y", ")", "."
308            ]
309        )
310    }
311
312    #[test]
313    fn unclosed_multi_line_comments() {
314        let file = "p(x,y):-q(x,y)
315        /* a comment
316        on two lines
317        fact(a,b)."
318            .to_string();
319
320        match tokenise(file) {
321            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
322            Err(message) => assert_eq!(message, "Unclosed multi line comment"),
323        }
324    }
325
326    #[test]
327    fn unclosed_strings() {
328        let file = "\"a string".to_string();
329
330        match tokenise(file) {
331            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
332            Err(message) => assert_eq!(message, "Unexpected end of file, missing closing \""),
333        }
334
335        let file = "'a string".to_string();
336
337        match tokenise(file) {
338            Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
339            Err(message) => assert_eq!(message, "Unexpected end of file, missing closing '"),
340        }
341    }
342
343    #[test]
344    fn string_tokenisation() {
345        assert_eq!(tokenise("\"a.b\'/c\"".into()).unwrap(), ["\"a.b'/c\""]);
346        assert_eq!(tokenise("'a.b\"/c'".into()).unwrap(), ["'a.b\"/c'"]);
347
348        assert_eq!(
349            tokenise("p(\"a.b\'/c\").".into()).unwrap(),
350            ["p", "(", "\"a.b'/c\"", ")", "."]
351        );
352    }
353
354    #[test]
355    fn string_escape_characters() {
356        assert_eq!(tokenise("\"a\\\"b\"".into()).unwrap(), ["\"a\"b\""]);
357
358        assert_eq!(tokenise("'a\\'b'".into()).unwrap(), ["'a'b'"]);
359
360        assert_eq!(
361            tokenise("\" \\n \\t \\\\ \"".into()).unwrap(),
362            ["\" \n \t \\ \""]
363        );
364    }
365
366    #[test]
367    fn float_tokenisation() {
368        assert_eq!(tokenise("123.123".into()).unwrap(), ["123.123"]);
369        assert_eq!(
370            tokenise("123.123.123".into()).unwrap(),
371            ["123.123", ".", "123"]
372        );
373        assert_eq!(tokenise("123 . 123".into()).unwrap(), ["123", ".", "123"]);
374    }
375
376    #[test]
377    fn negative_number_tokenisation() {
378        assert_eq!(tokenise("-123 - 123".into()).unwrap(), ["-123", "-", "123"]);
379        assert_eq!(tokenise("-123.123".into()).unwrap(), ["-123.123"]);
380        assert_eq!(tokenise("- 123.123".into()).unwrap(), ["-", "123.123"]);
381        assert_eq!(tokenise("-123 . 123".into()).unwrap(), ["-123", ".", "123"]);
382    }
383
384    #[test]
385    fn list_tokenisation() {
386        assert_eq!(
387            tokenise("[123,abc, VAR]".into()).unwrap(),
388            ["[", "123", ",", "abc", ",", "VAR", "]"]
389        );
390        assert_eq!(
391            tokenise("[123,abc, VAR|T]".into()).unwrap(),
392            ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
393        );
394        assert_eq!(
395            tokenise("[123,abc, VAR | T]".into()).unwrap(),
396            ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
397        );
398    }
399
400    #[test]
401    fn empty_list_token() {
402        assert_eq!(tokenise("[]".into()).unwrap(), ["[]"]);
403        assert_eq!(tokenise("[ ]".into()).unwrap(), ["[]"]);
404        assert_eq!(tokenise("[\n]".into()).unwrap(), ["[]", "\n"]);
405        assert_eq!(tokenise("[\n\n]".into()).unwrap(), ["[]", "\n", "\n"]);
406        assert_eq!(tokenise("[\t]".into()).unwrap(), ["[]"]);
407        assert_eq!(tokenise("[  \n\t\n ]".into()).unwrap(), ["[]", "\n", "\n"]);
408    }
409
410    #[test]
411    fn empty_set_token() {
412        assert_eq!(tokenise("{}".into()).unwrap(), ["{}"]);
413        assert_eq!(tokenise("{ }".into()).unwrap(), ["{}"]);
414        assert_eq!(tokenise("{\n}".into()).unwrap(), ["{}", "\n"]);
415        assert_eq!(tokenise("{\n\n}".into()).unwrap(), ["{}", "\n", "\n"]);
416        assert_eq!(tokenise("{\t}".into()).unwrap(), ["{}"]);
417        assert_eq!(tokenise("{  \n\t\n }".into()).unwrap(), ["{}", "\n", "\n"]);
418    }
419
420    #[test]
421    fn known_symbol_formation() {
422        assert_eq!(tokenise(":-".into()).unwrap(), [":-"]);
423        assert_eq!(tokenise(": -".into()).unwrap(), [":", "-"]);
424
425        assert_eq!(tokenise("==".into()).unwrap(), ["=="]);
426        assert_eq!(tokenise("= =".into()).unwrap(), ["=", "="]);
427
428        assert_eq!(tokenise("=/=".into()).unwrap(), ["=/="]);
429        assert_eq!(tokenise("= / =".into()).unwrap(), ["=", "/", "="]);
430
431        assert_eq!(tokenise("/=".into()).unwrap(), ["/="]);
432        assert_eq!(tokenise("/ =".into()).unwrap(), ["/", "="]);
433
434        assert_eq!(tokenise("=:=".into()).unwrap(), ["=:="]);
435        assert_eq!(tokenise("=  : =".into()).unwrap(), ["=", ":", "="]);
436
437        assert_eq!(tokenise("**".into()).unwrap(), ["**"]);
438        assert_eq!(tokenise("* *".into()).unwrap(), ["*", "*",]);
439
440        assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
441        assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
442
443        assert_eq!(tokenise(">=".into()).unwrap(), [">="]);
444        assert_eq!(tokenise("> =".into()).unwrap(), [">", "=",]);
445
446        assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
447        assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
448    }
449
450    #[test]
451    fn tokenise_multiple_clauses() {
452        let text = " p(a,[b,c|[\t]]).
453        p(X,Y):- Q(X,Y), {Q}.
454        :- [\"file/name\"], test.
455        "
456        .to_string();
457
458        assert_eq!(
459            tokenise(text).unwrap(),
460            [
461                "p",
462                "(",
463                "a",
464                ",",
465                "[",
466                "b",
467                ",",
468                "c",
469                "|",
470                "[]",
471                "]",
472                ")",
473                ".",
474                "\n",
475                "p",
476                "(",
477                "X",
478                ",",
479                "Y",
480                ")",
481                ":-",
482                "Q",
483                "(",
484                "X",
485                ",",
486                "Y",
487                ")",
488                ",",
489                "{",
490                "Q",
491                "}",
492                ".",
493                "\n",
494                ":-",
495                "[",
496                "\"file/name\"",
497                "]",
498                ",",
499                "test",
500                ".",
501                "\n"
502            ]
503        );
504    }
505}