1const DELIMINATORS: &[char] = &[
4 '(', ')', ',', '.', ' ', '\r', '\n', '\t', '\\', ':', '-', '+', '/', '*', '=', '[', ']', '|',
5 '>', '<', '{', '}',
6];
7const KNOWN_SYMBOLS: &[&str] = &[":-", "==", "=/=", "/=", "=:=", "**", "<=", ">=", "/*", "*/"];
8
9
10fn form_empty_list_token(tokens: &mut Vec<String>) {
15 let mut i = 0;
16 while tokens.len() > i + 1 {
17 if tokens[i] == "[" {
18 let mut j = i + 1;
19 let mut new_lines = 0;
20 loop {
21 match tokens[j].as_str() {
22 " " | "\t" => j += 1,
23 "\n" => {
24 new_lines += 1;
25 j += 1;
26 }
27 "]" => {
28 tokens.drain(i..j + 1);
29 tokens.insert(i, "[]".into());
30 for _ in 0..new_lines {
31 tokens.insert(i + 1, "\n".into());
32 }
33 break;
34 }
35 _ => break,
36 }
37 }
38 }
39 i += 1;
40 }
41}
42
43fn form_empty_set_token(tokens: &mut Vec<String>) {
44 let mut i = 0;
45 while tokens.len() > i + 1 {
46 if tokens[i] == "{" {
47 let mut j = i + 1;
48 let mut new_lines = 0;
49 loop {
50 match tokens[j].as_str() {
51 " " | "\t" => j += 1,
52 "\n" => {
53 new_lines += 1;
54 j += 1;
55 }
56 "}" => {
57 tokens.drain(i..j + 1);
58 tokens.insert(i, "{}".into());
59 for _ in 0..new_lines {
60 tokens.insert(i + 1, "\n".into());
61 }
62 break;
63 }
64 _ => break,
65 }
66 }
67 }
68 i += 1;
69 }
70}
71
72fn form_empty_tuple_token(tokens: &mut Vec<String>) {
73 let mut i = 0;
74 while tokens.len() > i + 1 {
75 if tokens[i] == "(" {
76 let mut j = i + 1;
77 let mut new_lines = 0;
78 loop {
79 match tokens[j].as_str() {
80 " " | "\t" => j += 1,
81 "\n" => {
82 new_lines += 1;
83 j += 1;
84 }
85 ")" => {
86 tokens.drain(i..j + 1);
87 tokens.insert(i, "()".into());
88 for _ in 0..new_lines {
89 tokens.insert(i + 1, "\n".into());
90 }
91 break;
92 }
93 _ => break,
94 }
95 }
96 }
97 i += 1;
98 }
99}
100
101fn walk_string(
102 characters: &Vec<char>,
103 mut i: usize,
104 mark: char,
105) -> Result<(String, usize), String> {
106 let mut str = vec![mark];
108 while let Some(&c) = characters.get(i) {
109 match c {
110 c if c == mark => {
111 str.push(c);
112 return Ok((str.iter().collect(), i + 1));
113 }
114 '\\' => {
115 match characters.get(i + 1) {
116 Some('n') => str.push('\n'),
117 Some('t') => str.push('\t'),
118 Some('\\') => str.push('\\'),
119 Some('"') => str.push('"'),
120 Some('\'') => str.push('\''),
121 Some(_) => return Err("'\\' used without proper escape character".into()),
122 None => break,
123 }
124 i += 2
125 }
126 _ => {
127 str.push(c);
128 i += 1;
129 }
130 }
131 }
132 Err(format!("Unexpected end of file, missing closing {mark}"))
133}
134
135fn walk_multi_line_comment(characters: &Vec<char>, mut i: usize) -> Result<(usize, usize), String> {
136 let mut newlines = 0;
137 while i <= characters.len() - 2 {
138 if characters[i] == '\n' {
139 newlines += 1
140 }
141 if &characters[i..i + 2] == ['*', '/'] {
142 return Ok((i + 2, newlines));
143 }
144 i += 1;
145 }
146 Err("Unclosed multi line comment".into())
147}
148
149fn walk_single_line_comment(characters: &Vec<char>, mut i: usize) -> Result<usize, ()> {
150 while let Some(&c) = characters.get(i) {
151 if c == '\n' {
152 return Ok(i + 1);
153 }
154 i += 1;
155 }
156 Err(())
157}
158
159fn join_decimal_nums(tokens: &mut Vec<String>) {
162 let mut i = 0;
163 while tokens.len() > i + 2 {
164 i += 1;
165 if tokens[i] == "."
166 && tokens[i - 1].chars().all(|c| c.is_numeric())
167 && tokens[i + 1].chars().all(|c| c.is_numeric())
168 {
169 let combined_value = [
170 tokens[i - 1].as_str(),
171 tokens[i].as_str(),
172 tokens[i + 1].as_str(),
173 ]
174 .concat();
175 tokens.remove(i + 1);
176 tokens.remove(i);
177 tokens.remove(i - 1);
178 tokens.insert(i - 1, combined_value);
179 }
180 }
181}
182
183fn form_negative_nums(tokens: &mut Vec<String>) {
186 let mut i = 0;
187 while tokens.len() > i + 1 {
188 if tokens[i] == "-" && tokens[i + 1].chars().all(|c| c.is_numeric() || c == '.') {
189 let combined_value = [tokens[i].as_str(), tokens[i + 1].as_str()].concat();
190 tokens.remove(i + 1);
191 tokens.remove(i);
192 tokens.insert(i, combined_value);
193 }
194 i += 1;
195 }
196}
197
198fn form_known_symbols(tokens: &mut Vec<String>) {
199 let mut i = 0;
200 while i < tokens.len() - 1 {
201 if DELIMINATORS.contains(&tokens[i].chars().next().unwrap()) {
202 for &symbol in KNOWN_SYMBOLS {
203 if symbol.len() <= tokens.len() - i {
204 if symbol == tokens[i..i + symbol.len()].concat() {
205 tokens.drain(i..i + symbol.len());
206 tokens.insert(i, symbol.into());
207 }
208 }
209 }
210 }
211 i += 1;
212 }
213}
214
215pub fn tokenise(text: String) -> Result<Vec<String>, String> {
216 let mut tokens = Vec::<String>::new();
217 let mut last_i = 0;
218 let characters: Vec<char> = text.chars().collect();
219 let mut i = 0;
220
221 while i < characters.len() {
222 let c = characters[i];
224 match c {
225 '\'' | '"' => {
226 let (token, i2) = walk_string(&characters, i + 1, c)?;
227 tokens.push(token);
228 i = i2;
229 last_i = i;
230 }
231 '%' => match walk_single_line_comment(&characters, i + 1) {
232 Ok(i2) => {
233 i = i2;
234 last_i = i;
235 tokens.push("\n".into());
236 }
237 Err(_) => {
238 i = characters.len();
239 last_i = i;
240 break;
241 }
242 },
243 '/' => {
244 if characters[i + 1] == '*' {
245 let (i2, newlines) = walk_multi_line_comment(&characters, i + 1)?;
246 i = i2;
247 last_i = i;
248 for _ in 0..newlines {
249 tokens.push("\n".into());
250 }
251 } else {
252 tokens.push(text[last_i..i].into());
253 tokens.push(c.into());
254 i += 1;
255 last_i = i;
256 }
257 }
258 c if DELIMINATORS.contains(&c) => {
259 tokens.push(text[last_i..i].into());
260 tokens.push(c.into());
261 i += 1;
262 last_i = i;
263 }
264 _ => i += 1,
265 }
266 }
267 tokens.push(text[last_i..].into());
268 tokens.retain(|token| "" != *token);
269 form_known_symbols(&mut tokens);
270 join_decimal_nums(&mut tokens);
271 form_negative_nums(&mut tokens);
272 form_empty_list_token(&mut tokens);
273 form_empty_set_token(&mut tokens);
274 form_empty_tuple_token(&mut tokens);
275 tokens.retain(|t1| !["\t", " ", "\r"].iter().any(|t2| t1 == t2));
276 Ok(tokens)
277}
278
279#[cfg(test)]
280mod tests {
281 use super::tokenise;
282
283 #[test]
284 fn single_line_comments() {
285 let file =
286 "\n%simple predicate\np(x,y):-%The head\nq(x),%body 1\nr(y).%body 2".to_string();
287
288 let tokens = tokenise(file).unwrap();
289
290 assert_eq!(
291 tokens,
292 [
293 "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "\n", "q", "(", "x", ")", ",",
294 "\n", "r", "(", "y", ")", "."
295 ]
296 );
297 }
298
299 #[test]
300 fn multi_line_comments() {
301 let file = "/* This is a\nmutli\nline\ncomment */\np(x,y):-q(x,y).".to_string();
302
303 let tokens = tokenise(file).unwrap();
304
305 assert_eq!(
306 tokens,
307 [
308 "\n", "\n", "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "q", "(", "x", ",",
309 "y", ")", "."
310 ]
311 )
312 }
313
314 #[test]
315 fn unclosed_multi_line_comments() {
316 let file = "p(x,y):-q(x,y)
317 /* a comment
318 on two lines
319 fact(a,b)."
320 .to_string();
321
322 match tokenise(file) {
323 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
324 Err(message) => assert_eq!(message, "Unclosed multi line comment"),
325 }
326 }
327
328 #[test]
329 fn unclosed_strings() {
330 let file = "\"a string".to_string();
331
332 match tokenise(file) {
333 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
334 Err(message) => assert_eq!(message, "Unexpected end of file, missing closing \""),
335 }
336
337 let file = "'a string".to_string();
338
339 match tokenise(file) {
340 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
341 Err(message) => assert_eq!(message, "Unexpected end of file, missing closing '"),
342 }
343 }
344
345 #[test]
346 fn string_tokenisation() {
347 assert_eq!(tokenise("\"a.b\'/c\"".into()).unwrap(), ["\"a.b'/c\""]);
348 assert_eq!(tokenise("'a.b\"/c'".into()).unwrap(), ["'a.b\"/c'"]);
349
350 assert_eq!(
351 tokenise("p(\"a.b\'/c\").".into()).unwrap(),
352 ["p", "(", "\"a.b'/c\"", ")", "."]
353 );
354 }
355
356 #[test]
357 fn string_escape_characters() {
358 assert_eq!(tokenise("\"a\\\"b\"".into()).unwrap(), ["\"a\"b\""]);
359
360 assert_eq!(tokenise("'a\\'b'".into()).unwrap(), ["'a'b'"]);
361
362 assert_eq!(
363 tokenise("\" \\n \\t \\\\ \"".into()).unwrap(),
364 ["\" \n \t \\ \""]
365 );
366 }
367
368 #[test]
369 fn float_tokenisation() {
370 assert_eq!(tokenise("123.123".into()).unwrap(), ["123.123"]);
371 assert_eq!(
372 tokenise("123.123.123".into()).unwrap(),
373 ["123.123", ".", "123"]
374 );
375 assert_eq!(tokenise("123 . 123".into()).unwrap(), ["123", ".", "123"]);
376 }
377
378 #[test]
379 fn negative_number_tokenisation() {
380 assert_eq!(tokenise("-123 - 123".into()).unwrap(), ["-123", "-", "123"]);
381 assert_eq!(tokenise("-123.123".into()).unwrap(), ["-123.123"]);
382 assert_eq!(tokenise("- 123.123".into()).unwrap(), ["-", "123.123"]);
383 assert_eq!(tokenise("-123 . 123".into()).unwrap(), ["-123", ".", "123"]);
384 }
385
386 #[test]
387 fn list_tokenisation() {
388 assert_eq!(
389 tokenise("[123,abc, VAR]".into()).unwrap(),
390 ["[", "123", ",", "abc", ",", "VAR", "]"]
391 );
392 assert_eq!(
393 tokenise("[123,abc, VAR|T]".into()).unwrap(),
394 ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
395 );
396 assert_eq!(
397 tokenise("[123,abc, VAR | T]".into()).unwrap(),
398 ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
399 );
400 }
401
402 #[test]
403 fn empty_list_token() {
404 assert_eq!(tokenise("[]".into()).unwrap(), ["[]"]);
405 assert_eq!(tokenise("[ ]".into()).unwrap(), ["[]"]);
406 assert_eq!(tokenise("[\n]".into()).unwrap(), ["[]", "\n"]);
407 assert_eq!(tokenise("[\n\n]".into()).unwrap(), ["[]", "\n", "\n"]);
408 assert_eq!(tokenise("[\t]".into()).unwrap(), ["[]"]);
409 assert_eq!(tokenise("[ \n\t\n ]".into()).unwrap(), ["[]", "\n", "\n"]);
410 }
411
412 #[test]
413 fn empty_set_token() {
414 assert_eq!(tokenise("{}".into()).unwrap(), ["{}"]);
415 assert_eq!(tokenise("{ }".into()).unwrap(), ["{}"]);
416 assert_eq!(tokenise("{\n}".into()).unwrap(), ["{}", "\n"]);
417 assert_eq!(tokenise("{\n\n}".into()).unwrap(), ["{}", "\n", "\n"]);
418 assert_eq!(tokenise("{\t}".into()).unwrap(), ["{}"]);
419 assert_eq!(tokenise("{ \n\t\n }".into()).unwrap(), ["{}", "\n", "\n"]);
420 }
421
422 #[test]
423 fn known_symbol_formation() {
424 assert_eq!(tokenise(":-".into()).unwrap(), [":-"]);
425 assert_eq!(tokenise(": -".into()).unwrap(), [":", "-"]);
426
427 assert_eq!(tokenise("==".into()).unwrap(), ["=="]);
428 assert_eq!(tokenise("= =".into()).unwrap(), ["=", "="]);
429
430 assert_eq!(tokenise("=/=".into()).unwrap(), ["=/="]);
431 assert_eq!(tokenise("= / =".into()).unwrap(), ["=", "/", "="]);
432
433 assert_eq!(tokenise("/=".into()).unwrap(), ["/="]);
434 assert_eq!(tokenise("/ =".into()).unwrap(), ["/", "="]);
435
436 assert_eq!(tokenise("=:=".into()).unwrap(), ["=:="]);
437 assert_eq!(tokenise("= : =".into()).unwrap(), ["=", ":", "="]);
438
439 assert_eq!(tokenise("**".into()).unwrap(), ["**"]);
440 assert_eq!(tokenise("* *".into()).unwrap(), ["*", "*",]);
441
442 assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
443 assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
444
445 assert_eq!(tokenise(">=".into()).unwrap(), [">="]);
446 assert_eq!(tokenise("> =".into()).unwrap(), [">", "=",]);
447
448 assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
449 assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
450 }
451
452 #[test]
453 fn tokenise_multiple_clauses() {
454 let text = " p(a,[b,c|[\t]]).
455 p(X,Y):- Q(X,Y), {Q}.
456 :- [\"file/name\"], test.
457 "
458 .to_string();
459
460 assert_eq!(
461 tokenise(text).unwrap(),
462 [
463 "p",
464 "(",
465 "a",
466 ",",
467 "[",
468 "b",
469 ",",
470 "c",
471 "|",
472 "[]",
473 "]",
474 ")",
475 ".",
476 "\n",
477 "p",
478 "(",
479 "X",
480 ",",
481 "Y",
482 ")",
483 ":-",
484 "Q",
485 "(",
486 "X",
487 ",",
488 "Y",
489 ")",
490 ",",
491 "{",
492 "Q",
493 "}",
494 ".",
495 "\n",
496 ":-",
497 "[",
498 "\"file/name\"",
499 "]",
500 ",",
501 "test",
502 ".",
503 "\n"
504 ]
505 );
506 }
507}