1const DELIMINATORS: &[char] = &[
4 '(', ')', ',', '.', ' ', '\r', '\n', '\t', '\\', ':', '-', '+', '/', '*', '=', '[', ']', '|',
5 '>', '<', '{', '}',
6];
7const KNOWN_SYMBOLS: &[&str] = &[":-", "==", "=/=", "/=", "=:=", "**", "<=", ">=", "/*", "*/"];
8
9fn form_empty_list_token(tokens: &mut Vec<String>) {
14 let mut i = 0;
15 while tokens.len() > i + 1 {
16 if tokens[i] == "[" {
17 let mut j = i + 1;
18 let mut new_lines = 0;
19 loop {
20 match tokens[j].as_str() {
21 " " | "\t" => j += 1,
22 "\n" => {
23 new_lines += 1;
24 j += 1;
25 }
26 "]" => {
27 tokens.drain(i..j + 1);
28 tokens.insert(i, "[]".into());
29 for _ in 0..new_lines {
30 tokens.insert(i + 1, "\n".into());
31 }
32 break;
33 }
34 _ => break,
35 }
36 }
37 }
38 i += 1;
39 }
40}
41
42fn form_empty_set_token(tokens: &mut Vec<String>) {
43 let mut i = 0;
44 while tokens.len() > i + 1 {
45 if tokens[i] == "{" {
46 let mut j = i + 1;
47 let mut new_lines = 0;
48 loop {
49 match tokens[j].as_str() {
50 " " | "\t" => j += 1,
51 "\n" => {
52 new_lines += 1;
53 j += 1;
54 }
55 "}" => {
56 tokens.drain(i..j + 1);
57 tokens.insert(i, "{}".into());
58 for _ in 0..new_lines {
59 tokens.insert(i + 1, "\n".into());
60 }
61 break;
62 }
63 _ => break,
64 }
65 }
66 }
67 i += 1;
68 }
69}
70
71fn form_empty_tuple_token(tokens: &mut Vec<String>) {
72 let mut i = 0;
73 while tokens.len() > i + 1 {
74 if tokens[i] == "(" {
75 let mut j = i + 1;
76 let mut new_lines = 0;
77 loop {
78 match tokens[j].as_str() {
79 " " | "\t" => j += 1,
80 "\n" => {
81 new_lines += 1;
82 j += 1;
83 }
84 ")" => {
85 tokens.drain(i..j + 1);
86 tokens.insert(i, "()".into());
87 for _ in 0..new_lines {
88 tokens.insert(i + 1, "\n".into());
89 }
90 break;
91 }
92 _ => break,
93 }
94 }
95 }
96 i += 1;
97 }
98}
99
100fn walk_string(
101 characters: &Vec<char>,
102 mut i: usize,
103 mark: char,
104) -> Result<(String, usize), String> {
105 let mut str = vec![mark];
107 while let Some(&c) = characters.get(i) {
108 match c {
109 c if c == mark => {
110 str.push(c);
111 return Ok((str.iter().collect(), i + 1));
112 }
113 '\\' => {
114 match characters.get(i + 1) {
115 Some('n') => str.push('\n'),
116 Some('t') => str.push('\t'),
117 Some('\\') => str.push('\\'),
118 Some('"') => str.push('"'),
119 Some('\'') => str.push('\''),
120 Some(_) => return Err("'\\' used without proper escape character".into()),
121 None => break,
122 }
123 i += 2
124 }
125 _ => {
126 str.push(c);
127 i += 1;
128 }
129 }
130 }
131 Err(format!("Unexpected end of file, missing closing {mark}"))
132}
133
134fn walk_multi_line_comment(characters: &Vec<char>, mut i: usize) -> Result<(usize, usize), String> {
135 let mut newlines = 0;
136 while i <= characters.len() - 2 {
137 if characters[i] == '\n' {
138 newlines += 1
139 }
140 if &characters[i..i + 2] == ['*', '/'] {
141 return Ok((i + 2, newlines));
142 }
143 i += 1;
144 }
145 Err("Unclosed multi line comment".into())
146}
147
148fn walk_single_line_comment(characters: &Vec<char>, mut i: usize) -> Result<usize, ()> {
149 while let Some(&c) = characters.get(i) {
150 if c == '\n' {
151 return Ok(i + 1);
152 }
153 i += 1;
154 }
155 Err(())
156}
157
158fn join_decimal_nums(tokens: &mut Vec<String>) {
161 let mut i = 0;
162 while tokens.len() > i + 2 {
163 i += 1;
164 if tokens[i] == "."
165 && tokens[i - 1].chars().all(|c| c.is_numeric())
166 && tokens[i + 1].chars().all(|c| c.is_numeric())
167 {
168 let combined_value = [
169 tokens[i - 1].as_str(),
170 tokens[i].as_str(),
171 tokens[i + 1].as_str(),
172 ]
173 .concat();
174 tokens.remove(i + 1);
175 tokens.remove(i);
176 tokens.remove(i - 1);
177 tokens.insert(i - 1, combined_value);
178 }
179 }
180}
181
182fn form_negative_nums(tokens: &mut Vec<String>) {
185 let mut i = 0;
186 while tokens.len() > i + 1 {
187 if tokens[i] == "-" && tokens[i + 1].chars().all(|c| c.is_numeric() || c == '.') {
188 let combined_value = [tokens[i].as_str(), tokens[i + 1].as_str()].concat();
189 tokens.remove(i + 1);
190 tokens.remove(i);
191 tokens.insert(i, combined_value);
192 }
193 i += 1;
194 }
195}
196
197fn form_known_symbols(tokens: &mut Vec<String>) {
198 let mut i = 0;
199 while i < tokens.len() - 1 {
200 if DELIMINATORS.contains(&tokens[i].chars().next().unwrap()) {
201 for &symbol in KNOWN_SYMBOLS {
202 if symbol.len() <= tokens.len() - i {
203 if symbol == tokens[i..i + symbol.len()].concat() {
204 tokens.drain(i..i + symbol.len());
205 tokens.insert(i, symbol.into());
206 }
207 }
208 }
209 }
210 i += 1;
211 }
212}
213
214pub fn tokenise(text: String) -> Result<Vec<String>, String> {
215 let mut tokens = Vec::<String>::new();
216 let mut last_i = 0;
217 let characters: Vec<char> = text.chars().collect();
218 let mut i = 0;
219
220 while i < characters.len() {
221 let c = characters[i];
223 match c {
224 '\'' | '"' => {
225 let (token, i2) = walk_string(&characters, i + 1, c)?;
226 tokens.push(token);
227 i = i2;
228 last_i = i;
229 }
230 '%' => match walk_single_line_comment(&characters, i + 1) {
231 Ok(i2) => {
232 i = i2;
233 last_i = i;
234 tokens.push("\n".into());
235 }
236 Err(_) => {
237 i = characters.len();
238 last_i = i;
239 break;
240 }
241 },
242 '/' => {
243 if characters[i + 1] == '*' {
244 let (i2, newlines) = walk_multi_line_comment(&characters, i + 1)?;
245 i = i2;
246 last_i = i;
247 for _ in 0..newlines {
248 tokens.push("\n".into());
249 }
250 } else {
251 tokens.push(text[last_i..i].into());
252 tokens.push(c.into());
253 i += 1;
254 last_i = i;
255 }
256 }
257 c if DELIMINATORS.contains(&c) => {
258 tokens.push(text[last_i..i].into());
259 tokens.push(c.into());
260 i += 1;
261 last_i = i;
262 }
263 _ => i += 1,
264 }
265 }
266 tokens.push(text[last_i..].into());
267 tokens.retain(|token| "" != *token);
268 form_known_symbols(&mut tokens);
269 join_decimal_nums(&mut tokens);
270 form_negative_nums(&mut tokens);
271 form_empty_list_token(&mut tokens);
272 form_empty_set_token(&mut tokens);
273 form_empty_tuple_token(&mut tokens);
274 tokens.retain(|t1| !["\t", " ", "\r"].iter().any(|t2| t1 == t2));
275 Ok(tokens)
276}
277
278#[cfg(test)]
279mod tests {
280 use super::tokenise;
281
282 #[test]
283 fn single_line_comments() {
284 let file = "\n%simple predicate\np(x,y):-%The head\nq(x),%body 1\nr(y).%body 2".to_string();
285
286 let tokens = tokenise(file).unwrap();
287
288 assert_eq!(
289 tokens,
290 [
291 "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "\n", "q", "(", "x", ")", ",",
292 "\n", "r", "(", "y", ")", "."
293 ]
294 );
295 }
296
297 #[test]
298 fn multi_line_comments() {
299 let file = "/* This is a\nmutli\nline\ncomment */\np(x,y):-q(x,y).".to_string();
300
301 let tokens = tokenise(file).unwrap();
302
303 assert_eq!(
304 tokens,
305 [
306 "\n", "\n", "\n", "\n", "p", "(", "x", ",", "y", ")", ":-", "q", "(", "x", ",",
307 "y", ")", "."
308 ]
309 )
310 }
311
312 #[test]
313 fn unclosed_multi_line_comments() {
314 let file = "p(x,y):-q(x,y)
315 /* a comment
316 on two lines
317 fact(a,b)."
318 .to_string();
319
320 match tokenise(file) {
321 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
322 Err(message) => assert_eq!(message, "Unclosed multi line comment"),
323 }
324 }
325
326 #[test]
327 fn unclosed_strings() {
328 let file = "\"a string".to_string();
329
330 match tokenise(file) {
331 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
332 Err(message) => assert_eq!(message, "Unexpected end of file, missing closing \""),
333 }
334
335 let file = "'a string".to_string();
336
337 match tokenise(file) {
338 Ok(tokens) => panic!("Should have thrown error\nTokens: {tokens:?}"),
339 Err(message) => assert_eq!(message, "Unexpected end of file, missing closing '"),
340 }
341 }
342
343 #[test]
344 fn string_tokenisation() {
345 assert_eq!(tokenise("\"a.b\'/c\"".into()).unwrap(), ["\"a.b'/c\""]);
346 assert_eq!(tokenise("'a.b\"/c'".into()).unwrap(), ["'a.b\"/c'"]);
347
348 assert_eq!(
349 tokenise("p(\"a.b\'/c\").".into()).unwrap(),
350 ["p", "(", "\"a.b'/c\"", ")", "."]
351 );
352 }
353
354 #[test]
355 fn string_escape_characters() {
356 assert_eq!(tokenise("\"a\\\"b\"".into()).unwrap(), ["\"a\"b\""]);
357
358 assert_eq!(tokenise("'a\\'b'".into()).unwrap(), ["'a'b'"]);
359
360 assert_eq!(
361 tokenise("\" \\n \\t \\\\ \"".into()).unwrap(),
362 ["\" \n \t \\ \""]
363 );
364 }
365
366 #[test]
367 fn float_tokenisation() {
368 assert_eq!(tokenise("123.123".into()).unwrap(), ["123.123"]);
369 assert_eq!(
370 tokenise("123.123.123".into()).unwrap(),
371 ["123.123", ".", "123"]
372 );
373 assert_eq!(tokenise("123 . 123".into()).unwrap(), ["123", ".", "123"]);
374 }
375
376 #[test]
377 fn negative_number_tokenisation() {
378 assert_eq!(tokenise("-123 - 123".into()).unwrap(), ["-123", "-", "123"]);
379 assert_eq!(tokenise("-123.123".into()).unwrap(), ["-123.123"]);
380 assert_eq!(tokenise("- 123.123".into()).unwrap(), ["-", "123.123"]);
381 assert_eq!(tokenise("-123 . 123".into()).unwrap(), ["-123", ".", "123"]);
382 }
383
384 #[test]
385 fn list_tokenisation() {
386 assert_eq!(
387 tokenise("[123,abc, VAR]".into()).unwrap(),
388 ["[", "123", ",", "abc", ",", "VAR", "]"]
389 );
390 assert_eq!(
391 tokenise("[123,abc, VAR|T]".into()).unwrap(),
392 ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
393 );
394 assert_eq!(
395 tokenise("[123,abc, VAR | T]".into()).unwrap(),
396 ["[", "123", ",", "abc", ",", "VAR", "|", "T", "]"]
397 );
398 }
399
400 #[test]
401 fn empty_list_token() {
402 assert_eq!(tokenise("[]".into()).unwrap(), ["[]"]);
403 assert_eq!(tokenise("[ ]".into()).unwrap(), ["[]"]);
404 assert_eq!(tokenise("[\n]".into()).unwrap(), ["[]", "\n"]);
405 assert_eq!(tokenise("[\n\n]".into()).unwrap(), ["[]", "\n", "\n"]);
406 assert_eq!(tokenise("[\t]".into()).unwrap(), ["[]"]);
407 assert_eq!(tokenise("[ \n\t\n ]".into()).unwrap(), ["[]", "\n", "\n"]);
408 }
409
410 #[test]
411 fn empty_set_token() {
412 assert_eq!(tokenise("{}".into()).unwrap(), ["{}"]);
413 assert_eq!(tokenise("{ }".into()).unwrap(), ["{}"]);
414 assert_eq!(tokenise("{\n}".into()).unwrap(), ["{}", "\n"]);
415 assert_eq!(tokenise("{\n\n}".into()).unwrap(), ["{}", "\n", "\n"]);
416 assert_eq!(tokenise("{\t}".into()).unwrap(), ["{}"]);
417 assert_eq!(tokenise("{ \n\t\n }".into()).unwrap(), ["{}", "\n", "\n"]);
418 }
419
420 #[test]
421 fn known_symbol_formation() {
422 assert_eq!(tokenise(":-".into()).unwrap(), [":-"]);
423 assert_eq!(tokenise(": -".into()).unwrap(), [":", "-"]);
424
425 assert_eq!(tokenise("==".into()).unwrap(), ["=="]);
426 assert_eq!(tokenise("= =".into()).unwrap(), ["=", "="]);
427
428 assert_eq!(tokenise("=/=".into()).unwrap(), ["=/="]);
429 assert_eq!(tokenise("= / =".into()).unwrap(), ["=", "/", "="]);
430
431 assert_eq!(tokenise("/=".into()).unwrap(), ["/="]);
432 assert_eq!(tokenise("/ =".into()).unwrap(), ["/", "="]);
433
434 assert_eq!(tokenise("=:=".into()).unwrap(), ["=:="]);
435 assert_eq!(tokenise("= : =".into()).unwrap(), ["=", ":", "="]);
436
437 assert_eq!(tokenise("**".into()).unwrap(), ["**"]);
438 assert_eq!(tokenise("* *".into()).unwrap(), ["*", "*",]);
439
440 assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
441 assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
442
443 assert_eq!(tokenise(">=".into()).unwrap(), [">="]);
444 assert_eq!(tokenise("> =".into()).unwrap(), [">", "=",]);
445
446 assert_eq!(tokenise("<=".into()).unwrap(), ["<="]);
447 assert_eq!(tokenise("< =".into()).unwrap(), ["<", "=",]);
448 }
449
450 #[test]
451 fn tokenise_multiple_clauses() {
452 let text = " p(a,[b,c|[\t]]).
453 p(X,Y):- Q(X,Y), {Q}.
454 :- [\"file/name\"], test.
455 "
456 .to_string();
457
458 assert_eq!(
459 tokenise(text).unwrap(),
460 [
461 "p",
462 "(",
463 "a",
464 ",",
465 "[",
466 "b",
467 ",",
468 "c",
469 "|",
470 "[]",
471 "]",
472 ")",
473 ".",
474 "\n",
475 "p",
476 "(",
477 "X",
478 ",",
479 "Y",
480 ")",
481 ":-",
482 "Q",
483 "(",
484 "X",
485 ",",
486 "Y",
487 ")",
488 ",",
489 "{",
490 "Q",
491 "}",
492 ".",
493 "\n",
494 ":-",
495 "[",
496 "\"file/name\"",
497 "]",
498 ",",
499 "test",
500 ".",
501 "\n"
502 ]
503 );
504 }
505}