suiron/
tokenizer.rs

1//! Functions which tokenize text strings to a generate goals.
2//!
3// Cleve Lendon 2023
4
5use crate::str_to_chars;
6use crate::chars_to_string;
7
8use super::goal::Goal;
9use super::parse_goals::*;
10use super::operator::Operator;
11use super::token::{*, Token, TokenType};
12use super::parse_stack::*;
13
14/// Determines whether the given character is a letter, number or hyphen.
15/// # Arguments
16/// * a character
17/// # Return
18/// * boolean, true or false
19pub fn letter_number_hyphen(ch: char) -> bool {
20    if ch >= 'a' && ch <= 'z' { return true; }
21    if ch >= 'A' && ch <= 'Z' { return true; }
22    if ch >= '0' && ch <= '9' { return true; }
23    if ch == '_' || ch == '-' ||
24       ch == '\u{ad}' { return true; }
25    if ch >= '\u{c0}' && ch < '\u{2c0}' { return true; }
26    if ch >= '\u{380}' && ch < '\u{510}' { return true; }
27    return false;
28} // letter_number_hyphen()
29
30/// Quote, hash and 'at' are invalid between terms.
31/// # Arguments
32/// * character
33/// # Return
34/// * bool - true means invalid.
35fn invalid_between_terms(ch: char) -> bool {
36    if ch == '"' { return true; }
37    if ch == '#' { return true; }
38    if ch == '@' { return true; }
39    return false;
40} // invalid_between_terms()
41
42/// Generates a goal from a text string.
43///
44/// # Arguments
45/// * string to parse
46/// # Return
47/// * [Goal](../goal/index.html)) or error message
48/// # Usage
49/// ```
50/// use suiron::*;
51///
52/// let s = "can_swim($X), can_fly($X)";
53/// match generate_goal(s) {
54///     Ok(goal) => { println!("{}", goal); },
55///     Err(err) => { println!("{}", err); },
56/// }
57/// ```
58///
59/// The above should print: `can_swim($X), can_fly($X")`
60pub fn generate_goal(to_parse: &str) -> Result<Goal, String> {
61    match tokenize(to_parse) {
62        Ok(tokens) => {
63            // For debugging.
64            let mut base_token = group_tokens(&tokens, 0);
65            base_token = group_and_tokens(base_token);
66            base_token = group_or_tokens(base_token);
67            return token_tree_to_goal(base_token);
68        },
69        Err(err) => { Err(err) },
70    }
71} // generate_goal()
72
73
74/// tokenize()
75/// Divides the given string into a series of tokens.
76///
77/// Note:
78/// Parentheses can be used to define a complex term,
79/// such as likes(Charles, Gina), or to group terms:
80/// (father($X, $Y); mother($X, $Y))
81///
82/// # Arguments
83/// * string to parse
84/// # Return
85/// * vector of tokens or error message
86fn tokenize(to_parse: &str) -> Result<Vec<Token>, String> {
87
88    let mut tokens: Vec<Token> = vec![];
89    let mut parse_stk = ParseStack::new();
90    let s = to_parse.trim();
91
92    if s.len() == 0 {
93        let msg = "tokenize() - String is empty.".to_string();
94        return Err(msg);
95    }
96
97    let mut start_index = 0;
98    let chrs = str_to_chars!(s);
99    let length = chrs.len();
100
101    // Find a separator (comma, semicolon), if there is one.
102    let mut previous = '#';  // random
103
104    let mut i = start_index;
105    while i < length {
106
107        // Get the top of the stack.
108        let mut top = peek(&mut parse_stk);
109
110        let mut ch = chrs[i];
111        if no_esc(ch, '"', previous) { // Ignore chars between quotes.
112            let mut j = i + 1;
113            let mut prev = '#';
114            while j < length {
115                ch = chrs[j];
116                if no_esc(ch, '"', prev) {
117                    i = j;
118                    break;
119                }
120                j += 1;
121                prev = ch;
122            }
123        }
124        else if no_esc(ch, '(', previous) {
125            // Is the previous character valid in a functor?
126            if letter_number_hyphen(previous) {
127                parse_stk.push(TokenType::Complex);
128            } else {
129                parse_stk.push(TokenType::Group);
130                tokens.push(make_leaf_token("("));
131                start_index = i + 1;
132            }
133        }
134        else if no_esc(ch, ')', previous) {
135            if top == TokenType::Empty {
136                let msg = format!("tokenize() - Unmatched parenthesis: {}", s);
137                return Err(msg);
138            }
139            top = pop(&mut parse_stk);
140            if top == TokenType::Group {
141                let subgoal = chars_to_string!(chrs[start_index..i]);
142                tokens.push(make_leaf_token(&subgoal));
143                tokens.push(make_leaf_token(")"));
144            } else if top != TokenType::Complex {
145                let msg = format!("tokenize() - Unmatched parenthesis: {}", s);
146                return Err(msg);
147            }
148        } else if no_esc(ch, '[', previous) {
149            parse_stk.push(TokenType::LinkedList);
150        } else if no_esc(ch, ']', previous) {
151            if top == TokenType::Empty {
152                let msg = format!("tokenize() - Unmatched bracket: {}", s);
153                return Err(msg);
154            }
155            top = pop(&mut parse_stk);
156            if top != TokenType::LinkedList {
157                let msg = format!("Tokenize() - Unmatched bracket: {}", s);
158                return Err(msg);
159            }
160        }
161        else {
162            // If not inside complex term or linked list...
163            if top != TokenType::Complex && top != TokenType::LinkedList {
164                if invalid_between_terms(ch) {
165                    let msg = format!("tokenize() - Invalid character: {}", s);
166                    return Err(msg);
167                }
168                if no_esc(ch, ',', previous) {   // And
169                    let subgoal = chars_to_string!(chrs[start_index..i]);
170                    tokens.push(make_leaf_token(&subgoal));
171                    tokens.push(make_leaf_token(","));
172                    start_index = i + 1;
173                } else if no_esc(ch, ';', previous) {   // Or
174                    let subgoal = chars_to_string!(chrs[start_index..i]);
175                    tokens.push(make_leaf_token(&subgoal));
176                    tokens.push(make_leaf_token(";"));
177                    start_index = i + 1;
178                }
179            }
180        } // else
181
182        previous = ch;
183        i += 1;
184
185    } // while
186
187    if parse_stk.len() > 0 {
188        let msg = format!("tokenize() - Invalid term: {}", s);
189        return Err(msg);
190    }
191
192    if length - start_index > 0 {
193        let subgoal = chars_to_string!(chrs[start_index..length]);
194        tokens.push(make_leaf_token(&subgoal));
195    }
196
197    return Ok(tokens);
198
199} // tokenize()
200
201
202// Ensures that the character being checked matches the match
203// character, and is not escaped by a backslash. (Eg. \, \[ )
204//
205// # Arguments
206// * character to check
207// * match character
208// * previous character
209// # Return
210// * true if not escaped by backslash
211fn no_esc(check: char, match_char: char, previous: char) -> bool {
212    if previous == '\\' { return false; }
213    if check != match_char { return false; }
214    true
215}
216
217/// group_tokens()
218///
219/// Collects tokens within parentheses into groups.
220/// Converts a flat array of tokens into a tree of tokens.
221///
222/// For example, this:   SUBGOAL SUBGOAL ( SUBGOAL  SUBGOAL )
223/// becomes:
224///          GROUP
225///            |
226/// SUBGOAL SUBGOAL GROUP
227///                   |
228///            SUBGOAL SUBGOAL
229///
230/// There is a precedence order in subgoals. From highest to lowest.
231///
232///    Group
233///    And
234///    Or
235///
236/// # Argument
237/// * `tokens`
238/// * `index`
239/// # Return
240/// * `Token`
241///
242fn group_tokens(tokens: &Vec<Token>, mut index: usize) -> Token {
243
244    let mut new_tokens: Vec<Token> = vec![];
245    let size = tokens.len();
246
247    while index < size {
248
249        let token = tokens[index].clone();
250        let the_type = token.get_type();
251
252        if the_type == TokenType::LParen {
253            index += 1;
254            // Make a GROUP token.
255            let t = group_tokens(tokens, index);
256            // Skip past tokens already processed.
257            // +1 for right parenthesis
258            index += t.number_of_children() + 1;
259            new_tokens.push(t);
260        } else if the_type == TokenType::RParen {
261            // Add all remaining tokens to the list.
262            return make_branch_token(TokenType::Group, new_tokens);
263        } else {
264            new_tokens.push(token);
265        }
266        index += 1;
267
268    } // for
269
270    return make_branch_token(TokenType::Group, new_tokens)
271
272} // group_tokens
273
274
275/// group_and_tokens()
276///
277/// Groups child tokens which are separated by commas.
278///
279/// # Arguments
280/// * `token`
281/// # Return
282/// * `token`
283/// # Panics
284/// * If given token is not a branch token.
285fn group_and_tokens(token: Token) -> Token {
286
287    match token {
288        Token::Leaf{ token_type: tt, token_str: _ } => {
289            let err = format!(
290                "group_and_tokens() - Requires branch token: {tt}"
291            );
292            panic!("{}", err);
293        },
294        Token::Branch{ token_type, children } => {
295
296            let mut new_children: Vec<Token> = vec![];
297            let mut and_list: Vec<Token> = vec![];
298
299            for child in children {
300                let child_type = child.get_type();
301                if child_type == TokenType::Subgoal {
302                    and_list.push(child);
303                }
304                else if child_type == TokenType::Comma {
305                    // Nothing to do.
306                }
307                else if child_type == TokenType::Semicolon {
308                    // Must be end of comma separated list.
309                    let size = and_list.len();
310                    if size == 1 {
311                        new_children.push(and_list[0].clone());
312                    } else {
313                        new_children.push(
314                            make_branch_token(TokenType::And, and_list)
315                        );
316                    }
317                    new_children.push(child);
318                    and_list = vec![];
319                }
320                else if child_type == TokenType::Group {
321                    let mut t = group_and_tokens(child);
322                    t = group_or_tokens(t);
323                    and_list.push(t);
324                }
325            } // for
326
327            let size = and_list.len();
328            if size == 1 {
329                new_children.push(and_list[0].clone());
330            } else if size > 1 {
331                new_children.push(
332                    make_branch_token(TokenType::And, and_list)
333                );
334            }
335
336            make_branch_token(token_type, new_children)
337
338        } // Token::Branch
339    } // match
340} // group_and_tokens
341
342
343/// group_or_tokens()
344///
345/// Groups child tokens which are separated by semicolons into
346/// an Or token. The given token must be a branch token.
347///
348/// # Arguments
349/// * `token`
350/// # Return
351/// * `token`
352/// # Panics
353/// * If given token is not a branch token.
354fn group_or_tokens(token: Token) -> Token {
355
356    match token {
357        Token::Leaf{ token_type: tt, token_str: _ } => {
358            let err = format!(
359                "group_or_tokens() - Requires branch token: {tt}"
360            );
361            panic!("{}", err);
362        },
363        Token::Branch{ token_type, children } => {
364
365            let mut new_children: Vec<Token> = vec![];
366            let mut or_list: Vec<Token> = vec![];
367
368            for child in children {
369                let child_type = child.get_type();
370                if child_type == TokenType::Subgoal ||
371                   child_type == TokenType::And ||
372                   child_type == TokenType::Group {
373                    or_list.push(child);
374                }
375                else if child_type == TokenType::Semicolon {
376                    // Nothing to do.
377                }
378            } // for
379
380            let size = or_list.len();
381            if size == 1 {
382                new_children.push(or_list[0].clone());
383            }
384            else if size > 1 {
385                new_children.push(
386                    make_branch_token(TokenType::Or, or_list)
387                );
388            }
389
390            make_branch_token(token_type, new_children)
391
392        } // Branch
393    } // match
394} // group_or_tokens
395
396
397/// token_tree_to_goal()
398///
399/// Returns a goal for the given token tree, or an error message
400///
401/// # Argument
402/// * `token` - base of token tree
403/// # Return
404/// * `Result` - Ok(Goal) or Err(message)
405/// # Panics
406/// * If leaf token is not a Subgoal.
407/// * If branch token Group does not have 1 child.
408fn token_tree_to_goal(token: Token) -> Result<Goal, String> {
409
410    match token {
411
412        Token::Leaf{ token_type, token_str } => {
413            if token_type == TokenType::Subgoal {
414                return parse_subgoal(&token_str);
415            };
416            let msg = tttg_error("Invalid. Leaf token must be Subgoal.", "");
417            panic!("{}", msg);
418        }, // Leaf
419
420        Token::Branch{ token_type, children: _ } => {
421
422            if token_type == TokenType::And {
423
424                let mut operands: Vec<Goal> = vec![];
425                let children = token.get_children();
426
427                for child in children {
428                    let child_type = child.get_type();
429                    if child_type == TokenType::Subgoal {
430                        let s = child.get_token_str();
431                        match parse_subgoal(&s) {
432                            Ok(g) => { operands.push(g); },
433                            Err(err) => { return Err(err); },
434                        }
435                    }
436                    else if child_type == TokenType::Group {
437                        match token_tree_to_goal(child) {
438                            Ok(g) => { operands.push(g); },
439                            Err(err) => { return Err(err); },
440                        }
441                    }
442                } // for child...
443
444                let op = Operator::And(operands);
445                return Ok(Goal::OperatorGoal(op));
446            }; // token_type == And
447
448            if token_type == TokenType::Or {
449
450                let mut operands: Vec<Goal> = vec![];
451                let children = token.get_children();
452
453                for child in children {
454                    let child_type = child.get_type();
455                    if child_type == TokenType::Subgoal {
456                        let s = child.get_token_str();
457                        match parse_subgoal(&s) {
458                            Ok(g) => { operands.push(g); },
459                            Err(err) => { return Err(err); },
460                        }
461                    }
462                    else if child_type == TokenType::Group {
463                        match token_tree_to_goal(child) {
464                            Ok(g) => { operands.push(g); },
465                            Err(err) => { return Err(err); },
466                        }
467                    }
468                } // for child...
469                let op = Operator::Or(operands);
470                return Ok(Goal::OperatorGoal(op));
471            };
472
473            if token_type == TokenType::Group {
474
475                if token.number_of_children() != 1 {
476                    let msg = tttg_error("Group should have 1 child.", "");
477                    panic!("{}", msg);
478                }
479
480                let children = token.get_children();
481                let child = children[0].clone();
482                return token_tree_to_goal(child);
483            };
484
485            let tt = token_type.to_string();
486            let msg = tttg_error("Invalid token type:", &tt);
487            return Err(msg);
488
489        }, // Branch
490
491    } // match
492
493}  // token_tree_to_goal()
494
495
496// Formats an error message for token_tree_to_goal().
497// Arguments:
498//   err - error description
499//   bad - string which caused the error
500// Return:
501//   error message (String)
502fn tttg_error(err: &str, bad: &str) -> String {
503    format!("token_tree_to_goal() - {} {}", err, bad)
504}
505
506
507#[cfg(test)]
508mod test {
509
510    use crate::*;
511    use super::*;
512
513    /// tokens_to_string()
514    ///
515    /// Makes a string representation of a vector of tokens,
516    /// for debugging purposes.
517    ///
518    /// # Argument
519    /// * `tokens` - vector of tokens
520    /// # Return
521    /// * `String`
522    fn tokens_to_string(tokens: &Vec<Token>) -> String {
523        let mut first = true;
524        let mut out = "".to_string();
525        for token in tokens {
526            if !first { out += " "; }
527            else { first = false; }
528            let token_type = token.get_type();
529            if token_type == TokenType::Subgoal {
530                out += &token.get_token_str();
531            }
532            else {
533                let s = format!("{}", token_type);
534                out += &s.to_uppercase();
535            }
536        }
537        return out;
538    } // tokens_to_string()
539
540    /// make_test_tokens()
541    /// Returns a vector of tokens for testing.
542    /// The tokens are: c1, c2, ( c3; c4 ), c5
543    fn make_test_tokens() -> Vec<Token> {
544
545        let c1 = make_leaf_token("a(1)");
546        let c2 = make_leaf_token("b(2)");
547        let c3 = make_leaf_token("c(3)");
548        let c4 = make_leaf_token("d(4)");
549        let c5 = make_leaf_token("d(5)");
550
551        let com1 = make_leaf_token(",");
552        let com2 = make_leaf_token(",");
553        let com3 = make_leaf_token(",");
554
555        let sem = make_leaf_token(";");
556        let lp  = make_leaf_token("(");
557        let rp  = make_leaf_token(")");
558        return vec![c1, com1, c2, com2, lp, c3, sem, c4, rp, com3, c5];
559    } // make_test_tokens()
560
561    #[test]
562    fn test_tokens_to_string() {
563        let tokens = make_test_tokens();
564        let s1 = "a(1) COMMA b(2) COMMA LPAREN c(3) \
565                  SEMICOLON d(4) RPAREN COMMA d(5)";
566        let s2 = tokens_to_string(&tokens);
567        assert_eq!(s1, s2);
568    } // test_tokens_to_string()
569
570
571    #[test]
572    fn test_group_tokens() {
573
574        let test_tokens = make_test_tokens();
575
576        let token = group_tokens(&test_tokens, 0);
577        assert_eq!(token.to_string(),
578        "GROUP > SUBGOAL(a(1)) COMMA SUBGOAL(b(2)) COMMA GROUP SUBGOAL(d(5))");
579
580        let token = group_and_tokens(token);
581        assert_eq!(token.to_string(), "GROUP > AND");
582
583        let child = &token.get_children()[0];
584        assert_eq!(child.to_string(),
585        "AND > SUBGOAL(a(1)) SUBGOAL(b(2)) GROUP SUBGOAL(d(5))");
586
587        let children = child.get_children();
588        let child = &children[2];
589        assert_eq!(child.to_string(), "GROUP > OR");
590
591        let child = &child.get_children()[0];
592        assert_eq!(child.to_string(),
593        "OR > SUBGOAL(c(3)) SUBGOAL(d(4))");
594
595    } // test_group_tokens()
596
597    /// make_test_tokens2()
598    /// Returns a vector of tokens for testing.
599    /// The tokens are: c1, c2; c3, c4
600    fn make_test_tokens2() -> Vec<Token> {
601
602        let c1 = make_leaf_token("a(1)");
603        let c2 = make_leaf_token("b(2)");
604        let c3 = make_leaf_token("c(3)");
605        let c4 = make_leaf_token("d(4)");
606
607        let com1 = make_leaf_token(",");
608        let com2 = make_leaf_token(",");
609        let sem = make_leaf_token(";");
610
611        return vec![c1, com1, c2, sem, c3, com2, c4];
612    } // make_test_tokens2()
613
614    #[test]
615    fn test_group_tokens2() {
616
617        let test_tokens = make_test_tokens2();
618
619        let token = group_tokens(&test_tokens, 0);
620        assert_eq!(token.to_string(),
621             "GROUP > SUBGOAL(a(1)) COMMA SUBGOAL(b(2)) \
622             SEMICOLON SUBGOAL(c(3)) COMMA SUBGOAL(d(4))");
623
624        let token = group_and_tokens(token);
625        assert_eq!(token.to_string(), "GROUP > AND SEMICOLON AND");
626
627        let token = group_or_tokens(token);
628        assert_eq!(token.to_string(), "GROUP > OR");
629
630    } // test_group_tokens2()
631
632    /// make_test_tokens3()
633    /// Returns a vector of tokens for testing.
634    /// The tokens are: c1; ( c2, c3; c4 )
635    fn make_test_tokens3() -> Vec<Token> {
636
637        let c1 = make_leaf_token("a(1)");
638        let c2 = make_leaf_token("b(2)");
639        let c3 = make_leaf_token("c(3)");
640        let c4 = make_leaf_token("d(4)");
641
642        let com  = make_leaf_token(",");
643        let sem1 = make_leaf_token(";");
644        let sem2 = make_leaf_token(";");
645
646        let lp  = make_leaf_token("(");
647        let rp  = make_leaf_token(")");
648        return vec![c1, sem1, lp, c2, com, c3, sem2, c4, rp];
649
650    } // make_test_tokens3()
651
652    #[test]
653    fn test_group_tokens3() {
654
655        let test_tokens = make_test_tokens3();
656        let token = group_tokens(&test_tokens, 0);
657        let token = group_and_tokens(token);
658        let token = group_or_tokens(token);
659        assert_eq!(token.to_string(), "GROUP > OR");
660
661        let child = &token.get_children()[0];
662        assert_eq!(child.to_string(), "OR > SUBGOAL(a(1)) GROUP");
663
664        let child = &child.get_children()[1];
665        let child = &child.get_children()[0];
666        assert_eq!(child.to_string(), "OR > AND SUBGOAL(d(4))");
667
668        let child = &child.get_children()[0];
669        assert_eq!(child.to_string(), "AND > SUBGOAL(b(2)) SUBGOAL(c(3))");
670
671    } // test_group_tokens3()
672
673    #[test]
674    #[should_panic]
675    fn test_group_tokens_panic() {
676        let com1 = make_leaf_token(",");
677        group_and_tokens(com1);
678    } // test_group_tokens_panic()
679
680    #[test]
681    fn test_tokenize() {
682        let s = "a(1, 2), b(3, 4); c(5, 6), c(7, 8)";
683        match tokenize(s) {
684            Ok(tokens) => {
685                let s = tokens_to_string(&tokens);
686                assert_eq!(s,
687                  "a(1, 2) COMMA b(3, 4) SEMICOLON c(5, 6) COMMA c(7, 8)");
688            },
689            Err(err) => {
690                panic!("Should create tokens: {}", err);
691            },
692        }
693
694        let s = "a(1, 2), b(3, 4";
695        match tokenize(s) {
696            Ok(_tokens) => {
697                panic!("Missing parenthesis. Should not create tokens.");
698            },
699            Err(err) => {
700                assert_eq!(err, "tokenize() - Invalid term: a(1, 2), b(3, 4");
701            },
702        }
703
704        let s = "$X = 1, 2, 3]";
705        match tokenize(s) {
706            Ok(_tokens) => {
707                panic!("Missing parenthesis. Should not create tokens.");
708            },
709            Err(err) => {
710                assert_eq!(err, "tokenize() - Unmatched bracket: $X = 1, 2, 3]");
711            },
712        }
713    } // test_tokenize()
714
715} // test