pub mod token;
pub use token::*;
pub fn process_rule_maybe_verbose(rule: impl Fn(Vec<Token>) -> Option<Vec<Token>>, body: &mut Vec<Token>, verbose: bool) {
let mut start_index: usize = 0;
'current_start: while start_index < body.len() {
let mut end_index: usize = start_index + 1;
let mut tokens: Vec<Token> = body[start_index..end_index].to_vec();
let mut applied: Option<Vec<Token>> = rule(tokens);
while let None = applied {
end_index += 1;
if end_index > body.len() {
start_index += 1;
continue 'current_start;
}
tokens = body[start_index..end_index].to_vec();
applied = rule(tokens);
}
let replacement: Vec<Token> = applied.unwrap();
if verbose {
println!("\nReplacing");
print_tokens(body[start_index..end_index].to_vec());
println!("with");
print_tokens(replacement.clone());
}
let r_len = replacement.len();
body.splice(start_index..end_index, replacement);
if r_len >= end_index - start_index {
start_index += 1;
}
}
}
pub fn process_rule(rule: impl Fn(Vec<Token>) -> Option<Vec<Token>>, body: &mut Vec<Token>) {
process_rule_maybe_verbose(rule, body, false);
}
pub fn process_rules(rules: Vec<impl Fn(Vec<Token>) -> Option<Vec<Token>>>, body: &mut Vec<Token>, verbose: bool) {
for rule in rules {
process_rule_maybe_verbose(rule, body, verbose);
}
}
pub fn format_tags(tags: Vec<&str>) -> String {
String::from("(")
+ &tags.iter()
.map(|t| format!("{:?}", t)).collect::<Vec<String>>().join("; ")
+ ")"
}
#[cfg(test)]
mod tests {
use std::hint::black_box;
use super::*;
fn token1() -> Token<'static> {
token_from_string("Hi!", vec!["greeting", "exclam"])
}
fn token2() -> Token<'static> {
token_from_string("-890", vec!["int", "neg_int"])
}
fn test_tokens() -> Vec<Token<'static>> {
vec![token1(), token2()]
}
#[test]
fn print_one_token() {
println!("{}", token1());
}
#[test]
fn print_several_tokens() {
print_tokens(test_tokens());
}
#[test]
fn print_base_token_stream() {
print_tokens(str_to_tokens("This is a string."));
}
fn whitespace_rule(mut tokens: Vec<Token>) -> Option<Vec<Token>> {
let ch = tokens[0].single_char().unwrap_or_default();
if ch.is_whitespace() || ch == '\u{0}' {
tokens[0].tags.push("ws");
}
Some(tokens)
}
fn letter_rule(mut tokens: Vec<Token>) -> Option<Vec<Token>> {
if tokens[0].single_char().unwrap_or_default().is_alphabetic() {
tokens[0].tags.push("letter");
}
Some(tokens)
}
fn word_rule(tokens: Vec<Token>) -> Option<Vec<Token>> {
if tokens.last().unwrap_or(&empty_token()).has_tag("letter") {
None
} else if tokens.len() == 1 {
Some(tokens)
} else {
Some(vec![
wrap(tokens[0..tokens.len() - 1].to_vec(), vec!["word"]),
tokens.last().unwrap().clone(),
])
}
}
fn int_rule(tokens: Vec<Token>) -> Option<Vec<Token>> {
match tokens_structure(&tokens) {
TokenStructure::Single(tok) => {
if tok.content() == "0" {
Some(vec![wrap(tokens, vec!["int", "posInt"])])
} else if tok.has_tag("digit") {
None
} else {
Some(tokens)
}
}
TokenStructure::Multiple => {
if tokens.last().unwrap_or(&empty_token()).has_tag("digit") {
None
} else {
Some(vec![
wrap(tokens[0..tokens.len() - 1].to_vec(), vec!["int", "posInt"]),
tokens.last().unwrap().clone(),
])
}
}
TokenStructure::None => {
Some(tokens)
}
}
}
fn remove_whitespace_rule(tokens: Vec<Token>) -> Option<Vec<Token>> {
if tokens[0].has_tag("ws") {
Some(vec![])
} else {
Some(tokens)
}
}
fn digit_rule(mut tokens: Vec<Token>) -> Option<Vec<Token>> {
if let TokenStructure::Single(tok) = tokens_structure(&tokens) {
let ch = tok.single_char().unwrap_or_default();
if ch.is_digit(10) {
tokens[0].tags.push("digit");
if ch != '0' {
tokens[0].tags.push("nonzero");
}
}
}
Some(tokens)
}
#[test]
fn apply_one_rule() {
let mut body = str_to_tokens("A space");
print_tokens(body.clone());
process_rule(whitespace_rule, &mut body);
print_tokens(body);
}
fn word_rules() -> Vec<impl Fn(Vec<Token>) -> Option<Vec<Token>>> {
[
whitespace_rule,
letter_rule,
word_rule,
remove_whitespace_rule,
]
.to_vec()
}
fn int_rules() -> Vec<impl Fn(Vec<Token>) -> Option<Vec<Token>>> {
[
whitespace_rule,
digit_rule,
int_rule,
remove_whitespace_rule,
]
.to_vec()
}
fn ab_rule(tokens: Vec<Token>) -> Option<Vec<Token>> {
match tokens_structure(&tokens) {
TokenStructure::Single(tok) => {
if tok.has_tag("a") {
None
} else {
Some(tokens)
}
}
TokenStructure::Multiple => {
if tokens[1].has_tag("b") {
Some(vec![wrap(tokens, vec!["c"])])
} else {
Some(tokens)
}
}
TokenStructure::None => {
Some(tokens)
}
}
}
#[test]
fn apply_ab() {
let text = "a b blex ab abab";
let mut body = str_to_tokens(text);
process_rule(ab_rule, &mut body);
print_tokens(body);
}
#[test]
fn apply_words() {
let text = "A space";
let mut body = str_to_tokens(text);
process_rules(word_rules(), &mut body, false);
print_tokens(body.clone());
assert_eq!(
body,
vec![
Token {
body: text,
indices: 0..1,
tags: vec!["word"]
},
Token {
body: text,
indices: 5..10,
tags: vec!["word"]
}
]
);
}
#[test]
fn big_paragraph_performance() {
let text = "The donkey than rams him.
Oh my goodness, but the kangaroo jumps over.
And it looks like the seagulls are going for it again!
They're just hitting the tank!
(To the penguins, attacking a T-Rex)
Hit him with your penguin beaks!
What are you doing out there?
Looks like I gotta do everything myself...
Come on, now I'm playing.
Get over here, T-Rex. I'll beat you up.
Now watch out for my spin attack...";
let mut body = str_to_tokens(text);
for _ in 0..1000 {
black_box(process_rules(word_rules(), &mut body, false));
}
}
#[test]
fn apply_ints() {
let text = "123 040 k";
let mut body = str_to_tokens(text);
process_rules(int_rules(), &mut body, false);
assert_eq!(
body,
vec![
Token {
body: text,
indices: 0..3,
tags: vec!["int", "posInt"]
},
Token {
body: text,
indices: 4..5,
tags: vec!["int", "posInt"]
},
Token {
body: text,
indices: 5..7,
tags: vec!["int", "posInt"]
},
Token {
body: text,
indices: 8..9,
tags: vec!["k"]
}
]
);
}
#[test]
fn has_tag_test() {
println!("{}", token_from_string("Hi", vec!["test"]).has_tag("test"));
print_tokens(str_to_tokens("a b blex ab abab"));
}
}