1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
pub fn tokenize(s: &str) -> (Vec<usize>, Vec<TokenType>) {
let mut current_token_type = TokenType::Init;
let mut indices = vec![];
let mut token_types = vec![];
for (i, c) in s.chars().enumerate() {
let tt = token_type(c);
if current_token_type != tt {
indices.push(i);
token_types.push(tt);
current_token_type = tt;
}
}
(indices, token_types)
}
fn token_type(c: char) -> TokenType {
if c.is_ascii_digit() {
return TokenType::Number;
}
if c.is_ascii_whitespace() {
return TokenType::Whitespace;
}
if c.is_ascii_punctuation() {
return TokenType::Punctuation;
}
TokenType::Text
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
Init,
Number,
Whitespace,
Punctuation,
Text,
}
#[cfg(test)]
mod tests {
use super::*;
macro_rules! tokenize_tests {
($($name:ident: $value:expr,)*) => {
$(
#[test]
fn $name() {
let (s, expected) = $value;
assert_eq!(tokenize(s), expected);
}
)*
}
}
tokenize_tests!(
tokenize_text: ("a", (vec![0], vec![TokenType::Text])),
tokenize_number: ("a1", (vec![0, 1], vec![
TokenType::Text,
TokenType::Number,
])),
tokenize_numbers: ("a12", (vec![0, 1], vec![
TokenType::Text,
TokenType::Number,
])),
tokenize_alternating: ("a12bc", (vec![0, 1, 3], vec![
TokenType::Text,
TokenType::Number,
TokenType::Text,
])),
tokenize_number_first: ("12bc", (vec![0, 2], vec![
TokenType::Number,
TokenType::Text,
])),
tokenize_whitespace: ("12b c", (vec![0, 2, 3, 4], vec![
TokenType::Number,
TokenType::Text,
TokenType::Whitespace,
TokenType::Text,
])),
tokenize_whitespaces: ("12b \tc", (vec![0, 2, 3, 5], vec![
TokenType::Number,
TokenType::Text,
TokenType::Whitespace,
TokenType::Text,
])),
tokenize_punctuation: ("12b.c", (vec![0, 2, 3, 4], vec![
TokenType::Number,
TokenType::Text,
TokenType::Punctuation,
TokenType::Text,
])),
);
}