1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
use core::str::CharIndices;
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Token<'a> {
NewLine,
Whitespace(u32),
Word(&'a str),
}
#[derive(Clone)]
pub struct Parser<'a> {
inner: CharIndices<'a>,
}
impl<'a> Parser<'a> {
#[inline]
#[must_use]
pub fn parse(text: &'a str) -> Self {
Self {
inner: text.char_indices(),
}
}
}
impl<'a> Iterator for Parser<'a> {
type Item = Token<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let string = self.inner.as_str();
self.inner.next().map(|(start, c)| match c {
'\n' => Token::NewLine,
c if c.is_whitespace() => {
let mut len = 1;
for (_, c) in self.inner.clone() {
if c.is_whitespace() {
self.inner.next();
len += 1;
} else {
break;
}
}
Token::Whitespace(len)
}
_ => {
for (possible_end, c) in self.inner.clone() {
if c.is_whitespace() {
return Token::Word(unsafe {
string.get_unchecked(0..possible_end - start)
});
} else {
self.inner.next();
}
}
Token::Word(&string)
}
})
}
}
#[cfg(test)]
mod test {
use super::{Parser, Token};
#[test]
fn parse() {
let text = "Lorem ipsum \r dolor sit amet, conse😅ctetur adipiscing\nelit";
assert_eq!(
Parser::parse(text).collect::<Vec<Token>>(),
vec![
Token::Word("Lorem"),
Token::Whitespace(1),
Token::Word("ipsum"),
Token::Whitespace(3),
Token::Word("dolor"),
Token::Whitespace(1),
Token::Word("sit"),
Token::Whitespace(1),
Token::Word("amet,"),
Token::Whitespace(1),
Token::Word("conse😅ctetur"),
Token::Whitespace(1),
Token::Word("adipiscing"),
Token::NewLine,
Token::Word("elit"),
]
);
}
#[test]
fn parse_multibyte_last() {
let text = "test😅";
assert_eq!(
Parser::parse(text).collect::<Vec<Token>>(),
vec![Token::Word("test😅"),]
);
}
}