1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
//! Parse text into words, newlines and whitespace sequences
//!
//! ```rust
//! use embedded_text::parser::{Parser, Token};
//!
//! let parser = Parser::parse("Hello, world!\n");
//! let tokens = parser.collect::<Vec<Token<'_>>>();
//!
//! assert_eq!(vec![
//!     Token::Word("Hello,"),
//!     Token::Whitespace(1),
//!     Token::Word("world!"),
//!     Token::NewLine
//! ], tokens);
//! ```
use core::str::Chars;

/// A text token
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Token<'a> {
    /// A newline character
    NewLine,

    /// n whitespace characters
    Whitespace(u32),

    /// A word (a sequence of non-whitespace characters)
    Word(&'a str),
}

/// Text parser. Turns a string into a stream of [`Token`] objects.
#[derive(Clone, Debug)]
pub struct Parser<'a> {
    inner: Chars<'a>,
}

impl<'a> Parser<'a> {
    /// Create a new parser object to process the given piece of text
    #[inline]
    #[must_use]
    pub fn parse(text: &'a str) -> Self {
        Self {
            inner: text.chars(),
        }
    }

    /// Returns the next token without advancing
    #[inline]
    #[must_use]
    pub fn peek(&self) -> Option<Token> {
        self.clone().next()
    }

    /// Returns true if there are no tokens to process
    #[inline]
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.inner.as_str().is_empty()
    }
}

impl<'a> Iterator for Parser<'a> {
    type Item = Token<'a>;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        let string = self.inner.as_str();
        self.inner.next().map(|c| match c {
            '\n' => Token::NewLine,

            c if c.is_whitespace() => {
                let mut len = 0;
                for (idx, c) in string.char_indices() {
                    if c.is_whitespace() && c != '\n' {
                        len += 1;
                    } else {
                        // consume the whitespaces
                        self.inner = unsafe { string.get_unchecked(idx..) }.chars();
                        return Token::Whitespace(len);
                    }
                }

                // consume all the text
                self.inner = "".chars();
                Token::Whitespace(len)
            }

            _ => {
                for (possible_end, c) in string.char_indices() {
                    if c.is_whitespace() {
                        let (word, rest) = unsafe {
                            // don't worry
                            (
                                string.get_unchecked(0..possible_end),
                                string.get_unchecked(possible_end..),
                            )
                        };
                        self.inner = rest.chars();
                        return Token::Word(word);
                    }
                }

                // consume all the text
                self.inner = "".chars();
                Token::Word(&string)
            }
        })
    }
}

#[cfg(test)]
mod test {
    use super::{Parser, Token};
    #[test]
    fn parse() {
        // (At least) for now, \r is considered a whitespace
        let text = "Lorem ipsum \r dolor sit amet, conse😅ctetur adipiscing\nelit";

        assert_eq!(
            Parser::parse(text).collect::<Vec<Token>>(),
            vec![
                Token::Word("Lorem"),
                Token::Whitespace(1),
                Token::Word("ipsum"),
                Token::Whitespace(3),
                Token::Word("dolor"),
                Token::Whitespace(1),
                Token::Word("sit"),
                Token::Whitespace(1),
                Token::Word("amet,"),
                Token::Whitespace(1),
                Token::Word("conse😅ctetur"),
                Token::Whitespace(1),
                Token::Word("adipiscing"),
                Token::NewLine,
                Token::Word("elit"),
            ]
        );
    }

    #[test]
    fn parse_multibyte_last() {
        // (At least) for now, \r is considered a whitespace
        let text = "test😅";

        assert_eq!(
            Parser::parse(text).collect::<Vec<Token>>(),
            vec![Token::Word("test😅"),]
        );
    }
}