embedded_text/parser/
mod.rs

1//! Parse text into words, newlines and whitespace sequences.
2//!
3//! ```rust,ignore
4//! use embedded_text::parser::{Parser, Token};
5//!
6//! let parser = Parser::parse("Hello, world!\n");
7//! let tokens = parser.collect::<Vec<Token<'_>>>();
8//!
9//! assert_eq!(
10//!     vec![
11//!         Token::Word("Hello,"),
12//!         Token::Whitespace(1, " "),
13//!         Token::Word("world!"),
14//!         Token::NewLine
15//!     ],
16//!     tokens
17//! );
18//! ```
19use core::{marker::PhantomData, str::Chars};
20use embedded_graphics::{prelude::PixelColor, text::DecorationColor};
21
22/// Change text style.
23#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
24pub enum ChangeTextStyle<C> {
25    /// Reset text style. Disables decoration, removes background color and sets a default text color.
26    Reset,
27
28    /// Change text color. `None` means transparent.
29    TextColor(Option<C>),
30
31    /// Change background color. `None` means transparent.
32    BackgroundColor(Option<C>),
33
34    /// Change color of underlining.
35    Underline(DecorationColor<C>),
36
37    /// Change color of strikethrough decoration.
38    Strikethrough(DecorationColor<C>),
39}
40
41/// A text token
42#[derive(Debug, PartialEq, Clone)]
43pub enum Token<'a, C> {
44    /// A newline character.
45    NewLine,
46
47    /// A \r character.
48    CarriageReturn,
49
50    /// A \t character.
51    Tab,
52
53    /// A number of whitespace characters.
54    Whitespace(u32, &'a str),
55
56    /// A word (a sequence of non-whitespace characters).
57    Word(&'a str),
58
59    /// A possible wrapping point. Contains the separator character(s).
60    Break(&'a str),
61
62    /// Change of text style.
63    ChangeTextStyle(ChangeTextStyle<C>),
64
65    /// Move the cursor by a number of characters.
66    MoveCursor {
67        /// Number of characters to move.
68        chars: i32,
69        /// True to draw over the area of movement with the background color.
70        draw_background: bool,
71    },
72}
73
74/// Text parser. Turns a string into a stream of [`Token`] objects.
75#[derive(Clone, Debug)]
76pub(crate) struct Parser<'a, C>
77where
78    C: PixelColor,
79{
80    inner: Chars<'a>,
81    _marker: PhantomData<C>,
82}
83
84pub(crate) const SPEC_CHAR_NBSP: char = '\u{a0}';
85pub(crate) const SPEC_CHAR_ZWSP: char = '\u{200b}';
86pub(crate) const SPEC_CHAR_SHY: char = '\u{ad}';
87
88fn is_word_char(c: char) -> bool {
89    // Word tokens are terminated when a whitespace, zwsp or shy character is found. An exception
90    // to this rule is the nbsp, which is whitespace but is included in the word.
91    (!c.is_whitespace() || c == SPEC_CHAR_NBSP) && ![SPEC_CHAR_ZWSP, SPEC_CHAR_SHY].contains(&c)
92}
93
94fn is_space_char(c: char) -> bool {
95    // zero-width space breaks whitespace sequences - this works as long as
96    // space handling is symmetrical (i.e. starting == ending behaviour)
97    c.is_whitespace() && !['\n', '\r', '\t', SPEC_CHAR_NBSP].contains(&c) || c == SPEC_CHAR_ZWSP
98}
99
100impl<'a, C> Parser<'a, C>
101where
102    C: PixelColor,
103{
104    /// Create a new parser object to process the given piece of text.
105    #[inline]
106    #[must_use]
107    pub fn parse(text: &'a str) -> Self {
108        Self {
109            inner: text.chars(),
110            _marker: PhantomData,
111        }
112    }
113
114    pub fn as_str(&self) -> &str {
115        self.inner.as_str()
116    }
117
118    fn consume_string(&mut self, string: &'a str, c: char) -> &'a str {
119        // pointer arithmetic to get the offset of `c` relative to `string`
120        let offset = {
121            let ptr_start = string.as_ptr() as usize;
122            let ptr_cur = self.inner.as_str().as_ptr() as usize;
123            ptr_cur - ptr_start - c.len_utf8()
124        };
125
126        debug_assert!(string.is_char_boundary(offset));
127
128        unsafe {
129            // SAFETY: we only work with character boundaries and
130            // offset is <= length
131            self.inner = string.get_unchecked(offset..).chars();
132
133            string.get_unchecked(0..offset)
134        }
135    }
136}
137
138impl<'a, C> Iterator for Parser<'a, C>
139where
140    C: PixelColor,
141{
142    type Item = Token<'a, C>;
143
144    #[inline]
145    fn next(&mut self) -> Option<Self::Item> {
146        let string = self.inner.as_str();
147
148        if let Some(c) = self.inner.next() {
149            if is_word_char(c) {
150                // find the longest consecutive slice of text for a Word token
151                for c in &mut self.inner {
152                    if !is_word_char(c) {
153                        let consumed = self.consume_string(string, c);
154                        return Some(Token::Word(consumed));
155                    }
156                }
157
158                // consumed all the text
159                Some(Token::Word(string))
160            } else {
161                match c {
162                    // special characters
163                    '\n' => Some(Token::NewLine),
164                    '\r' => Some(Token::CarriageReturn),
165                    '\t' => Some(Token::Tab),
166                    SPEC_CHAR_ZWSP => Some(Token::Whitespace(0, unsafe {
167                        // SAFETY: we only work with character boundaries and
168                        // offset is <= length
169                        string.get_unchecked(0..c.len_utf8())
170                    })),
171                    SPEC_CHAR_SHY => Some(Token::Break(
172                        "-", // translate SHY to a printable character
173                    )),
174
175                    // count consecutive whitespace
176                    _ => {
177                        let mut len = 1;
178                        for c in &mut self.inner {
179                            if is_space_char(c) {
180                                if c != SPEC_CHAR_ZWSP {
181                                    len += 1;
182                                }
183                            } else {
184                                let consumed = self.consume_string(string, c);
185                                return Some(Token::Whitespace(len, consumed));
186                            }
187                        }
188
189                        // consumed all the text
190                        Some(Token::Whitespace(len, string))
191                    }
192                }
193            }
194        } else {
195            None
196        }
197    }
198}
199
200#[cfg(test)]
201mod test {
202    use embedded_graphics::pixelcolor::BinaryColor;
203
204    use super::{Parser, Token};
205
206    #[track_caller]
207    pub fn assert_tokens(text: &str, tokens: std::vec::Vec<Token<BinaryColor>>) {
208        assert_eq!(
209            Parser::parse(text).collect::<std::vec::Vec<Token<BinaryColor>>>(),
210            tokens
211        )
212    }
213
214    #[test]
215    fn test_parse() {
216        assert_tokens(
217            "Lorem ipsum \r dolor sit am\u{00AD}et,\tconse😅ctetur adipiscing\nelit",
218            vec![
219                Token::Word("Lorem"),
220                Token::Whitespace(1, " "),
221                Token::Word("ipsum"),
222                Token::Whitespace(1, " "),
223                Token::CarriageReturn,
224                Token::Whitespace(1, " "),
225                Token::Word("dolor"),
226                Token::Whitespace(1, " "),
227                Token::Word("sit"),
228                Token::Whitespace(1, " "),
229                Token::Word("am"),
230                Token::Break("-"),
231                Token::Word("et,"),
232                Token::Tab,
233                Token::Word("conse😅ctetur"),
234                Token::Whitespace(1, " "),
235                Token::Word("adipiscing"),
236                Token::NewLine,
237                Token::Word("elit"),
238            ],
239        );
240    }
241
242    #[test]
243    fn parse_zwsp() {
244        assert_eq!(9, "two\u{200B}words".chars().count());
245
246        assert_tokens(
247            "two\u{200B}words",
248            vec![
249                Token::Word("two"),
250                Token::Whitespace(0, "\u{200B}"),
251                Token::Word("words"),
252            ],
253        );
254
255        // ZWSP is not counted
256        assert_tokens("  \u{200B} ", vec![Token::Whitespace(3, "  \u{200B} ")]);
257    }
258
259    #[test]
260    fn parse_multibyte_last() {
261        assert_tokens("test😅", vec![Token::Word("test😅")]);
262    }
263
264    #[test]
265    fn parse_nbsp_as_word_char() {
266        assert_eq!(9, "test\u{A0}word".chars().count());
267        assert_tokens("test\u{A0}word", vec![Token::Word("test\u{A0}word")]);
268        assert_tokens(
269            " \u{A0}word",
270            vec![Token::Whitespace(1, " "), Token::Word("\u{A0}word")],
271        );
272    }
273
274    #[test]
275    fn parse_shy_issue_42() {
276        assert_tokens(
277            "foo\u{AD}bar",
278            vec![Token::Word("foo"), Token::Break("-"), Token::Word("bar")],
279        );
280    }
281}
embedded_text/parser/mod.rs

embedded_text/parser/
mod.rs