#[cfg(feature = "ansi")]
use ansi_parser::AnsiSequence;
use core::str::Chars;
#[derive(Debug, PartialEq, Clone)]
pub enum Token<'a> {
NewLine,
CarriageReturn,
Tab,
Whitespace(u32),
Word(&'a str),
Break(Option<char>),
ExtraCharacter(char),
#[cfg(feature = "ansi")]
EscapeSequence(AnsiSequence),
}
#[derive(Clone, Debug)]
pub struct Parser<'a> {
inner: Chars<'a>,
}
pub(crate) const SPEC_CHAR_NBSP: char = '\u{a0}';
pub(crate) const SPEC_CHAR_ZWSP: char = '\u{200b}';
pub(crate) const SPEC_CHAR_SHY: char = '\u{ad}';
pub(crate) const SPEC_CHAR_ESCAPE: char = '\x1b';
fn is_word_char(c: char) -> bool {
(!c.is_whitespace() || c == SPEC_CHAR_NBSP)
&& ![SPEC_CHAR_ZWSP, SPEC_CHAR_SHY, SPEC_CHAR_ESCAPE].contains(&c)
}
fn is_space_char(c: char) -> bool {
c.is_whitespace() && !['\n', '\r', '\t', SPEC_CHAR_NBSP].contains(&c) || c == SPEC_CHAR_ZWSP
}
impl<'a> Parser<'a> {
#[inline]
#[must_use]
pub fn parse(text: &'a str) -> Self {
Self {
inner: text.chars(),
}
}
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.inner.as_str().is_empty()
}
}
impl<'a> Iterator for Parser<'a> {
type Item = Token<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
let string = self.inner.as_str();
if let Some(c) = self.inner.next() {
if is_word_char(c) {
while let Some(c) = self.inner.next() {
if !is_word_char(c) {
let offset = {
let ptr_start = string.as_ptr() as usize;
let ptr_cur = self.inner.as_str().as_ptr() as usize;
ptr_cur - ptr_start - c.len_utf8()
};
self.inner = unsafe {
string.get_unchecked(offset..).chars()
};
return Some(Token::Word(unsafe {
string.get_unchecked(0..offset)
}));
}
}
Some(Token::Word(string))
} else {
match c {
'\n' => Some(Token::NewLine),
'\r' => Some(Token::CarriageReturn),
'\t' => Some(Token::Tab),
SPEC_CHAR_ZWSP => Some(Token::Break(None)),
SPEC_CHAR_SHY => Some(Token::Break(Some('-'))),
#[cfg(feature = "ansi")]
SPEC_CHAR_ESCAPE => ansi_parser::parse_escape(string).map_or(
Some(Token::EscapeSequence(AnsiSequence::Escape)),
|(string, output)| {
self.inner = string.chars();
Some(Token::EscapeSequence(output))
},
),
_ => {
let mut len = 1;
while let Some(c) = self.inner.next() {
if is_space_char(c) {
if c != SPEC_CHAR_ZWSP {
len += 1;
}
} else {
let offset = {
let ptr_start = string.as_ptr() as usize;
let ptr_cur = self.inner.as_str().as_ptr() as usize;
ptr_cur - ptr_start - c.len_utf8()
};
self.inner = unsafe {
string.get_unchecked(offset..).chars()
};
return Some(Token::Whitespace(len));
}
}
Some(Token::Whitespace(len))
}
}
}
} else {
None
}
}
}
#[cfg(test)]
mod test {
use super::{Parser, Token};
pub fn assert_tokens(text: &str, tokens: std::vec::Vec<Token>) {
assert_eq!(
Parser::parse(text).collect::<std::vec::Vec<Token>>(),
tokens
)
}
#[test]
fn test_parse() {
assert_tokens(
"Lorem ipsum \r dolor sit am\u{00AD}et,\tconse😅ctetur adipiscing\nelit",
vec![
Token::Word("Lorem"),
Token::Whitespace(1),
Token::Word("ipsum"),
Token::Whitespace(1),
Token::CarriageReturn,
Token::Whitespace(1),
Token::Word("dolor"),
Token::Whitespace(1),
Token::Word("sit"),
Token::Whitespace(1),
Token::Word("am"),
Token::Break(Some('-')),
Token::Word("et,"),
Token::Tab,
Token::Word("conse😅ctetur"),
Token::Whitespace(1),
Token::Word("adipiscing"),
Token::NewLine,
Token::Word("elit"),
],
);
}
#[test]
fn parse_zwsp() {
assert_eq!(9, "two\u{200B}words".chars().count());
assert_tokens(
"two\u{200B}words",
vec![Token::Word("two"), Token::Break(None), Token::Word("words")],
);
assert_tokens(" \u{200B} ", vec![Token::Whitespace(3)]);
}
#[test]
fn parse_multibyte_last() {
assert_tokens("test😅", vec![Token::Word("test😅")]);
}
#[test]
fn parse_nbsp_as_word_char() {
assert_eq!(9, "test\u{A0}word".chars().count());
assert_tokens("test\u{A0}word", vec![Token::Word("test\u{A0}word")]);
assert_tokens(
" \u{A0}word",
vec![Token::Whitespace(1), Token::Word("\u{A0}word")],
);
}
#[test]
fn parse_shy_issue_42() {
assert_tokens(
"foo\u{AD}bar",
vec![
Token::Word("foo"),
Token::Break(Some('-')),
Token::Word("bar"),
],
);
}
}
#[cfg(all(feature = "ansi", test))]
mod ansi_parser_tests {
use super::{test::assert_tokens, Token};
use ansi_parser::AnsiSequence;
use heapless::Vec;
#[test]
fn escape_char_ignored_if_not_ansi_sequence() {
assert_tokens(
"foo\x1bbar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::Escape),
Token::Word("bar"),
],
);
assert_tokens(
"foo\x1b[bar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::Escape),
Token::Word("[bar"),
],
);
assert_tokens(
"foo\x1b\x1bbar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::Escape),
Token::Word("bar"),
],
);
}
#[test]
fn escape_char_colors() {
assert_tokens(
"foo\x1b[34mbar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::SetGraphicsMode(
Vec::from_slice(&[34]).unwrap(),
)),
Token::Word("bar"),
],
);
assert_tokens(
"foo\x1b[95mbar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::SetGraphicsMode(
Vec::from_slice(&[95]).unwrap(),
)),
Token::Word("bar"),
],
);
assert_tokens(
"foo\x1b[48;5;16mbar",
vec![
Token::Word("foo"),
Token::EscapeSequence(AnsiSequence::SetGraphicsMode(
Vec::from_slice(&[48, 5, 16]).unwrap(),
)),
Token::Word("bar"),
],
);
}
}