harper_core/parsers/
mod.rs

1mod collapse_identifiers;
2mod isolate_english;
3mod markdown;
4mod mask;
5mod plain_english;
6
7use blanket::blanket;
8pub use collapse_identifiers::CollapseIdentifiers;
9pub use isolate_english::IsolateEnglish;
10pub use markdown::{Markdown, MarkdownOptions};
11pub use mask::Mask;
12pub use plain_english::PlainEnglish;
13
14use crate::{LSend, Token, TokenStringExt};
15
16#[cfg_attr(feature = "concurrent", blanket(derive(Box, Arc)))]
17#[cfg_attr(not(feature = "concurrent"), blanket(derive(Box, Rc)))]
18pub trait Parser: LSend {
19    fn parse(&self, source: &[char]) -> Vec<Token>;
20}
21
22pub trait StrParser {
23    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token>;
24}
25
26impl<T> StrParser for T
27where
28    T: Parser,
29{
30    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token> {
31        let source: Vec<_> = source.as_ref().chars().collect();
32        self.parse(&source)
33    }
34}
35
36#[cfg(test)]
37mod tests {
38    use super::{Markdown, Parser, PlainEnglish};
39    use crate::Punctuation;
40    use crate::TokenKind::{self, *};
41
42    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind], parser: &impl Parser) {
43        let chars: Vec<_> = test_str.as_ref().chars().collect();
44        let tokens = parser.parse(&chars);
45        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
46
47        assert_eq!(&kinds, expected)
48    }
49
50    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
51        assert_tokens_eq(test_str, expected, &PlainEnglish);
52    }
53
54    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
55        assert_tokens_eq(test_str, expected, &Markdown::default())
56    }
57
58    #[test]
59    fn single_letter() {
60        assert_tokens_eq_plain("a", &[TokenKind::blank_word()])
61    }
62
63    #[test]
64    fn sentence() {
65        assert_tokens_eq_plain(
66            "hello world, my friend",
67            &[
68                TokenKind::blank_word(),
69                Space(1),
70                TokenKind::blank_word(),
71                Punctuation(Punctuation::Comma),
72                Space(1),
73                TokenKind::blank_word(),
74                Space(1),
75                TokenKind::blank_word(),
76            ],
77        )
78    }
79
80    #[test]
81    fn sentence_md() {
82        assert_tokens_eq_md(
83            "__hello__ world, [my]() friend",
84            &[
85                TokenKind::blank_word(),
86                Space(1),
87                TokenKind::blank_word(),
88                Punctuation(Punctuation::Comma),
89                Space(1),
90                TokenKind::blank_word(),
91                Space(1),
92                TokenKind::blank_word(),
93            ],
94        );
95    }
96
97    #[test]
98    fn inserts_newlines() {
99        assert_tokens_eq_md(
100            "__hello__ world,\n\n[my]() friend",
101            &[
102                TokenKind::blank_word(),
103                Space(1),
104                TokenKind::blank_word(),
105                Punctuation(Punctuation::Comma),
106                ParagraphBreak,
107                TokenKind::blank_word(),
108                Space(1),
109                TokenKind::blank_word(),
110            ],
111        );
112    }
113
114    /// Make sure that the English parser correctly identifies non-English
115    /// characters as part of the same word.
116    #[test]
117    fn parses_non_english() {
118        assert_tokens_eq_plain("Løvetann", &[TokenKind::blank_word()]);
119        assert_tokens_eq_plain("Naïve", &[TokenKind::blank_word()]);
120    }
121}