harper_core/parsers/
mod.rs

1mod collapse_identifiers;
2mod isolate_english;
3mod markdown;
4mod mask;
5mod plain_english;
6
7use blanket::blanket;
8pub use collapse_identifiers::CollapseIdentifiers;
9pub use isolate_english::IsolateEnglish;
10pub use markdown::{Markdown, MarkdownOptions};
11pub use mask::Mask;
12pub use plain_english::PlainEnglish;
13
14use crate::{Token, TokenStringExt};
15
16#[cfg(not(feature = "concurrent"))]
17#[blanket(derive(Box, Rc))]
18pub trait Parser {
19    fn parse(&self, source: &[char]) -> Vec<Token>;
20}
21
22#[cfg(feature = "concurrent")]
23#[blanket(derive(Box, Arc))]
24pub trait Parser: Send + Sync {
25    fn parse(&self, source: &[char]) -> Vec<Token>;
26}
27
28pub trait StrParser {
29    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token>;
30}
31
32impl<T> StrParser for T
33where
34    T: Parser,
35{
36    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token> {
37        let source: Vec<_> = source.as_ref().chars().collect();
38        self.parse(&source)
39    }
40}
41
42#[cfg(test)]
43mod tests {
44    use super::{Markdown, Parser, PlainEnglish};
45    use crate::Punctuation;
46    use crate::TokenKind::{self, *};
47
48    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind], parser: &impl Parser) {
49        let chars: Vec<_> = test_str.as_ref().chars().collect();
50        let tokens = parser.parse(&chars);
51        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
52
53        assert_eq!(&kinds, expected)
54    }
55
56    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
57        assert_tokens_eq(test_str, expected, &PlainEnglish);
58    }
59
60    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
61        assert_tokens_eq(test_str, expected, &Markdown::default())
62    }
63
64    #[test]
65    fn single_letter() {
66        assert_tokens_eq_plain("a", &[TokenKind::blank_word()])
67    }
68
69    #[test]
70    fn sentence() {
71        assert_tokens_eq_plain(
72            "hello world, my friend",
73            &[
74                TokenKind::blank_word(),
75                Space(1),
76                TokenKind::blank_word(),
77                Punctuation(Punctuation::Comma),
78                Space(1),
79                TokenKind::blank_word(),
80                Space(1),
81                TokenKind::blank_word(),
82            ],
83        )
84    }
85
86    #[test]
87    fn sentence_md() {
88        assert_tokens_eq_md(
89            "__hello__ world, [my]() friend",
90            &[
91                TokenKind::blank_word(),
92                Space(1),
93                TokenKind::blank_word(),
94                Punctuation(Punctuation::Comma),
95                Space(1),
96                TokenKind::blank_word(),
97                Space(1),
98                TokenKind::blank_word(),
99            ],
100        );
101    }
102
103    #[test]
104    fn inserts_newlines() {
105        assert_tokens_eq_md(
106            "__hello__ world,\n\n[my]() friend",
107            &[
108                TokenKind::blank_word(),
109                Space(1),
110                TokenKind::blank_word(),
111                Punctuation(Punctuation::Comma),
112                ParagraphBreak,
113                TokenKind::blank_word(),
114                Space(1),
115                TokenKind::blank_word(),
116            ],
117        );
118    }
119
120    /// Make sure that the English parser correctly identifies non-English
121    /// characters as part of the same word.
122    #[test]
123    fn parses_non_english() {
124        assert_tokens_eq_plain("Løvetann", &[TokenKind::blank_word()]);
125        assert_tokens_eq_plain("Naïve", &[TokenKind::blank_word()]);
126    }
127}