harper_core/parsers/
mod.rs

1mod collapse_identifiers;
2mod isolate_english;
3mod markdown;
4mod mask;
5mod org_mode;
6mod plain_english;
7
8use blanket::blanket;
9pub use collapse_identifiers::CollapseIdentifiers;
10pub use isolate_english::IsolateEnglish;
11pub use markdown::{Markdown, MarkdownOptions};
12pub use mask::Mask;
13pub use org_mode::OrgMode;
14pub use plain_english::PlainEnglish;
15
16use crate::{LSend, Token, TokenStringExt};
17
18#[cfg_attr(feature = "concurrent", blanket(derive(Box, Arc)))]
19#[cfg_attr(not(feature = "concurrent"), blanket(derive(Box, Rc)))]
20pub trait Parser: LSend {
21    fn parse(&self, source: &[char]) -> Vec<Token>;
22}
23
24pub trait StrParser {
25    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token>;
26}
27
28impl<T> StrParser for T
29where
30    T: Parser,
31{
32    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token> {
33        let source: Vec<_> = source.as_ref().chars().collect();
34        self.parse(&source)
35    }
36}
37
38#[cfg(test)]
39mod tests {
40    use super::{Markdown, OrgMode, Parser, PlainEnglish};
41    use crate::Punctuation;
42    use crate::TokenKind::{self, *};
43
44    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind], parser: &impl Parser) {
45        let chars: Vec<_> = test_str.as_ref().chars().collect();
46        let tokens = parser.parse(&chars);
47        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
48
49        assert_eq!(&kinds, expected)
50    }
51
52    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
53        assert_tokens_eq(test_str, expected, &PlainEnglish);
54    }
55
56    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
57        assert_tokens_eq(test_str, expected, &Markdown::default())
58    }
59
60    fn assert_tokens_eq_org(test_str: impl AsRef<str>, expected: &[TokenKind]) {
61        assert_tokens_eq(test_str, expected, &OrgMode)
62    }
63
64    #[test]
65    fn single_letter() {
66        assert_tokens_eq_plain("a", &[TokenKind::blank_word()])
67    }
68
69    #[test]
70    fn sentence() {
71        assert_tokens_eq_plain(
72            "hello world, my friend",
73            &[
74                TokenKind::blank_word(),
75                Space(1),
76                TokenKind::blank_word(),
77                Punctuation(Punctuation::Comma),
78                Space(1),
79                TokenKind::blank_word(),
80                Space(1),
81                TokenKind::blank_word(),
82            ],
83        )
84    }
85
86    #[test]
87    fn sentence_md() {
88        assert_tokens_eq_md(
89            "__hello__ world, [my]() friend",
90            &[
91                TokenKind::blank_word(),
92                Space(1),
93                TokenKind::blank_word(),
94                Punctuation(Punctuation::Comma),
95                Space(1),
96                TokenKind::blank_word(),
97                Space(1),
98                TokenKind::blank_word(),
99            ],
100        );
101    }
102
103    #[test]
104    fn inserts_newlines() {
105        assert_tokens_eq_md(
106            "__hello__ world,\n\n[my]() friend",
107            &[
108                TokenKind::blank_word(),
109                Space(1),
110                TokenKind::blank_word(),
111                Punctuation(Punctuation::Comma),
112                ParagraphBreak,
113                TokenKind::blank_word(),
114                Space(1),
115                TokenKind::blank_word(),
116            ],
117        );
118    }
119
120    /// Make sure that the English parser correctly identifies non-English
121    /// characters as part of the same word.
122    #[test]
123    fn parses_non_english() {
124        assert_tokens_eq_plain("Løvetann", &[TokenKind::blank_word()]);
125        assert_tokens_eq_plain("Naïve", &[TokenKind::blank_word()]);
126    }
127
128    #[test]
129    fn org_mode_basic() {
130        assert_tokens_eq_org(
131            "hello world",
132            &[TokenKind::blank_word(), Space(1), TokenKind::blank_word()],
133        );
134    }
135}