harper_core/parsers/
mod.rs

1//! Adds support for parsing various programming and markup languages through a unified trait: [`Parser`].
2
3mod collapse_identifiers;
4mod isolate_english;
5mod markdown;
6mod mask;
7mod org_mode;
8mod plain_english;
9
10use blanket::blanket;
11pub use collapse_identifiers::CollapseIdentifiers;
12pub use isolate_english::IsolateEnglish;
13pub use markdown::{Markdown, MarkdownOptions};
14pub use mask::Mask;
15pub use org_mode::OrgMode;
16pub use plain_english::PlainEnglish;
17
18use crate::{LSend, Token, TokenStringExt};
19
20#[cfg_attr(feature = "concurrent", blanket(derive(Box, Arc)))]
21#[cfg_attr(not(feature = "concurrent"), blanket(derive(Box, Rc)))]
22pub trait Parser: LSend {
23    fn parse(&self, source: &[char]) -> Vec<Token>;
24}
25
26pub trait StrParser {
27    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token>;
28}
29
30impl<T> StrParser for T
31where
32    T: Parser,
33{
34    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token> {
35        let source: Vec<_> = source.as_ref().chars().collect();
36        self.parse(&source)
37    }
38}
39
40#[cfg(test)]
41mod tests {
42    use super::{Markdown, OrgMode, Parser, PlainEnglish};
43    use crate::Punctuation;
44    use crate::TokenKind::{self, *};
45
46    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind], parser: &impl Parser) {
47        let chars: Vec<_> = test_str.as_ref().chars().collect();
48        let tokens = parser.parse(&chars);
49        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
50
51        assert_eq!(&kinds, expected)
52    }
53
54    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
55        assert_tokens_eq(test_str, expected, &PlainEnglish);
56    }
57
58    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
59        assert_tokens_eq(test_str, expected, &Markdown::default())
60    }
61
62    fn assert_tokens_eq_org(test_str: impl AsRef<str>, expected: &[TokenKind]) {
63        assert_tokens_eq(test_str, expected, &OrgMode)
64    }
65
66    #[test]
67    fn single_letter() {
68        assert_tokens_eq_plain("a", &[TokenKind::blank_word()])
69    }
70
71    #[test]
72    fn sentence() {
73        assert_tokens_eq_plain(
74            "hello world, my friend",
75            &[
76                TokenKind::blank_word(),
77                Space(1),
78                TokenKind::blank_word(),
79                Punctuation(Punctuation::Comma),
80                Space(1),
81                TokenKind::blank_word(),
82                Space(1),
83                TokenKind::blank_word(),
84            ],
85        )
86    }
87
88    #[test]
89    fn sentence_md() {
90        assert_tokens_eq_md(
91            "__hello__ world, [my]() friend",
92            &[
93                TokenKind::blank_word(),
94                Space(1),
95                TokenKind::blank_word(),
96                Punctuation(Punctuation::Comma),
97                Space(1),
98                TokenKind::blank_word(),
99                Space(1),
100                TokenKind::blank_word(),
101            ],
102        );
103    }
104
105    #[test]
106    fn inserts_newlines() {
107        assert_tokens_eq_md(
108            "__hello__ world,\n\n[my]() friend",
109            &[
110                TokenKind::blank_word(),
111                Space(1),
112                TokenKind::blank_word(),
113                Punctuation(Punctuation::Comma),
114                ParagraphBreak,
115                TokenKind::blank_word(),
116                Space(1),
117                TokenKind::blank_word(),
118            ],
119        );
120    }
121
122    /// Make sure that the English parser correctly identifies non-English
123    /// characters as part of the same word.
124    #[test]
125    fn parses_non_english() {
126        assert_tokens_eq_plain("Løvetann", &[TokenKind::blank_word()]);
127        assert_tokens_eq_plain("Naïve", &[TokenKind::blank_word()]);
128    }
129
130    #[test]
131    fn org_mode_basic() {
132        assert_tokens_eq_org(
133            "hello world",
134            &[TokenKind::blank_word(), Space(1), TokenKind::blank_word()],
135        );
136    }
137}