harper_core/parsers/
mod.rs

1//! Adds support for parsing various programming and markup languages through a unified trait: [`Parser`].
2
3mod collapse_identifiers;
4mod isolate_english;
5mod markdown;
6mod mask;
7mod oops_all_headings;
8mod org_mode;
9mod plain_english;
10
11use blanket::blanket;
12pub use collapse_identifiers::CollapseIdentifiers;
13pub use isolate_english::IsolateEnglish;
14pub use markdown::{Markdown, MarkdownOptions};
15pub use mask::Mask;
16pub use oops_all_headings::OopsAllHeadings;
17pub use org_mode::OrgMode;
18pub use plain_english::PlainEnglish;
19
20use crate::{LSend, Token, TokenStringExt};
21
22#[cfg_attr(feature = "concurrent", blanket(derive(Box, Arc)))]
23#[cfg_attr(not(feature = "concurrent"), blanket(derive(Box, Rc)))]
24pub trait Parser: LSend {
25    fn parse(&self, source: &[char]) -> Vec<Token>;
26}
27
28pub trait StrParser {
29    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token>;
30}
31
32impl<T> StrParser for T
33where
34    T: Parser,
35{
36    fn parse_str(&self, source: impl AsRef<str>) -> Vec<Token> {
37        let source: Vec<_> = source.as_ref().chars().collect();
38        self.parse(&source)
39    }
40}
41
42#[cfg(test)]
43mod tests {
44    use super::{Markdown, OrgMode, Parser, PlainEnglish};
45    use crate::Punctuation;
46    use crate::TokenKind::{self, *};
47
48    fn assert_tokens_eq(test_str: impl AsRef<str>, expected: &[TokenKind], parser: &impl Parser) {
49        let chars: Vec<_> = test_str.as_ref().chars().collect();
50        let tokens = parser.parse(&chars);
51        let kinds: Vec<_> = tokens.into_iter().map(|v| v.kind).collect();
52
53        assert_eq!(&kinds, expected)
54    }
55
56    fn assert_tokens_eq_plain(test_str: impl AsRef<str>, expected: &[TokenKind]) {
57        assert_tokens_eq(test_str, expected, &PlainEnglish);
58    }
59
60    fn assert_tokens_eq_md(test_str: impl AsRef<str>, expected: &[TokenKind]) {
61        assert_tokens_eq(test_str, expected, &Markdown::default())
62    }
63
64    fn assert_tokens_eq_org(test_str: impl AsRef<str>, expected: &[TokenKind]) {
65        assert_tokens_eq(test_str, expected, &OrgMode)
66    }
67
68    #[test]
69    fn single_letter() {
70        assert_tokens_eq_plain("a", &[TokenKind::blank_word()])
71    }
72
73    #[test]
74    fn sentence() {
75        assert_tokens_eq_plain(
76            "hello world, my friend",
77            &[
78                TokenKind::blank_word(),
79                Space(1),
80                TokenKind::blank_word(),
81                Punctuation(Punctuation::Comma),
82                Space(1),
83                TokenKind::blank_word(),
84                Space(1),
85                TokenKind::blank_word(),
86            ],
87        )
88    }
89
90    #[test]
91    fn sentence_md() {
92        assert_tokens_eq_md(
93            "__hello__ world, [my]() friend",
94            &[
95                TokenKind::blank_word(),
96                Space(1),
97                TokenKind::blank_word(),
98                Punctuation(Punctuation::Comma),
99                Space(1),
100                TokenKind::blank_word(),
101                Space(1),
102                TokenKind::blank_word(),
103            ],
104        );
105    }
106
107    #[test]
108    fn inserts_newlines() {
109        assert_tokens_eq_md(
110            "__hello__ world,\n\n[my]() friend",
111            &[
112                TokenKind::blank_word(),
113                Space(1),
114                TokenKind::blank_word(),
115                Punctuation(Punctuation::Comma),
116                ParagraphBreak,
117                TokenKind::blank_word(),
118                Space(1),
119                TokenKind::blank_word(),
120            ],
121        );
122    }
123
124    /// Make sure that the English parser correctly identifies non-English
125    /// characters as part of the same word.
126    #[test]
127    fn parses_non_english() {
128        assert_tokens_eq_plain("Løvetann", &[TokenKind::blank_word()]);
129        assert_tokens_eq_plain("Naïve", &[TokenKind::blank_word()]);
130    }
131
132    #[test]
133    fn org_mode_basic() {
134        assert_tokens_eq_org(
135            "hello world",
136            &[TokenKind::blank_word(), Space(1), TokenKind::blank_word()],
137        );
138    }
139}