Skip to main content

oak_org_mode/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::OrgModeLanguage, lexer::token_type::OrgModeTokenType};
5use oak_core::{
6    TextEdit,
7    errors::OakError,
8    lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig, WhitespaceConfig},
9    source::Source,
10};
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, OrgModeLanguage>;
14
15static ORG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: false });
16static ORG_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
17static ORG_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
18
19/// Org-mode lexer.
20#[derive(Clone, Debug)]
21pub struct OrgModeLexer<'config> {
22    config: &'config OrgModeLanguage,
23}
24
25impl<'config> OrgModeLexer<'config> {
26    /// Creates a new `OrgModeLexer`.
27    /// Creates a new OrgModeLexer with the given language configuration.
28    pub fn new(config: &'config OrgModeLanguage) -> Self {
29        Self { config }
30    }
31
32    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
33        ORG_WHITESPACE.scan(state, OrgModeTokenType::Whitespace)
34    }
35
36    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
37        ORG_COMMENT.scan(state, OrgModeTokenType::Comment, OrgModeTokenType::Comment)
38    }
39
40    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
41        ORG_STRING.scan(state, OrgModeTokenType::Text)
42    }
43
44    fn lex_text_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
45        if let Some(ch) = state.peek() {
46            if ch.is_alphabetic() {
47                let start_pos = state.get_position();
48                // Read letters and digits
49                while let Some(ch) = state.peek() {
50                    if ch.is_alphanumeric() {
51                        state.advance(ch.len_utf8());
52                    }
53                    else {
54                        break;
55                    }
56                }
57                let end_pos = state.get_position();
58                let text = state.source().get_text_in((start_pos..end_pos).into());
59                let kind = if self.config.todo_keywords.iter().any(|k| k == text.as_ref()) {
60                    OrgModeTokenType::Todo
61                }
62                else if self.config.done_keywords.iter().any(|k| k == text.as_ref()) {
63                    OrgModeTokenType::Done
64                }
65                else {
66                    OrgModeTokenType::Text
67                };
68                state.add_token(kind, start_pos, end_pos);
69                return true;
70            }
71        }
72        false
73    }
74
75    fn lex_priority<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
76        if state.starts_with("[#") {
77            let start_pos = state.get_position();
78            state.advance(2);
79
80            if let Some(ch) = state.peek() {
81                if ch.is_alphabetic() {
82                    state.advance(ch.len_utf8());
83                    if let Some(']') = state.peek() {
84                        state.advance(1);
85                        state.add_token(OrgModeTokenType::Priority, start_pos, state.get_position());
86                        return true;
87                    }
88                }
89            }
90
91            state.set_position(start_pos);
92            false
93        }
94        else {
95            false
96        }
97    }
98
99    fn lex_number_or_date<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100        if let Some(ch) = state.peek() {
101            if ch.is_ascii_digit() {
102                let start_pos = state.get_position();
103                let mut has_dash = false;
104
105                while let Some(ch) = state.peek() {
106                    if ch.is_ascii_digit() {
107                        state.advance(1);
108                    }
109                    else if ch == '-' {
110                        state.advance(1);
111                        has_dash = true;
112                    }
113                    else {
114                        break;
115                    }
116                }
117
118                let kind = if has_dash { OrgModeTokenType::Date } else { OrgModeTokenType::Number };
119
120                state.add_token(kind, start_pos, state.get_position());
121                return true;
122            }
123        }
124        false
125    }
126
127    fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
128        if let Some(ch) = state.peek() {
129            let start_pos = state.get_position();
130            state.advance(ch.len_utf8());
131
132            let kind = match ch {
133                '+' => OrgModeTokenType::Plus,
134                '-' => OrgModeTokenType::Minus,
135                '*' => OrgModeTokenType::Star,
136                '#' => OrgModeTokenType::Hash,
137                '|' => OrgModeTokenType::Pipe,
138                ':' => OrgModeTokenType::Colon,
139                '[' => OrgModeTokenType::LeftBracket,
140                ']' => OrgModeTokenType::RightBracket,
141                '(' => OrgModeTokenType::LeftParen,
142                ')' => OrgModeTokenType::RightParen,
143                '{' => OrgModeTokenType::LeftBrace,
144                '}' => OrgModeTokenType::RightBrace,
145                '<' => OrgModeTokenType::LessThan,
146                '>' => OrgModeTokenType::GreaterThan,
147                '=' => OrgModeTokenType::Equal,
148                '_' => OrgModeTokenType::Underscore,
149                '~' => OrgModeTokenType::Tilde,
150                '/' => OrgModeTokenType::Slash,
151                '\\' => OrgModeTokenType::Backslash,
152                '\n' => OrgModeTokenType::Newline,
153                _ => {
154                    // Unknown character, handle as text
155                    state.add_token(OrgModeTokenType::Text, start_pos, state.get_position());
156                    return true;
157                }
158            };
159
160            state.add_token(kind, start_pos, state.get_position());
161            true
162        }
163        else {
164            false
165        }
166    }
167
168    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
169        while state.not_at_end() {
170            let safe_point = state.get_position();
171
172            // Prioritize newline characters
173            if let Some('\n') = state.peek() {
174                let start_pos = state.get_position();
175                state.advance(1);
176                state.add_token(OrgModeTokenType::Newline, start_pos, state.get_position());
177                continue;
178            }
179
180            // Skip whitespace characters
181            if self.skip_whitespace(state) {
182                continue;
183            }
184
185            // Handle comments
186            if self.skip_comment(state) {
187                continue;
188            }
189
190            // Handle strings
191            if self.lex_string(state) {
192                continue;
193            }
194
195            // Handle priority
196            if self.lex_priority(state) {
197                continue;
198            }
199
200            // Handle numbers or dates
201            if self.lex_number_or_date(state) {
202                continue;
203            }
204
205            // Handle text or keywords
206            if self.lex_text_or_keyword(state) {
207                continue;
208            }
209
210            // Handle symbols
211            if self.lex_symbols(state) {
212                continue;
213            }
214
215            // If no pattern matched, create error token
216            let start_pos = state.get_position();
217            if let Some(ch) = state.peek() {
218                state.advance(ch.len_utf8());
219                state.add_token(OrgModeTokenType::Error, start_pos, state.get_position());
220            }
221            else {
222                break;
223            }
224
225            state.advance_if_dead_lock(safe_point);
226        }
227        Ok(())
228    }
229}
230
231impl<'config> Lexer<OrgModeLanguage> for OrgModeLexer<'config> {
232    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<OrgModeLanguage>) -> LexOutput<OrgModeLanguage> {
233        let mut state = State::new(source);
234        let result = self.run(&mut state);
235        if result.is_ok() {
236            state.add_eof()
237        }
238        state.finish_with_cache(result, cache)
239    }
240}