1use crate::{kind::OrgModeSyntaxKind, language::OrgModeLanguage};
2use oak_core::{
3 TextEdit,
4 errors::OakError,
5 lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig, WhitespaceConfig},
6 source::Source,
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, OrgModeLanguage>;
11
12static ORG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: false });
13static ORG_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
14static ORG_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct OrgModeLexer<'config> {
18 _config: &'config OrgModeLanguage,
19}
20
21impl<'config> OrgModeLexer<'config> {
22 pub fn new(config: &'config OrgModeLanguage) -> Self {
23 Self { _config: config }
24 }
25
26 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
27 ORG_WHITESPACE.scan(state, OrgModeSyntaxKind::Whitespace)
28 }
29
30 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
31 ORG_COMMENT.scan(state, OrgModeSyntaxKind::Comment, OrgModeSyntaxKind::Comment)
32 }
33
34 fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
35 ORG_STRING.scan(state, OrgModeSyntaxKind::Text)
36 }
37
38 fn lex_text_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
39 if let Some(ch) = state.peek() {
40 if ch.is_alphabetic() {
41 let start_pos = state.get_position();
42 while let Some(ch) = state.peek() {
44 if ch.is_alphanumeric() {
45 state.advance(ch.len_utf8());
46 }
47 else {
48 break;
49 }
50 }
51 let end_pos = state.get_position();
52 let text = state.source().get_text_in((start_pos..end_pos).into());
53 let kind = if self._config.todo_keywords.iter().any(|k| k == text.as_ref()) {
54 OrgModeSyntaxKind::Todo
55 } else if self._config.done_keywords.iter().any(|k| k == text.as_ref()) {
56 OrgModeSyntaxKind::Done
57 } else {
58 OrgModeSyntaxKind::Text
59 };
60 state.add_token(kind, start_pos, end_pos);
61 return true;
62 }
63 }
64 false
65 }
66
67 fn lex_priority<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68 if state.starts_with("[#") {
69 let start_pos = state.get_position();
70 state.advance(2);
71
72 if let Some(ch) = state.peek() {
73 if ch.is_alphabetic() {
74 state.advance(ch.len_utf8());
75 if let Some(']') = state.peek() {
76 state.advance(1);
77 state.add_token(OrgModeSyntaxKind::Priority, start_pos, state.get_position());
78 return true;
79 }
80 }
81 }
82
83 state.set_position(start_pos);
84 false
85 }
86 else {
87 false
88 }
89 }
90
91 fn lex_number_or_date<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
92 if let Some(ch) = state.peek() {
93 if ch.is_ascii_digit() {
94 let start_pos = state.get_position();
95 let mut has_dash = false;
96
97 while let Some(ch) = state.peek() {
98 if ch.is_ascii_digit() {
99 state.advance(1);
100 }
101 else if ch == '-' {
102 state.advance(1);
103 has_dash = true;
104 }
105 else {
106 break;
107 }
108 }
109
110 let kind = if has_dash { OrgModeSyntaxKind::Date } else { OrgModeSyntaxKind::Number };
111
112 state.add_token(kind, start_pos, state.get_position());
113 return true;
114 }
115 }
116 false
117 }
118
119 fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120 if let Some(ch) = state.peek() {
121 let start_pos = state.get_position();
122 state.advance(ch.len_utf8());
123
124 let kind = match ch {
125 '+' => OrgModeSyntaxKind::Plus,
126 '-' => OrgModeSyntaxKind::Minus,
127 '*' => OrgModeSyntaxKind::Star,
128 '#' => OrgModeSyntaxKind::Hash,
129 '|' => OrgModeSyntaxKind::Pipe,
130 ':' => OrgModeSyntaxKind::Colon,
131 '[' => OrgModeSyntaxKind::LeftBracket,
132 ']' => OrgModeSyntaxKind::RightBracket,
133 '(' => OrgModeSyntaxKind::LeftParen,
134 ')' => OrgModeSyntaxKind::RightParen,
135 '{' => OrgModeSyntaxKind::LeftBrace,
136 '}' => OrgModeSyntaxKind::RightBrace,
137 '<' => OrgModeSyntaxKind::LessThan,
138 '>' => OrgModeSyntaxKind::GreaterThan,
139 '=' => OrgModeSyntaxKind::Equal,
140 '_' => OrgModeSyntaxKind::Underscore,
141 '~' => OrgModeSyntaxKind::Tilde,
142 '/' => OrgModeSyntaxKind::Slash,
143 '\\' => OrgModeSyntaxKind::Backslash,
144 '\n' => OrgModeSyntaxKind::Newline,
145 _ => {
146 state.add_token(OrgModeSyntaxKind::Text, start_pos, state.get_position());
148 return true;
149 }
150 };
151
152 state.add_token(kind, start_pos, state.get_position());
153 true
154 }
155 else {
156 false
157 }
158 }
159
160 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
161 while state.not_at_end() {
162 let safe_point = state.get_position();
163
164 if let Some('\n') = state.peek() {
166 let start_pos = state.get_position();
167 state.advance(1);
168 state.add_token(OrgModeSyntaxKind::Newline, start_pos, state.get_position());
169 continue;
170 }
171
172 if self.skip_whitespace(state) {
174 continue;
175 }
176
177 if self.skip_comment(state) {
179 continue;
180 }
181
182 if self.lex_string(state) {
184 continue;
185 }
186
187 if self.lex_priority(state) {
189 continue;
190 }
191
192 if self.lex_number_or_date(state) {
194 continue;
195 }
196
197 if self.lex_text_or_keyword(state) {
199 continue;
200 }
201
202 if self.lex_symbols(state) {
204 continue;
205 }
206
207 let start_pos = state.get_position();
209 if let Some(ch) = state.peek() {
210 state.advance(ch.len_utf8());
211 state.add_token(OrgModeSyntaxKind::Error, start_pos, state.get_position());
212 }
213 else {
214 break;
215 }
216
217 state.advance_if_dead_lock(safe_point);
218 }
219 Ok(())
220 }
221}
222
223impl<'config> Lexer<OrgModeLanguage> for OrgModeLexer<'config> {
224 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<OrgModeLanguage>) -> LexOutput<OrgModeLanguage> {
225 let mut state = State::new(source);
226 let result = self.run(&mut state);
227 if result.is_ok() {
228 state.add_eof();
229 }
230 state.finish_with_cache(result, cache)
231 }
232}
233