Skip to main content

oak_org_mode/lexer/
mod.rs

1use crate::{kind::OrgModeSyntaxKind, language::OrgModeLanguage};
2use oak_core::{
3    TextEdit,
4    errors::OakError,
5    lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig, WhitespaceConfig},
6    source::Source,
7};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, OrgModeLanguage>;
11
12static ORG_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: false });
13static ORG_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false });
14static ORG_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
15
16#[derive(Clone, Debug)]
17pub struct OrgModeLexer<'config> {
18    _config: &'config OrgModeLanguage,
19}
20
21impl<'config> OrgModeLexer<'config> {
22    pub fn new(config: &'config OrgModeLanguage) -> Self {
23        Self { _config: config }
24    }
25
26    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
27        ORG_WHITESPACE.scan(state, OrgModeSyntaxKind::Whitespace)
28    }
29
30    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
31        ORG_COMMENT.scan(state, OrgModeSyntaxKind::Comment, OrgModeSyntaxKind::Comment)
32    }
33
34    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
35        ORG_STRING.scan(state, OrgModeSyntaxKind::Text)
36    }
37
38    fn lex_text_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
39        if let Some(ch) = state.peek() {
40            if ch.is_alphabetic() {
41                let start_pos = state.get_position();
42                // 读取字母和数字
43                while let Some(ch) = state.peek() {
44                    if ch.is_alphanumeric() {
45                        state.advance(ch.len_utf8());
46                    }
47                    else {
48                        break;
49                    }
50                }
51                let end_pos = state.get_position();
52                let text = state.source().get_text_in((start_pos..end_pos).into());
53                let kind = if self._config.todo_keywords.iter().any(|k| k == text.as_ref()) {
54                    OrgModeSyntaxKind::Todo
55                } else if self._config.done_keywords.iter().any(|k| k == text.as_ref()) {
56                    OrgModeSyntaxKind::Done
57                } else {
58                    OrgModeSyntaxKind::Text
59                };
60                state.add_token(kind, start_pos, end_pos);
61                return true;
62            }
63        }
64        false
65    }
66
67    fn lex_priority<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68        if state.starts_with("[#") {
69            let start_pos = state.get_position();
70            state.advance(2);
71
72            if let Some(ch) = state.peek() {
73                if ch.is_alphabetic() {
74                    state.advance(ch.len_utf8());
75                    if let Some(']') = state.peek() {
76                        state.advance(1);
77                        state.add_token(OrgModeSyntaxKind::Priority, start_pos, state.get_position());
78                        return true;
79                    }
80                }
81            }
82
83            state.set_position(start_pos);
84            false
85        }
86        else {
87            false
88        }
89    }
90
91    fn lex_number_or_date<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
92        if let Some(ch) = state.peek() {
93            if ch.is_ascii_digit() {
94                let start_pos = state.get_position();
95                let mut has_dash = false;
96
97                while let Some(ch) = state.peek() {
98                    if ch.is_ascii_digit() {
99                        state.advance(1);
100                    }
101                    else if ch == '-' {
102                        state.advance(1);
103                        has_dash = true;
104                    }
105                    else {
106                        break;
107                    }
108                }
109
110                let kind = if has_dash { OrgModeSyntaxKind::Date } else { OrgModeSyntaxKind::Number };
111
112                state.add_token(kind, start_pos, state.get_position());
113                return true;
114            }
115        }
116        false
117    }
118
119    fn lex_symbols<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120        if let Some(ch) = state.peek() {
121            let start_pos = state.get_position();
122            state.advance(ch.len_utf8());
123
124            let kind = match ch {
125                '+' => OrgModeSyntaxKind::Plus,
126                '-' => OrgModeSyntaxKind::Minus,
127                '*' => OrgModeSyntaxKind::Star,
128                '#' => OrgModeSyntaxKind::Hash,
129                '|' => OrgModeSyntaxKind::Pipe,
130                ':' => OrgModeSyntaxKind::Colon,
131                '[' => OrgModeSyntaxKind::LeftBracket,
132                ']' => OrgModeSyntaxKind::RightBracket,
133                '(' => OrgModeSyntaxKind::LeftParen,
134                ')' => OrgModeSyntaxKind::RightParen,
135                '{' => OrgModeSyntaxKind::LeftBrace,
136                '}' => OrgModeSyntaxKind::RightBrace,
137                '<' => OrgModeSyntaxKind::LessThan,
138                '>' => OrgModeSyntaxKind::GreaterThan,
139                '=' => OrgModeSyntaxKind::Equal,
140                '_' => OrgModeSyntaxKind::Underscore,
141                '~' => OrgModeSyntaxKind::Tilde,
142                '/' => OrgModeSyntaxKind::Slash,
143                '\\' => OrgModeSyntaxKind::Backslash,
144                '\n' => OrgModeSyntaxKind::Newline,
145                _ => {
146                    // 未知字符,作为文本处理
147                    state.add_token(OrgModeSyntaxKind::Text, start_pos, state.get_position());
148                    return true;
149                }
150            };
151
152            state.add_token(kind, start_pos, state.get_position());
153            true
154        }
155        else {
156            false
157        }
158    }
159
160    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
161        while state.not_at_end() {
162            let safe_point = state.get_position();
163
164            // 优先处理换行符
165            if let Some('\n') = state.peek() {
166                let start_pos = state.get_position();
167                state.advance(1);
168                state.add_token(OrgModeSyntaxKind::Newline, start_pos, state.get_position());
169                continue;
170            }
171
172            // 跳过空白字符
173            if self.skip_whitespace(state) {
174                continue;
175            }
176
177            // 处理注释
178            if self.skip_comment(state) {
179                continue;
180            }
181
182            // 处理字符串
183            if self.lex_string(state) {
184                continue;
185            }
186
187            // 处理优先级
188            if self.lex_priority(state) {
189                continue;
190            }
191
192            // 处理数字或日期
193            if self.lex_number_or_date(state) {
194                continue;
195            }
196
197            // 处理文本或关键字
198            if self.lex_text_or_keyword(state) {
199                continue;
200            }
201
202            // 处理符号
203            if self.lex_symbols(state) {
204                continue;
205            }
206
207            // 如果没有匹配任何模式,创建错误 token
208            let start_pos = state.get_position();
209            if let Some(ch) = state.peek() {
210                state.advance(ch.len_utf8());
211                state.add_token(OrgModeSyntaxKind::Error, start_pos, state.get_position());
212            }
213            else {
214                break;
215            }
216
217            state.advance_if_dead_lock(safe_point);
218        }
219        Ok(())
220    }
221}
222
223impl<'config> Lexer<OrgModeLanguage> for OrgModeLexer<'config> {
224    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<OrgModeLanguage>) -> LexOutput<OrgModeLanguage> {
225        let mut state = State::new(source);
226        let result = self.run(&mut state);
227        if result.is_ok() {
228            state.add_eof();
229        }
230        state.finish_with_cache(result, cache)
231    }
232}
233