harper_core/parsers/
org_mode.rs1use super::{Parser, PlainEnglish};
2use crate::{Span, Token, TokenKind};
3
4#[derive(Debug, PartialEq, Copy, Clone)]
5enum SourceBlockMarker {
6 Begin,
7 End,
8}
9
10fn is_header_line(chars: &[char], start: usize) -> bool {
12 chars.get(start).is_some_and(|c| *c == '*')
13}
14
15fn is_source_block_marker(chars: &[char], start: usize) -> Option<SourceBlockMarker> {
17 let line = get_line_from_start(chars, start);
18 let line_str: String = line.iter().collect();
19 let line_str = line_str.trim();
20
21 if line_str.starts_with("#+BEGIN_SRC") || line_str.starts_with("#+begin_src") {
22 Some(SourceBlockMarker::Begin)
23 } else if line_str.starts_with("#+END_SRC") || line_str.starts_with("#+end_src") {
24 Some(SourceBlockMarker::End)
25 } else {
26 None
27 }
28}
29
30fn is_directive(chars: &[char], start: usize) -> bool {
32 if start + 1 >= chars.len() {
33 return false;
34 }
35 chars[start] == '#' && chars[start + 1] == '+'
36}
37
38fn is_list_item(chars: &[char], start: usize) -> bool {
40 let mut pos = start;
41
42 while pos < chars.len() && (chars[pos] == ' ' || chars[pos] == '\t') {
44 pos += 1;
45 }
46
47 if pos >= chars.len() {
48 return false;
49 }
50
51 if (chars[pos] == '-' || chars[pos] == '+') && pos + 1 < chars.len() && chars[pos + 1] == ' ' {
53 return true;
54 }
55
56 if chars[pos].is_ascii_digit() {
58 let mut num_pos = pos;
59 while num_pos < chars.len() && chars[num_pos].is_ascii_digit() {
60 num_pos += 1;
61 }
62
63 if num_pos < chars.len()
64 && (chars[num_pos] == '.' || chars[num_pos] == ')')
65 && num_pos + 1 < chars.len()
66 && chars[num_pos + 1] == ' '
67 {
68 return true;
69 }
70 }
71
72 false
73}
74
75fn normalize_list_item_whitespace(chars: &[char]) -> Vec<char> {
77 let mut result = Vec::new();
78 let mut init_list = false;
79 for &ch in chars {
80 if !init_list && ch == '\t' {
81 result.push(' ');
82 init_list = true;
83 } else {
84 result.push(ch);
85 }
86 }
87 result
88}
89
90fn get_line_from_start(chars: &[char], start: usize) -> &[char] {
92 let mut end = start;
93 while end < chars.len() && chars[end] != '\n' {
94 end += 1;
95 }
96 &chars[start..end]
97}
98
99fn find_line_end(chars: &[char], start: usize) -> usize {
101 let mut pos = start;
102 while pos < chars.len() && chars[pos] != '\n' {
103 pos += 1;
104 }
105 if pos < chars.len() && chars[pos] == '\n' {
106 pos + 1 } else {
108 pos
109 }
110}
111
112fn find_line_start(chars: &[char], pos: usize) -> usize {
114 let mut start = pos;
115 while start > 0 && chars[start - 1] != '\n' {
116 start -= 1;
117 }
118 start
119}
120
121#[derive(Default, Clone, Debug, Copy)]
127pub struct OrgMode;
128
129impl OrgMode {}
130
131impl Parser for OrgMode {
132 fn parse(&self, source: &[char]) -> Vec<Token> {
133 let english_parser = PlainEnglish;
134 let mut tokens = Vec::new();
135 let mut cursor = 0;
136 let mut in_source_block = false;
137
138 while cursor < source.len() {
139 let line_start = find_line_start(source, cursor);
140
141 let source_marker = is_source_block_marker(source, line_start);
143 if let Some(marker) = source_marker {
144 in_source_block = marker == SourceBlockMarker::Begin;
145 }
146
147 if in_source_block || source_marker.is_some() {
149 let line_end = find_line_end(source, line_start);
150 tokens.push(Token {
151 span: Span::new(line_start, line_end),
152 kind: TokenKind::Unlintable,
153 });
154 cursor = line_end;
155 continue;
156 }
157
158 if is_header_line(source, line_start) {
160 let line_end = find_line_end(source, line_start);
161
162 let mut header_text_start = line_start;
164 while header_text_start < line_end
165 && (source[header_text_start] == '*' || source[header_text_start] == ' ')
166 {
167 header_text_start += 1;
168 }
169
170 if header_text_start < line_end {
172 let mut header_tokens =
173 english_parser.parse(&source[header_text_start..line_end]);
174 header_tokens
175 .iter_mut()
176 .for_each(|token| token.span.push_by(header_text_start));
177 tokens.append(&mut header_tokens);
178 }
179
180 tokens.push(Token {
182 span: Span::new_with_len(line_end.saturating_sub(1), 0),
183 kind: TokenKind::ParagraphBreak,
184 });
185
186 cursor = line_end;
187 continue;
188 }
189
190 if is_directive(source, line_start) {
192 let line_end = find_line_end(source, line_start);
193 tokens.push(Token {
194 span: Span::new(line_start, line_end),
195 kind: TokenKind::Unlintable,
196 });
197 cursor = line_end;
198 continue;
199 }
200
201 if is_list_item(source, line_start) {
203 let line_end = find_line_end(source, line_start);
204 let line_chars = &source[line_start..line_end];
205 let normalized_chars = normalize_list_item_whitespace(line_chars);
206
207 let mut line_tokens = english_parser.parse(&normalized_chars);
208 line_tokens
209 .iter_mut()
210 .for_each(|token| token.span.push_by(line_start));
211 tokens.append(&mut line_tokens);
212
213 cursor = line_end;
214 continue;
215 }
216
217 let line_end = find_line_end(source, cursor);
219 if cursor < line_end {
220 let mut line_tokens = english_parser.parse(&source[cursor..line_end]);
221 line_tokens
222 .iter_mut()
223 .for_each(|token| token.span.push_by(cursor));
224 tokens.append(&mut line_tokens);
225 }
226
227 cursor = line_end;
228 }
229
230 if matches!(
232 tokens.last(),
233 Some(Token {
234 kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
235 ..
236 })
237 ) && source.last() != Some(&'\n')
238 {
239 tokens.pop();
240 }
241
242 tokens
243 }
244}
245
246#[cfg(test)]
247mod tests {
248 use super::super::StrParser;
249 use super::OrgMode;
250 use crate::TokenKind;
251
252 #[test]
253 fn simple_text() {
254 let source = "This is simple text.";
255 let tokens = OrgMode.parse_str(source);
256 assert!(!tokens.is_empty());
257 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
258 }
259
260 #[test]
261 fn header_parsing() {
262 let source = "* This is a header\nThis is regular text.";
263 let tokens = OrgMode.parse_str(source);
264 let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
265
266 assert!(token_kinds.iter().any(|k| matches!(k, TokenKind::Word(_))));
268 assert!(
269 token_kinds
270 .iter()
271 .any(|k| matches!(k, TokenKind::ParagraphBreak))
272 );
273 }
274
275 #[test]
276 fn multiple_level_headers() {
277 let source = "** Second level header\n*** Third level header";
278 let tokens = OrgMode.parse_str(source);
279 let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
280
281 let word_count = token_kinds
283 .iter()
284 .filter(|k| matches!(k, TokenKind::Word(_)))
285 .count();
286 assert!(word_count >= 4); }
288
289 #[test]
290 fn source_block_unlintable() {
291 let source = r#"Regular text.
292#+BEGIN_SRC rust
293fn main() {
294 println!("Hello, world!");
295}
296#+END_SRC
297More regular text."#;
298
299 let tokens = OrgMode.parse_str(source);
300 let unlintable_count = tokens
301 .iter()
302 .filter(|t| matches!(t.kind, TokenKind::Unlintable))
303 .count();
304
305 assert!(unlintable_count > 0);
307
308 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
310 }
311
312 #[test]
313 fn directive_unlintable() {
314 let source = r#"#+TITLE: My Document
315#+AUTHOR: Test Author
316This is regular text."#;
317
318 let tokens = OrgMode.parse_str(source);
319 let unlintable_count = tokens
320 .iter()
321 .filter(|t| matches!(t.kind, TokenKind::Unlintable))
322 .count();
323
324 assert_eq!(unlintable_count, 2);
326
327 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
329 }
330
331 #[test]
332 fn case_insensitive_source_blocks() {
333 let source = r#"#+begin_src python
334print("hello")
335#+end_src"#;
336
337 let tokens = OrgMode.parse_str(source);
338 let unlintable_count = tokens
339 .iter()
340 .filter(|t| matches!(t.kind, TokenKind::Unlintable))
341 .count();
342
343 assert_eq!(unlintable_count, 3);
345 }
346
347 #[test]
348 fn empty_header() {
349 let source = "*\nRegular text.";
350 let tokens = OrgMode.parse_str(source);
351
352 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
354 }
355
356 #[test]
357 fn no_trailing_newline() {
358 let source = "Simple text without newline";
359 let tokens = OrgMode.parse_str(source);
360
361 assert!(!tokens.last().unwrap().kind.is_newline());
363 }
364
365 #[test]
366 fn list_items_with_tabs() {
367 let source = "- First item\n\t- Indented with tab\n+ Second item\n1. Numbered item";
368 let tokens = OrgMode.parse_str(source);
369
370 assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
371
372 let unlintable_count = tokens
373 .iter()
374 .filter(|t| matches!(t.kind, TokenKind::Unlintable))
375 .count();
376 assert_eq!(unlintable_count, 0);
377 }
378
379 #[test]
380 fn mixed_list_formats() {
381 let source = r#"- Bullet item
3821. Numbered item
383+ Plus item
3842) Parenthesis numbered"#;
385
386 let tokens = OrgMode.parse_str(source);
387
388 let word_count = tokens
390 .iter()
391 .filter(|t| matches!(t.kind, TokenKind::Word(_)))
392 .count();
393
394 assert!(word_count == 8, "{:?}", tokens); }
396}