harper_core/parsers/
org_mode.rs

1use super::{Parser, PlainEnglish};
2use crate::{Span, Token, TokenKind};
3
4#[derive(Debug, PartialEq, Copy, Clone)]
5enum SourceBlockMarker {
6    Begin,
7    End,
8}
9
10// Check if a line starts with a header (starts with one or more '*')
11fn is_header_line(chars: &[char], start: usize) -> bool {
12    chars.get(start).is_some_and(|c| *c == '*')
13}
14
15// Check if a line starts with a source block begin/end
16fn is_source_block_marker(chars: &[char], start: usize) -> Option<SourceBlockMarker> {
17    let line = get_line_from_start(chars, start);
18    let line_str: String = line.iter().collect();
19    let line_str = line_str.trim();
20
21    if line_str.starts_with("#+BEGIN_SRC") || line_str.starts_with("#+begin_src") {
22        Some(SourceBlockMarker::Begin)
23    } else if line_str.starts_with("#+END_SRC") || line_str.starts_with("#+end_src") {
24        Some(SourceBlockMarker::End)
25    } else {
26        None
27    }
28}
29
30// Check if a line is a directive (starts with #+)
31fn is_directive(chars: &[char], start: usize) -> bool {
32    if start + 1 >= chars.len() {
33        return false;
34    }
35    chars[start] == '#' && chars[start + 1] == '+'
36}
37
38// Check if a line is a list item (starts with -, +, or number)
39fn is_list_item(chars: &[char], start: usize) -> bool {
40    let mut pos = start;
41
42    // initial whitespaces or tabs
43    while pos < chars.len() && (chars[pos] == ' ' || chars[pos] == '\t') {
44        pos += 1;
45    }
46
47    if pos >= chars.len() {
48        return false;
49    }
50
51    // Check for - or + followed by space
52    if (chars[pos] == '-' || chars[pos] == '+') && pos + 1 < chars.len() && chars[pos + 1] == ' ' {
53        return true;
54    }
55
56    // Check for numbered list
57    if chars[pos].is_ascii_digit() {
58        let mut num_pos = pos;
59        while num_pos < chars.len() && chars[num_pos].is_ascii_digit() {
60            num_pos += 1;
61        }
62
63        if num_pos < chars.len()
64            && (chars[num_pos] == '.' || chars[num_pos] == ')')
65            && num_pos + 1 < chars.len()
66            && chars[num_pos + 1] == ' '
67        {
68            return true;
69        }
70    }
71
72    false
73}
74
75// Convert tabs to spaces in list items to avoid French spaces error
76fn normalize_list_item_whitespace(chars: &[char]) -> Vec<char> {
77    let mut result = Vec::new();
78    let mut init_list = false;
79    for &ch in chars {
80        if !init_list && ch == '\t' {
81            result.push(' ');
82            init_list = true;
83        } else {
84            result.push(ch);
85        }
86    }
87    result
88}
89
90// Get the rest of the line from a starting position
91fn get_line_from_start(chars: &[char], start: usize) -> &[char] {
92    let mut end = start;
93    while end < chars.len() && chars[end] != '\n' {
94        end += 1;
95    }
96    &chars[start..end]
97}
98
99// Find the end of the current line starting from position
100fn find_line_end(chars: &[char], start: usize) -> usize {
101    let mut pos = start;
102    while pos < chars.len() && chars[pos] != '\n' {
103        pos += 1;
104    }
105    if pos < chars.len() && chars[pos] == '\n' {
106        pos + 1 // Include the newline
107    } else {
108        pos
109    }
110}
111
112// Find the start of the line containing the given position
113fn find_line_start(chars: &[char], pos: usize) -> usize {
114    let mut start = pos;
115    while start > 0 && chars[start - 1] != '\n' {
116        start -= 1;
117    }
118    start
119}
120
121/// A parser that wraps the [`PlainEnglish`] parser that allows one to parse
122/// Org-mode files.
123///
124/// Will ignore code blocks, source blocks, and other org-mode specific elements
125/// that should not be linted for prose.
126#[derive(Default, Clone, Debug, Copy)]
127pub struct OrgMode;
128
129impl OrgMode {}
130
131impl Parser for OrgMode {
132    fn parse(&self, source: &[char]) -> Vec<Token> {
133        let english_parser = PlainEnglish;
134        let mut tokens = Vec::new();
135        let mut cursor = 0;
136        let mut in_source_block = false;
137
138        while cursor < source.len() {
139            let line_start = find_line_start(source, cursor);
140
141            // Check for source block markers
142            let source_marker = is_source_block_marker(source, line_start);
143            if let Some(marker) = source_marker {
144                in_source_block = marker == SourceBlockMarker::Begin;
145            }
146
147            // If we're in a source block or found a source block marker, make the line unlintable
148            if in_source_block || source_marker.is_some() {
149                let line_end = find_line_end(source, line_start);
150                tokens.push(Token {
151                    span: Span::new(line_start, line_end),
152                    kind: TokenKind::Unlintable,
153                });
154                cursor = line_end;
155                continue;
156            }
157
158            // Check for headers
159            if is_header_line(source, line_start) {
160                let line_end = find_line_end(source, line_start);
161
162                // Find where the header text starts (after the stars and spaces)
163                let mut header_text_start = line_start;
164                while header_text_start < line_end
165                    && (source[header_text_start] == '*' || source[header_text_start] == ' ')
166                {
167                    header_text_start += 1;
168                }
169
170                // If there's actual text after the stars, parse it
171                if header_text_start < line_end {
172                    let mut header_tokens =
173                        english_parser.parse(&source[header_text_start..line_end]);
174                    header_tokens
175                        .iter_mut()
176                        .for_each(|token| token.span.push_by(header_text_start));
177                    tokens.append(&mut header_tokens);
178                }
179
180                // Add paragraph break after header
181                tokens.push(Token {
182                    span: Span::new_with_len(line_end.saturating_sub(1), 0),
183                    kind: TokenKind::ParagraphBreak,
184                });
185
186                cursor = line_end;
187                continue;
188            }
189
190            // Check for directives (#+SOMETHING)
191            if is_directive(source, line_start) {
192                let line_end = find_line_end(source, line_start);
193                tokens.push(Token {
194                    span: Span::new(line_start, line_end),
195                    kind: TokenKind::Unlintable,
196                });
197                cursor = line_end;
198                continue;
199            }
200
201            // Check for list items and normalize tabs to avoid French spaces
202            if is_list_item(source, line_start) {
203                let line_end = find_line_end(source, line_start);
204                let line_chars = &source[line_start..line_end];
205                let normalized_chars = normalize_list_item_whitespace(line_chars);
206
207                let mut line_tokens = english_parser.parse(&normalized_chars);
208                line_tokens
209                    .iter_mut()
210                    .for_each(|token| token.span.push_by(line_start));
211                tokens.append(&mut line_tokens);
212
213                cursor = line_end;
214                continue;
215            }
216
217            // For normal text, parse with the English parser
218            let line_end = find_line_end(source, cursor);
219            if cursor < line_end {
220                let mut line_tokens = english_parser.parse(&source[cursor..line_end]);
221                line_tokens
222                    .iter_mut()
223                    .for_each(|token| token.span.push_by(cursor));
224                tokens.append(&mut line_tokens);
225            }
226
227            cursor = line_end;
228        }
229
230        // Remove trailing newline/paragraph break tokens if the source doesn't actually end with a newline.
231        if matches!(
232            tokens.last(),
233            Some(Token {
234                kind: TokenKind::Newline(_) | TokenKind::ParagraphBreak,
235                ..
236            })
237        ) && source.last() != Some(&'\n')
238        {
239            tokens.pop();
240        }
241
242        tokens
243    }
244}
245
246#[cfg(test)]
247mod tests {
248    use super::super::StrParser;
249    use super::OrgMode;
250    use crate::TokenKind;
251
252    #[test]
253    fn simple_text() {
254        let source = "This is simple text.";
255        let tokens = OrgMode.parse_str(source);
256        assert!(!tokens.is_empty());
257        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
258    }
259
260    #[test]
261    fn header_parsing() {
262        let source = "* This is a header\nThis is regular text.";
263        let tokens = OrgMode.parse_str(source);
264        let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
265
266        // Should have words from header and paragraph break
267        assert!(token_kinds.iter().any(|k| matches!(k, TokenKind::Word(_))));
268        assert!(
269            token_kinds
270                .iter()
271                .any(|k| matches!(k, TokenKind::ParagraphBreak))
272        );
273    }
274
275    #[test]
276    fn multiple_level_headers() {
277        let source = "** Second level header\n*** Third level header";
278        let tokens = OrgMode.parse_str(source);
279        let token_kinds: Vec<_> = tokens.iter().map(|t| &t.kind).collect();
280
281        // Should parse text from both headers
282        let word_count = token_kinds
283            .iter()
284            .filter(|k| matches!(k, TokenKind::Word(_)))
285            .count();
286        assert!(word_count >= 4); // "Second", "level", "Third", "header"
287    }
288
289    #[test]
290    fn source_block_unlintable() {
291        let source = r#"Regular text.
292#+BEGIN_SRC rust
293fn main() {
294    println!("Hello, world!");
295}
296#+END_SRC
297More regular text."#;
298
299        let tokens = OrgMode.parse_str(source);
300        let unlintable_count = tokens
301            .iter()
302            .filter(|t| matches!(t.kind, TokenKind::Unlintable))
303            .count();
304
305        // Should have unlintable tokens for the source block lines
306        assert!(unlintable_count > 0);
307
308        // Should still have regular words from the non-source-block text
309        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
310    }
311
312    #[test]
313    fn directive_unlintable() {
314        let source = r#"#+TITLE: My Document
315#+AUTHOR: Test Author
316This is regular text."#;
317
318        let tokens = OrgMode.parse_str(source);
319        let unlintable_count = tokens
320            .iter()
321            .filter(|t| matches!(t.kind, TokenKind::Unlintable))
322            .count();
323
324        // Should have unlintable tokens for directives
325        assert_eq!(unlintable_count, 2);
326
327        // Should still have regular words
328        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
329    }
330
331    #[test]
332    fn case_insensitive_source_blocks() {
333        let source = r#"#+begin_src python
334print("hello")
335#+end_src"#;
336
337        let tokens = OrgMode.parse_str(source);
338        let unlintable_count = tokens
339            .iter()
340            .filter(|t| matches!(t.kind, TokenKind::Unlintable))
341            .count();
342
343        // All lines should be unlintable
344        assert_eq!(unlintable_count, 3);
345    }
346
347    #[test]
348    fn empty_header() {
349        let source = "*\nRegular text.";
350        let tokens = OrgMode.parse_str(source);
351
352        // Should handle empty headers gracefully
353        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
354    }
355
356    #[test]
357    fn no_trailing_newline() {
358        let source = "Simple text without newline";
359        let tokens = OrgMode.parse_str(source);
360
361        // Should not end with newline token if source doesn't
362        assert!(!tokens.last().unwrap().kind.is_newline());
363    }
364
365    #[test]
366    fn list_items_with_tabs() {
367        let source = "- First item\n\t- Indented with tab\n+ Second item\n1. Numbered item";
368        let tokens = OrgMode.parse_str(source);
369
370        assert!(tokens.iter().any(|t| matches!(t.kind, TokenKind::Word(_))));
371
372        let unlintable_count = tokens
373            .iter()
374            .filter(|t| matches!(t.kind, TokenKind::Unlintable))
375            .count();
376        assert_eq!(unlintable_count, 0);
377    }
378
379    #[test]
380    fn mixed_list_formats() {
381        let source = r#"- Bullet item
3821. Numbered item
383+ Plus item
3842) Parenthesis numbered"#;
385
386        let tokens = OrgMode.parse_str(source);
387
388        // Should recognize all list formats
389        let word_count = tokens
390            .iter()
391            .filter(|t| matches!(t.kind, TokenKind::Word(_)))
392            .count();
393
394        assert!(word_count == 8, "{:?}", tokens); // "Bullet", "item", "Numbered", "item", "Plus", "item", "Parenthesis", "numbered"
395    }
396}