starsector/parser/
structure.rs

1use crate::{Arena, Document, RopeSliceExt, Section, SectionData};
2
3use ropey::{Rope, RopeSlice};
4
5pub fn lex_level(line: &RopeSlice) -> u16 {
6    headline_level(line, 0)
7}
8
9pub fn lex_level_str(line: &str) -> u16 {
10    headline_level_str(line, 0)
11}
12
13// Returns the level of the headline at offset, or 0 if no headline. This will
14// always be exactly one line, including the terminal \n if one is present. Must
15// only be called at the start of a line.
16//
17// For compatibility with org-mode itself, a headline is defined by one or more
18// '*' followed by an ASCII space, ' '.
19pub fn headline_level(input: &RopeSlice, offset: usize) -> u16 {
20    for (i, c) in input.bytes_at(offset).enumerate() {
21        match c {
22            b'*' => {}
23            b' ' if i > 0 => return i as u16,
24            _ => return 0,
25        }
26    }
27    0
28}
29
30pub fn headline_level_str(input: &str, offset: usize) -> u16 {
31    for (i, c) in input[offset..].as_bytes().iter().enumerate() {
32        match c {
33            b'*' => {}
34            b' ' if i > 0 => return i as u16,
35            _ => return 0,
36        }
37    }
38    0
39}
40
41// Returns a pair of the current line and the rest of the string. If the current
42// line is terminated by a \n, that will begin the second returned value. In
43// other words, the first value will never contain a newline.
44pub fn line<'a>(input: &'a RopeSlice<'a>) -> (RopeSlice<'a>, RopeSlice<'a>) {
45    let split = next_line(input, 0);
46    (input.slice_bytes(..split), input.slice_bytes(split..))
47}
48
49// Returns a pair of the current line and the rest of the string, consuming the
50// newline that terminates the current line if present.
51pub fn consuming_line<'a>(input: &'a RopeSlice<'a>) -> (RopeSlice<'a>, RopeSlice<'a>) {
52    let split = next_line(input, 0);
53    let line = input.slice_bytes(..split);
54    let rest = input.slice_bytes(split..);
55    match rest.get_char(0) {
56        Some('\n') => (line, rest.slice(1..)),
57        _ => (line, rest),
58    }
59}
60
61// Returns either the start of the next line, or input.len() if none.
62pub fn next_line(input: &RopeSlice, offset: usize) -> usize {
63    input.memchr(b'\n', offset)
64}
65
66pub(crate) fn parse_document(arena: &mut Arena, input: &RopeSlice) -> Document {
67    let mut offset = 0;
68
69    // Special case empty document.
70    if input.is_empty() {
71        let root_id = arena.arena.new_node(SectionData {
72            level: 0,
73            text: Rope::default(),
74        });
75
76        return Document {
77            root: Section { id: root_id },
78            terminal_newline: false,
79            empty_root_section: true,
80        };
81    }
82
83    let (new_offset, end) = parse_section(input, offset);
84    let empty_root_section = new_offset == end && offset == end;
85    let root_id = arena.arena.new_node(SectionData {
86        level: 0,
87        text: Rope::from(input.slice_bytes(offset..end)),
88    });
89    offset = new_offset;
90
91    let mut stack = vec![root_id];
92
93    let mut level = headline_level(input, offset);
94    while level > 0 {
95        let (new_offset, end) = parse_section(input, next_line(input, offset));
96        let section = SectionData {
97            text: Rope::from(input.slice_bytes(offset..end)),
98            level,
99        };
100        offset = new_offset;
101
102        while level
103            <= arena.arena[*stack.last().expect("stack never empty")]
104                .get()
105                .level
106        {
107            stack.pop().expect("stack never empty");
108        }
109
110        let node_id = arena.arena.new_node(section);
111
112        stack
113            .last()
114            .expect("stack never empty")
115            .append(node_id, &mut arena.arena);
116
117        stack.push(node_id);
118
119        level = headline_level(input, offset);
120    }
121
122    assert_eq!(input.len_bytes(), offset);
123
124    // #[cfg(debug_assertions)]
125    let re = regex::Regex::new("(^|.*\n)\\*\\** .*").expect("failed to assemble headline regex");
126
127    // #[cfg(debug_assertions)]
128    fn checker(re: &regex::Regex, node: Section, arena: &Arena, input: &RopeSlice) {
129        let level = node.level(&arena);
130        let text = node.text(&arena);
131        let lexed_level = lex_level(&text.slice(..));
132        if lexed_level != level
133            || text.len_bytes() >= level as usize
134                && re.is_match(&text.to_contiguous()[(level as usize)..])
135        {
136            // use std::io::Write;
137            // let mut ff = std::fs::File::create("/tmp/lll.org").unwrap();
138            // ff.write_all(input).unwrap();
139            // panic!("Error written to /tmp/lll.org");
140            panic!("Check failed");
141        }
142        assert_eq!(lexed_level, level);
143        for node in node.children(&arena) {
144            checker(re, node, arena, input);
145        }
146    }
147
148    // #[cfg(debug_assertions)]
149    checker(&re, Section { id: root_id }, &arena, input);
150
151    Document {
152        root: Section { id: root_id },
153        terminal_newline: input.bytes().last() == Some(b'\n'),
154        empty_root_section,
155    }
156}
157
158// Returns either the start of the next headline or input.len(), whichever comes
159// first. Must start at the start of the line. Will return nothing if started at
160// a headline.
161fn parse_section(input: &RopeSlice, offset: usize) -> (usize, usize) {
162    // Collect lines until EOF or a headline.
163    let mut last = offset;
164    while last < input.len_bytes() {
165        let i = input.memchr(b'\n', last);
166        // Fastpath: skip lines that don't start with *.
167        if i >= input.len_bytes() || input.byte(last) == b'*' && headline_level(input, last) != 0 {
168            break;
169        }
170        last = i + 1;
171    }
172
173    let last = if last < input.len_bytes() && headline_level(input, last) == 0 {
174        input.len_bytes()
175    } else {
176        last
177    };
178    if last > offset && last <= input.len_bytes() && input.byte(last - 1) == b'\n' {
179        (last, last - 1)
180    } else {
181        (last, last)
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    fn next_line(s: &[u8], offset: usize) -> usize {
190        let r = Rope::from(std::str::from_utf8(s).unwrap());
191        let r = r.slice(..);
192        crate::parser::structure::next_line(&r, offset)
193    }
194
195    fn parse_section(s: &[u8], offset: usize) -> (usize, usize) {
196        let r = Rope::from(std::str::from_utf8(s).unwrap());
197        let r = r.slice(..);
198        crate::parser::structure::parse_section(&r, offset)
199    }
200
201    fn headline_level(s: &[u8], offset: usize) -> u16 {
202        let r = Rope::from(std::str::from_utf8(s).unwrap());
203        let r = r.slice(..);
204        crate::parser::structure::headline_level(&r, offset)
205    }
206
207    #[test]
208    fn test_line() {
209        let empty = Rope::default();
210        assert_eq!((empty.slice(..), empty.slice(..)), line(&empty.slice(..)));
211
212        let newline = Rope::from("\n");
213        assert_eq!(
214            (empty.slice(..), newline.slice(..)),
215            line(&newline.slice(..))
216        );
217
218        let term = Rope::from("* Hello\n");
219        assert_eq!(
220            (term.slice(..term.len_chars() - 1), newline.slice(..)),
221            line(&term.slice(..))
222        );
223
224        let multi = Rope::from("* Hello\nWorld");
225        assert_eq!(
226            (
227                Rope::from("* Hello").slice(..),
228                Rope::from("\nWorld").slice(..)
229            ),
230            line(&multi.slice(..))
231        );
232    }
233
234    #[test]
235    fn test_consuming_line() {
236        let empty = Rope::default();
237        assert_eq!(
238            (empty.slice(..), empty.slice(..)),
239            consuming_line(&empty.slice(..))
240        );
241
242        let newline = Rope::from("\n");
243        assert_eq!(
244            (empty.slice(..), empty.slice(..)),
245            consuming_line(&newline.slice(..))
246        );
247
248        let term = Rope::from("* Hello\n");
249        assert_eq!(
250            (term.slice(..term.len_chars() - 1), empty.slice(..)),
251            consuming_line(&term.slice(..))
252        );
253
254        let multi = Rope::from("* Hello\nWorld");
255        assert_eq!(
256            (
257                Rope::from("* Hello").slice(..),
258                Rope::from("World").slice(..)
259            ),
260            consuming_line(&multi.slice(..))
261        );
262
263        let many = Rope::from("* Hello\n\nWorld");
264        assert_eq!(
265            (
266                Rope::from("* Hello").slice(..),
267                Rope::from("\nWorld").slice(..)
268            ),
269            consuming_line(&many.slice(..))
270        );
271    }
272
273    #[test]
274    fn test_next_line() {
275        assert_eq!(0, next_line(b"", 0));
276        assert_eq!(1, next_line(b" ", 0));
277        assert_eq!(1, next_line(b" ", 1));
278        assert_eq!(0, next_line(b"\n", 0));
279        assert_eq!(1, next_line(b"\n", 1));
280        assert_eq!(1, next_line(b" \n", 0));
281        assert_eq!(1, next_line(b" \n", 1));
282        assert_eq!(0, next_line(b"\n ", 0));
283        assert_eq!(2, next_line(b"\n ", 1));
284        assert_eq!(0, next_line(b"\ntest\n", 0));
285        assert_eq!(5, next_line(b"\ntest\n", 1));
286        assert_eq!(0, next_line(b"\n\na\n", 0));
287        assert_eq!(1, next_line(b"\n\na\n", 1));
288        assert_eq!(3, next_line(b"\n\na\n", 2));
289        assert_eq!(3, next_line(b"\n\na\n", 3));
290    }
291
292    #[test]
293    fn test_parse_section() {
294        assert_eq!((0, 0), parse_section(b"", 0));
295        assert_eq!((1, 1), parse_section(b"*", 0));
296        assert_eq!((1, 1), parse_section(b"*", 1));
297        assert_eq!((0, 0), parse_section(b"* ", 0));
298        assert_eq!((2, 2), parse_section(b"* ", 1));
299        assert_eq!((2, 2), parse_section(b"* ", 2));
300        assert_eq!((1, 0), parse_section(b"\n", 0));
301        assert_eq!((1, 1), parse_section(b"\n", 1));
302        assert_eq!((0, 0), parse_section(b"* \n", 0));
303        assert_eq!((3, 2), parse_section(b"* \n", 1));
304        assert_eq!((3, 2), parse_section(b"* \n", 2));
305        assert_eq!((1, 0), parse_section(b"\n*** \n", 0));
306        assert_eq!((1, 1), parse_section(b"\n*** \n", 1));
307        assert_eq!((2, 2), parse_section(b"\n*** \n", 2));
308        assert_eq!((3, 3), parse_section(b"\n*** \n", 3));
309        assert_eq!((6, 5), parse_section(b"\n*** \n", 4));
310        assert_eq!((3, 2), parse_section(b"Hi\n*** \n", 0));
311    }
312
313    #[test]
314    fn test_headline_level() {
315        assert_eq!(0, headline_level(b"", 0));
316
317        assert_eq!(0, headline_level(b" ", 0));
318        assert_eq!(0, headline_level(b"*", 0));
319        assert_eq!(0, headline_level(b"a", 0));
320
321        assert_eq!(0, headline_level(b"  ", 0));
322        assert_eq!(1, headline_level(b"* ", 0));
323        assert_eq!(0, headline_level(b"a ", 0));
324        assert_eq!(0, headline_level(b" *", 0));
325        assert_eq!(0, headline_level(b"**", 0));
326        assert_eq!(0, headline_level(b"a*", 0));
327        assert_eq!(0, headline_level(b" a", 0));
328        assert_eq!(0, headline_level(b"*a", 0));
329        assert_eq!(0, headline_level(b"aa", 0));
330
331        assert_eq!(0, headline_level(b"   ", 0));
332        assert_eq!(1, headline_level(b"*  ", 0));
333        assert_eq!(0, headline_level(b"a  ", 0));
334        assert_eq!(0, headline_level(b" * ", 0));
335        assert_eq!(2, headline_level(b"** ", 0));
336        assert_eq!(0, headline_level(b"a* ", 0));
337        assert_eq!(0, headline_level(b" a ", 0));
338        assert_eq!(0, headline_level(b"*a ", 0));
339        assert_eq!(0, headline_level(b"aa ", 0));
340
341        assert_eq!(0, headline_level(b"  *", 0));
342        assert_eq!(1, headline_level(b"* *", 0));
343        assert_eq!(0, headline_level(b"a *", 0));
344        assert_eq!(0, headline_level(b" **", 0));
345        assert_eq!(0, headline_level(b"***", 0));
346        assert_eq!(0, headline_level(b"a**", 0));
347        assert_eq!(0, headline_level(b" a*", 0));
348        assert_eq!(0, headline_level(b"*a*", 0));
349        assert_eq!(0, headline_level(b"aa*", 0));
350
351        assert_eq!(0, headline_level(b"  a", 0));
352        assert_eq!(1, headline_level(b"* a", 0));
353        assert_eq!(0, headline_level(b"a a", 0));
354        assert_eq!(0, headline_level(b" *a", 0));
355        assert_eq!(0, headline_level(b"**a", 0));
356        assert_eq!(0, headline_level(b"a*a", 0));
357        assert_eq!(0, headline_level(b" aa", 0));
358        assert_eq!(0, headline_level(b"*aa", 0));
359        assert_eq!(0, headline_level(b"aaa", 0));
360
361        assert_eq!(0, headline_level(b"***", 0));
362        assert_eq!(3, headline_level(b"*** ", 0));
363        assert_eq!(3, headline_level(b"***  ", 0));
364        assert_eq!(0, headline_level(b"***a", 0));
365        assert_eq!(3, headline_level(b"*** a", 0));
366        assert_eq!(3, headline_level(b"*** aaaaa", 0));
367    }
368}