Skip to main content

vimdoc_language_server/
parser.rs

1use lsp_types::{Position, Range};
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub enum SepKind {
5    Major,
6    Minor,
7}
8
9#[derive(Debug, Clone)]
10pub struct Span {
11    pub name: String,
12    pub range: Range,
13}
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum LineKind {
17    Blank,
18    Separator(SepKind),
19    CodeBody,
20    ListItem,
21    Text,
22}
23
24#[derive(Debug, Clone)]
25pub struct ParsedLine {
26    pub kind: LineKind,
27    pub tag_defs: Vec<Span>,
28    pub tag_refs: Vec<Span>,
29}
30
31#[derive(Debug, Default, Clone)]
32pub struct Document {
33    pub lines: Vec<ParsedLine>,
34    pub has_modeline: bool,
35}
36
37impl Document {
38    #[must_use]
39    #[allow(clippy::cast_possible_truncation)]
40    pub fn parse(text: &str) -> Self {
41        let mut lines = Vec::new();
42        let mut in_code = false;
43        for (idx, raw) in text.lines().enumerate() {
44            lines.push(parse_line(idx as u32, raw, &mut in_code));
45        }
46        let has_modeline = text
47            .lines()
48            .rev()
49            .find(|l| !l.trim().is_empty())
50            .is_some_and(is_modeline);
51        Document {
52            lines,
53            has_modeline,
54        }
55    }
56
57    pub fn tag_defs(&self) -> impl Iterator<Item = &Span> {
58        self.lines.iter().flat_map(|l| l.tag_defs.iter())
59    }
60
61    pub fn tag_refs(&self) -> impl Iterator<Item = &Span> {
62        self.lines.iter().flat_map(|l| l.tag_refs.iter())
63    }
64}
65
66#[allow(clippy::similar_names)]
67fn parse_line(line_num: u32, raw: &str, in_code: &mut bool) -> ParsedLine {
68    let trimmed = raw.trim_end();
69
70    if trimmed.is_empty() {
71        return mk(LineKind::Blank, vec![], vec![]);
72    }
73
74    if *in_code {
75        let ends_code = trimmed == "<" || (!raw.starts_with(' ') && !raw.starts_with('\t'));
76        if ends_code {
77            *in_code = false;
78            if trimmed == "<" {
79                return mk(LineKind::CodeBody, vec![], vec![]);
80            }
81        } else {
82            return mk(LineKind::CodeBody, vec![], vec![]);
83        }
84    }
85
86    if trimmed.len() >= 10 && trimmed.bytes().all(|b| b == b'=') {
87        return mk(LineKind::Separator(SepKind::Major), vec![], vec![]);
88    }
89    if trimmed.len() >= 10 && trimmed.bytes().all(|b| b == b'-') {
90        return mk(LineKind::Separator(SepKind::Minor), vec![], vec![]);
91    }
92
93    if is_fence_start(trimmed) {
94        *in_code = true;
95        return mk(LineKind::CodeBody, vec![], vec![]);
96    }
97
98    let (tag_defs, tag_refs) = scan_inline(line_num, raw);
99
100    if trimmed.ends_with('>') && !trimmed.ends_with("->") {
101        *in_code = true;
102    }
103
104    if raw.starts_with("- ") || raw.starts_with("* ") || raw.starts_with("• ") {
105        return mk(LineKind::ListItem, tag_defs, tag_refs);
106    }
107
108    if tag_defs.is_empty() {
109        let after_digits = raw.trim_start_matches(|c: char| c.is_ascii_digit());
110        if after_digits.len() < raw.len() && after_digits.starts_with(". ") {
111            return mk(LineKind::ListItem, tag_defs, tag_refs);
112        }
113    }
114
115    mk(LineKind::Text, tag_defs, tag_refs)
116}
117
118#[allow(clippy::similar_names)]
119fn mk(kind: LineKind, tag_defs: Vec<Span>, tag_refs: Vec<Span>) -> ParsedLine {
120    ParsedLine {
121        kind,
122        tag_defs,
123        tag_refs,
124    }
125}
126
127fn is_modeline(line: &str) -> bool {
128    let s = line.trim();
129    (s.contains("vim:") || s.contains("vi:") || s.contains("ex:"))
130        && (s.contains("ft=help") || s.contains("filetype=help"))
131}
132
133fn is_fence_start(s: &str) -> bool {
134    let Some(lang) = s.strip_prefix('>') else {
135        return false;
136    };
137    !lang.is_empty()
138        && lang
139            .bytes()
140            .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'-' | b'+' | b'_'))
141}
142
143#[allow(clippy::similar_names)]
144fn scan_inline(line_num: u32, raw: &str) -> (Vec<Span>, Vec<Span>) {
145    let mut tag_defs = Vec::new();
146    let mut tag_refs = Vec::new();
147    let bytes = raw.as_bytes();
148    let len = bytes.len();
149    let mut i = 0;
150
151    while i < len {
152        match bytes[i] {
153            b'*' => {
154                let at_boundary = i == 0 || matches!(bytes[i - 1], b' ' | b'\t');
155                if at_boundary {
156                    if let Some((name, end)) = scan_delimited(raw, i + 1, b'*') {
157                        tag_defs.push(make_span(raw, line_num, i, end, name));
158                        i = end;
159                    } else {
160                        i += 1;
161                    }
162                } else {
163                    i += 1;
164                }
165            }
166            b'|' => {
167                let at_boundary =
168                    i == 0 || matches!(bytes[i - 1], b' ' | b'\t' | b'(' | b'[' | b'|');
169                if at_boundary {
170                    if let Some((name, end)) = scan_delimited(raw, i + 1, b'|') {
171                        tag_refs.push(make_span(raw, line_num, i, end, name));
172                        i = end;
173                    } else {
174                        i += 1;
175                    }
176                } else {
177                    i += 1;
178                }
179            }
180            b'`' => {
181                let mut j = i + 1;
182                while j < len && bytes[j] != b'`' {
183                    j += 1;
184                }
185                i = j + 1;
186            }
187            _ => {
188                i += 1;
189            }
190        }
191    }
192
193    (tag_defs, tag_refs)
194}
195
196#[allow(clippy::cast_possible_truncation)]
197pub(crate) fn byte_offset_to_utf16(s: &str, byte_pos: usize) -> u32 {
198    s[..byte_pos].chars().map(char::len_utf16).sum::<usize>() as u32
199}
200
201#[allow(clippy::cast_possible_truncation)]
202fn make_span(raw: &str, line_num: u32, start: usize, end: usize, name: String) -> Span {
203    Span {
204        name,
205        range: Range {
206            start: Position {
207                line: line_num,
208                character: byte_offset_to_utf16(raw, start),
209            },
210            end: Position {
211                line: line_num,
212                character: byte_offset_to_utf16(raw, end),
213            },
214        },
215    }
216}
217
218fn scan_delimited(raw: &str, start: usize, delim: u8) -> Option<(String, usize)> {
219    let bytes = raw.as_bytes();
220    let mut end = start;
221    while end < bytes.len() {
222        if bytes[end] == delim {
223            break;
224        }
225        if bytes[end] == b' ' || bytes[end] == b'\t' {
226            return None;
227        }
228        end += 1;
229    }
230    if end >= bytes.len() || end == start {
231        return None;
232    }
233    Some((raw[start..end].to_string(), end + 1))
234}
235
236#[cfg(test)]
237mod tests {
238    use super::*;
239
240    #[test]
241    fn detects_tag_defs() {
242        let doc = Document::parse("*my-tag* some text");
243        assert_eq!(doc.tag_defs().count(), 1);
244        assert_eq!(doc.tag_defs().next().unwrap().name, "my-tag");
245    }
246
247    #[test]
248    fn detects_tag_refs() {
249        let doc = Document::parse("see |my-tag| for details");
250        assert_eq!(doc.tag_refs().count(), 1);
251        assert_eq!(doc.tag_refs().next().unwrap().name, "my-tag");
252    }
253
254    #[test]
255    fn detects_major_separator() {
256        let doc = Document::parse(&"=".repeat(78));
257        assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Major));
258    }
259
260    #[test]
261    fn detects_minor_separator() {
262        let doc = Document::parse(&"-".repeat(78));
263        assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Minor));
264    }
265
266    #[test]
267    fn code_block_body_is_verbatim() {
268        let text = "example >\n    code line\n    another\n<\nnormal";
269        let doc = Document::parse(text);
270        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
271        assert_eq!(doc.lines[2].kind, LineKind::CodeBody);
272        assert_eq!(doc.lines[4].kind, LineKind::Text);
273    }
274
275    #[test]
276    fn unindented_line_ends_code_block() {
277        let text = "example >\n    code\n\nnormal";
278        let doc = Document::parse(text);
279        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
280        assert_eq!(doc.lines[2].kind, LineKind::Blank);
281        assert_eq!(doc.lines[3].kind, LineKind::Text);
282    }
283
284    #[test]
285    fn blank_does_not_end_code_block() {
286        let text = "example >\n    code\n\n    more code\n<\nnormal";
287        let doc = Document::parse(text);
288        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
289        assert_eq!(doc.lines[2].kind, LineKind::Blank);
290        assert_eq!(doc.lines[3].kind, LineKind::CodeBody);
291        assert_eq!(doc.lines[4].kind, LineKind::CodeBody);
292        assert_eq!(doc.lines[5].kind, LineKind::Text);
293    }
294
295    #[test]
296    fn pipe_in_code_block_after_blank_not_scanned() {
297        let text = "example >\n\n    code with |pipe|\n<";
298        let doc = Document::parse(text);
299        assert_eq!(doc.tag_refs().count(), 0);
300    }
301
302    #[test]
303    fn pipe_mid_word_not_scanned_as_taglink() {
304        let doc = Document::parse("string|fun()|nil");
305        assert_eq!(doc.tag_refs().count(), 0);
306    }
307
308    #[test]
309    fn pipe_after_comma_not_scanned_as_taglink() {
310        let doc = Document::parse("value '+,-,+,|,+,-,+,|'");
311        assert_eq!(doc.tag_refs().count(), 0);
312    }
313
314    #[test]
315    fn pipe_after_backslash_not_scanned_as_taglink() {
316        let doc = Document::parse(r"pattern \|alternative\|");
317        assert_eq!(doc.tag_refs().count(), 0);
318    }
319
320    #[test]
321    fn pipe_after_open_paren_is_taglink() {
322        let doc = Document::parse("(see |my-tag|)");
323        assert_eq!(doc.tag_refs().count(), 1);
324        assert_eq!(doc.tag_refs().next().unwrap().name, "my-tag");
325    }
326
327    #[test]
328    fn pipe_at_line_start_is_taglink() {
329        let doc = Document::parse("|my-tag| description");
330        assert_eq!(doc.tag_refs().count(), 1);
331    }
332
333    #[test]
334    fn no_tag_with_space() {
335        let doc = Document::parse("* not a tag *");
336        assert_eq!(doc.tag_defs().count(), 0);
337    }
338
339    #[test]
340    fn inline_glob_not_tag_def() {
341        let doc = Document::parse("set wildignore=*.o,*.obj");
342        assert_eq!(doc.tag_defs().count(), 0);
343    }
344
345    #[test]
346    fn quoted_glob_not_tag_def() {
347        let doc = Document::parse(r#"the patterns "*printcap*", or "*termcap*""#);
348        assert_eq!(doc.tag_defs().count(), 0);
349    }
350
351    #[test]
352    fn path_pattern_not_tag_def() {
353        let doc = Document::parse(r#"located in "pack/*/start/*" dirs"#);
354        assert_eq!(doc.tag_defs().count(), 0);
355    }
356
357    #[test]
358    fn printf_format_not_tag_def() {
359        let doc = Document::parse(r#"echo printf("%1$*2$.*3$d", 1, 2, 3)"#);
360        assert_eq!(doc.tag_defs().count(), 0);
361    }
362
363    #[test]
364    fn utf16_multibyte_before_tag_def() {
365        let doc = Document::parse("日本語 *foo*");
366        let span = doc.tag_defs().next().unwrap();
367        assert_eq!(span.range.start.character, 4);
368        assert_eq!(span.range.end.character, 9);
369    }
370
371    #[test]
372    fn utf16_supplementary_plane_before_tag_ref() {
373        let doc = Document::parse("𝄞 |bar|");
374        let span = doc.tag_refs().next().unwrap();
375        assert_eq!(span.range.start.character, 3);
376        assert_eq!(span.range.end.character, 8);
377    }
378
379    #[test]
380    fn code_fence_language_is_code_body() {
381        let text = "prose\n>lua\n    code()\n<\nafter";
382        let doc = Document::parse(text);
383        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
384        assert_eq!(doc.lines[2].kind, LineKind::CodeBody);
385        assert_eq!(doc.lines[3].kind, LineKind::CodeBody);
386        assert_eq!(doc.lines[4].kind, LineKind::Text);
387    }
388
389    #[test]
390    fn code_fence_language_no_tags_scanned() {
391        let text = ">vim\n    *not-a-tag*\n<";
392        let doc = Document::parse(text);
393        assert_eq!(doc.tag_defs().count(), 0);
394    }
395
396    #[test]
397    fn code_fence_with_digits_is_recognized() {
398        let text = ">lua54\n    vim.fn.input()\n<\nafter";
399        let doc = Document::parse(text);
400        assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
401        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
402        assert_eq!(doc.lines[3].kind, LineKind::Text);
403    }
404
405    #[test]
406    fn code_fence_with_plus_is_recognized() {
407        let text = ">c++\n    int x = 0;\n<\nafter";
408        let doc = Document::parse(text);
409        assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
410        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
411        assert_eq!(doc.lines[3].kind, LineKind::Text);
412    }
413
414    #[test]
415    fn code_fence_with_hyphen_is_recognized() {
416        let text = ">objective-c\n    [obj message];\n<\nafter";
417        let doc = Document::parse(text);
418        assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
419        assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
420        assert_eq!(doc.lines[3].kind, LineKind::Text);
421    }
422
423    #[test]
424    fn utf16_ascii_unaffected() {
425        let doc = Document::parse("hello *baz*");
426        let span = doc.tag_defs().next().unwrap();
427        assert_eq!(span.range.start.character, 6);
428        assert_eq!(span.range.end.character, 11);
429    }
430
431    #[test]
432    fn dash_list_item_is_list_item() {
433        let doc = Document::parse("- item text");
434        assert_eq!(doc.lines[0].kind, LineKind::ListItem);
435    }
436
437    #[test]
438    fn asterisk_list_item_is_list_item() {
439        let doc = Document::parse("* item text");
440        assert_eq!(doc.lines[0].kind, LineKind::ListItem);
441        assert_eq!(doc.tag_defs().count(), 0);
442    }
443
444    #[test]
445    fn tag_def_not_mistaken_for_list_item() {
446        let doc = Document::parse("*my-tag* some text");
447        assert_eq!(doc.lines[0].kind, LineKind::Text);
448        assert_eq!(doc.tag_defs().count(), 1);
449    }
450
451    #[test]
452    fn separator_not_mistaken_for_list_item() {
453        let doc = Document::parse(&"-".repeat(78));
454        assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Minor));
455    }
456
457    #[test]
458    fn ordered_list_item_is_list_item() {
459        let doc = Document::parse("1. First item");
460        assert_eq!(doc.lines[0].kind, LineKind::ListItem);
461    }
462
463    #[test]
464    fn multi_digit_ordered_item_is_list_item() {
465        let doc = Document::parse("42. Forty-second item");
466        assert_eq!(doc.lines[0].kind, LineKind::ListItem);
467    }
468
469    #[test]
470    fn version_number_not_list_item() {
471        let doc = Document::parse("3.14 is approximately pi");
472        assert_eq!(doc.lines[0].kind, LineKind::Text);
473    }
474
475    #[test]
476    fn numbered_item_with_tag_not_list_item() {
477        let doc = Document::parse("1. Introduction\t\t\t*intro*");
478        assert_eq!(doc.lines[0].kind, LineKind::Text);
479        assert_eq!(doc.tag_defs().count(), 1);
480    }
481}