1use lsp_types::{Position, Range};
2
3#[derive(Debug, Clone, PartialEq, Eq)]
4pub enum SepKind {
5 Major,
6 Minor,
7}
8
9#[derive(Debug, Clone)]
10pub struct Span {
11 pub name: String,
12 pub range: Range,
13}
14
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub enum LineKind {
17 Blank,
18 Separator(SepKind),
19 CodeBody,
20 ListItem,
21 Text,
22}
23
24#[derive(Debug, Clone)]
25pub struct ParsedLine {
26 pub kind: LineKind,
27 pub tag_defs: Vec<Span>,
28 pub tag_refs: Vec<Span>,
29}
30
31#[derive(Debug, Default, Clone)]
32pub struct Document {
33 pub lines: Vec<ParsedLine>,
34 pub has_modeline: bool,
35}
36
37impl Document {
38 #[must_use]
39 #[allow(clippy::cast_possible_truncation)]
40 pub fn parse(text: &str) -> Self {
41 let mut lines = Vec::new();
42 let mut in_code = false;
43 for (idx, raw) in text.lines().enumerate() {
44 lines.push(parse_line(idx as u32, raw, &mut in_code));
45 }
46 let has_modeline = text
47 .lines()
48 .rev()
49 .find(|l| !l.trim().is_empty())
50 .is_some_and(is_modeline);
51 Document {
52 lines,
53 has_modeline,
54 }
55 }
56
57 pub fn tag_defs(&self) -> impl Iterator<Item = &Span> {
58 self.lines.iter().flat_map(|l| l.tag_defs.iter())
59 }
60
61 pub fn tag_refs(&self) -> impl Iterator<Item = &Span> {
62 self.lines.iter().flat_map(|l| l.tag_refs.iter())
63 }
64}
65
66#[allow(clippy::similar_names)]
67fn parse_line(line_num: u32, raw: &str, in_code: &mut bool) -> ParsedLine {
68 let trimmed = raw.trim_end();
69
70 if trimmed.is_empty() {
71 return mk(LineKind::Blank, vec![], vec![]);
72 }
73
74 if *in_code {
75 let ends_code = trimmed == "<" || (!raw.starts_with(' ') && !raw.starts_with('\t'));
76 if ends_code {
77 *in_code = false;
78 if trimmed == "<" {
79 return mk(LineKind::CodeBody, vec![], vec![]);
80 }
81 } else {
82 return mk(LineKind::CodeBody, vec![], vec![]);
83 }
84 }
85
86 if trimmed.len() >= 10 && trimmed.bytes().all(|b| b == b'=') {
87 return mk(LineKind::Separator(SepKind::Major), vec![], vec![]);
88 }
89 if trimmed.len() >= 10 && trimmed.bytes().all(|b| b == b'-') {
90 return mk(LineKind::Separator(SepKind::Minor), vec![], vec![]);
91 }
92
93 if is_fence_start(trimmed) {
94 *in_code = true;
95 return mk(LineKind::CodeBody, vec![], vec![]);
96 }
97
98 let (tag_defs, tag_refs) = scan_inline(line_num, raw);
99
100 if trimmed.ends_with('>') && !trimmed.ends_with("->") {
101 *in_code = true;
102 }
103
104 if raw.starts_with("- ") || raw.starts_with("* ") || raw.starts_with("• ") {
105 return mk(LineKind::ListItem, tag_defs, tag_refs);
106 }
107
108 if tag_defs.is_empty() {
109 let after_digits = raw.trim_start_matches(|c: char| c.is_ascii_digit());
110 if after_digits.len() < raw.len() && after_digits.starts_with(". ") {
111 return mk(LineKind::ListItem, tag_defs, tag_refs);
112 }
113 }
114
115 mk(LineKind::Text, tag_defs, tag_refs)
116}
117
118#[allow(clippy::similar_names)]
119fn mk(kind: LineKind, tag_defs: Vec<Span>, tag_refs: Vec<Span>) -> ParsedLine {
120 ParsedLine {
121 kind,
122 tag_defs,
123 tag_refs,
124 }
125}
126
127fn is_modeline(line: &str) -> bool {
128 let s = line.trim();
129 (s.contains("vim:") || s.contains("vi:") || s.contains("ex:"))
130 && (s.contains("ft=help") || s.contains("filetype=help"))
131}
132
133fn is_fence_start(s: &str) -> bool {
134 let Some(lang) = s.strip_prefix('>') else {
135 return false;
136 };
137 !lang.is_empty()
138 && lang
139 .bytes()
140 .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'-' | b'+' | b'_'))
141}
142
143#[allow(clippy::similar_names)]
144fn scan_inline(line_num: u32, raw: &str) -> (Vec<Span>, Vec<Span>) {
145 let mut tag_defs = Vec::new();
146 let mut tag_refs = Vec::new();
147 let bytes = raw.as_bytes();
148 let len = bytes.len();
149 let mut i = 0;
150
151 while i < len {
152 match bytes[i] {
153 b'*' => {
154 let at_boundary = i == 0 || matches!(bytes[i - 1], b' ' | b'\t');
155 if at_boundary {
156 if let Some((name, end)) = scan_delimited(raw, i + 1, b'*') {
157 tag_defs.push(make_span(raw, line_num, i, end, name));
158 i = end;
159 } else {
160 i += 1;
161 }
162 } else {
163 i += 1;
164 }
165 }
166 b'|' => {
167 let at_boundary =
168 i == 0 || matches!(bytes[i - 1], b' ' | b'\t' | b'(' | b'[' | b'|');
169 if at_boundary {
170 if let Some((name, end)) = scan_delimited(raw, i + 1, b'|') {
171 tag_refs.push(make_span(raw, line_num, i, end, name));
172 i = end;
173 } else {
174 i += 1;
175 }
176 } else {
177 i += 1;
178 }
179 }
180 b'`' => {
181 let mut j = i + 1;
182 while j < len && bytes[j] != b'`' {
183 j += 1;
184 }
185 i = j + 1;
186 }
187 _ => {
188 i += 1;
189 }
190 }
191 }
192
193 (tag_defs, tag_refs)
194}
195
196#[allow(clippy::cast_possible_truncation)]
197pub(crate) fn byte_offset_to_utf16(s: &str, byte_pos: usize) -> u32 {
198 s[..byte_pos].chars().map(char::len_utf16).sum::<usize>() as u32
199}
200
201#[allow(clippy::cast_possible_truncation)]
202fn make_span(raw: &str, line_num: u32, start: usize, end: usize, name: String) -> Span {
203 Span {
204 name,
205 range: Range {
206 start: Position {
207 line: line_num,
208 character: byte_offset_to_utf16(raw, start),
209 },
210 end: Position {
211 line: line_num,
212 character: byte_offset_to_utf16(raw, end),
213 },
214 },
215 }
216}
217
218fn scan_delimited(raw: &str, start: usize, delim: u8) -> Option<(String, usize)> {
219 let bytes = raw.as_bytes();
220 let mut end = start;
221 while end < bytes.len() {
222 if bytes[end] == delim {
223 break;
224 }
225 if bytes[end] == b' ' || bytes[end] == b'\t' {
226 return None;
227 }
228 end += 1;
229 }
230 if end >= bytes.len() || end == start {
231 return None;
232 }
233 Some((raw[start..end].to_string(), end + 1))
234}
235
236#[cfg(test)]
237mod tests {
238 use super::*;
239
240 #[test]
241 fn detects_tag_defs() {
242 let doc = Document::parse("*my-tag* some text");
243 assert_eq!(doc.tag_defs().count(), 1);
244 assert_eq!(doc.tag_defs().next().unwrap().name, "my-tag");
245 }
246
247 #[test]
248 fn detects_tag_refs() {
249 let doc = Document::parse("see |my-tag| for details");
250 assert_eq!(doc.tag_refs().count(), 1);
251 assert_eq!(doc.tag_refs().next().unwrap().name, "my-tag");
252 }
253
254 #[test]
255 fn detects_major_separator() {
256 let doc = Document::parse(&"=".repeat(78));
257 assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Major));
258 }
259
260 #[test]
261 fn detects_minor_separator() {
262 let doc = Document::parse(&"-".repeat(78));
263 assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Minor));
264 }
265
266 #[test]
267 fn code_block_body_is_verbatim() {
268 let text = "example >\n code line\n another\n<\nnormal";
269 let doc = Document::parse(text);
270 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
271 assert_eq!(doc.lines[2].kind, LineKind::CodeBody);
272 assert_eq!(doc.lines[4].kind, LineKind::Text);
273 }
274
275 #[test]
276 fn unindented_line_ends_code_block() {
277 let text = "example >\n code\n\nnormal";
278 let doc = Document::parse(text);
279 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
280 assert_eq!(doc.lines[2].kind, LineKind::Blank);
281 assert_eq!(doc.lines[3].kind, LineKind::Text);
282 }
283
284 #[test]
285 fn blank_does_not_end_code_block() {
286 let text = "example >\n code\n\n more code\n<\nnormal";
287 let doc = Document::parse(text);
288 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
289 assert_eq!(doc.lines[2].kind, LineKind::Blank);
290 assert_eq!(doc.lines[3].kind, LineKind::CodeBody);
291 assert_eq!(doc.lines[4].kind, LineKind::CodeBody);
292 assert_eq!(doc.lines[5].kind, LineKind::Text);
293 }
294
295 #[test]
296 fn pipe_in_code_block_after_blank_not_scanned() {
297 let text = "example >\n\n code with |pipe|\n<";
298 let doc = Document::parse(text);
299 assert_eq!(doc.tag_refs().count(), 0);
300 }
301
302 #[test]
303 fn pipe_mid_word_not_scanned_as_taglink() {
304 let doc = Document::parse("string|fun()|nil");
305 assert_eq!(doc.tag_refs().count(), 0);
306 }
307
308 #[test]
309 fn pipe_after_comma_not_scanned_as_taglink() {
310 let doc = Document::parse("value '+,-,+,|,+,-,+,|'");
311 assert_eq!(doc.tag_refs().count(), 0);
312 }
313
314 #[test]
315 fn pipe_after_backslash_not_scanned_as_taglink() {
316 let doc = Document::parse(r"pattern \|alternative\|");
317 assert_eq!(doc.tag_refs().count(), 0);
318 }
319
320 #[test]
321 fn pipe_after_open_paren_is_taglink() {
322 let doc = Document::parse("(see |my-tag|)");
323 assert_eq!(doc.tag_refs().count(), 1);
324 assert_eq!(doc.tag_refs().next().unwrap().name, "my-tag");
325 }
326
327 #[test]
328 fn pipe_at_line_start_is_taglink() {
329 let doc = Document::parse("|my-tag| description");
330 assert_eq!(doc.tag_refs().count(), 1);
331 }
332
333 #[test]
334 fn no_tag_with_space() {
335 let doc = Document::parse("* not a tag *");
336 assert_eq!(doc.tag_defs().count(), 0);
337 }
338
339 #[test]
340 fn inline_glob_not_tag_def() {
341 let doc = Document::parse("set wildignore=*.o,*.obj");
342 assert_eq!(doc.tag_defs().count(), 0);
343 }
344
345 #[test]
346 fn quoted_glob_not_tag_def() {
347 let doc = Document::parse(r#"the patterns "*printcap*", or "*termcap*""#);
348 assert_eq!(doc.tag_defs().count(), 0);
349 }
350
351 #[test]
352 fn path_pattern_not_tag_def() {
353 let doc = Document::parse(r#"located in "pack/*/start/*" dirs"#);
354 assert_eq!(doc.tag_defs().count(), 0);
355 }
356
357 #[test]
358 fn printf_format_not_tag_def() {
359 let doc = Document::parse(r#"echo printf("%1$*2$.*3$d", 1, 2, 3)"#);
360 assert_eq!(doc.tag_defs().count(), 0);
361 }
362
363 #[test]
364 fn utf16_multibyte_before_tag_def() {
365 let doc = Document::parse("日本語 *foo*");
366 let span = doc.tag_defs().next().unwrap();
367 assert_eq!(span.range.start.character, 4);
368 assert_eq!(span.range.end.character, 9);
369 }
370
371 #[test]
372 fn utf16_supplementary_plane_before_tag_ref() {
373 let doc = Document::parse("𝄞 |bar|");
374 let span = doc.tag_refs().next().unwrap();
375 assert_eq!(span.range.start.character, 3);
376 assert_eq!(span.range.end.character, 8);
377 }
378
379 #[test]
380 fn code_fence_language_is_code_body() {
381 let text = "prose\n>lua\n code()\n<\nafter";
382 let doc = Document::parse(text);
383 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
384 assert_eq!(doc.lines[2].kind, LineKind::CodeBody);
385 assert_eq!(doc.lines[3].kind, LineKind::CodeBody);
386 assert_eq!(doc.lines[4].kind, LineKind::Text);
387 }
388
389 #[test]
390 fn code_fence_language_no_tags_scanned() {
391 let text = ">vim\n *not-a-tag*\n<";
392 let doc = Document::parse(text);
393 assert_eq!(doc.tag_defs().count(), 0);
394 }
395
396 #[test]
397 fn code_fence_with_digits_is_recognized() {
398 let text = ">lua54\n vim.fn.input()\n<\nafter";
399 let doc = Document::parse(text);
400 assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
401 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
402 assert_eq!(doc.lines[3].kind, LineKind::Text);
403 }
404
405 #[test]
406 fn code_fence_with_plus_is_recognized() {
407 let text = ">c++\n int x = 0;\n<\nafter";
408 let doc = Document::parse(text);
409 assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
410 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
411 assert_eq!(doc.lines[3].kind, LineKind::Text);
412 }
413
414 #[test]
415 fn code_fence_with_hyphen_is_recognized() {
416 let text = ">objective-c\n [obj message];\n<\nafter";
417 let doc = Document::parse(text);
418 assert_eq!(doc.lines[0].kind, LineKind::CodeBody);
419 assert_eq!(doc.lines[1].kind, LineKind::CodeBody);
420 assert_eq!(doc.lines[3].kind, LineKind::Text);
421 }
422
423 #[test]
424 fn utf16_ascii_unaffected() {
425 let doc = Document::parse("hello *baz*");
426 let span = doc.tag_defs().next().unwrap();
427 assert_eq!(span.range.start.character, 6);
428 assert_eq!(span.range.end.character, 11);
429 }
430
431 #[test]
432 fn dash_list_item_is_list_item() {
433 let doc = Document::parse("- item text");
434 assert_eq!(doc.lines[0].kind, LineKind::ListItem);
435 }
436
437 #[test]
438 fn asterisk_list_item_is_list_item() {
439 let doc = Document::parse("* item text");
440 assert_eq!(doc.lines[0].kind, LineKind::ListItem);
441 assert_eq!(doc.tag_defs().count(), 0);
442 }
443
444 #[test]
445 fn tag_def_not_mistaken_for_list_item() {
446 let doc = Document::parse("*my-tag* some text");
447 assert_eq!(doc.lines[0].kind, LineKind::Text);
448 assert_eq!(doc.tag_defs().count(), 1);
449 }
450
451 #[test]
452 fn separator_not_mistaken_for_list_item() {
453 let doc = Document::parse(&"-".repeat(78));
454 assert_eq!(doc.lines[0].kind, LineKind::Separator(SepKind::Minor));
455 }
456
457 #[test]
458 fn ordered_list_item_is_list_item() {
459 let doc = Document::parse("1. First item");
460 assert_eq!(doc.lines[0].kind, LineKind::ListItem);
461 }
462
463 #[test]
464 fn multi_digit_ordered_item_is_list_item() {
465 let doc = Document::parse("42. Forty-second item");
466 assert_eq!(doc.lines[0].kind, LineKind::ListItem);
467 }
468
469 #[test]
470 fn version_number_not_list_item() {
471 let doc = Document::parse("3.14 is approximately pi");
472 assert_eq!(doc.lines[0].kind, LineKind::Text);
473 }
474
475 #[test]
476 fn numbered_item_with_tag_not_list_item() {
477 let doc = Document::parse("1. Introduction\t\t\t*intro*");
478 assert_eq!(doc.lines[0].kind, LineKind::Text);
479 assert_eq!(doc.tag_defs().count(), 1);
480 }
481}