1use crate::ast::{Inline, ShortArgs};
2use crate::diag::{Code, Diagnostic};
3use crate::shortcode::ArgValue;
4use crate::span::Span;
5use std::collections::BTreeMap;
6
7pub fn parse_inline(line: &str, base: u32) -> (Vec<Inline>, Vec<Diagnostic>) {
8 let mut p = Parser {
9 src: line,
10 base,
11 pos: 0,
12 diags: Vec::new(),
13 };
14 let nodes = p.parse_until(None);
15 (nodes, p.diags)
16}
17
18struct Parser<'a> {
19 src: &'a str,
20 base: u32,
21 pos: usize,
22 diags: Vec<Diagnostic>,
23}
24
25impl<'a> Parser<'a> {
26 fn span(&self, start: usize, len: usize) -> Span {
27 Span::new(self.base as usize + start, len)
28 }
29
30 fn peek(&self) -> Option<u8> {
31 self.src.as_bytes().get(self.pos).copied()
32 }
33
34 fn parse_until(&mut self, terminator: Option<u8>) -> Vec<Inline> {
35 let mut out: Vec<Inline> = Vec::new();
36 let mut text_start = self.pos;
37
38 while let Some(c) = self.peek() {
39 if Some(c) == terminator {
40 break;
41 }
42 match c {
43 b'\\' => {
44 self.flush_text(&mut out, text_start);
45 if let Some(esc_char) = self.src[self.pos + 1..].chars().next() {
46 let w = esc_char.len_utf8();
47 let s = self.span(self.pos, 1 + w);
48 out.push(Inline::Text {
49 value: esc_char.to_string(),
50 span: s,
51 });
52 self.pos += 1 + w;
53 } else {
54 self.pos += 1;
55 }
56 text_start = self.pos;
57 }
58 b'`' => {
59 self.flush_text(&mut out, text_start);
60 self.parse_code(&mut out);
61 text_start = self.pos;
62 }
63 b'@' => {
64 self.flush_text(&mut out, text_start);
65 if !self.try_parse_shortcode(&mut out) {
66 out.push(Inline::Text {
67 value: "@".to_string(),
68 span: self.span(self.pos, 1),
69 });
70 self.pos += 1;
71 }
72 text_start = self.pos;
73 }
74 b'*' | b'_' | b'+' | b'~' if self.is_open_marker() => {
75 self.flush_text(&mut out, text_start);
76 self.parse_emph(&mut out, c);
77 text_start = self.pos;
78 }
79 _ => {
80 let w = self.src[self.pos..]
84 .chars()
85 .next()
86 .map_or(1, |c| c.len_utf8());
87 self.pos += w;
88 }
89 }
90 }
91 self.flush_text(&mut out, text_start);
92 out
93 }
94
95 fn flush_text(&self, out: &mut Vec<Inline>, start: usize) {
96 if start < self.pos {
97 let value = self.src[start..self.pos].to_string();
98 out.push(Inline::Text {
99 value,
100 span: self.span(start, self.pos - start),
101 });
102 }
103 }
104
105 fn is_open_marker(&self) -> bool {
106 is_open_marker_at(self.src.as_bytes(), self.pos)
107 }
108
109 fn is_close_marker(&self, marker: u8) -> bool {
110 let bytes = self.src.as_bytes();
111 let pos = self.pos;
112 if bytes.get(pos) != Some(&marker) {
113 return false;
114 }
115 if bytes.get(pos + 1) == Some(&marker) {
116 return false;
117 }
118 let prev = if pos == 0 { None } else { Some(bytes[pos - 1]) };
119 let next = bytes.get(pos + 1).copied();
120 let prev_ok = matches!(prev, Some(b) if b != b' ');
121 let next_ok = match next {
122 None => true,
123 Some(b' ') => true,
124 Some(b) => is_inline_sigil(b) || is_punct(b),
125 };
126 prev_ok && next_ok
127 }
128
129 fn parse_emph(&mut self, out: &mut Vec<Inline>, marker: u8) {
130 let start = self.pos;
131 self.pos += 1;
132 let inner_start = self.pos;
133 let mut content: Vec<Inline> = Vec::new();
134 let mut text_start = inner_start;
135 let mut closed = false;
136
137 while let Some(c) = self.peek() {
138 if c == marker && self.is_close_marker(marker) {
139 if text_start < self.pos {
140 content.push(Inline::Text {
141 value: self.src[text_start..self.pos].to_string(),
142 span: self.span(text_start, self.pos - text_start),
143 });
144 }
145 self.pos += 1;
146 closed = true;
147 break;
148 }
149 if c == marker {
150 self.diags.push(
151 Diagnostic::new(Code::EmphasisSameMarker, self.span(self.pos, 1))
152 .label("inner emphasis re-uses the same marker")
153 .help("use a different emphasis marker for the inner span"),
154 );
155 self.pos += 1;
156 continue;
157 }
158 match c {
159 b'\\' => {
160 if text_start < self.pos {
161 content.push(Inline::Text {
162 value: self.src[text_start..self.pos].to_string(),
163 span: self.span(text_start, self.pos - text_start),
164 });
165 }
166 if let Some(esc_char) = self.src[self.pos + 1..].chars().next() {
167 let w = esc_char.len_utf8();
168 content.push(Inline::Text {
169 value: esc_char.to_string(),
170 span: self.span(self.pos, 1 + w),
171 });
172 self.pos += 1 + w;
173 } else {
174 self.pos += 1;
175 }
176 text_start = self.pos;
177 }
178 b'`' => {
179 if text_start < self.pos {
180 content.push(Inline::Text {
181 value: self.src[text_start..self.pos].to_string(),
182 span: self.span(text_start, self.pos - text_start),
183 });
184 }
185 self.parse_code(&mut content);
186 text_start = self.pos;
187 }
188 b'@' => {
189 if text_start < self.pos {
190 content.push(Inline::Text {
191 value: self.src[text_start..self.pos].to_string(),
192 span: self.span(text_start, self.pos - text_start),
193 });
194 }
195 if !self.try_parse_shortcode(&mut content) {
196 content.push(Inline::Text {
197 value: "@".to_string(),
198 span: self.span(self.pos, 1),
199 });
200 self.pos += 1;
201 }
202 text_start = self.pos;
203 }
204 b'*' | b'_' | b'+' | b'~' if c != marker && self.is_open_marker() => {
205 if text_start < self.pos {
206 content.push(Inline::Text {
207 value: self.src[text_start..self.pos].to_string(),
208 span: self.span(text_start, self.pos - text_start),
209 });
210 }
211 self.parse_emph(&mut content, c);
212 text_start = self.pos;
213 }
214 _ => {
215 let w = self.src[self.pos..]
216 .chars()
217 .next()
218 .map_or(1, |c| c.len_utf8());
219 self.pos += w;
220 }
221 }
222 }
223 if !closed {
224 self.diags.push(
225 Diagnostic::new(Code::UnterminatedEmph, self.span(start, 1))
226 .label(format!("opened with `{}`", marker as char)),
227 );
228 }
229 let span = self.span(start, self.pos - start);
230 let node = match marker {
231 b'*' => Inline::Bold { content, span },
232 b'_' => Inline::Italic { content, span },
233 b'+' => Inline::Underline { content, span },
234 b'~' => Inline::Strike { content, span },
235 _ => unreachable!(),
236 };
237 out.push(node);
238 }
239
240 fn parse_code(&mut self, out: &mut Vec<Inline>) {
241 let start = self.pos;
242 let mut ticks = 0;
243 while self.peek() == Some(b'`') && ticks < 2 {
244 self.pos += 1;
245 ticks += 1;
246 }
247 if self.peek() == Some(b'`') {
248 out.push(Inline::Text {
249 value: self.src[start..self.pos].to_string(),
250 span: self.span(start, self.pos - start),
251 });
252 return;
253 }
254 let body_start = self.pos;
255 let needle = if ticks == 1 {
256 "`".to_string()
257 } else {
258 "``".to_string()
259 };
260 let rest = &self.src[body_start..];
261 if let Some(rel) = rest.find(&needle) {
262 let body = &self.src[body_start..body_start + rel];
263 self.pos = body_start + rel + needle.len();
264 out.push(Inline::InlineCode {
265 value: body.to_string(),
266 span: self.span(start, self.pos - start),
267 });
268 } else {
269 self.diags.push(Diagnostic::new(
270 Code::UnterminatedCode,
271 self.span(start, ticks),
272 ));
273 out.push(Inline::Text {
274 value: self.src[start..self.pos].to_string(),
275 span: self.span(start, self.pos - start),
276 });
277 }
278 }
279
280 fn try_parse_shortcode(&mut self, out: &mut Vec<Inline>) -> bool {
281 let saved = self.pos;
282 if self.peek() != Some(b'@') {
283 return false;
284 }
285 let mut cursor = self.pos + 1;
286 let bytes = self.src.as_bytes();
287 if bytes
288 .get(cursor)
289 .map(|b| !b.is_ascii_alphabetic())
290 .unwrap_or(true)
291 {
292 return false;
293 }
294 let name_start = cursor;
295 while let Some(&b) = bytes.get(cursor) {
296 if b.is_ascii_alphanumeric() || b == b'-' {
297 cursor += 1;
298 } else {
299 break;
300 }
301 }
302 let name = self.src[name_start..cursor].to_string();
303 let mut args = ShortArgs::default();
304 if bytes.get(cursor) == Some(&b'(') {
305 match parse_args(self.src, &mut cursor) {
306 Ok(a) => args = a,
307 Err(d) => {
308 self.diags.push(d.label("in inline shortcode"));
309 self.pos = cursor;
310 out.push(Inline::Text {
311 value: self.src[saved..self.pos].to_string(),
312 span: self.span(saved, self.pos - saved),
313 });
314 return true;
315 }
316 }
317 }
318 self.pos = cursor;
319 let mut content = None;
320 if self.peek() == Some(b'[') {
321 self.pos += 1;
322 let inner = self.parse_until(Some(b']'));
323 if self.peek() == Some(b']') {
324 self.pos += 1;
325 }
326 content = Some(inner);
327
328 if self.peek() == Some(b'(') {
332 self.pos += 1;
333 let url_start = self.pos;
334 while let Some(b) = self.peek() {
335 if b == b')' {
336 break;
337 }
338 self.pos += 1;
339 }
340 let url = self.src[url_start..self.pos].to_string();
341 if self.peek() == Some(b')') {
342 self.pos += 1;
343 }
344 args.positional.push(ArgValue::Str(url));
345 }
346 }
347 let span = self.span(saved, self.pos - saved);
348 out.push(Inline::Shortcode {
349 name,
350 args,
351 content,
352 span,
353 });
354 true
355 }
356}
357
358pub(crate) fn is_inline_sigil(b: u8) -> bool {
359 matches!(b, b'*' | b'_' | b'+' | b'~' | b'`' | b'@' | b'[' | b']')
360}
361
362pub(crate) fn is_punct(b: u8) -> bool {
363 matches!(
364 b,
365 b'.' | b',' | b';' | b':' | b'!' | b'?' | b')' | b'(' | b'"' | b'\'' | b'-' | b'/'
366 )
367}
368
369pub(crate) fn is_open_marker_at(bytes: &[u8], pos: usize) -> bool {
374 let marker = match bytes.get(pos) {
375 Some(&b @ (b'*' | b'_' | b'+' | b'~')) => b,
376 _ => return false,
377 };
378 let prev = if pos == 0 { None } else { Some(bytes[pos - 1]) };
379 let next = bytes.get(pos + 1).copied();
380 if next == Some(marker) || prev == Some(marker) {
381 return false;
382 }
383 let prev_ok = match prev {
384 None => true,
385 Some(b' ') => true,
386 Some(b) if is_inline_sigil(b) => true,
387 Some(b) if is_punct(b) => true,
388 _ => false,
389 };
390 let next_ok = matches!(next, Some(b) if b != b' ' && b != marker);
391 prev_ok && next_ok
392}
393
394pub fn parse_args(src: &str, cursor: &mut usize) -> Result<ShortArgs, Diagnostic> {
395 let bytes = src.as_bytes();
396 if bytes.get(*cursor) != Some(&b'(') {
397 return Ok(ShortArgs::default());
398 }
399 *cursor += 1;
400 let mut args = ShortArgs::default();
401 let mut keys_seen: BTreeMap<String, ()> = BTreeMap::new();
402 skip_ws(src, cursor);
403 if bytes.get(*cursor) == Some(&b')') {
404 *cursor += 1;
405 return Ok(args);
406 }
407 loop {
408 skip_ws(src, cursor);
409 let arg_start = *cursor;
410 let saved = *cursor;
411 if let Some(name) = read_ident(src, cursor) {
412 skip_ws(src, cursor);
413 if bytes.get(*cursor) == Some(&b':') {
414 *cursor += 1;
415 skip_ws(src, cursor);
416 let v = read_value(src, cursor)
417 .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
418 if keys_seen.insert(name.clone(), ()).is_some() {
419 return Err(Diagnostic::new(
420 Code::DuplicateKwarg,
421 Span::new(arg_start, name.len()),
422 ));
423 }
424 args.keyword.insert(name, v);
425 } else {
426 *cursor = saved;
427 let v = read_value(src, cursor)
428 .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
429 args.positional.push(v);
430 }
431 } else {
432 let v = read_value(src, cursor)
433 .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
434 args.positional.push(v);
435 }
436 skip_ws(src, cursor);
437 match bytes.get(*cursor) {
438 Some(&b',') => {
439 *cursor += 1;
440 continue;
441 }
442 Some(&b')') => {
443 *cursor += 1;
444 break;
445 }
446 _ => return Err(Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1))),
447 }
448 }
449 Ok(args)
450}
451
452fn skip_ws(src: &str, cursor: &mut usize) {
453 while src.as_bytes().get(*cursor) == Some(&b' ') {
454 *cursor += 1;
455 }
456}
457
458fn read_ident(src: &str, cursor: &mut usize) -> Option<String> {
459 let bytes = src.as_bytes();
460 let start = *cursor;
461 let first = *bytes.get(start)?;
462 if !first.is_ascii_alphabetic() {
463 return None;
464 }
465 let mut end = start + 1;
466 while let Some(&b) = bytes.get(end) {
467 if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
468 end += 1;
469 } else {
470 break;
471 }
472 }
473 *cursor = end;
474 Some(src[start..end].to_string())
475}
476
477fn read_value(src: &str, cursor: &mut usize) -> Option<ArgValue> {
478 skip_ws(src, cursor);
479 let bytes = src.as_bytes();
480 let start = *cursor;
481 match bytes.get(start)? {
482 b'"' => {
483 *cursor += 1;
484 let mut s = String::new();
485 while *cursor < bytes.len() {
486 let b = bytes[*cursor];
487 if b == b'"' {
488 *cursor += 1;
489 return Some(ArgValue::Str(s));
490 }
491 if b == b'\\' {
492 if let Some(c) = src[*cursor + 1..].chars().next() {
493 s.push(c);
494 *cursor += 1 + c.len_utf8();
495 continue;
496 }
497 s.push('\\');
499 *cursor += 1;
500 continue;
501 }
502 let c = src[*cursor..].chars().next().expect("cursor < len");
505 s.push(c);
506 *cursor += c.len_utf8();
507 }
508 None
509 }
510 b'[' => {
511 *cursor += 1;
512 let mut arr: Vec<ArgValue> = Vec::new();
513 skip_ws(src, cursor);
514 if bytes.get(*cursor) == Some(&b']') {
515 *cursor += 1;
516 return Some(ArgValue::Array(arr));
517 }
518 loop {
519 let v = read_value(src, cursor)?;
520 arr.push(v);
521 skip_ws(src, cursor);
522 match bytes.get(*cursor) {
523 Some(&b',') => {
524 *cursor += 1;
525 skip_ws(src, cursor);
526 }
527 Some(&b']') => {
528 *cursor += 1;
529 return Some(ArgValue::Array(arr));
530 }
531 _ => return None,
532 }
533 }
534 }
535 c if c.is_ascii_digit() || *c == b'-' => {
536 let mut end = start;
537 if bytes[end] == b'-' {
538 end += 1;
539 }
540 while let Some(&b) = bytes.get(end) {
541 if b.is_ascii_digit() {
542 end += 1;
543 } else {
544 break;
545 }
546 }
547 let n: i64 = src[start..end].parse().ok()?;
548 *cursor = end;
549 Some(ArgValue::Int(n))
550 }
551 c if c.is_ascii_alphabetic() => {
552 let id = read_ident(src, cursor)?;
553 Some(ArgValue::Ident(id))
554 }
555 _ => None,
556 }
557}
558
559#[cfg(test)]
560mod tests {
561 use super::*;
562
563 fn parse(s: &str) -> (Vec<Inline>, Vec<Diagnostic>) {
564 parse_inline(s, 0)
565 }
566
567 #[test]
568 fn plain_text() {
569 let (n, d) = parse("hello world");
570 assert!(d.is_empty());
571 assert_eq!(n.len(), 1);
572 if let Inline::Text { value, .. } = &n[0] {
573 assert_eq!(value, "hello world");
574 } else {
575 panic!();
576 }
577 }
578
579 #[test]
580 fn bold() {
581 let (n, d) = parse("a *bold* b");
582 assert!(d.is_empty(), "{:?}", d);
583 assert!(matches!(n[1], Inline::Bold { .. }));
584 }
585
586 #[test]
587 fn snake_case_is_literal() {
588 let (n, d) = parse("snake_case_name");
589 assert!(d.is_empty());
590 assert_eq!(n.len(), 1);
591 assert!(matches!(n[0], Inline::Text { .. }));
592 }
593
594 #[test]
595 fn nested_same_marker_errors() {
596 let (_, d) = parse("*outer *inner* outer*");
597 assert!(d.iter().any(|x| x.code == Code::EmphasisSameMarker));
598 }
599
600 #[test]
601 fn inline_code() {
602 let (n, d) = parse("use `printf` here");
603 assert!(d.is_empty());
604 assert!(matches!(n[1], Inline::InlineCode { .. }));
605 }
606
607 #[test]
608 fn double_backtick_code_with_backtick() {
609 let (n, d) = parse("``a ` b``");
610 assert!(d.is_empty());
611 if let Inline::InlineCode { value, .. } = &n[0] {
612 assert_eq!(value, "a ` b");
613 } else {
614 panic!();
615 }
616 }
617
618 #[test]
619 fn shortcode_inline() {
620 let (n, d) = parse("see @link[here](https://x)");
621 assert!(d.is_empty(), "{:?}", d);
622 assert!(matches!(n.last().unwrap(), Inline::Shortcode { .. }));
623 }
624
625 #[test]
626 fn escape_emphasis() {
627 let (n, d) = parse(r"\*literal\*");
628 assert!(d.is_empty());
629 let joined: String = n
630 .iter()
631 .filter_map(|x| {
632 if let Inline::Text { value, .. } = x {
633 Some(value.clone())
634 } else {
635 None
636 }
637 })
638 .collect();
639 assert_eq!(joined, "*literal*");
640 }
641
642 #[test]
643 fn double_marker_not_emphasis() {
644 let (n, _d) = parse("**no**");
645 assert!(!matches!(n[0], Inline::Bold { .. }));
646 }
647
648 #[test]
649 fn escape_before_multibyte_char() {
650 let (n, d) = parse("a \\é b");
654 assert!(d.is_empty(), "{:?}", d);
655 let joined: String = n
656 .iter()
657 .filter_map(|x| {
658 if let Inline::Text { value, .. } = x {
659 Some(value.clone())
660 } else {
661 None
662 }
663 })
664 .collect();
665 assert_eq!(joined, "a é b");
666 }
667
668 #[test]
669 fn multibyte_text_then_emph() {
670 let (n, d) = parse("日本 *bold*");
673 assert!(d.is_empty(), "{:?}", d);
674 assert!(matches!(n.last().unwrap(), Inline::Bold { .. }));
675 }
676
677 #[test]
678 fn arg_string_preserves_multibyte() {
679 let mut cursor = 0usize;
682 let s = "(label: \"日本 🦀\")";
683 let args = parse_args(s, &mut cursor).unwrap();
684 if let ArgValue::Str(v) = args.keyword.get("label").unwrap() {
685 assert_eq!(v, "日本 🦀");
686 } else {
687 panic!();
688 }
689 }
690
691 #[test]
692 fn escape_at_end_of_input() {
693 let (_, _d) = parse("trailing\\");
695 }
696}