Skip to main content

brief/
inline.rs

1use crate::ast::{Inline, ShortArgs};
2use crate::diag::{Code, Diagnostic};
3use crate::shortcode::ArgValue;
4use crate::span::Span;
5use std::collections::BTreeMap;
6
7pub fn parse_inline(line: &str, base: u32) -> (Vec<Inline>, Vec<Diagnostic>) {
8    let mut p = Parser {
9        src: line,
10        base,
11        pos: 0,
12        diags: Vec::new(),
13    };
14    let nodes = p.parse_until(None);
15    (nodes, p.diags)
16}
17
18struct Parser<'a> {
19    src: &'a str,
20    base: u32,
21    pos: usize,
22    diags: Vec<Diagnostic>,
23}
24
25impl<'a> Parser<'a> {
26    fn span(&self, start: usize, len: usize) -> Span {
27        Span::new(self.base as usize + start, len)
28    }
29
30    fn peek(&self) -> Option<u8> {
31        self.src.as_bytes().get(self.pos).copied()
32    }
33
34    fn parse_until(&mut self, terminator: Option<u8>) -> Vec<Inline> {
35        let mut out: Vec<Inline> = Vec::new();
36        let mut text_start = self.pos;
37
38        while let Some(c) = self.peek() {
39            if Some(c) == terminator {
40                break;
41            }
42            match c {
43                b'\\' => {
44                    self.flush_text(&mut out, text_start);
45                    if let Some(esc_char) = self.src[self.pos + 1..].chars().next() {
46                        let w = esc_char.len_utf8();
47                        let s = self.span(self.pos, 1 + w);
48                        out.push(Inline::Text {
49                            value: esc_char.to_string(),
50                            span: s,
51                        });
52                        self.pos += 1 + w;
53                    } else {
54                        self.pos += 1;
55                    }
56                    text_start = self.pos;
57                }
58                b'`' => {
59                    self.flush_text(&mut out, text_start);
60                    self.parse_code(&mut out);
61                    text_start = self.pos;
62                }
63                b'@' => {
64                    self.flush_text(&mut out, text_start);
65                    if !self.try_parse_shortcode(&mut out) {
66                        out.push(Inline::Text {
67                            value: "@".to_string(),
68                            span: self.span(self.pos, 1),
69                        });
70                        self.pos += 1;
71                    }
72                    text_start = self.pos;
73                }
74                b'*' | b'_' | b'+' | b'~' if self.is_open_marker() => {
75                    self.flush_text(&mut out, text_start);
76                    self.parse_emph(&mut out, c);
77                    text_start = self.pos;
78                }
79                _ => {
80                    // Advance by full UTF-8 char width so `pos` stays on
81                    // a char boundary; otherwise a later sigil would slice
82                    // through a multibyte sequence.
83                    let w = self.src[self.pos..]
84                        .chars()
85                        .next()
86                        .map_or(1, |c| c.len_utf8());
87                    self.pos += w;
88                }
89            }
90        }
91        self.flush_text(&mut out, text_start);
92        out
93    }
94
95    fn flush_text(&self, out: &mut Vec<Inline>, start: usize) {
96        if start < self.pos {
97            let value = self.src[start..self.pos].to_string();
98            out.push(Inline::Text {
99                value,
100                span: self.span(start, self.pos - start),
101            });
102        }
103    }
104
105    fn is_open_marker(&self) -> bool {
106        is_open_marker_at(self.src.as_bytes(), self.pos)
107    }
108
109    fn is_close_marker(&self, marker: u8) -> bool {
110        let bytes = self.src.as_bytes();
111        let pos = self.pos;
112        if bytes.get(pos) != Some(&marker) {
113            return false;
114        }
115        if bytes.get(pos + 1) == Some(&marker) {
116            return false;
117        }
118        let prev = if pos == 0 { None } else { Some(bytes[pos - 1]) };
119        let next = bytes.get(pos + 1).copied();
120        let prev_ok = matches!(prev, Some(b) if b != b' ');
121        let next_ok = match next {
122            None => true,
123            Some(b' ') => true,
124            Some(b) => is_inline_sigil(b) || is_punct(b),
125        };
126        prev_ok && next_ok
127    }
128
129    fn parse_emph(&mut self, out: &mut Vec<Inline>, marker: u8) {
130        let start = self.pos;
131        self.pos += 1;
132        let inner_start = self.pos;
133        let mut content: Vec<Inline> = Vec::new();
134        let mut text_start = inner_start;
135        let mut closed = false;
136
137        while let Some(c) = self.peek() {
138            if c == marker && self.is_close_marker(marker) {
139                if text_start < self.pos {
140                    content.push(Inline::Text {
141                        value: self.src[text_start..self.pos].to_string(),
142                        span: self.span(text_start, self.pos - text_start),
143                    });
144                }
145                self.pos += 1;
146                closed = true;
147                break;
148            }
149            if c == marker {
150                self.diags.push(
151                    Diagnostic::new(Code::EmphasisSameMarker, self.span(self.pos, 1))
152                        .label("inner emphasis re-uses the same marker")
153                        .help("use a different emphasis marker for the inner span"),
154                );
155                self.pos += 1;
156                continue;
157            }
158            match c {
159                b'\\' => {
160                    if text_start < self.pos {
161                        content.push(Inline::Text {
162                            value: self.src[text_start..self.pos].to_string(),
163                            span: self.span(text_start, self.pos - text_start),
164                        });
165                    }
166                    if let Some(esc_char) = self.src[self.pos + 1..].chars().next() {
167                        let w = esc_char.len_utf8();
168                        content.push(Inline::Text {
169                            value: esc_char.to_string(),
170                            span: self.span(self.pos, 1 + w),
171                        });
172                        self.pos += 1 + w;
173                    } else {
174                        self.pos += 1;
175                    }
176                    text_start = self.pos;
177                }
178                b'`' => {
179                    if text_start < self.pos {
180                        content.push(Inline::Text {
181                            value: self.src[text_start..self.pos].to_string(),
182                            span: self.span(text_start, self.pos - text_start),
183                        });
184                    }
185                    self.parse_code(&mut content);
186                    text_start = self.pos;
187                }
188                b'@' => {
189                    if text_start < self.pos {
190                        content.push(Inline::Text {
191                            value: self.src[text_start..self.pos].to_string(),
192                            span: self.span(text_start, self.pos - text_start),
193                        });
194                    }
195                    if !self.try_parse_shortcode(&mut content) {
196                        content.push(Inline::Text {
197                            value: "@".to_string(),
198                            span: self.span(self.pos, 1),
199                        });
200                        self.pos += 1;
201                    }
202                    text_start = self.pos;
203                }
204                b'*' | b'_' | b'+' | b'~' if c != marker && self.is_open_marker() => {
205                    if text_start < self.pos {
206                        content.push(Inline::Text {
207                            value: self.src[text_start..self.pos].to_string(),
208                            span: self.span(text_start, self.pos - text_start),
209                        });
210                    }
211                    self.parse_emph(&mut content, c);
212                    text_start = self.pos;
213                }
214                _ => {
215                    let w = self.src[self.pos..]
216                        .chars()
217                        .next()
218                        .map_or(1, |c| c.len_utf8());
219                    self.pos += w;
220                }
221            }
222        }
223        if !closed {
224            self.diags.push(
225                Diagnostic::new(Code::UnterminatedEmph, self.span(start, 1))
226                    .label(format!("opened with `{}`", marker as char)),
227            );
228        }
229        let span = self.span(start, self.pos - start);
230        let node = match marker {
231            b'*' => Inline::Bold { content, span },
232            b'_' => Inline::Italic { content, span },
233            b'+' => Inline::Underline { content, span },
234            b'~' => Inline::Strike { content, span },
235            _ => unreachable!(),
236        };
237        out.push(node);
238    }
239
240    fn parse_code(&mut self, out: &mut Vec<Inline>) {
241        let start = self.pos;
242        let mut ticks = 0;
243        while self.peek() == Some(b'`') && ticks < 2 {
244            self.pos += 1;
245            ticks += 1;
246        }
247        if self.peek() == Some(b'`') {
248            out.push(Inline::Text {
249                value: self.src[start..self.pos].to_string(),
250                span: self.span(start, self.pos - start),
251            });
252            return;
253        }
254        let body_start = self.pos;
255        let needle = if ticks == 1 {
256            "`".to_string()
257        } else {
258            "``".to_string()
259        };
260        let rest = &self.src[body_start..];
261        if let Some(rel) = rest.find(&needle) {
262            let body = &self.src[body_start..body_start + rel];
263            self.pos = body_start + rel + needle.len();
264            out.push(Inline::InlineCode {
265                value: body.to_string(),
266                span: self.span(start, self.pos - start),
267            });
268        } else {
269            self.diags.push(Diagnostic::new(
270                Code::UnterminatedCode,
271                self.span(start, ticks),
272            ));
273            out.push(Inline::Text {
274                value: self.src[start..self.pos].to_string(),
275                span: self.span(start, self.pos - start),
276            });
277        }
278    }
279
280    fn try_parse_shortcode(&mut self, out: &mut Vec<Inline>) -> bool {
281        let saved = self.pos;
282        if self.peek() != Some(b'@') {
283            return false;
284        }
285        let mut cursor = self.pos + 1;
286        let bytes = self.src.as_bytes();
287        if bytes
288            .get(cursor)
289            .map(|b| !b.is_ascii_alphabetic())
290            .unwrap_or(true)
291        {
292            return false;
293        }
294        let name_start = cursor;
295        while let Some(&b) = bytes.get(cursor) {
296            if b.is_ascii_alphanumeric() || b == b'-' {
297                cursor += 1;
298            } else {
299                break;
300            }
301        }
302        let name = self.src[name_start..cursor].to_string();
303        let mut args = ShortArgs::default();
304        if bytes.get(cursor) == Some(&b'(') {
305            match parse_args(self.src, &mut cursor) {
306                Ok(a) => args = a,
307                Err(d) => {
308                    self.diags.push(d.label("in inline shortcode"));
309                    self.pos = cursor;
310                    out.push(Inline::Text {
311                        value: self.src[saved..self.pos].to_string(),
312                        span: self.span(saved, self.pos - saved),
313                    });
314                    return true;
315                }
316            }
317        }
318        self.pos = cursor;
319        let mut content = None;
320        if self.peek() == Some(b'[') {
321            self.pos += 1;
322            let inner = self.parse_until(Some(b']'));
323            if self.peek() == Some(b']') {
324                self.pos += 1;
325            }
326            content = Some(inner);
327
328            // Markdown-link sugar: `@name[text](url)` -- the trailing parens
329            // hold a single raw-string positional argument. This is the only
330            // place where a URL-like value escapes the strict arg grammar.
331            if self.peek() == Some(b'(') {
332                self.pos += 1;
333                let url_start = self.pos;
334                while let Some(b) = self.peek() {
335                    if b == b')' {
336                        break;
337                    }
338                    self.pos += 1;
339                }
340                let url = self.src[url_start..self.pos].to_string();
341                if self.peek() == Some(b')') {
342                    self.pos += 1;
343                }
344                args.positional.push(ArgValue::Str(url));
345            }
346        }
347        let span = self.span(saved, self.pos - saved);
348        out.push(Inline::Shortcode {
349            name,
350            args,
351            content,
352            span,
353        });
354        true
355    }
356}
357
358pub(crate) fn is_inline_sigil(b: u8) -> bool {
359    matches!(b, b'*' | b'_' | b'+' | b'~' | b'`' | b'@' | b'[' | b']')
360}
361
362pub(crate) fn is_punct(b: u8) -> bool {
363    matches!(
364        b,
365        b'.' | b',' | b';' | b':' | b'!' | b'?' | b')' | b'(' | b'"' | b'\'' | b'-' | b'/'
366    )
367}
368
369/// `true` if `bytes[pos]` (which must be one of `*`/`_`/`+`/`~`) would
370/// open an emphasis span at this position under Brief's flanking rules.
371/// Shared between the inline parser and the Markdown→Brief converter so
372/// the two cannot drift.
373pub(crate) fn is_open_marker_at(bytes: &[u8], pos: usize) -> bool {
374    let marker = match bytes.get(pos) {
375        Some(&b @ (b'*' | b'_' | b'+' | b'~')) => b,
376        _ => return false,
377    };
378    let prev = if pos == 0 { None } else { Some(bytes[pos - 1]) };
379    let next = bytes.get(pos + 1).copied();
380    if next == Some(marker) || prev == Some(marker) {
381        return false;
382    }
383    let prev_ok = match prev {
384        None => true,
385        Some(b' ') => true,
386        Some(b) if is_inline_sigil(b) => true,
387        Some(b) if is_punct(b) => true,
388        _ => false,
389    };
390    let next_ok = matches!(next, Some(b) if b != b' ' && b != marker);
391    prev_ok && next_ok
392}
393
394pub fn parse_args(src: &str, cursor: &mut usize) -> Result<ShortArgs, Diagnostic> {
395    let bytes = src.as_bytes();
396    if bytes.get(*cursor) != Some(&b'(') {
397        return Ok(ShortArgs::default());
398    }
399    *cursor += 1;
400    let mut args = ShortArgs::default();
401    let mut keys_seen: BTreeMap<String, ()> = BTreeMap::new();
402    skip_ws(src, cursor);
403    if bytes.get(*cursor) == Some(&b')') {
404        *cursor += 1;
405        return Ok(args);
406    }
407    loop {
408        skip_ws(src, cursor);
409        let arg_start = *cursor;
410        let saved = *cursor;
411        if let Some(name) = read_ident(src, cursor) {
412            skip_ws(src, cursor);
413            if bytes.get(*cursor) == Some(&b':') {
414                *cursor += 1;
415                skip_ws(src, cursor);
416                let v = read_value(src, cursor)
417                    .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
418                if keys_seen.insert(name.clone(), ()).is_some() {
419                    return Err(Diagnostic::new(
420                        Code::DuplicateKwarg,
421                        Span::new(arg_start, name.len()),
422                    ));
423                }
424                args.keyword.insert(name, v);
425            } else {
426                *cursor = saved;
427                let v = read_value(src, cursor)
428                    .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
429                args.positional.push(v);
430            }
431        } else {
432            let v = read_value(src, cursor)
433                .ok_or_else(|| Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1)))?;
434            args.positional.push(v);
435        }
436        skip_ws(src, cursor);
437        match bytes.get(*cursor) {
438            Some(&b',') => {
439                *cursor += 1;
440                continue;
441            }
442            Some(&b')') => {
443                *cursor += 1;
444                break;
445            }
446            _ => return Err(Diagnostic::new(Code::BadArgSyntax, Span::new(*cursor, 1))),
447        }
448    }
449    Ok(args)
450}
451
452fn skip_ws(src: &str, cursor: &mut usize) {
453    while src.as_bytes().get(*cursor) == Some(&b' ') {
454        *cursor += 1;
455    }
456}
457
458fn read_ident(src: &str, cursor: &mut usize) -> Option<String> {
459    let bytes = src.as_bytes();
460    let start = *cursor;
461    let first = *bytes.get(start)?;
462    if !first.is_ascii_alphabetic() {
463        return None;
464    }
465    let mut end = start + 1;
466    while let Some(&b) = bytes.get(end) {
467        if b.is_ascii_alphanumeric() || b == b'-' || b == b'_' {
468            end += 1;
469        } else {
470            break;
471        }
472    }
473    *cursor = end;
474    Some(src[start..end].to_string())
475}
476
477fn read_value(src: &str, cursor: &mut usize) -> Option<ArgValue> {
478    skip_ws(src, cursor);
479    let bytes = src.as_bytes();
480    let start = *cursor;
481    match bytes.get(start)? {
482        b'"' => {
483            *cursor += 1;
484            let mut s = String::new();
485            while *cursor < bytes.len() {
486                let b = bytes[*cursor];
487                if b == b'"' {
488                    *cursor += 1;
489                    return Some(ArgValue::Str(s));
490                }
491                if b == b'\\' {
492                    if let Some(c) = src[*cursor + 1..].chars().next() {
493                        s.push(c);
494                        *cursor += 1 + c.len_utf8();
495                        continue;
496                    }
497                    // dangling backslash at EOF: treat as literal
498                    s.push('\\');
499                    *cursor += 1;
500                    continue;
501                }
502                // Take one full char so cursor stays UTF-8 aligned and the
503                // produced string preserves multibyte content correctly.
504                let c = src[*cursor..].chars().next().expect("cursor < len");
505                s.push(c);
506                *cursor += c.len_utf8();
507            }
508            None
509        }
510        b'[' => {
511            *cursor += 1;
512            let mut arr: Vec<ArgValue> = Vec::new();
513            skip_ws(src, cursor);
514            if bytes.get(*cursor) == Some(&b']') {
515                *cursor += 1;
516                return Some(ArgValue::Array(arr));
517            }
518            loop {
519                let v = read_value(src, cursor)?;
520                arr.push(v);
521                skip_ws(src, cursor);
522                match bytes.get(*cursor) {
523                    Some(&b',') => {
524                        *cursor += 1;
525                        skip_ws(src, cursor);
526                    }
527                    Some(&b']') => {
528                        *cursor += 1;
529                        return Some(ArgValue::Array(arr));
530                    }
531                    _ => return None,
532                }
533            }
534        }
535        c if c.is_ascii_digit() || *c == b'-' => {
536            let mut end = start;
537            if bytes[end] == b'-' {
538                end += 1;
539            }
540            while let Some(&b) = bytes.get(end) {
541                if b.is_ascii_digit() {
542                    end += 1;
543                } else {
544                    break;
545                }
546            }
547            let n: i64 = src[start..end].parse().ok()?;
548            *cursor = end;
549            Some(ArgValue::Int(n))
550        }
551        c if c.is_ascii_alphabetic() => {
552            let id = read_ident(src, cursor)?;
553            Some(ArgValue::Ident(id))
554        }
555        _ => None,
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    fn parse(s: &str) -> (Vec<Inline>, Vec<Diagnostic>) {
564        parse_inline(s, 0)
565    }
566
567    #[test]
568    fn plain_text() {
569        let (n, d) = parse("hello world");
570        assert!(d.is_empty());
571        assert_eq!(n.len(), 1);
572        if let Inline::Text { value, .. } = &n[0] {
573            assert_eq!(value, "hello world");
574        } else {
575            panic!();
576        }
577    }
578
579    #[test]
580    fn bold() {
581        let (n, d) = parse("a *bold* b");
582        assert!(d.is_empty(), "{:?}", d);
583        assert!(matches!(n[1], Inline::Bold { .. }));
584    }
585
586    #[test]
587    fn snake_case_is_literal() {
588        let (n, d) = parse("snake_case_name");
589        assert!(d.is_empty());
590        assert_eq!(n.len(), 1);
591        assert!(matches!(n[0], Inline::Text { .. }));
592    }
593
594    #[test]
595    fn nested_same_marker_errors() {
596        let (_, d) = parse("*outer *inner* outer*");
597        assert!(d.iter().any(|x| x.code == Code::EmphasisSameMarker));
598    }
599
600    #[test]
601    fn inline_code() {
602        let (n, d) = parse("use `printf` here");
603        assert!(d.is_empty());
604        assert!(matches!(n[1], Inline::InlineCode { .. }));
605    }
606
607    #[test]
608    fn double_backtick_code_with_backtick() {
609        let (n, d) = parse("``a ` b``");
610        assert!(d.is_empty());
611        if let Inline::InlineCode { value, .. } = &n[0] {
612            assert_eq!(value, "a ` b");
613        } else {
614            panic!();
615        }
616    }
617
618    #[test]
619    fn shortcode_inline() {
620        let (n, d) = parse("see @link[here](https://x)");
621        assert!(d.is_empty(), "{:?}", d);
622        assert!(matches!(n.last().unwrap(), Inline::Shortcode { .. }));
623    }
624
625    #[test]
626    fn escape_emphasis() {
627        let (n, d) = parse(r"\*literal\*");
628        assert!(d.is_empty());
629        let joined: String = n
630            .iter()
631            .filter_map(|x| {
632                if let Inline::Text { value, .. } = x {
633                    Some(value.clone())
634                } else {
635                    None
636                }
637            })
638            .collect();
639        assert_eq!(joined, "*literal*");
640    }
641
642    #[test]
643    fn double_marker_not_emphasis() {
644        let (n, _d) = parse("**no**");
645        assert!(!matches!(n[0], Inline::Bold { .. }));
646    }
647
648    #[test]
649    fn escape_before_multibyte_char() {
650        // Regression: escaping `\é` used to advance two bytes past `\` and
651        // land mid-codepoint, panicking the next slice. The escape must
652        // consume the full UTF-8 char.
653        let (n, d) = parse("a \\é b");
654        assert!(d.is_empty(), "{:?}", d);
655        let joined: String = n
656            .iter()
657            .filter_map(|x| {
658                if let Inline::Text { value, .. } = x {
659                    Some(value.clone())
660                } else {
661                    None
662                }
663            })
664            .collect();
665        assert_eq!(joined, "a é b");
666    }
667
668    #[test]
669    fn multibyte_text_then_emph() {
670        // Regression: the byte-walking text scanner must land on a char
671        // boundary before any sigil, even when text is multibyte.
672        let (n, d) = parse("日本 *bold*");
673        assert!(d.is_empty(), "{:?}", d);
674        assert!(matches!(n.last().unwrap(), Inline::Bold { .. }));
675    }
676
677    #[test]
678    fn arg_string_preserves_multibyte() {
679        // Regression: read_value used to push raw bytes as Latin-1 chars,
680        // corrupting multibyte content inside a string argument.
681        let mut cursor = 0usize;
682        let s = "(label: \"日本 🦀\")";
683        let args = parse_args(s, &mut cursor).unwrap();
684        if let ArgValue::Str(v) = args.keyword.get("label").unwrap() {
685            assert_eq!(v, "日本 🦀");
686        } else {
687            panic!();
688        }
689    }
690
691    #[test]
692    fn escape_at_end_of_input() {
693        // A trailing `\\` with nothing to escape must not panic.
694        let (_, _d) = parse("trailing\\");
695    }
696}