minimad/parser/
line_parser.rs

1use crate::*;
2
3/// The structure parsing a line or part of a line.
4/// A `LineParser` initialized from a markdown string exposes 2 main methods:
5/// * `line` parses a line which is supposed to be part of a markdown text. This
6///   method shouln't really be used externally: a text can be parsed in a whole
7///   using `Text::from`
8/// * `inline` parses a snippet which isn't supposed to be part of a markdown text.
9///   Some types of lines aren't produced this ways as they don't make sense out of
10///   a text: `ListItem`, `TableRow`, `Code`.
11///
12/// Normally not used directly but though `line::from(str)`
13#[derive(Debug)]
14pub struct LineParser<'s> {
15    src: &'s str,
16    idx: usize, // current index in string, in bytes
17    pub(crate) code: bool,
18    pub(crate) italic: bool,
19    pub(crate) bold: bool,
20    pub(crate) strikeout: bool,
21}
22
23impl<'s> LineParser<'s> {
24    pub fn from(src: &'s str) -> LineParser<'s> {
25        LineParser {
26            src,
27            idx: 0,
28            bold: false,
29            italic: false,
30            code: false,
31            strikeout: false,
32        }
33    }
34    fn close_compound(
35        &mut self,
36        end: usize,
37        tag_length: usize,
38        compounds: &mut Vec<Compound<'s>>,
39    ) {
40        if end > self.idx {
41            compounds.push(Compound::new(
42                self.src,
43                self.idx,
44                end,
45                self.bold,
46                self.italic,
47                self.code,
48                self.strikeout,
49            ));
50        }
51        self.idx = end + tag_length;
52    }
53    fn code_block_compound_from_idx(
54        &self,
55        idx: usize,
56    ) -> Compound<'s> {
57        Compound::new(self.src, idx, self.src.len(), false, false, false, false)
58    }
59    fn parse_compounds(
60        &mut self,
61        stop_on_pipe: bool,
62    ) -> Vec<Compound<'s>> {
63        let mut compounds = Vec::new();
64        let mut after_first_star = false;
65        let mut after_first_tilde = false;
66        let mut after_antislash = false;
67
68        // self.idx tracks byte indices, but str::char_indices returns an
69        // iterator over chars, which may be wider than one byte. So we need
70        // to skip not self.idx elements, but the number of chars that occur
71        // before self.idx
72        let chars_to_skip = self.src[..self.idx].chars().count();
73        for (idx, char) in self.src.char_indices().skip(chars_to_skip) {
74            if self.code {
75                // only one thing matters: whether we're closing the inline code
76                if char == '`' {
77                    self.close_compound(idx, 1, &mut compounds);
78                    self.code = false;
79                }
80                after_antislash = false;
81                after_first_star = false;
82                continue;
83            }
84
85            #[cfg(feature = "escaping")]
86            if after_antislash {
87                after_antislash = false;
88                match char {
89                    '*' | '~' | '|' | '`' => {
90                        self.close_compound(idx - 1, 1, &mut compounds);
91                        continue;
92                    }
93                    '\\' => {
94                        self.close_compound(idx, 1, &mut compounds);
95                        continue;
96                    }
97                    _ => {} // we don't escape at all normal chars
98                }
99            } else if char == '\\' {
100                after_antislash = true;
101                continue;
102            }
103
104            if after_first_star {
105                match char {
106                    '*' => {
107                        // this is the second star
108                        self.close_compound(idx - 1, 2, &mut compounds);
109                        self.bold ^= true;
110                    }
111                    '~' => {
112                        after_first_tilde = true;
113                        self.close_compound(idx - 1, 2, &mut compounds);
114                        // we don't know yet if it's one or two tildes
115                        self.italic ^= true;
116                    }
117                    '|' if stop_on_pipe => {
118                        self.close_compound(idx - 1, 1, &mut compounds);
119                        return compounds;
120                    }
121                    '`' => {
122                        self.close_compound(idx - 1, 2, &mut compounds);
123                        self.italic ^= true;
124                        self.code = true;
125                    }
126                    _ => {
127                        // there was only one star
128                        // Note that we don't handle a tag just after a star (except in code)
129                        self.close_compound(idx - 1, 1, &mut compounds);
130                        self.italic ^= true;
131                    }
132                }
133                after_first_star = false;
134            } else if after_first_tilde {
135                match char {
136                    '*' => {
137                        after_first_star = true;
138                        // we don't know yet if it's one or two stars
139                    }
140                    '~' => {
141                        // this is the second tilde
142                        self.close_compound(idx - 1, 2, &mut compounds);
143                        self.strikeout ^= true;
144                    }
145                    '|' if stop_on_pipe => {
146                        self.close_compound(idx - 1, 1, &mut compounds);
147                        return compounds;
148                    }
149                    _ => {
150                        // there was only one tilde, which means nothing
151                    }
152                }
153                after_first_tilde = false;
154            } else {
155                match char {
156                    '*' => {
157                        after_first_star = true;
158                        // we don't know yet if it's one or two stars
159                    }
160                    '~' => {
161                        after_first_tilde = true;
162                    }
163                    '|' if stop_on_pipe => {
164                        self.close_compound(idx, 0, &mut compounds);
165                        return compounds;
166                    }
167                    '`' => {
168                        self.close_compound(idx, 1, &mut compounds);
169                        self.code = true;
170                    }
171                    _ => {}
172                }
173            }
174        }
175        let mut idx = self.src.len();
176        if after_first_star && self.italic {
177            idx -= 1;
178        }
179        if after_first_tilde && self.strikeout {
180            idx -= 1;
181        }
182        self.close_compound(idx, 0, &mut compounds);
183        compounds
184    }
185    fn parse_cells(&mut self) -> Vec<Composite<'s>> {
186        let mut cells = Vec::new();
187        while self.idx < self.src.len() {
188            self.idx += 1;
189            let style = if self.src[self.idx..].starts_with("* ") {
190                self.idx += 2;
191                CompositeStyle::ListItem(0)
192            } else if self.src[self.idx..].starts_with(" * ") {
193                self.idx += 3;
194                CompositeStyle::ListItem(1)
195            } else if self.src[self.idx..].starts_with("  * ") {
196                self.idx += 4;
197                CompositeStyle::ListItem(2)
198            } else if self.src[self.idx..].starts_with("   * ") {
199                self.idx += 5;
200                CompositeStyle::ListItem(3)
201            } else if self.src[self.idx..].starts_with("> ") {
202                self.idx += 2;
203                CompositeStyle::Quote
204            } else {
205                CompositeStyle::Paragraph
206            };
207            self.bold = false;
208            self.italic = false;
209            self.code = false;
210            self.strikeout = false;
211            let compounds = self.parse_compounds(true);
212            let mut composite = Composite { style, compounds };
213            composite.trim_spaces();
214            cells.push(composite);
215        }
216        if !cells.is_empty() && cells[cells.len() - 1].compounds.is_empty() {
217            cells.pop();
218        }
219        cells
220    }
221    pub fn inline(mut self) -> Composite<'s> {
222        Composite {
223            style: CompositeStyle::Paragraph,
224            compounds: self.parse_compounds(false),
225        }
226    }
227    /// should be called when the line must be interpreted as a code part,
228    /// for example between code fences
229    pub fn as_code(mut self) -> Line<'s> {
230        if self.src.starts_with("```") {
231            self.idx = 3;
232            Line::new_code_fence(self.parse_compounds(false))
233        } else {
234            Line::new_code(self.code_block_compound_from_idx(0))
235        }
236    }
237    pub fn line(mut self) -> Line<'s> {
238        self.parse_line()
239    }
240    pub(crate) fn parse_line(&mut self) -> Line<'s> {
241        if self.src.starts_with('|') {
242            let tr = TableRow {
243                cells: self.parse_cells(),
244            };
245            return match tr.as_table_alignments() {
246                Some(aligns) => Line::TableRule(aligns),
247                None => Line::TableRow(tr),
248            };
249        }
250        if self.src.starts_with("    ") {
251            return Line::new_code(self.code_block_compound_from_idx(4));
252        }
253        if self.src.starts_with('\t') {
254            return Line::new_code(self.code_block_compound_from_idx(1));
255        }
256        if self.src.starts_with("* ") {
257            self.idx = 2;
258            return Line::new_list_item(0, self.parse_compounds(false));
259        }
260        if self.src.starts_with(" * ") {
261            self.idx = 3;
262            return Line::new_list_item(1, self.parse_compounds(false));
263        }
264        if self.src.starts_with("  * ") {
265            self.idx = 4;
266            return Line::new_list_item(2, self.parse_compounds(false));
267        }
268        if self.src.starts_with("   * ") {
269            self.idx = 5;
270            return Line::new_list_item(3, self.parse_compounds(false));
271        }
272        if self.src == ">" {
273            return Line::new_quote(Vec::new());
274        }
275        if self.src.starts_with("> ") {
276            self.idx = 2;
277            return Line::new_quote(self.parse_compounds(false));
278        }
279        if self.src.starts_with("```") {
280            self.idx = 3;
281            return Line::new_code_fence(self.parse_compounds(false));
282        }
283        let header_level = header_level(self.src);
284        if header_level > 0 {
285            self.idx = header_level + 1;
286            return Line::new_header(header_level as u8, self.parse_compounds(false));
287        }
288        let compounds = self.parse_compounds(false);
289        if compounds_are_rule(&compounds) {
290            Line::HorizontalRule
291        } else {
292            Line::new_paragraph(compounds)
293        }
294    }
295}
296
297const DASH: u8 = 45;
298
299fn compounds_are_rule(compounds: &[Compound<'_>]) -> bool {
300    if compounds.len() != 1 {
301        return false;
302    }
303    let s = compounds[0].as_str();
304    if s.len() < 3 {
305        return false;
306    }
307    for c in s.as_bytes() {
308        if *c != DASH {
309            return false;
310        }
311    }
312    true
313}
314
315/// Tests of line parsing
316#[cfg(test)]
317mod tests {
318    use crate::*;
319
320    #[test]
321    fn simple_line_parsing() {
322        assert_eq!(
323            Line::from("Hello ~~wolrd~~ **World**. *Code*: `sqrt(π/2)`"),
324            Line::new_paragraph(vec![
325                Compound::raw_str("Hello "),
326                Compound::raw_str("wolrd").strikeout(),
327                Compound::raw_str(" "),
328                Compound::raw_str("World").bold(),
329                Compound::raw_str(". "),
330                Compound::raw_str("Code").italic(),
331                Compound::raw_str(": "),
332                Compound::raw_str("sqrt(π/2)").code(),
333            ])
334        );
335    }
336
337    #[test]
338    fn nested_styles_parsing() {
339        assert_eq!(
340            Line::from("*Italic then **bold and italic `and some *code*`** and italic*"),
341            Line::new_paragraph(vec![
342                Compound::raw_str("Italic then ").italic(),
343                Compound::raw_str("bold and italic ").bold().italic(),
344                Compound::raw_str("and some *code*").bold().italic().code(),
345                Compound::raw_str(" and italic").italic(),
346            ])
347        );
348    }
349
350    #[test]
351    fn quote() {
352        assert_eq!(
353            Line::from("> Veni, vidi, *vici*!"),
354            Line::new_quote(vec![
355                Compound::raw_str("Veni, vidi, "),
356                Compound::raw_str("vici").italic(),
357                Compound::raw_str("!"),
358            ])
359        );
360    }
361
362    #[test]
363    fn code_after_italic() {
364        assert_eq!(
365            Line::from("*name=*`code`"),
366            Line::new_paragraph(vec![
367                Compound::raw_str("name=").italic(),
368                Compound::raw_str("code").code(),
369            ])
370        );
371    }
372
373    #[test]
374    /// this test is borderline. It wouldn't be very problematic to not support this case.
375    /// A regression would thus be acceptable here (but I want it to be noticed)
376    fn single_star() {
377        assert_eq!(
378            Line::from("*"),
379            Line::new_paragraph(vec![Compound::raw_str("*"),])
380        );
381    }
382
383    #[test]
384    /// this test is borderline. It wouldn't be very problematic to not support it.
385    /// A regression would thus be acceptable here (but I want it to be noticed)
386    fn single_tilde() {
387        assert_eq!(
388            Line::from("~"),
389            Line::new_paragraph(vec![Compound::raw_str("~"),])
390        );
391    }
392
393    #[test]
394    fn striked_after_italic() {
395        assert_eq!(
396            Line::from("*italic*~~striked~~"),
397            Line::new_paragraph(vec![
398                Compound::raw_str("italic").italic(),
399                Compound::raw_str("striked").strikeout(),
400            ])
401        );
402    }
403
404    #[test]
405    fn tight_sequence() {
406        assert_eq!(
407            Line::from(
408                "*italic*`code`**bold**`code`*italic**italic+bold***`code`*I*~~striked~~*I*"
409            ),
410            Line::new_paragraph(vec![
411                Compound::raw_str("italic").italic(),
412                Compound::raw_str("code").code(),
413                Compound::raw_str("bold").bold(),
414                Compound::raw_str("code").code(),
415                Compound::raw_str("italic").italic(),
416                Compound::raw_str("italic+bold").italic().bold(),
417                Compound::raw_str("code").code(),
418                Compound::raw_str("I").italic(),
419                Compound::raw_str("striked").strikeout(),
420                Compound::raw_str("I").italic(),
421            ])
422        );
423    }
424
425    #[cfg(feature = "escaping")]
426    #[test]
427    fn escapes() {
428        assert_eq!(
429            Line::from("no \\*italic\\* here"),
430            Line::new_paragraph(vec![
431                Compound::raw_str("no "),
432                Compound::raw_str("*italic"),
433                Compound::raw_str("* here"),
434            ])
435        );
436        // check we're not removing chars with the escaping, and that
437        // we're not losing the '\' when it's not escaping something
438        // (only markdown modifiers can be escaped)
439        assert_eq!(
440            Line::from("a\\bc\\"),
441            Line::new_paragraph(vec![Compound::raw_str("a\\bc\\"),])
442        );
443        assert_eq!(
444            Line::from("*italic\\*and\\*still\\*italic*"),
445            Line::new_paragraph(vec![
446                Compound::raw_str("italic").italic(),
447                Compound::raw_str("*and").italic(),
448                Compound::raw_str("*still").italic(),
449                Compound::raw_str("*italic").italic(),
450            ])
451        );
452        assert_eq!(
453            Line::from(
454                "\\**Italic then **bold\\\\ and \\`italic `and some *code*`** and italic*\\*"
455            ),
456            Line::new_paragraph(vec![
457                Compound::raw_str("*"),
458                Compound::raw_str("Italic then ").italic(),
459                Compound::raw_str("bold\\").bold().italic(),
460                Compound::raw_str(" and ").bold().italic(),
461                Compound::raw_str("`italic ").bold().italic(),
462                Compound::raw_str("and some *code*").bold().italic().code(),
463                Compound::raw_str(" and italic*").italic(),
464            ])
465        );
466    }
467
468    #[test]
469    fn code_fence() {
470        assert_eq!(Line::from("```"), Line::new_code_fence(vec![]));
471        assert_eq!(
472            Line::from("```rust"),
473            Line::new_code_fence(vec![Compound::raw_str("rust"),]),
474        );
475    }
476
477    #[test]
478    fn line_of_code() {
479        assert_eq!(
480            Line::from("    let r = Math.sin(π/2) * 7"),
481            Line::new_code(Compound::raw_str("let r = Math.sin(π/2) * 7"))
482        );
483    }
484
485    #[test]
486    fn standard_header() {
487        assert_eq!(
488            Line::from("### just a title"),
489            Line::new_header(3, vec![Compound::raw_str("just a title"),])
490        );
491    }
492
493    #[test]
494    fn list_item() {
495        assert_eq!(
496            Line::from("* *list* item"),
497            Line::new_list_item(
498                0,
499                vec![
500                    Compound::raw_str("list").italic(),
501                    Compound::raw_str(" item"),
502                ]
503            )
504        );
505    }
506
507    #[test]
508    fn deep_list_items() {
509        assert_eq!(
510            Line::from(" * *list* item"),
511            Line::new_list_item(
512                1,
513                vec![
514                    Compound::raw_str("list").italic(),
515                    Compound::raw_str(" item"),
516                ]
517            )
518        );
519        assert_eq!(
520            Line::from("  * deeper"),
521            Line::new_list_item(2, vec![Compound::raw_str("deeper"),])
522        );
523        assert_eq!(
524            Line::from("   * even **deeper**"),
525            Line::new_list_item(
526                3,
527                vec![
528                    Compound::raw_str("even "),
529                    Compound::raw_str("deeper").bold(),
530                ]
531            )
532        );
533        assert_eq!(
534            Line::from("    * but not this one..."),
535            Line::new_code(Compound::raw_str("* but not this one...")),
536        );
537    }
538
539    #[test]
540    fn horizontal_rule() {
541        assert_eq!(Line::from("----------"), Line::HorizontalRule,);
542    }
543
544    #[test]
545    fn styled_header() {
546        assert_eq!(
547            Line::from("## a header with some **bold**!"),
548            Line::new_header(
549                2,
550                vec![
551                    Compound::raw_str("a header with some "),
552                    Compound::raw_str("bold").bold(),
553                    Compound::raw_str("!"),
554                ]
555            )
556        );
557    }
558
559    #[test]
560    fn table_row() {
561        assert_eq!(
562            Line::from("| bla |*italic*|hi!|> some quote"),
563            Line::new_table_row(vec![
564                Composite {
565                    style: CompositeStyle::Paragraph,
566                    compounds: vec![Compound::raw_str("bla"),],
567                },
568                Composite {
569                    style: CompositeStyle::Paragraph,
570                    compounds: vec![Compound::raw_str("italic").italic(),],
571                },
572                Composite {
573                    style: CompositeStyle::Paragraph,
574                    compounds: vec![Compound::raw_str("hi!"),],
575                },
576                Composite {
577                    style: CompositeStyle::Quote,
578                    compounds: vec![Compound::raw_str("some quote"),],
579                }
580            ])
581        );
582    }
583
584    #[test]
585    fn table_row_issue_4() {
586        assert_eq!(
587            Line::from("| 安 | 安 | 安 |"),
588            Line::new_table_row(vec![
589                Composite {
590                    style: CompositeStyle::Paragraph,
591                    compounds: vec![Compound::raw_str("安"),],
592                },
593                Composite {
594                    style: CompositeStyle::Paragraph,
595                    compounds: vec![Compound::raw_str("安"),],
596                },
597                Composite {
598                    style: CompositeStyle::Paragraph,
599                    compounds: vec![Compound::raw_str("安"),],
600                },
601            ])
602        );
603    }
604
605    #[test]
606    fn table_alignments() {
607        assert_eq!(
608            Line::from("|-----|:--|:-:|----:"),
609            Line::new_table_alignments(vec![
610                Alignment::Unspecified,
611                Alignment::Left,
612                Alignment::Center,
613                Alignment::Right,
614            ])
615        );
616    }
617}