rat_markdown/
parser.rs

1//!
2//! Special parsers for things not covered by pulldown-cmark.
3//!
4use rat_text::upos_type;
5use std::ops::Range;
6use unicode_segmentation::UnicodeSegmentation;
7
8/// Parsed header.
9#[derive(Debug)]
10pub struct MDHeader<'a> {
11    pub header: u8,
12    pub prefix: &'a str,
13    pub tag: &'a str,
14    pub text: &'a str,
15    pub text_byte: Range<usize>,
16}
17
18/// Parse the text as header.
19///
20/// * relocate: Start offset of txt
21pub fn parse_md_header(relocate: usize, txt: &str) -> Option<MDHeader<'_>> {
22    let mut mark_prefix_end = 0;
23    let mut mark_tag_start = 0;
24    let mut mark_tag_end = 0;
25    let mut mark_text_start = 0;
26
27    #[derive(Debug, PartialEq)]
28    enum It {
29        Leading,
30        Tag,
31        LeadingText,
32        Text,
33        End,
34        Fail,
35    }
36
37    let mut state = It::Leading;
38    for (idx, c) in txt.bytes().enumerate() {
39        if state == It::Leading {
40            if c == b' ' || c == b'\t' {
41                mark_prefix_end = idx + 1;
42                mark_tag_start = idx + 1;
43                mark_tag_end = idx + 1;
44                mark_text_start = idx + 1;
45            } else if c == b'#' {
46                mark_prefix_end = idx;
47                mark_tag_start = idx;
48                mark_tag_end = idx + 1;
49                mark_text_start = idx + 1;
50                state = It::Tag;
51            } else {
52                state = It::Fail;
53                break;
54            }
55        } else if state == It::Tag {
56            if c == b'#' {
57                mark_tag_end = idx;
58                mark_text_start = idx + 1;
59            } else {
60                mark_tag_end = idx;
61                mark_text_start = idx + 1;
62                state = It::LeadingText;
63            }
64        } else if state == It::LeadingText {
65            if c == b' ' || c == b'\t' {
66                mark_text_start = idx + 1;
67                // ok
68            } else {
69                mark_text_start = idx;
70                state = It::Text;
71            }
72        } else if state == It::Text {
73            state = It::End;
74            break;
75        }
76    }
77
78    if state == It::Fail {
79        return None;
80    }
81
82    Some(MDHeader {
83        header: (mark_tag_end - mark_tag_start) as u8,
84        prefix: &txt[..mark_prefix_end],
85        tag: &txt[mark_tag_start..mark_tag_end],
86        text: &txt[mark_text_start..],
87        text_byte: relocate + mark_text_start..relocate + txt.len(),
88    })
89}
90
91/// Parsed link reference
92#[derive(Debug)]
93pub struct MDLinkRef<'a> {
94    pub prefix: &'a str,
95    pub tag: &'a str,
96    pub link: &'a str,
97    pub title: &'a str,
98    pub suffix: &'a str,
99}
100
101/// Parse the text as link reference
102///
103/// * relocate - start offset of txt.
104pub fn parse_md_link_ref(_relocate: usize, txt: &str) -> Option<MDLinkRef<'_>> {
105    let mut mark_prefix_end = 0;
106    let mut mark_tag_start = 0;
107    let mut mark_tag_end = 0;
108    let mut mark_link_start = 0;
109    let mut mark_link_end = 0;
110    let mut mark_title_start = 0;
111    let mut mark_title_end = 0;
112
113    #[derive(Debug, PartialEq)]
114    enum It {
115        Leading,
116        Tag,
117        AfterTag,
118        LeadingLink,
119        BracketLink,
120        Link,
121        LinkEsc,
122        LeadingTitle,
123        TitleSingle,
124        TitleSingleEsc,
125        TitleDouble,
126        TitleDoubleEsc,
127        End,
128        Fail,
129    }
130
131    let mut state = It::Leading;
132    for (idx, c) in txt.bytes().enumerate() {
133        if state == It::Leading {
134            if c == b'[' {
135                mark_prefix_end = idx;
136                mark_tag_start = idx + 1;
137                mark_tag_end = idx + 1;
138                mark_link_start = idx + 1;
139                mark_link_end = idx + 1;
140                mark_title_start = idx + 1;
141                mark_title_end = idx + 1;
142                state = It::Tag;
143            } else if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
144                mark_prefix_end = idx + 1;
145                mark_tag_start = idx + 1;
146                mark_tag_end = idx + 1;
147                mark_link_start = idx + 1;
148                mark_link_end = idx + 1;
149                mark_title_start = idx + 1;
150                mark_title_end = idx + 1;
151            } else {
152                state = It::Fail;
153                break;
154            }
155        } else if state == It::Tag {
156            if c == b']' {
157                mark_tag_end = idx;
158                mark_link_start = idx + 1;
159                mark_link_end = idx + 1;
160                mark_title_start = idx + 1;
161                mark_title_end = idx + 1;
162                state = It::AfterTag;
163            } else {
164                mark_tag_end = idx;
165                mark_link_start = idx + 1;
166                mark_link_end = idx + 1;
167                mark_title_start = idx + 1;
168                mark_title_end = idx + 1;
169            }
170        } else if state == It::AfterTag {
171            if c == b':' {
172                mark_link_start = idx + 1;
173                mark_link_end = idx + 1;
174                mark_title_start = idx + 1;
175                mark_title_end = idx + 1;
176                state = It::LeadingLink;
177            } else {
178                state = It::Fail;
179                break;
180            }
181        } else if state == It::LeadingLink {
182            if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
183                mark_link_start = idx + 1;
184                mark_link_end = idx + 1;
185                mark_title_start = idx + 1;
186                mark_title_end = idx + 1;
187                // ok
188            } else if c == b'<' {
189                mark_link_start = idx + 1;
190                mark_link_end = idx + 1;
191                mark_title_start = idx + 1;
192                mark_title_end = idx + 1;
193                state = It::BracketLink;
194            } else {
195                mark_link_start = idx;
196                mark_link_end = idx;
197                mark_title_start = idx;
198                mark_title_end = idx;
199                state = It::Link;
200            }
201        } else if state == It::BracketLink {
202            if c == b'>' {
203                mark_link_end = idx;
204                mark_title_start = idx + 1;
205                mark_title_end = idx + 1;
206                state = It::LeadingTitle;
207            } else {
208                mark_link_end = idx;
209                mark_title_start = idx;
210                mark_title_end = idx;
211            }
212        } else if state == It::Link {
213            if c == b'\\' {
214                mark_link_end = idx;
215                mark_title_start = idx;
216                mark_title_end = idx;
217                state = It::LinkEsc;
218            } else if c == b'\n' || c == b'\r' {
219                mark_link_end = idx;
220                mark_title_start = idx + 1;
221                mark_title_end = idx + 1;
222                state = It::LeadingTitle;
223            } else if c == b'\'' {
224                mark_link_end = idx;
225                mark_title_start = idx + 1;
226                mark_title_end = idx + 1;
227                state = It::TitleSingle;
228            } else if c == b'"' {
229                mark_link_end = idx;
230                mark_title_start = idx + 1;
231                mark_title_end = idx + 1;
232                state = It::TitleDouble;
233            } else {
234                mark_link_end = idx;
235                mark_title_start = idx;
236                mark_title_end = idx;
237            }
238        } else if state == It::LinkEsc {
239            mark_link_end = idx;
240            mark_title_start = idx;
241            mark_title_end = idx;
242            state = It::Link;
243        } else if state == It::LeadingTitle {
244            if c == b' ' || c == b'\t' || c == b'\n' || c == b'\r' {
245                mark_title_start = idx + 1;
246                mark_title_end = idx + 1;
247            } else if c == b'\'' {
248                mark_title_start = idx + 1;
249                mark_title_end = idx + 1;
250                state = It::TitleSingle;
251            } else if c == b'"' {
252                mark_title_start = idx + 1;
253                mark_title_end = idx + 1;
254                state = It::TitleDouble;
255            } else {
256                // no title, just suffix
257                mark_title_start = idx;
258                mark_title_end = idx;
259                state = It::End;
260                break;
261            }
262        } else if state == It::TitleSingle {
263            if c == b'\'' {
264                mark_title_end = idx;
265                state = It::End;
266                break;
267            } else if c == b'\\' {
268                mark_title_end = idx;
269                state = It::TitleSingleEsc;
270            } else {
271                mark_title_end = idx;
272            }
273        } else if state == It::TitleSingleEsc {
274            mark_title_end = idx;
275            state = It::TitleSingle;
276        } else if state == It::TitleDouble {
277            if c == b'"' {
278                mark_title_end = idx;
279                state = It::End;
280                break;
281            } else if c == b'\\' {
282                mark_title_end = idx;
283                state = It::TitleDoubleEsc;
284            } else {
285                mark_title_end = idx;
286            }
287        } else if state == It::TitleDoubleEsc {
288            mark_title_end = idx;
289            state = It::TitleDouble;
290        }
291    }
292
293    if state == It::Fail {
294        return None;
295    }
296
297    Some(MDLinkRef {
298        prefix: &txt[..mark_prefix_end],
299        tag: &txt[mark_tag_start..mark_tag_end],
300        link: &txt[mark_link_start..mark_link_end],
301        title: &txt[mark_title_start..mark_title_end],
302        suffix: &txt[mark_title_end..],
303    })
304}
305
306/// One list item.
307#[derive(Debug)]
308pub struct MDItem<'a> {
309    pub prefix: &'a str,
310    pub mark_bytes: Range<usize>,
311    pub mark: &'a str,
312    pub mark_suffix: &'a str,
313    pub mark_nr: Option<usize>,
314    pub text_prefix: &'a str,
315    pub text_bytes: Range<usize>,
316    pub text: &'a str,
317}
318
319/// Parse a single list item into marker and text.
320pub fn parse_md_item(relocate: usize, txt: &str) -> Option<MDItem<'_>> {
321    let mut mark_byte = 0;
322    let mut mark_suffix_byte = 0;
323    let mut text_prefix_byte = 0;
324    let mut text_byte = 0;
325
326    let mut mark_nr = None;
327
328    #[derive(Debug, PartialEq)]
329    enum It {
330        Leading,
331        OrderedMark,
332        TextLeading,
333        Fail,
334        End,
335    }
336
337    let mut state = It::Leading;
338    for (idx, c) in txt.bytes().enumerate() {
339        if state == It::Leading {
340            if c == b'+' || c == b'-' || c == b'*' {
341                mark_byte = idx;
342                mark_suffix_byte = idx + 1;
343                text_prefix_byte = idx + 1;
344                text_byte = idx + 1;
345                state = It::TextLeading;
346            } else if c.is_ascii_digit() {
347                mark_byte = idx;
348                state = It::OrderedMark;
349            } else if c == b' ' || c == b'\t' {
350                // ok
351            } else {
352                state = It::Fail;
353                break;
354            }
355        } else if state == It::OrderedMark {
356            if c.is_ascii_digit() {
357                // ok
358            } else if c == b'.' || c == b')' {
359                mark_suffix_byte = idx;
360                text_prefix_byte = idx + 1;
361                text_byte = idx + 1;
362                mark_nr = Some(
363                    txt[mark_byte..mark_suffix_byte]
364                        .parse::<usize>()
365                        .expect("nr"),
366                );
367                state = It::TextLeading;
368            } else {
369                state = It::Fail;
370                break;
371            }
372        } else if state == It::TextLeading {
373            if c == b' ' || c == b'\t' {
374                // ok
375            } else {
376                text_byte = idx;
377                state = It::End;
378                break;
379            }
380        }
381    }
382
383    if state == It::Fail {
384        return None;
385    }
386
387    Some(MDItem {
388        prefix: &txt[0..mark_byte],
389        mark_bytes: relocate + mark_byte..relocate + text_prefix_byte,
390        mark: &txt[mark_byte..mark_suffix_byte],
391        mark_suffix: &txt[mark_suffix_byte..text_prefix_byte],
392        mark_nr,
393        text_prefix: &txt[text_prefix_byte..text_byte],
394        text_bytes: relocate + text_byte..relocate + txt.len(),
395        text: &txt[text_byte..],
396    })
397}
398
399/// One table cell.
400#[derive(Debug)]
401pub struct MDCell<'a> {
402    pub txt: &'a str,
403    pub txt_graphemes: Range<upos_type>,
404    pub txt_bytes: Range<usize>,
405}
406
407/// One table row.
408#[derive(Debug)]
409pub struct MDRow<'a> {
410    pub row: Vec<MDCell<'a>>,
411    // cursor cell-nr
412    pub cursor_cell: usize,
413    // cursor grapheme offset into the cell
414    pub cursor_offset: upos_type,
415    // cursor byte offset into the cell
416    pub cursor_byte_offset: usize,
417}
418
419/// Split single row. Translate x-position to cell+cell_offset.
420/// __info__: returns the string before the first | and the string after the last | too!!
421pub fn parse_md_row(relocate: usize, txt: &str, x: upos_type) -> MDRow<'_> {
422    let mut tmp = MDRow {
423        row: Default::default(),
424        cursor_cell: 0,
425        cursor_offset: 0,
426        cursor_byte_offset: 0,
427    };
428
429    let mut grapheme_start = 0;
430    let mut grapheme_last = 0;
431    let mut esc = false;
432    let mut cell_offset = 0;
433    let mut cell_byte_start = 0;
434    for (idx, (byte_idx, c)) in txt.grapheme_indices(true).enumerate() {
435        if idx == x as usize {
436            tmp.cursor_cell = tmp.row.len();
437            tmp.cursor_offset = cell_offset;
438            tmp.cursor_byte_offset = byte_idx - cell_byte_start;
439        }
440
441        if c == "\\" {
442            cell_offset += 1;
443            esc = true;
444        } else if c == "|" && !esc {
445            cell_offset = 0;
446            tmp.row.push(MDCell {
447                txt: &txt[cell_byte_start..byte_idx],
448                txt_graphemes: grapheme_start..idx as upos_type,
449                txt_bytes: relocate + cell_byte_start..relocate + byte_idx,
450            });
451            cell_byte_start = byte_idx + 1;
452            grapheme_start = idx as upos_type + 1;
453        } else {
454            cell_offset += 1;
455            esc = false;
456        }
457
458        grapheme_last = idx as upos_type;
459    }
460
461    tmp.row.push(MDCell {
462        txt: &txt[cell_byte_start..txt.len()],
463        txt_graphemes: grapheme_start..grapheme_last,
464        txt_bytes: relocate + cell_byte_start..relocate + txt.len(),
465    });
466
467    tmp
468}
469
470/// Quoted text
471#[derive(Debug)]
472pub struct MDBlockQuote<'a> {
473    pub quote: &'a str,
474    pub text_prefix: &'a str,
475    pub text_bytes: Range<usize>,
476    pub text: &'a str,
477}
478
479/// Parse a block-quote.
480///
481/// * relocate - offset of txt.
482pub fn parse_md_block_quote(relocate: usize, txt: &str) -> Option<MDBlockQuote<'_>> {
483    let mut quote_byte = 0;
484    let mut text_prefix_byte = 0;
485    let mut text_byte = 0;
486
487    #[derive(Debug, PartialEq)]
488    enum It {
489        Leading,
490        TextLeading,
491        Text,
492        End,
493        Fail,
494    }
495
496    let mut state = It::Leading;
497    for (idx, c) in txt.bytes().enumerate() {
498        if state == It::Leading {
499            if c == b'>' {
500                quote_byte = idx;
501                text_prefix_byte = idx + 1;
502                state = It::TextLeading;
503            } else if c == b' ' || c == b'\t' {
504                // ok
505            } else {
506                state = It::Fail;
507                break;
508            }
509        } else if state == It::TextLeading {
510            if c == b' ' || c == b'\t' {
511                // ok
512            } else {
513                text_byte = idx;
514                state = It::Text;
515            }
516        } else if state == It::Text {
517            state = It::End;
518            break;
519        }
520    }
521
522    if state == It::Fail {
523        return None;
524    }
525
526    Some(MDBlockQuote {
527        quote: &txt[quote_byte..quote_byte + 1],
528        text_prefix: &txt[text_prefix_byte..text_byte],
529        text_bytes: relocate + text_byte..relocate + txt.len(),
530        text: &txt[text_byte..txt.len()],
531    })
532}