pulldown_cmark/
scanners.rs

1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Scanners for fragments of CommonMark syntax
22
23use std::char;
24
25use crate::parse::HtmlScanGuard;
26pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
27use crate::strings::CowStr;
28use crate::{entities, BlockQuoteKind, HeadingLevel};
29use crate::{Alignment, LinkType};
30
31use memchr::memchr;
32
33// sorted for binary search
34const HTML_TAGS: [&str; 62] = [
35    "address",
36    "article",
37    "aside",
38    "base",
39    "basefont",
40    "blockquote",
41    "body",
42    "caption",
43    "center",
44    "col",
45    "colgroup",
46    "dd",
47    "details",
48    "dialog",
49    "dir",
50    "div",
51    "dl",
52    "dt",
53    "fieldset",
54    "figcaption",
55    "figure",
56    "footer",
57    "form",
58    "frame",
59    "frameset",
60    "h1",
61    "h2",
62    "h3",
63    "h4",
64    "h5",
65    "h6",
66    "head",
67    "header",
68    "hr",
69    "html",
70    "iframe",
71    "legend",
72    "li",
73    "link",
74    "main",
75    "menu",
76    "menuitem",
77    "nav",
78    "noframes",
79    "ol",
80    "optgroup",
81    "option",
82    "p",
83    "param",
84    "search",
85    "section",
86    "summary",
87    "table",
88    "tbody",
89    "td",
90    "tfoot",
91    "th",
92    "thead",
93    "title",
94    "tr",
95    "track",
96    "ul",
97];
98
99/// Analysis of the beginning of a line, including indentation and container
100/// markers.
101#[derive(Clone)]
102pub(crate) struct LineStart<'a> {
103    bytes: &'a [u8],
104    ix: usize,
105
106    // The index in `bytes` after the last tab we scanned; initially
107    // zero.
108    //
109    // Thus, there are no tab characters between `ix` and here, and for
110    // the purpose of defining block structure, this position can be
111    // considered to fall on a tab stop.
112    //
113    // This is only valid while scanning the initial portion of the
114    // line; methods that work with interior structure don't bother to
115    // update it.
116    tab_start: usize,
117
118    // In contexts where spaces help to define block structure, tabs
119    // behave as if they were replaced by spaces with a tab stop of 4
120    // characters.
121    //
122    // If we have scanned past a tab character but not consumed all
123    // the horizontal width it contributed, this is the number of
124    // spaces logically remaining, before the character at `ix`.
125    spaces_remaining: usize,
126
127    // no thematic breaks can occur before this offset.
128    // this prevents scanning over and over up to a certain point
129    min_hrule_offset: usize,
130}
131
132impl<'a> LineStart<'a> {
133    pub(crate) fn new(bytes: &[u8]) -> LineStart<'_> {
134        LineStart {
135            bytes,
136            tab_start: 0,
137            ix: 0,
138            spaces_remaining: 0,
139            min_hrule_offset: 0,
140        }
141    }
142
143    /// Try to scan a number of spaces.
144    ///
145    /// Returns true if all spaces were consumed.
146    ///
147    /// Note: consumes some spaces even if not successful.
148    pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
149        self.scan_space_inner(n_space) == 0
150    }
151
152    /// Scan a number of spaces up to a maximum.
153    ///
154    /// Returns number of spaces scanned.
155    pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
156        n_space - self.scan_space_inner(n_space)
157    }
158
159    /// Returns unused remainder of spaces.
160    fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
161        // Consume any common prefix between the number of spaces we
162        // want and the number of unscanned tab-introduced spaces.
163        let n_from_remaining = self.spaces_remaining.min(n_space);
164        self.spaces_remaining -= n_from_remaining;
165        n_space -= n_from_remaining;
166
167        while n_space > 0 && self.ix < self.bytes.len() {
168            match self.bytes[self.ix] {
169                b' ' => {
170                    self.ix += 1;
171                    n_space -= 1;
172                }
173                b'\t' => {
174                    let spaces = 4 - (self.ix - self.tab_start) % 4;
175                    self.ix += 1;
176                    self.tab_start = self.ix;
177                    let n = spaces.min(n_space);
178                    n_space -= n;
179
180                    // Record the unscanned portion of the tab.
181                    self.spaces_remaining = spaces - n;
182                }
183                _ => break,
184            }
185        }
186        n_space
187    }
188
189    /// Scan all available ASCII whitespace (not including eol).
190    pub(crate) fn scan_all_space(&mut self) {
191        self.spaces_remaining = 0;
192        self.ix += self.bytes[self.ix..]
193            .iter()
194            .take_while(|&&b| b == b' ' || b == b'\t')
195            .count();
196    }
197
198    /// Determine whether we're at end of line (includes end of file).
199    pub(crate) fn is_at_eol(&self) -> bool {
200        self.bytes
201            .get(self.ix)
202            .map(|&c| c == b'\r' || c == b'\n')
203            .unwrap_or(true)
204    }
205
206    fn scan_ch(&mut self, c: u8) -> bool {
207        if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
208            self.ix += 1;
209            true
210        } else {
211            false
212        }
213    }
214
215    fn scan_case_insensitive(&mut self, tag: &[u8]) -> bool {
216        if self.bytes.len() - self.ix < tag.len() {
217            return false;
218        }
219        let prefix = &self.bytes[self.ix..self.ix + tag.len()];
220        let ok = prefix.eq_ignore_ascii_case(tag);
221        if ok {
222            self.ix += tag.len();
223        }
224        ok
225    }
226
227    pub(crate) fn scan_blockquote_tag(&mut self) -> Option<BlockQuoteKind> {
228        let saved_ix = self.ix;
229        let tag = if self.scan_ch(b'[') && self.scan_ch(b'!') {
230            let tag = if self.scan_case_insensitive(b"note") {
231                Some(BlockQuoteKind::Note)
232            } else if self.scan_case_insensitive(b"tip") {
233                Some(BlockQuoteKind::Tip)
234            } else if self.scan_case_insensitive(b"important") {
235                Some(BlockQuoteKind::Important)
236            } else if self.scan_case_insensitive(b"warning") {
237                Some(BlockQuoteKind::Warning)
238            } else if self.scan_case_insensitive(b"caution") {
239                Some(BlockQuoteKind::Caution)
240            } else {
241                None
242            };
243            if tag.is_some() && self.scan_ch(b']') {
244                if let Some(nl) = scan_blank_line(&self.bytes[self.ix..]) {
245                    self.ix += nl;
246                    tag
247                } else {
248                    None
249                }
250            } else {
251                None
252            }
253        } else {
254            None
255        };
256        if tag.is_none() {
257            self.ix = saved_ix;
258        }
259        tag
260    }
261
262    pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
263        if self.scan_ch(b'>') {
264            let _ = self.scan_space(1);
265            true
266        } else {
267            false
268        }
269    }
270
271    /// Scan a definition marker.
272    ///
273    /// Definition markers are single colons, preceded by at most three spaces
274    /// and followed by at most three spaces. The indentation of following
275    /// lines is equal to the whole size of the marker, including the colon.
276    ///
277    /// If one is found, it will make the preceding paragraph into a definition
278    /// list title.
279    ///
280    /// Return value is the amount of indentation, or `None` if it's not a
281    /// definition list marker.
282    pub(crate) fn scan_definition_list_definition_marker_with_indent(
283        &mut self,
284        indent: usize,
285    ) -> Option<usize> {
286        let save = self.clone();
287        if self.scan_ch(b':') {
288            let save = self.clone();
289            if self.scan_space(5) {
290                *self = save;
291                Some(indent + 1 + self.scan_space_upto(1))
292            } else {
293                *self = save;
294                Some(indent + 1 + self.scan_space_upto(5))
295            }
296        } else {
297            *self = save;
298            None
299        }
300    }
301
302    /// Scan a list marker.
303    ///
304    /// Return value is the character, the start index, and the indent in spaces.
305    /// For ordered list markers, the character will be one of b'.' or b')'. For
306    /// bullet list markers, it will be one of b'-', b'+', or b'*'.
307    pub(crate) fn scan_list_marker_with_indent(
308        &mut self,
309        indent: usize,
310    ) -> Option<(u8, u64, usize)> {
311        let save = self.clone();
312        if self.ix < self.bytes.len() {
313            let c = self.bytes[self.ix];
314            if c == b'-' || c == b'+' || c == b'*' {
315                if self.ix >= self.min_hrule_offset {
316                    // there could be an hrule here
317                    if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
318                        self.min_hrule_offset = min_offset;
319                    } else {
320                        *self = save;
321                        return None;
322                    }
323                }
324                self.ix += 1;
325                if self.scan_space(1) || self.is_at_eol() {
326                    return self.finish_list_marker(c, 0, indent + 2);
327                }
328            } else if c.is_ascii_digit() {
329                let start_ix = self.ix;
330                let mut ix = self.ix + 1;
331                let mut val = u64::from(c - b'0');
332                while ix < self.bytes.len() && ix - start_ix < 10 {
333                    let c = self.bytes[ix];
334                    ix += 1;
335                    if c.is_ascii_digit() {
336                        val = val * 10 + u64::from(c - b'0');
337                    } else if c == b')' || c == b'.' {
338                        self.ix = ix;
339                        if self.scan_space(1) || self.is_at_eol() {
340                            return self.finish_list_marker(c, val, indent + 1 + ix - start_ix);
341                        } else {
342                            break;
343                        }
344                    } else {
345                        break;
346                    }
347                }
348            }
349        }
350        *self = save;
351        None
352    }
353
354    fn finish_list_marker(
355        &mut self,
356        c: u8,
357        start: u64,
358        mut indent: usize,
359    ) -> Option<(u8, u64, usize)> {
360        let save = self.clone();
361
362        // skip the rest of the line if it's blank
363        if scan_blank_line(&self.bytes[self.ix..]).is_some() {
364            return Some((c, start, indent));
365        }
366
367        let post_indent = self.scan_space_upto(4);
368        if post_indent < 4 {
369            indent += post_indent;
370        } else {
371            *self = save;
372        }
373        Some((c, start, indent))
374    }
375
376    /// Returns Some(is_checked) when a task list marker was found. Resets itself
377    /// to original state otherwise.
378    pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
379        let save = self.clone();
380        self.scan_space_upto(3);
381
382        if !self.scan_ch(b'[') {
383            *self = save;
384            return None;
385        }
386        let is_checked = match self.bytes.get(self.ix) {
387            Some(&c) if is_ascii_whitespace_no_nl(c) => {
388                self.ix += 1;
389                false
390            }
391            Some(b'x') | Some(b'X') => {
392                self.ix += 1;
393                true
394            }
395            _ => {
396                *self = save;
397                return None;
398            }
399        };
400        if !self.scan_ch(b']') {
401            *self = save;
402            return None;
403        }
404        if !self
405            .bytes
406            .get(self.ix)
407            .map(|&b| is_ascii_whitespace(b))
408            .unwrap_or(false)
409        {
410            *self = save;
411            return None;
412        }
413        Some(is_checked)
414    }
415
416    pub(crate) fn bytes_scanned(&self) -> usize {
417        self.ix
418    }
419
420    pub(crate) fn remaining_space(&self) -> usize {
421        self.spaces_remaining
422    }
423}
424
425pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
426    (0x09..=0x0d).contains(&c) || c == b' '
427}
428
429pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
430    c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
431}
432
433fn is_ascii_alpha(c: u8) -> bool {
434    c.is_ascii_alphabetic()
435}
436
437fn is_ascii_alphanumeric(c: u8) -> bool {
438    matches!(c, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z')
439}
440
441fn is_ascii_letterdigitdash(c: u8) -> bool {
442    c == b'-' || is_ascii_alphanumeric(c)
443}
444
445fn is_digit(c: u8) -> bool {
446    c.is_ascii_digit()
447}
448
449fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
450    !matches!(
451        c,
452        b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r'
453    )
454}
455
456// scan a single character
457pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
458    if !data.is_empty() && data[0] == c {
459        1
460    } else {
461        0
462    }
463}
464
465pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
466where
467    F: FnMut(u8) -> bool,
468{
469    data.iter().take_while(|&&c| f(c)).count()
470}
471
472pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
473where
474    F: FnMut(u8) -> bool,
475{
476    data.iter().rev().take_while(|&&c| f(c)).count()
477}
478
479pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
480    scan_while(data, |x| x == c)
481}
482
483// Note: this scans ASCII whitespace only, for Unicode whitespace use
484// a different function.
485pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
486    scan_while(data, is_ascii_whitespace_no_nl)
487}
488
489fn scan_attr_value_chars(data: &[u8]) -> usize {
490    scan_while(data, is_valid_unquoted_attr_value_char)
491}
492
493pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
494    match bytes {
495        &[] => Some(0),
496        &[b'\n', ..] => Some(1),
497        &[b'\r', b'\n', ..] => Some(2),
498        &[b'\r', ..] => Some(1),
499        _ => None,
500    }
501}
502
503pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
504    let i = scan_whitespace_no_nl(bytes);
505    scan_eol(&bytes[i..]).map(|n| i + n)
506}
507
508pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
509    memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
510}
511
512// return: end byte for closing code fence, or None
513// if the line is not a closing code fence
514pub(crate) fn scan_closing_code_fence(
515    bytes: &[u8],
516    fence_char: u8,
517    n_fence_char: usize,
518) -> Option<usize> {
519    if bytes.is_empty() {
520        return Some(0);
521    }
522    let mut i = 0;
523    let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
524    if num_fence_chars_found < n_fence_char {
525        return None;
526    }
527    i += num_fence_chars_found;
528    let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
529    i += num_trailing_spaces;
530    scan_eol(&bytes[i..]).map(|_| i)
531}
532
533// return: end byte for closing metadata block, or None
534// if the line is not a closing metadata block
535pub(crate) fn scan_closing_metadata_block(bytes: &[u8], fence_char: u8) -> Option<usize> {
536    let mut i = 0;
537    let mut num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
538    if num_fence_chars_found != 3 {
539        // if YAML style metadata block the closing character can also be `.`
540        if fence_char == b'-' {
541            num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'.');
542            if num_fence_chars_found != 3 {
543                return None;
544            }
545        } else {
546            return None;
547        }
548    }
549    i += num_fence_chars_found;
550    let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
551    i += num_trailing_spaces;
552    scan_eol(&bytes[i..]).map(|_| i)
553}
554
555// returned pair is (number of bytes, number of spaces)
556pub(crate) fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
557    let mut spaces = 0;
558    let mut offset = 0;
559
560    for (i, &b) in text.iter().enumerate() {
561        offset = i;
562        match b {
563            b' ' => {
564                spaces += 1;
565                if spaces == max {
566                    break;
567                }
568            }
569            b'\t' => {
570                let new_spaces = spaces + 4 - (spaces & 3);
571                if new_spaces > max {
572                    break;
573                }
574                spaces = new_spaces;
575            }
576            _ => break,
577        }
578    }
579
580    (offset, spaces)
581}
582
583/// Scan hrule opening sequence.
584///
585/// Returns Ok(x) when it finds an hrule, where x is the
586/// size of line containing the hrule, including the trailing newline.
587///
588/// Returns Err(x) when it does not find an hrule and x is
589/// the offset in data before no hrule can appear.
590pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
591    if bytes.len() < 3 {
592        return Err(0);
593    }
594    let c = bytes[0];
595    if !(c == b'*' || c == b'-' || c == b'_') {
596        return Err(0);
597    }
598    let mut n = 0;
599    let mut i = 0;
600
601    while i < bytes.len() {
602        match bytes[i] {
603            b'\n' | b'\r' => {
604                i += scan_eol(&bytes[i..]).unwrap_or(0);
605                break;
606            }
607            c2 if c2 == c => {
608                n += 1;
609            }
610            b' ' | b'\t' => (),
611            _ => return Err(i),
612        }
613        i += 1;
614    }
615    if n >= 3 {
616        Ok(i)
617    } else {
618        Err(i)
619    }
620}
621
622/// Scan an ATX heading opening sequence.
623///
624/// Returns number of bytes in prefix and level.
625pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
626    let level = scan_ch_repeat(data, b'#');
627    if data.get(level).copied().map_or(true, is_ascii_whitespace) {
628        HeadingLevel::try_from(level).ok()
629    } else {
630        None
631    }
632}
633
634/// Scan a setext heading underline.
635///
636/// Returns number of bytes in line (including trailing newline) and level.
637pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
638    let c = *data.first()?;
639    let level = if c == b'=' {
640        HeadingLevel::H1
641    } else if c == b'-' {
642        HeadingLevel::H2
643    } else {
644        return None;
645    };
646    let mut i = 1 + scan_ch_repeat(&data[1..], c);
647    i += scan_blank_line(&data[i..])?;
648    Some((i, level))
649}
650
651// returns number of bytes in line (including trailing
652// newline) and column alignments
653pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
654    let (mut i, spaces) = calc_indent(data, 4);
655    if spaces > 3 || i == data.len() {
656        return (0, vec![]);
657    }
658    let mut cols = vec![];
659    let mut active_col = Alignment::None;
660    let mut start_col = true;
661    let mut found_pipe = false;
662    let mut found_hyphen = false;
663    let mut found_hyphen_in_col = false;
664    if data[i] == b'|' {
665        i += 1;
666        found_pipe = true;
667    }
668    for c in &data[i..] {
669        if let Some(n) = scan_eol(&data[i..]) {
670            i += n;
671            break;
672        }
673        match *c {
674            b' ' => (),
675            b':' => {
676                active_col = match (start_col, active_col) {
677                    (true, Alignment::None) => Alignment::Left,
678                    (false, Alignment::Left) => Alignment::Center,
679                    (false, Alignment::None) => Alignment::Right,
680                    _ => active_col,
681                };
682                start_col = false;
683            }
684            b'-' => {
685                start_col = false;
686                found_hyphen = true;
687                found_hyphen_in_col = true;
688            }
689            b'|' => {
690                start_col = true;
691                found_pipe = true;
692                cols.push(active_col);
693                active_col = Alignment::None;
694                if !found_hyphen_in_col {
695                    // It isn't a table head if it has back-to-back pipes.
696                    return (0, vec![]);
697                }
698                found_hyphen_in_col = false;
699            }
700            _ => {
701                // It isn't a table head if it has characters outside the allowed set.
702                return (0, vec![]);
703            }
704        }
705        i += 1;
706    }
707
708    if !start_col {
709        cols.push(active_col);
710    }
711    if !found_pipe || !found_hyphen {
712        // It isn't a table head if it doesn't have a least one pipe or hyphen.
713        // It's a list, a header, or a thematic break.
714        return (0, vec![]);
715    }
716
717    (i, cols)
718}
719
720/// Scan code fence.
721///
722/// Returns number of bytes scanned and the char that is repeated to make the code fence.
723pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
724    let c = *data.first()?;
725    if !(c == b'`' || c == b'~') {
726        return None;
727    }
728    let i = 1 + scan_ch_repeat(&data[1..], c);
729    if i >= 3 {
730        if c == b'`' {
731            let suffix = &data[i..];
732            let next_line = i + scan_nextline(suffix);
733            // FIXME: make sure this is correct
734            if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
735                return None;
736            }
737        }
738        Some((i, c))
739    } else {
740        None
741    }
742}
743
744/// Scan metadata block, returning the number of delimiter bytes
745/// (always 3 for now) and the delimiter character.
746///
747/// Differently to code blocks, metadata blocks must be closed with the closing
748/// sequence not being a valid terminator the end of the file.
749///
750/// In addition, they cannot be empty (closing sequence in the next line) and
751/// the next line cannot be an empty line.
752pub(crate) fn scan_metadata_block(
753    data: &[u8],
754    yaml_style_enabled: bool,
755    pluses_style_enabled: bool,
756) -> Option<(usize, u8)> {
757    // Only if metadata blocks are enabled
758    if yaml_style_enabled || pluses_style_enabled {
759        let c = *data.first()?;
760        if !((c == b'-' && yaml_style_enabled) || (c == b'+' && pluses_style_enabled)) {
761            return None;
762        }
763        let i = 1 + scan_ch_repeat(&data[1..], c);
764        // Only trailing spaces after the delimiters in the line
765        let next_line = scan_nextline(&data[i..]);
766        for c in &data[i..i + next_line] {
767            if !c.is_ascii_whitespace() {
768                return None;
769            }
770        }
771        if i == 3 {
772            // Search the closing sequence
773            let mut j = i;
774            let mut first_line = true;
775            while j < data.len() {
776                j += scan_nextline(&data[j..]);
777                let closed = scan_closing_metadata_block(&data[j..], c).is_some();
778                // The first line of the metadata block cannot be an empty line
779                // nor the end of the block
780                if first_line {
781                    if closed || scan_blank_line(&data[j..]).is_some() {
782                        return None;
783                    }
784                    first_line = false;
785                }
786                if closed {
787                    return Some((i, c));
788                }
789            }
790            None
791        } else {
792            None
793        }
794    } else {
795        None
796    }
797}
798
799pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
800    if data.first().copied() == Some(b'>') {
801        let space = if data.get(1).copied() == Some(b' ') {
802            1
803        } else {
804            0
805        };
806        Some(1 + space)
807    } else {
808        None
809    }
810}
811
812/// return number of bytes scanned, delimiter, start index, and indent
813pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
814    let mut c = *bytes.first()?;
815    let (w, start) = match c {
816        b'-' | b'+' | b'*' => (1, 0),
817        b'0'..=b'9' => {
818            let (length, start) = parse_decimal(bytes, 9);
819            c = *bytes.get(length)?;
820            if !(c == b'.' || c == b')') {
821                return None;
822            }
823            (length + 1, start)
824        }
825        _ => {
826            return None;
827        }
828    };
829    // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
830    let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
831    if postindent == 0 {
832        scan_eol(&bytes[w..])?;
833        postindent += 1;
834    } else if postindent > 4 {
835        postn = 1;
836        postindent = 1;
837    }
838    if scan_blank_line(&bytes[w..]).is_some() {
839        postn = 0;
840        postindent = 1;
841    }
842    Some((w + postn, c, start, w + postindent))
843}
844
845// returns (number of bytes, parsed decimal)
846fn parse_decimal(bytes: &[u8], limit: usize) -> (usize, usize) {
847    match bytes
848        .iter()
849        .take(limit)
850        .take_while(|&&b| is_digit(b))
851        .try_fold((0, 0usize), |(count, acc), c| {
852            let digit = usize::from(c - b'0');
853            match acc
854                .checked_mul(10)
855                .and_then(|ten_acc| ten_acc.checked_add(digit))
856            {
857                Some(number) => Ok((count + 1, number)),
858                // stop early on overflow
859                None => Err((count, acc)),
860            }
861        }) {
862        Ok(p) | Err(p) => p,
863    }
864}
865
866// returns (number of bytes, parsed hex)
867fn parse_hex(bytes: &[u8], limit: usize) -> (usize, usize) {
868    match bytes
869        .iter()
870        .take(limit)
871        .try_fold((0, 0usize), |(count, acc), c| {
872            let mut c = *c;
873            let digit = if c.is_ascii_digit() {
874                usize::from(c - b'0')
875            } else {
876                // make lower case
877                c |= 0x20;
878                if (b'a'..=b'f').contains(&c) {
879                    usize::from(c - b'a' + 10)
880                } else {
881                    return Err((count, acc));
882                }
883            };
884            match acc
885                .checked_mul(16)
886                .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
887            {
888                Some(number) => Ok((count + 1, number)),
889                // stop early on overflow
890                None => Err((count, acc)),
891            }
892        }) {
893        Ok(p) | Err(p) => p,
894    }
895}
896
897fn char_from_codepoint(input: usize) -> Option<char> {
898    let codepoint = input.try_into().ok()?;
899    if codepoint == 0 {
900        return None;
901    }
902    char::from_u32(codepoint)
903}
904
905// doesn't bother to check data[0] == '&'
906pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
907    let mut end = 1;
908    if bytes.get(end) == Some(&b'#') {
909        end += 1;
910        let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
911            end += 1;
912            parse_hex(&bytes[end..], 6)
913        } else {
914            parse_decimal(&bytes[end..], 7)
915        };
916        end += bytecount;
917        return if bytecount == 0 || bytes.get(end) != Some(&b';') {
918            (0, None)
919        } else {
920            (
921                end + 1,
922                Some(char_from_codepoint(codepoint).unwrap_or('\u{FFFD}').into()),
923            )
924        };
925    }
926    end += scan_while(&bytes[end..], is_ascii_alphanumeric);
927    if bytes.get(end) == Some(&b';') {
928        if let Some(value) = entities::get_entity(&bytes[1..end]) {
929            return (end + 1, Some(value.into()));
930        }
931    }
932    (0, None)
933}
934
935pub(crate) fn scan_wikilink_pipe(data: &str, start_ix: usize, len: usize) -> Option<(usize, &str)> {
936    let bytes = data.as_bytes();
937    let end_ix = std::cmp::min(start_ix + len, bytes.len());
938    let mut i = start_ix;
939
940    while i < end_ix {
941        if bytes[i] == b'|' {
942            return Some((i + 1, &data[start_ix..i]));
943        }
944        i += 1;
945    }
946    None
947}
948
949// note: dest returned is raw, still needs to be unescaped
950// TODO: check that nested parens are really not allowed for refdefs
951// TODO(performance): this func should probably its own unescaping
952pub(crate) fn scan_link_dest(
953    data: &str,
954    start_ix: usize,
955    max_next: usize,
956) -> Option<(usize, &str)> {
957    let bytes = &data.as_bytes()[start_ix..];
958    let mut i = scan_ch(bytes, b'<');
959
960    if i != 0 {
961        // pointy links
962        while i < bytes.len() {
963            match bytes[i] {
964                b'\n' | b'\r' | b'<' => return None,
965                b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
966                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
967                    i += 1;
968                }
969                _ => {}
970            }
971            i += 1;
972        }
973        None
974    } else {
975        // non-pointy links
976        let mut nest = 0;
977        while i < bytes.len() {
978            match bytes[i] {
979                0x0..=0x20 => {
980                    break;
981                }
982                b'(' => {
983                    if nest > max_next {
984                        return None;
985                    }
986                    nest += 1;
987                }
988                b')' => {
989                    if nest == 0 {
990                        break;
991                    }
992                    nest -= 1;
993                }
994                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
995                    i += 1;
996                }
997                _ => {}
998            }
999            i += 1;
1000        }
1001        if nest != 0 {
1002            return None;
1003        }
1004        Some((i, &data[start_ix..(start_ix + i)]))
1005    }
1006}
1007
1008/// Returns bytes scanned
1009fn scan_attribute_name(data: &[u8]) -> Option<usize> {
1010    let (&c, tail) = data.split_first()?;
1011    if is_ascii_alpha(c) || c == b'_' || c == b':' {
1012        Some(
1013            1 + scan_while(tail, |c| {
1014                is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
1015            }),
1016        )
1017    } else {
1018        None
1019    }
1020}
1021
1022/// Returns the index immediately following the attribute on success.
1023/// The argument `buffer_ix` refers to the index into `data` from which we
1024/// should copy into `buffer` when we find bytes to skip.
1025fn scan_attribute(
1026    data: &[u8],
1027    mut ix: usize,
1028    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1029    buffer: &mut Vec<u8>,
1030    buffer_ix: &mut usize,
1031) -> Option<usize> {
1032    ix += scan_attribute_name(&data[ix..])?;
1033    let ix_after_attribute = ix;
1034    ix = scan_whitespace_with_newline_handler_without_buffer(data, ix, newline_handler)?;
1035    if data.get(ix) == Some(&b'=') {
1036        ix = scan_whitespace_with_newline_handler(
1037            data,
1038            ix_after_attribute,
1039            newline_handler,
1040            buffer,
1041            buffer_ix,
1042        )?;
1043        ix += 1;
1044        ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
1045        ix = scan_attribute_value(data, ix, newline_handler, buffer, buffer_ix)?;
1046        Some(ix)
1047    } else {
1048        // Leave whitespace for next attribute.
1049        Some(ix_after_attribute)
1050    }
1051}
1052
1053/// Scans whitespace and possibly newlines according to the
1054/// behavior defined by the newline handler. When bytes are skipped,
1055/// all preceding non-skipped bytes are pushed to the buffer.
1056fn scan_whitespace_with_newline_handler(
1057    data: &[u8],
1058    mut i: usize,
1059    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1060    buffer: &mut Vec<u8>,
1061    buffer_ix: &mut usize,
1062) -> Option<usize> {
1063    while i < data.len() {
1064        if !is_ascii_whitespace(data[i]) {
1065            return Some(i);
1066        }
1067        if let Some(eol_bytes) = scan_eol(&data[i..]) {
1068            let handler = newline_handler?;
1069            i += eol_bytes;
1070            let skipped_bytes = handler(&data[i..]);
1071
1072            if skipped_bytes > 0 {
1073                buffer.extend(&data[*buffer_ix..i]);
1074                *buffer_ix = i + skipped_bytes;
1075            }
1076
1077            i += skipped_bytes;
1078        } else {
1079            i += 1;
1080        }
1081    }
1082
1083    Some(i)
1084}
1085
1086/// Scans whitespace and possible newlines according to the behavior defined
1087/// by the newline handler.
1088///
1089/// Unlike [`scan_whitespace_with_newline_handler`], this function doesn't
1090/// copy skipped data into a buffer. Typically, if this function
1091/// returns `Some`, a call to `scan_whitespace_with_newline_handler` will
1092/// soon follow.
1093fn scan_whitespace_with_newline_handler_without_buffer(
1094    data: &[u8],
1095    mut i: usize,
1096    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1097) -> Option<usize> {
1098    while i < data.len() {
1099        if !is_ascii_whitespace(data[i]) {
1100            return Some(i);
1101        }
1102        if let Some(eol_bytes) = scan_eol(&data[i..]) {
1103            let handler = newline_handler?;
1104            i += eol_bytes;
1105            let skipped_bytes = handler(&data[i..]);
1106            i += skipped_bytes;
1107        } else {
1108            i += 1;
1109        }
1110    }
1111
1112    Some(i)
1113}
1114
1115/// Returns the index immediately following the attribute value on success.
1116fn scan_attribute_value(
1117    data: &[u8],
1118    mut i: usize,
1119    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1120    buffer: &mut Vec<u8>,
1121    buffer_ix: &mut usize,
1122) -> Option<usize> {
1123    match *data.get(i)? {
1124        b @ b'"' | b @ b'\'' => {
1125            i += 1;
1126            while i < data.len() {
1127                if data[i] == b {
1128                    return Some(i + 1);
1129                }
1130                if let Some(eol_bytes) = scan_eol(&data[i..]) {
1131                    let handler = newline_handler?;
1132                    i += eol_bytes;
1133                    let skipped_bytes = handler(&data[i..]);
1134
1135                    if skipped_bytes > 0 {
1136                        buffer.extend(&data[*buffer_ix..i]);
1137                        *buffer_ix = i + skipped_bytes;
1138                    }
1139                    i += skipped_bytes;
1140                } else {
1141                    i += 1;
1142                }
1143            }
1144            return None;
1145        }
1146        b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
1147            return None;
1148        }
1149        _ => {
1150            // unquoted attribute value
1151            i += scan_attr_value_chars(&data[i..]);
1152        }
1153    }
1154
1155    Some(i)
1156}
1157
1158// Remove backslash escapes and resolve entities
1159pub(crate) fn unescape<'a, I: Into<CowStr<'a>>>(input: I, is_in_table: bool) -> CowStr<'a> {
1160    let input = input.into();
1161    let mut result = String::new();
1162    let mut mark = 0;
1163    let mut i = 0;
1164    let bytes = input.as_bytes();
1165    while i < bytes.len() {
1166        match bytes[i..] {
1167            // Tables are special, because they're parsed as-if the tables
1168            // were parsed in a discrete pass, changing `\|` to `|`, and then
1169            // passing the changed string to the inline parser.
1170            [b'\\', b'\\', b'|', ..] if is_in_table => {
1171                // even number of `\`s before pipe
1172                // odd number is handled in the normal way below
1173                result.push_str(&input[mark..i]);
1174                mark = i + 2;
1175                i += 3;
1176            }
1177            [b'\\', cx, ..] if is_ascii_punctuation(cx) => {
1178                result.push_str(&input[mark..i]);
1179                mark = i + 1;
1180                i += 2;
1181            }
1182            [b'&', ..] => match scan_entity(&bytes[i..]) {
1183                (n, Some(value)) => {
1184                    result.push_str(&input[mark..i]);
1185                    result.push_str(&value);
1186                    i += n;
1187                    mark = i;
1188                }
1189                _ => i += 1,
1190            },
1191            [b'\r', ..] => {
1192                result.push_str(&input[mark..i]);
1193                i += 1;
1194                mark = i;
1195            }
1196            _ => i += 1,
1197        }
1198    }
1199    if mark == 0 {
1200        input
1201    } else {
1202        result.push_str(&input[mark..]);
1203        result.into()
1204    }
1205}
1206
1207/// Assumes `data` is preceded by `<`.
1208pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
1209    let i = scan_ch(data, b'/');
1210    let tail = &data[i..];
1211    let n = scan_while(tail, is_ascii_alphanumeric);
1212    if !is_html_tag(&tail[..n]) {
1213        return false;
1214    }
1215    // Starting condition says the next byte must be either a space, a tab,
1216    // the end of the line, the string >, or the string />
1217    let tail = &tail[n..];
1218    tail.is_empty()
1219        || tail[0] == b' '
1220        || tail[0] == b'\t'
1221        || tail[0] == b'\r'
1222        || tail[0] == b'\n'
1223        || tail[0] == b'>'
1224        || tail.len() >= 2 && &tail[..2] == b"/>"
1225}
1226
1227fn is_html_tag(tag: &[u8]) -> bool {
1228    HTML_TAGS
1229        .binary_search_by(|probe| {
1230            let probe_bytes_iter = probe.as_bytes().iter();
1231            let tag_bytes_iter = tag.iter();
1232
1233            probe_bytes_iter
1234                .zip(tag_bytes_iter)
1235                .find_map(|(&a, &b)| {
1236                    // We can compare case insensitively because the probes are
1237                    // all lower case alpha strings.
1238                    match a.cmp(&(b | 0x20)) {
1239                        std::cmp::Ordering::Equal => None,
1240                        inequality => Some(inequality),
1241                    }
1242                })
1243                .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1244        })
1245        .is_ok()
1246}
1247
1248/// Assumes that `data` starts with `<`.
1249/// Returns the index into data directly after the html tag on success.
1250pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1251    // Block type html does not allow for newlines, so we
1252    // do not pass a newline handler.
1253    let (_span, i) = scan_html_block_inner(data, None)?;
1254    scan_blank_line(&data[i..])?;
1255    Some(i)
1256}
1257
1258/// Assumes that `data` starts with `<`.
1259/// Returns the number of bytes scanned and the html in case of
1260/// success.
1261/// When some bytes were skipped, because the html was split over
1262/// multiple leafs (e.g. over multiple lines in a blockquote),
1263/// the html is returned as a vector of bytes.
1264/// If no bytes were skipped, the buffer will be empty.
1265pub(crate) fn scan_html_block_inner(
1266    data: &[u8],
1267    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1268) -> Option<(Vec<u8>, usize)> {
1269    let mut buffer = Vec::new();
1270    let mut last_buf_index = 0;
1271
1272    let close_tag_bytes = scan_ch(&data[1..], b'/');
1273    let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1274    if l == 0 {
1275        return None;
1276    }
1277    let mut i = 1 + close_tag_bytes + l;
1278    i += scan_while(&data[i..], is_ascii_letterdigitdash);
1279
1280    if close_tag_bytes == 0 {
1281        loop {
1282            let old_i = i;
1283            loop {
1284                i += scan_whitespace_no_nl(&data[i..]);
1285                if let Some(eol_bytes) = scan_eol(&data[i..]) {
1286                    if eol_bytes == 0 {
1287                        return None;
1288                    }
1289                    let handler = newline_handler?;
1290                    i += eol_bytes;
1291                    let skipped_bytes = handler(&data[i..]);
1292
1293                    let data_len = data.len() - i;
1294
1295                    debug_assert!(
1296                        skipped_bytes <= data_len,
1297                        "Handler tried to skip too many bytes, fed {}, skipped {}",
1298                        data_len,
1299                        skipped_bytes
1300                    );
1301
1302                    if skipped_bytes > 0 {
1303                        buffer.extend(&data[last_buf_index..i]);
1304                        i += skipped_bytes;
1305                        last_buf_index = i;
1306                    }
1307                } else {
1308                    break;
1309                }
1310            }
1311            if let Some(b'/') | Some(b'>') = data.get(i) {
1312                break;
1313            }
1314            if old_i == i {
1315                // No whitespace, which is mandatory.
1316                return None;
1317            }
1318            i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1319        }
1320    }
1321
1322    i += scan_whitespace_no_nl(&data[i..]);
1323
1324    if close_tag_bytes == 0 {
1325        i += scan_ch(&data[i..], b'/');
1326    }
1327
1328    if data.get(i) != Some(&b'>') {
1329        None
1330    } else {
1331        i += 1;
1332        if !buffer.is_empty() {
1333            buffer.extend(&data[last_buf_index..i]);
1334        }
1335        Some((buffer, i))
1336    }
1337}
1338
1339/// Returns (next_byte_offset, uri, type)
1340pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1341    scan_uri(text, start_ix)
1342        .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1343        .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1344}
1345
1346/// Returns (next_byte_offset, uri)
1347fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1348    let bytes = &text.as_bytes()[start_ix..];
1349
1350    // scheme's first byte must be an ascii letter
1351    if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1352        return None;
1353    }
1354
1355    let mut i = 1;
1356
1357    while i < bytes.len() {
1358        let c = bytes[i];
1359        i += 1;
1360        match c {
1361            c if is_ascii_alphanumeric(c) => (),
1362            b'.' | b'-' | b'+' => (),
1363            b':' => break,
1364            _ => return None,
1365        }
1366    }
1367
1368    // scheme length must be between 2 and 32 characters long. scheme
1369    // must be followed by colon
1370    if !(3..=33).contains(&i) {
1371        return None;
1372    }
1373
1374    while i < bytes.len() {
1375        match bytes[i] {
1376            b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1377            b'\0'..=b' ' | b'<' => return None,
1378            _ => (),
1379        }
1380        i += 1;
1381    }
1382
1383    None
1384}
1385
1386/// Returns (next_byte_offset, email)
1387fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1388    // using a regex library would be convenient, but doing it by hand is not too bad
1389    let bytes = &text.as_bytes()[start_ix..];
1390    let mut i = 0;
1391
1392    while i < bytes.len() {
1393        let c = bytes[i];
1394        i += 1;
1395        match c {
1396            c if is_ascii_alphanumeric(c) => (),
1397            b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1398            | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1399            b'@' if i > 1 => break,
1400            _ => return None,
1401        }
1402    }
1403
1404    loop {
1405        let label_start_ix = i;
1406        let mut fresh_label = true;
1407
1408        while i < bytes.len() {
1409            match bytes[i] {
1410                c if is_ascii_alphanumeric(c) => (),
1411                b'-' if fresh_label => {
1412                    return None;
1413                }
1414                b'-' => (),
1415                _ => break,
1416            }
1417            fresh_label = false;
1418            i += 1;
1419        }
1420
1421        if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1422            return None;
1423        }
1424
1425        if bytes.get(i) != Some(&b'.') {
1426            break;
1427        }
1428        i += 1;
1429    }
1430
1431    if bytes.get(i) != Some(&b'>') {
1432        return None;
1433    }
1434
1435    Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1436}
1437
1438/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1439/// Returns byte offset on match.
1440pub(crate) fn scan_inline_html_comment(
1441    bytes: &[u8],
1442    mut ix: usize,
1443    scan_guard: &mut HtmlScanGuard,
1444) -> Option<usize> {
1445    let c = *bytes.get(ix)?;
1446    ix += 1;
1447    match c {
1448        // An HTML comment consists of `<!-->`, `<!--->`, or  `<!--`, a string of characters not
1449        // including the string `-->`, and `-->`.
1450        b'-' if ix > scan_guard.comment => {
1451            // HTML comment needs two hyphens after the !.
1452            if *bytes.get(ix)? != b'-' {
1453                return None;
1454            }
1455            // Yes, we're intentionally going backwards.
1456            // We want the cursor to point here:
1457            //
1458            //     <!--
1459            //       ^
1460            //
1461            // This way, the `<!-->` case is covered by the loop below.
1462            ix -= 1;
1463
1464            while let Some(x) = memchr(b'-', &bytes[ix..]) {
1465                ix += x + 1;
1466                scan_guard.comment = ix;
1467                if bytes.get(ix) == Some(&b'-') && bytes.get(ix + 1) == Some(&b'>') {
1468                    return Some(ix + 2);
1469                }
1470            }
1471            None
1472        }
1473        // A CDATA section consists of the string `<![CDATA[`, a string of characters not
1474        // including the string `]]>`, and the string `]]>`.
1475        b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1476            ix += b"CDATA[".len();
1477            ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1478            let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1479            ix += close_brackets;
1480
1481            if close_brackets == 0 || bytes.get(ix) != Some(&b'>') {
1482                scan_guard.cdata = ix;
1483                None
1484            } else {
1485                Some(ix + 1)
1486            }
1487        }
1488        // A declaration consists of the string `<!`, an ASCII letter, zero or more characters not
1489        // including the character >, and the character >.
1490        _ if c.is_ascii_alphabetic() && ix > scan_guard.declaration => {
1491            ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1492            if bytes.get(ix) != Some(&b'>') {
1493                scan_guard.declaration = ix;
1494                None
1495            } else {
1496                Some(ix + 1)
1497            }
1498        }
1499        _ => None,
1500    }
1501}
1502
1503/// Scan processing directive, with initial "<?" already consumed.
1504/// Returns the next byte offset on success.
1505pub(crate) fn scan_inline_html_processing(
1506    bytes: &[u8],
1507    mut ix: usize,
1508    scan_guard: &mut HtmlScanGuard,
1509) -> Option<usize> {
1510    if ix <= scan_guard.processing {
1511        return None;
1512    }
1513    while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1514        ix += offset + 1;
1515        if bytes.get(ix) == Some(&b'>') {
1516            return Some(ix + 1);
1517        }
1518    }
1519    scan_guard.processing = ix;
1520    None
1521}
1522
1523#[cfg(test)]
1524mod test {
1525    use super::*;
1526    #[test]
1527    fn overflow_list() {
1528        assert!(
1529            scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1530        );
1531    }
1532
1533    #[test]
1534    fn overflow_by_addition() {
1535        assert!(scan_listitem(b"1844674407370955161615!").is_none());
1536    }
1537
1538    #[test]
1539    fn good_emails() {
1540        const EMAILS: &[&str] = &[
1541            "<a@b.c>",
1542            "<a@b>",
1543            "<a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-@example.com>",
1544            "<a@sixty-three-letters-in-this-identifier-----------------------63>",
1545        ];
1546        for email in EMAILS {
1547            assert!(scan_email(email, 1).is_some());
1548        }
1549    }
1550
1551    #[test]
1552    fn bad_emails() {
1553        const EMAILS: &[&str] = &[
1554            "<@b.c>",
1555            "<foo@-example.com>",
1556            "<foo@example-.com>",
1557            "<a@notrailingperiod.>",
1558            "<a(noparens)@example.com>",
1559            "<\"noquotes\"@example.com>",
1560            "<a@sixty-four-letters-in-this-identifier-------------------------64>",
1561        ];
1562        for email in EMAILS {
1563            assert!(scan_email(email, 1).is_none());
1564        }
1565    }
1566}
pulldown_cmark/scanners.rs

pulldown_cmark/
scanners.rs