mercurius/parse/
scanners.rs

1// Copyright 2015 Google Inc. All rights reserved.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Scanners for fragments of CommonMark syntax
22
23use std::char;
24use std::convert::TryInto;
25
26use super::entities;
27use super::parse::{Alignment, HtmlScanGuard, LinkType};
28pub use super::puncttable::{is_ascii_punctuation, is_punctuation};
29use super::strings::CowStr;
30
31use memchr::memchr;
32
33// sorted for binary search
34const HTML_TAGS: [&str; 62] = [
35    "address",
36    "article",
37    "aside",
38    "base",
39    "basefont",
40    "blockquote",
41    "body",
42    "caption",
43    "center",
44    "col",
45    "colgroup",
46    "dd",
47    "details",
48    "dialog",
49    "dir",
50    "div",
51    "dl",
52    "dt",
53    "fieldset",
54    "figcaption",
55    "figure",
56    "footer",
57    "form",
58    "frame",
59    "frameset",
60    "h1",
61    "h2",
62    "h3",
63    "h4",
64    "h5",
65    "h6",
66    "head",
67    "header",
68    "hr",
69    "html",
70    "iframe",
71    "legend",
72    "li",
73    "link",
74    "main",
75    "menu",
76    "menuitem",
77    "nav",
78    "noframes",
79    "ol",
80    "optgroup",
81    "option",
82    "p",
83    "param",
84    "section",
85    "source",
86    "summary",
87    "table",
88    "tbody",
89    "td",
90    "tfoot",
91    "th",
92    "thead",
93    "title",
94    "tr",
95    "track",
96    "ul",
97];
98
99/// Analysis of the beginning of a line, including indentation and container
100/// markers.
101#[derive(Clone)]
102pub struct LineStart<'a> {
103    bytes: &'a [u8],
104    tab_start: usize,
105    ix: usize,
106    spaces_remaining: usize,
107    // no thematic breaks can occur before this offset.
108    // this prevents scanning over and over up to a certain point
109    min_hrule_offset: usize,
110}
111
112impl<'a> LineStart<'a> {
113    pub(crate) fn new(bytes: &[u8]) -> LineStart {
114        LineStart {
115            bytes,
116            tab_start: 0,
117            ix: 0,
118            spaces_remaining: 0,
119            min_hrule_offset: 0,
120        }
121    }
122
123    /// Try to scan a number of spaces.
124    ///
125    /// Returns true if all spaces were consumed.
126    ///
127    /// Note: consumes some spaces even if not successful.
128    pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
129        self.scan_space_inner(n_space) == 0
130    }
131
132    /// Scan a number of spaces up to a maximum.
133    ///
134    /// Returns number of spaces scanned.
135    pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
136        n_space - self.scan_space_inner(n_space)
137    }
138
139    /// Returns unused remainder of spaces.
140    fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
141        let n_from_remaining = self.spaces_remaining.min(n_space);
142        self.spaces_remaining -= n_from_remaining;
143        n_space -= n_from_remaining;
144        while n_space > 0 && self.ix < self.bytes.len() {
145            match self.bytes[self.ix] {
146                b' ' => {
147                    self.ix += 1;
148                    n_space -= 1;
149                }
150                b'\t' => {
151                    let spaces = 4 - (self.ix - self.tab_start) % 4;
152                    self.ix += 1;
153                    self.tab_start = self.ix;
154                    let n = spaces.min(n_space);
155                    n_space -= n;
156                    self.spaces_remaining = spaces - n;
157                }
158                _ => break,
159            }
160        }
161        n_space
162    }
163
164    /// Scan all available ASCII whitespace (not including eol).
165    pub(crate) fn scan_all_space(&mut self) {
166        self.spaces_remaining = 0;
167        self.ix += self.bytes[self.ix..]
168            .iter()
169            .take_while(|&&b| b == b' ' || b == b'\t')
170            .count();
171    }
172
173    /// Determine whether we're at end of line (includes end of file).
174    pub(crate) fn is_at_eol(&self) -> bool {
175        self.bytes
176            .get(self.ix)
177            .map(|&c| c == b'\r' || c == b'\n')
178            .unwrap_or(true)
179    }
180
181    fn scan_ch(&mut self, c: u8) -> bool {
182        if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
183            self.ix += 1;
184            true
185        } else {
186            false
187        }
188    }
189
190    pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
191        let save = self.clone();
192        let _ = self.scan_space(3);
193        if self.scan_ch(b'>') {
194            let _ = self.scan_space(1);
195            true
196        } else {
197            *self = save;
198            false
199        }
200    }
201
202    /// Scan a list marker.
203    ///
204    /// Return value is the character, the start index, and the indent in spaces.
205    /// For ordered list markers, the character will be one of b'.' or b')'. For
206    /// bullet list markers, it will be one of b'-', b'+', or b'*'.
207    pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
208        let save = self.clone();
209        let indent = self.scan_space_upto(3);
210        if self.ix < self.bytes.len() {
211            let c = self.bytes[self.ix];
212            if c == b'-' || c == b'+' || c == b'*' {
213                if self.ix >= self.min_hrule_offset {
214                    // there could be an hrule here
215                    if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
216                        self.min_hrule_offset = min_offset;
217                    } else {
218                        *self = save;
219                        return None;
220                    }
221                }
222                self.ix += 1;
223                if self.scan_space(1) || self.is_at_eol() {
224                    return self.finish_list_marker(c, 0, indent + 2);
225                }
226            } else if c >= b'0' && c <= b'9' {
227                let start_ix = self.ix;
228                let mut ix = self.ix + 1;
229                let mut val = u64::from(c - b'0');
230                while ix < self.bytes.len() && ix - start_ix < 10 {
231                    let c = self.bytes[ix];
232                    ix += 1;
233                    if c >= b'0' && c <= b'9' {
234                        val = val * 10 + u64::from(c - b'0');
235                    } else if c == b')' || c == b'.' {
236                        self.ix = ix;
237                        if self.scan_space(1) || self.is_at_eol() {
238                            return self.finish_list_marker(c, val, indent + self.ix - start_ix);
239                        } else {
240                            break;
241                        }
242                    } else {
243                        break;
244                    }
245                }
246            }
247        }
248        *self = save;
249        None
250    }
251
252    fn finish_list_marker(
253        &mut self,
254        c: u8,
255        start: u64,
256        mut indent: usize,
257    ) -> Option<(u8, u64, usize)> {
258        let save = self.clone();
259
260        // skip the rest of the line if it's blank
261        if scan_blank_line(&self.bytes[self.ix..]).is_some() {
262            return Some((c, start, indent));
263        }
264
265        let post_indent = self.scan_space_upto(4);
266        if post_indent < 4 {
267            indent += post_indent;
268        } else {
269            *self = save;
270        }
271        Some((c, start, indent))
272    }
273
274    /// Returns Some(is_checked) when a task list marker was found. Resets itself
275    /// to original state otherwise.
276    pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
277        let save = self.clone();
278        self.scan_space_upto(3);
279
280        if !self.scan_ch(b'[') {
281            *self = save;
282            return None;
283        }
284        let is_checked = match self.bytes.get(self.ix) {
285            Some(&c) if is_ascii_whitespace_no_nl(c) => {
286                self.ix += 1;
287                false
288            }
289            Some(b'x') | Some(b'X') => {
290                self.ix += 1;
291                true
292            }
293            _ => {
294                *self = save;
295                return None;
296            }
297        };
298        if !self.scan_ch(b']') {
299            *self = save;
300            return None;
301        }
302        if !self
303            .bytes
304            .get(self.ix)
305            .map(|&b| is_ascii_whitespace_no_nl(b))
306            .unwrap_or(false)
307        {
308            *self = save;
309            return None;
310        }
311        Some(is_checked)
312    }
313
314    pub(crate) fn bytes_scanned(&self) -> usize {
315        self.ix
316    }
317
318    pub(crate) fn remaining_space(&self) -> usize {
319        self.spaces_remaining
320    }
321}
322
323pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
324    (c >= 0x09 && c <= 0x0d) || c == b' '
325}
326
327pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
328    c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
329}
330
331fn is_ascii_alpha(c: u8) -> bool {
332    match c {
333        b'a'..=b'z' | b'A'..=b'Z' => true,
334        _ => false,
335    }
336}
337
338fn is_ascii_alphanumeric(c: u8) -> bool {
339    match c {
340        b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
341        _ => false,
342    }
343}
344
345fn is_ascii_letterdigitdash(c: u8) -> bool {
346    c == b'-' || is_ascii_alphanumeric(c)
347}
348
349fn is_digit(c: u8) -> bool {
350    b'0' <= c && c <= b'9'
351}
352
353fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
354    match c {
355        b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
356        _ => true,
357    }
358}
359
360// scan a single character
361pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
362    if !data.is_empty() && data[0] == c {
363        1
364    } else {
365        0
366    }
367}
368
369pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
370where
371    F: FnMut(u8) -> bool,
372{
373    data.iter().take_while(|&&c| f(c)).count()
374}
375
376pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
377where
378    F: FnMut(u8) -> bool,
379{
380    data.iter().rev().take_while(|&&c| f(c)).count()
381}
382
383pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
384    scan_while(data, |x| x == c)
385}
386
387// Note: this scans ASCII whitespace only, for Unicode whitespace use
388// a different function.
389pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
390    scan_while(data, is_ascii_whitespace_no_nl)
391}
392
393fn scan_attr_value_chars(data: &[u8]) -> usize {
394    scan_while(data, is_valid_unquoted_attr_value_char)
395}
396
397pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
398    if bytes.is_empty() {
399        return Some(0);
400    }
401    match bytes[0] {
402        b'\n' => Some(1),
403        b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
404        _ => None,
405    }
406}
407
408pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
409    let i = scan_whitespace_no_nl(bytes);
410    scan_eol(&bytes[i..]).map(|n| i + n)
411}
412
413pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
414    memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
415}
416
417// return: end byte for closing code fence, or None
418// if the line is not a closing code fence
419pub(crate) fn scan_closing_code_fence(
420    bytes: &[u8],
421    fence_char: u8,
422    n_fence_char: usize,
423) -> Option<usize> {
424    if bytes.is_empty() {
425        return Some(0);
426    }
427    let mut i = 0;
428    let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
429    if num_fence_chars_found < n_fence_char {
430        return None;
431    }
432    i += num_fence_chars_found;
433    let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
434    i += num_trailing_spaces;
435    scan_eol(&bytes[i..]).map(|_| i)
436}
437
438pub(crate) fn scan_closing_display_math(bytes: &[u8]) -> Option<usize> {
439    if bytes.is_empty() {
440        return Some(0);
441    }
442    let mut i = 0;
443    let num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'$');
444    if num_fence_chars_found != 2 {
445        return None;
446    }
447    i += num_fence_chars_found;
448    let num_trailing_spaces = scan_ch_repeat(&bytes, b' ');
449    i += num_trailing_spaces;
450    scan_eol(&bytes[i..]).map(|_| i)
451}
452
453// returned pair is (number of bytes, number of spaces)
454fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
455    let mut spaces = 0;
456    let mut offset = 0;
457
458    for (i, &b) in text.iter().enumerate() {
459        match b {
460            b' ' => {
461                spaces += 1;
462                if spaces == max {
463                    break;
464                }
465            }
466            b'\t' => {
467                let new_spaces = spaces + 4 - (spaces & 3);
468                if new_spaces > max {
469                    break;
470                }
471                spaces = new_spaces;
472            }
473            _ => break,
474        }
475        offset = i;
476    }
477
478    (offset, spaces)
479}
480
481/// Scan hrule opening sequence.
482///
483/// Returns Ok(x) when it finds an hrule, where x is the
484/// size of line containing the hrule, including the trailing newline.
485///
486/// Returns Err(x) when it does not find an hrule and x is
487/// the offset in data before no hrule can appear.
488pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
489    if bytes.len() < 3 {
490        return Err(0);
491    }
492    let c = bytes[0];
493    if !(c == b'*' || c == b'-' || c == b'_') {
494        return Err(0);
495    }
496    let mut n = 0;
497    let mut i = 0;
498
499    while i < bytes.len() {
500        match bytes[i] {
501            b'\n' | b'\r' => {
502                i += scan_eol(&bytes[i..]).unwrap_or(0);
503                break;
504            }
505            c2 if c2 == c => {
506                n += 1;
507            }
508            b' ' | b'\t' => (),
509            _ => return Err(i),
510        }
511        i += 1;
512    }
513    if n >= 3 {
514        Ok(i)
515    } else {
516        Err(i)
517    }
518}
519
520pub(crate) fn scan_frontmatter_delimiter(bytes: &[u8]) -> Option<usize> {
521    if bytes.len() < 3 {
522        return None;
523    }
524    let c = bytes[0];
525    if !(c == b'-' || c == b'+') {
526        return None;
527    }
528    let mut n = 0;
529    let mut i = 0;
530
531    while i < bytes.len() {
532        match bytes[i] {
533            b'\n' | b'\r' => {
534                i += scan_eol(&bytes[i..]).unwrap_or(0);
535                break;
536            }
537            c2 if c2 == c => {
538                n += 1;
539            }
540            b' ' | b'\t' => (),
541            _ => return None,
542        }
543        i += 1;
544    }
545    if n >= 3 {
546        Some(i)
547    } else {
548        None
549    }
550}
551
552/// Scan an ATX heading opening sequence.
553///
554/// Returns number of bytes in prefix and level.
555pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
556    let level = scan_ch_repeat(data, b'#');
557    if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
558        Some(level)
559    } else {
560        None
561    }
562}
563
564/// Scan a setext heading underline.
565///
566/// Returns number of bytes in line (including trailing newline) and level.
567pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
568    let c = *data.get(0)?;
569    if !(c == b'-' || c == b'=') {
570        return None;
571    }
572    let mut i = 1 + scan_ch_repeat(&data[1..], c);
573    i += scan_blank_line(&data[i..])?;
574    let level = if c == b'=' { 1 } else { 2 };
575    Some((i, level))
576}
577
578// returns number of bytes in line (including trailing
579// newline) and column alignments
580pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
581    let (mut i, spaces) = calc_indent(data, 4);
582    if spaces > 3 || i == data.len() {
583        return (0, vec![]);
584    }
585    let mut cols = vec![];
586    let mut active_col = Alignment::None;
587    let mut start_col = true;
588    if data[i] == b'|' {
589        i += 1;
590    }
591    for c in &data[i..] {
592        if let Some(n) = scan_eol(&data[i..]) {
593            i += n;
594            break;
595        }
596        match *c {
597            b' ' => (),
598            b':' => {
599                active_col = match (start_col, active_col) {
600                    (true, Alignment::None) => Alignment::Left,
601                    (false, Alignment::Left) => Alignment::Center,
602                    (false, Alignment::None) => Alignment::Right,
603                    _ => active_col,
604                };
605                start_col = false;
606            }
607            b'-' => {
608                start_col = false;
609            }
610            b'|' => {
611                start_col = true;
612                cols.push(active_col);
613                active_col = Alignment::None;
614            }
615            _ => {
616                cols = vec![];
617                start_col = true;
618                break;
619            }
620        }
621        i += 1;
622    }
623
624    if !start_col {
625        cols.push(active_col);
626    }
627
628    (i, cols)
629}
630
631/// Scan code fence.
632///
633/// Returns number of bytes scanned and the char that is repeated to make the code fence.
634pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
635    let c = *data.get(0)?;
636    if !(c == b'`' || c == b'~') {
637        return None;
638    }
639    let i = 1 + scan_ch_repeat(&data[1..], c);
640    if i >= 3 {
641        if c == b'`' {
642            let suffix = &data[i..];
643            let next_line = i + scan_nextline(suffix);
644            // FIXME: make sure this is correct
645            if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
646                return None;
647            }
648        }
649        Some((i, c))
650    } else {
651        None
652    }
653}
654
655/// Scan display math.
656pub(crate) fn scan_display_math(data: &[u8]) -> bool {
657    if data.len() < 2 || *data.get(0).unwrap() != b'$' {
658        return false;
659    }
660    let i = 1 + scan_ch_repeat(&data[1..], b'$');
661    if i == 2 {
662        let suffix = &data[i..];
663        let next_line = i + scan_nextline(suffix);
664        if suffix[..(next_line - i)].iter().any(|&b| b == b'$') {
665            return false;
666        }
667        true
668    } else {
669        false
670    }
671}
672
673pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
674    if data.starts_with(b"> ") {
675        Some(2)
676    } else {
677        None
678    }
679}
680
681/// This already assumes the list item has been scanned.
682pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
683    let mut ix = 0;
684    for _ in 0..2 {
685        if let Some(bytes) = scan_blank_line(&data[ix..]) {
686            ix += bytes;
687        } else {
688            return false;
689        }
690    }
691    true
692}
693
694// return number of bytes scanned, delimiter, start index, and indent
695pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
696    let mut c = *bytes.get(0)?;
697    let (w, start) = match c {
698        b'-' | b'+' | b'*' => (1, 0),
699        b'0'..=b'9' => {
700            let (length, start) = parse_decimal(bytes);
701            c = *bytes.get(length)?;
702            if !(c == b'.' || c == b')') {
703                return None;
704            }
705            (length + 1, start)
706        }
707        _ => {
708            return None;
709        }
710    };
711    // TODO: replace calc_indent with scan_leading_whitespace, for tab correctness
712    let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
713    if postindent == 0 {
714        scan_eol(&bytes[w..])?;
715        postindent += 1;
716    } else if postindent > 4 {
717        postn = 1;
718        postindent = 1;
719    }
720    if scan_blank_line(&bytes[w..]).is_some() {
721        postn = 0;
722        postindent = 1;
723    }
724    Some((w + postn, c, start, w + postindent))
725}
726
727// returns (number of bytes, parsed decimal)
728fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
729    match bytes
730        .iter()
731        .take_while(|&&b| is_digit(b))
732        .try_fold((0, 0usize), |(count, acc), c| {
733            let digit = usize::from(c - b'0');
734            match acc
735                .checked_mul(10)
736                .and_then(|ten_acc| ten_acc.checked_add(digit))
737            {
738                Some(number) => Ok((count + 1, number)),
739                // stop early on overflow
740                None => Err((count, acc)),
741            }
742        }) {
743        Ok(p) | Err(p) => p,
744    }
745}
746
747// returns (number of bytes, parsed hex)
748fn parse_hex(bytes: &[u8]) -> (usize, usize) {
749    match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
750        let mut c = *c;
751        let digit = if c >= b'0' && c <= b'9' {
752            usize::from(c - b'0')
753        } else {
754            // make lower case
755            c |= 0x20;
756            if c >= b'a' && c <= b'f' {
757                usize::from(c - b'a' + 10)
758            } else {
759                return Err((count, acc));
760            }
761        };
762        match acc
763            .checked_mul(16)
764            .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
765        {
766            Some(number) => Ok((count + 1, number)),
767            // stop early on overflow
768            None => Err((count, acc)),
769        }
770    }) {
771        Ok(p) | Err(p) => p,
772    }
773}
774
775fn char_from_codepoint(input: usize) -> Option<char> {
776    let mut codepoint = input.try_into().ok()?;
777    if codepoint == 0 {
778        codepoint = 0xFFFD;
779    }
780    char::from_u32(codepoint)
781}
782
783// doesn't bother to check data[0] == '&'
784pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
785    let mut end = 1;
786    if scan_ch(&bytes[end..], b'#') == 1 {
787        end += 1;
788        let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
789            end += 1;
790            parse_hex(&bytes[end..])
791        } else {
792            parse_decimal(&bytes[end..])
793        };
794        end += bytecount;
795        return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
796            (0, None)
797        } else if let Some(c) = char_from_codepoint(codepoint) {
798            (end + 1, Some(c.into()))
799        } else {
800            (0, None)
801        };
802    }
803    end += scan_while(&bytes[end..], is_ascii_alphanumeric);
804    if scan_ch(&bytes[end..], b';') == 1 {
805        if let Some(value) = entities::get_entity(&bytes[1..end]) {
806            return (end + 1, Some(value.into()));
807        }
808    }
809    (0, None)
810}
811
812// FIXME: we can most likely re-use other scanners
813// returns (bytelength, title_str)
814pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
815    let mut chars = text.chars().peekable();
816    let closing_delim = match chars.next()? {
817        '\'' => '\'',
818        '"' => '"',
819        '(' => ')',
820        _ => return None,
821    };
822    let mut bytecount = 1;
823
824    while let Some(c) = chars.next() {
825        match c {
826            '\n' => {
827                bytecount += 1;
828                let mut next = *chars.peek()?;
829                while is_ascii_whitespace_no_nl(next as u8) {
830                    bytecount += chars.next()?.len_utf8();
831                    next = *chars.peek()?;
832                }
833                if *chars.peek()? == '\n' {
834                    // blank line - not allowed
835                    return None;
836                }
837            }
838            '\\' => {
839                let next_char = chars.next()?;
840                bytecount += 1 + next_char.len_utf8();
841            }
842            c if c == closing_delim => {
843                return Some((bytecount + 1, &text[1..bytecount]));
844            }
845            c => {
846                bytecount += c.len_utf8();
847            }
848        }
849    }
850    None
851}
852
853// note: dest returned is raw, still needs to be unescaped
854// TODO: check that nested parens are really not allowed for refdefs
855// TODO(performance): this func should probably its own unescaping
856pub(crate) fn scan_link_dest(
857    data: &str,
858    start_ix: usize,
859    max_next: usize,
860) -> Option<(usize, &str)> {
861    let bytes = &data.as_bytes()[start_ix..];
862    let mut i = scan_ch(bytes, b'<');
863
864    if i != 0 {
865        // pointy links
866        while i < bytes.len() {
867            match bytes[i] {
868                b'\n' | b'\r' | b'<' => return None,
869                b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
870                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
871                    i += 1;
872                }
873                _ => {}
874            }
875            i += 1;
876        }
877        None
878    } else {
879        // non-pointy links
880        let mut nest = 0;
881        while i < bytes.len() {
882            match bytes[i] {
883                0x0..=0x20 => {
884                    break;
885                }
886                b'(' => {
887                    if nest > max_next {
888                        return None;
889                    }
890                    nest += 1;
891                }
892                b')' => {
893                    if nest == 0 {
894                        break;
895                    }
896                    nest -= 1;
897                }
898                b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
899                    i += 1;
900                }
901                _ => {}
902            }
903            i += 1;
904        }
905        Some((i, &data[start_ix..(start_ix + i)]))
906    }
907}
908
909/// Returns bytes scanned
910fn scan_attribute_name(data: &[u8]) -> Option<usize> {
911    let (&c, tail) = data.split_first()?;
912    if is_ascii_alpha(c) || c == b'_' || c == b':' {
913        Some(
914            1 + scan_while(tail, |c| {
915                is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
916            }),
917        )
918    } else {
919        None
920    }
921}
922
923/// Returns byte scanned (TODO: should it return new offset?)
924// TODO: properly use the newline handler here
925fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
926    let allow_newline = newline_handler.is_some();
927    let whitespace_scanner =
928        |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
929    let mut ix = scan_attribute_name(data)?;
930    let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
931    ix += n_whitespace;
932    if scan_ch(&data[ix..], b'=') == 1 {
933        ix += 1;
934        ix += scan_while(&data[ix..], whitespace_scanner);
935        ix += scan_attribute_value(&data[ix..], newline_handler)?;
936    } else if n_whitespace > 0 {
937        // Leave whitespace for next attribute.
938        ix -= 1;
939    }
940    Some(ix)
941}
942
943fn scan_attribute_value(
944    data: &[u8],
945    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
946) -> Option<usize> {
947    let mut i = 0;
948    match *data.get(0)? {
949        b @ b'"' | b @ b'\'' => {
950            i += 1;
951            while i < data.len() {
952                if data[i] == b {
953                    return Some(i + 1);
954                }
955                if let Some(eol_bytes) = scan_eol(&data[i..]) {
956                    let handler = newline_handler?;
957                    i += eol_bytes;
958                    i += handler(&data[i..]);
959                } else {
960                    i += 1;
961                }
962            }
963            return None;
964        }
965        b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
966            return None;
967        }
968        _ => {
969            // unquoted attribute value
970            i += scan_attr_value_chars(&data[i..]);
971        }
972    }
973    Some(i)
974}
975
976// Remove backslash escapes and resolve entities
977pub(crate) fn unescape(input: &str) -> CowStr<'_> {
978    let mut result = String::new();
979    let mut mark = 0;
980    let mut i = 0;
981    let bytes = input.as_bytes();
982    while i < bytes.len() {
983        match bytes[i] {
984            b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
985                result.push_str(&input[mark..i]);
986                mark = i + 1;
987                i += 2;
988            }
989            b'&' => match scan_entity(&bytes[i..]) {
990                (n, Some(value)) => {
991                    result.push_str(&input[mark..i]);
992                    result.push_str(&value);
993                    i += n;
994                    mark = i;
995                }
996                _ => i += 1,
997            },
998            b'\r' => {
999                result.push_str(&input[mark..i]);
1000                i += 1;
1001                mark = i;
1002            }
1003            _ => i += 1,
1004        }
1005    }
1006    if mark == 0 {
1007        input.into()
1008    } else {
1009        result.push_str(&input[mark..]);
1010        result.into()
1011    }
1012}
1013
1014/// Assumes `data` is preceded by `<`.
1015pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
1016    let i = scan_ch(data, b'/');
1017    let n = scan_while(&data[i..], is_ascii_alphanumeric);
1018    // TODO: scan attributes and >
1019    (i + n, &data[i..i + n])
1020}
1021
1022pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
1023    HTML_TAGS
1024        .binary_search_by(|probe| {
1025            let probe_bytes_iter = probe.as_bytes().iter();
1026            let tag_bytes_iter = tag.iter();
1027
1028            probe_bytes_iter
1029                .zip(tag_bytes_iter)
1030                .find_map(|(&a, &b)| {
1031                    // We can compare case insensitively because the probes are
1032                    // all lower case alpha strings.
1033                    match a.cmp(&(b | 0x20)) {
1034                        std::cmp::Ordering::Equal => None,
1035                        inequality => Some(inequality),
1036                    }
1037                })
1038                .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1039        })
1040        .is_ok()
1041}
1042
1043/// Assumes that `data` is preceded by `<`.
1044pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1045    // Block type html does not allow for newlines, so we
1046    // do not pass a newline handler.
1047    let i = scan_html_block_inner(data, None)?;
1048    scan_blank_line(&data[i..])?;
1049    Some(i)
1050}
1051
1052// FIXME: instead of a newline handler, maybe this should receive
1053// a whitespace handler instead.
1054// With signature `&dyn Fn(&[u8]) -> Option<usize>`.
1055// We currently need to implement whitespace handling in all of
1056// this function's dependencies as well.
1057pub(crate) fn scan_html_block_inner(
1058    data: &[u8],
1059    newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1060) -> Option<usize> {
1061    let close_tag_bytes = scan_ch(data, b'/');
1062    let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
1063    if l == 0 {
1064        return None;
1065    }
1066    let mut i = close_tag_bytes + l;
1067    i += scan_while(&data[i..], is_ascii_letterdigitdash);
1068
1069    if close_tag_bytes == 0 {
1070        loop {
1071            let old_i = i;
1072            loop {
1073                i += scan_whitespace_no_nl(&data[i..]);
1074                if let Some(eol_bytes) = scan_eol(&data[i..]) {
1075                    if eol_bytes == 0 {
1076                        return None;
1077                    }
1078                    if let Some(handler) = newline_handler {
1079                        i += eol_bytes;
1080                        i += handler(&data[i..]);
1081                    } else {
1082                        return None;
1083                    }
1084                } else {
1085                    break;
1086                }
1087            }
1088            if let Some(b'/') | Some(b'>') = data.get(i) {
1089                break;
1090            }
1091            if old_i == i {
1092                // No whitespace, which is mandatory.
1093                return None;
1094            }
1095            i += scan_attribute(&data[i..], newline_handler)?;
1096        }
1097    }
1098
1099    i += scan_whitespace_no_nl(&data[i..]);
1100
1101    if close_tag_bytes == 0 {
1102        i += scan_ch(&data[i..], b'/');
1103    }
1104
1105    if scan_ch(&data[i..], b'>') == 0 {
1106        None
1107    } else {
1108        Some(i + 1)
1109    }
1110}
1111
1112/// Returns (next_byte_offset, uri, type)
1113pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1114    scan_uri(text, start_ix)
1115        .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1116        .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1117}
1118
1119/// Returns (next_byte_offset, uri)
1120fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1121    let bytes = &text.as_bytes()[start_ix..];
1122
1123    // scheme's first byte must be an ascii letter
1124    if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1125        return None;
1126    }
1127
1128    let mut i = 1;
1129
1130    while i < bytes.len() {
1131        let c = bytes[i];
1132        i += 1;
1133        match c {
1134            c if is_ascii_alphanumeric(c) => (),
1135            b'.' | b'-' | b'+' => (),
1136            b':' => break,
1137            _ => return None,
1138        }
1139    }
1140
1141    // scheme length must be between 2 and 32 characters long. scheme
1142    // must be followed by colon
1143    if i < 3 || i > 33 {
1144        return None;
1145    }
1146
1147    while i < bytes.len() {
1148        match bytes[i] {
1149            b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1150            b'\0'..=b' ' | b'<' => return None,
1151            _ => (),
1152        }
1153        i += 1;
1154    }
1155
1156    None
1157}
1158
1159/// Returns (next_byte_offset, email)
1160fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1161    // using a regex library would be convenient, but doing it by hand is not too bad
1162    let bytes = &text.as_bytes()[start_ix..];
1163    let mut i = 0;
1164
1165    while i < bytes.len() {
1166        let c = bytes[i];
1167        i += 1;
1168        match c {
1169            c if is_ascii_alphanumeric(c) => (),
1170            b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1171            | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1172            b'@' => break,
1173            _ => return None,
1174        }
1175    }
1176
1177    loop {
1178        let label_start_ix = i;
1179        let mut fresh_label = true;
1180
1181        while i < bytes.len() {
1182            match bytes[i] {
1183                c if is_ascii_alphanumeric(c) => (),
1184                b'-' if fresh_label => {
1185                    return None;
1186                }
1187                b'-' => (),
1188                _ => break,
1189            }
1190            fresh_label = false;
1191            i += 1;
1192        }
1193
1194        if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1195            return None;
1196        }
1197
1198        if scan_ch(&bytes[i..], b'.') == 0 {
1199            break;
1200        }
1201        i += 1;
1202    }
1203
1204    if scan_ch(&bytes[i..], b'>') == 0 {
1205        return None;
1206    }
1207
1208    Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1209}
1210
1211/// Scan comment, declaration, or CDATA section, with initial "<!" already consumed.
1212/// Returns byte offset on match.
1213pub(crate) fn scan_inline_html_comment(
1214    bytes: &[u8],
1215    mut ix: usize,
1216    scan_guard: &mut HtmlScanGuard,
1217) -> Option<usize> {
1218    let c = *bytes.get(ix)?;
1219    ix += 1;
1220    match c {
1221        b'-' => {
1222            let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1223            if dashes < 1 {
1224                return None;
1225            }
1226            // Saw "<!--", scan comment.
1227            ix += dashes;
1228            if scan_ch(&bytes[ix..], b'>') == 1 {
1229                return None;
1230            }
1231
1232            while let Some(x) = memchr(b'-', &bytes[ix..]) {
1233                ix += x + 1;
1234                if scan_ch(&bytes[ix..], b'-') == 1 {
1235                    ix += 1;
1236                    return if scan_ch(&bytes[ix..], b'>') == 1 {
1237                        Some(ix + 1)
1238                    } else {
1239                        None
1240                    };
1241                }
1242            }
1243            None
1244        }
1245        b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1246            ix += b"CDATA[".len();
1247            ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1248            let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1249            ix += close_brackets;
1250
1251            if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1252                scan_guard.cdata = ix;
1253                None
1254            } else {
1255                Some(ix + 1)
1256            }
1257        }
1258        b'A'..=b'Z' if ix > scan_guard.declaration => {
1259            // Scan declaration.
1260            ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1261            let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1262            if whitespace == 0 {
1263                return None;
1264            }
1265            ix += whitespace;
1266            ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1267            if scan_ch(&bytes[ix..], b'>') == 0 {
1268                scan_guard.declaration = ix;
1269                None
1270            } else {
1271                Some(ix + 1)
1272            }
1273        }
1274        _ => None,
1275    }
1276}
1277
1278/// Scan processing directive, with initial "<?" already consumed.
1279/// Returns the next byte offset on success.
1280pub(crate) fn scan_inline_html_processing(
1281    bytes: &[u8],
1282    mut ix: usize,
1283    scan_guard: &mut HtmlScanGuard,
1284) -> Option<usize> {
1285    if ix <= scan_guard.processing {
1286        return None;
1287    }
1288    while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1289        ix += offset + 1;
1290        if scan_ch(&bytes[ix..], b'>') == 1 {
1291            return Some(ix + 1);
1292        }
1293    }
1294    scan_guard.processing = ix;
1295    None
1296}
1297
1298#[cfg(test)]
1299mod test {
1300    use super::*;
1301    #[test]
1302    fn overflow_list() {
1303        assert!(
1304            scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1305        );
1306    }
1307
1308    #[test]
1309    fn overflow_by_addition() {
1310        assert!(scan_listitem(b"1844674407370955161615!").is_none());
1311    }
1312}
mercurius/parse/scanners.rs

mercurius/parse/
scanners.rs