1use std::char;
24
25use crate::parse::HtmlScanGuard;
26pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
27use crate::strings::CowStr;
28use crate::{entities, BlockQuoteKind, HeadingLevel};
29use crate::{Alignment, LinkType};
30
31use memchr::memchr;
32
33const HTML_TAGS: [&str; 62] = [
35 "address",
36 "article",
37 "aside",
38 "base",
39 "basefont",
40 "blockquote",
41 "body",
42 "caption",
43 "center",
44 "col",
45 "colgroup",
46 "dd",
47 "details",
48 "dialog",
49 "dir",
50 "div",
51 "dl",
52 "dt",
53 "fieldset",
54 "figcaption",
55 "figure",
56 "footer",
57 "form",
58 "frame",
59 "frameset",
60 "h1",
61 "h2",
62 "h3",
63 "h4",
64 "h5",
65 "h6",
66 "head",
67 "header",
68 "hr",
69 "html",
70 "iframe",
71 "legend",
72 "li",
73 "link",
74 "main",
75 "menu",
76 "menuitem",
77 "nav",
78 "noframes",
79 "ol",
80 "optgroup",
81 "option",
82 "p",
83 "param",
84 "search",
85 "section",
86 "summary",
87 "table",
88 "tbody",
89 "td",
90 "tfoot",
91 "th",
92 "thead",
93 "title",
94 "tr",
95 "track",
96 "ul",
97];
98
99#[derive(Clone)]
102pub(crate) struct LineStart<'a> {
103 bytes: &'a [u8],
104 ix: usize,
105
106 tab_start: usize,
117
118 spaces_remaining: usize,
126
127 min_hrule_offset: usize,
130}
131
132impl<'a> LineStart<'a> {
133 pub(crate) fn new(bytes: &[u8]) -> LineStart<'_> {
134 LineStart {
135 bytes,
136 tab_start: 0,
137 ix: 0,
138 spaces_remaining: 0,
139 min_hrule_offset: 0,
140 }
141 }
142
143 pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
149 self.scan_space_inner(n_space) == 0
150 }
151
152 pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
156 n_space - self.scan_space_inner(n_space)
157 }
158
159 fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
161 let n_from_remaining = self.spaces_remaining.min(n_space);
164 self.spaces_remaining -= n_from_remaining;
165 n_space -= n_from_remaining;
166
167 while n_space > 0 && self.ix < self.bytes.len() {
168 match self.bytes[self.ix] {
169 b' ' => {
170 self.ix += 1;
171 n_space -= 1;
172 }
173 b'\t' => {
174 let spaces = 4 - (self.ix - self.tab_start) % 4;
175 self.ix += 1;
176 self.tab_start = self.ix;
177 let n = spaces.min(n_space);
178 n_space -= n;
179
180 self.spaces_remaining = spaces - n;
182 }
183 _ => break,
184 }
185 }
186 n_space
187 }
188
189 pub(crate) fn scan_all_space(&mut self) {
191 self.spaces_remaining = 0;
192 self.ix += self.bytes[self.ix..]
193 .iter()
194 .take_while(|&&b| b == b' ' || b == b'\t')
195 .count();
196 }
197
198 pub(crate) fn is_at_eol(&self) -> bool {
200 self.bytes
201 .get(self.ix)
202 .map(|&c| c == b'\r' || c == b'\n')
203 .unwrap_or(true)
204 }
205
206 fn scan_ch(&mut self, c: u8) -> bool {
207 if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
208 self.ix += 1;
209 true
210 } else {
211 false
212 }
213 }
214
215 fn scan_case_insensitive(&mut self, tag: &[u8]) -> bool {
216 if self.bytes.len() - self.ix < tag.len() {
217 return false;
218 }
219 let prefix = &self.bytes[self.ix..self.ix + tag.len()];
220 let ok = prefix.eq_ignore_ascii_case(tag);
221 if ok {
222 self.ix += tag.len();
223 }
224 ok
225 }
226
227 pub(crate) fn scan_blockquote_tag(&mut self) -> Option<BlockQuoteKind> {
228 let saved_ix = self.ix;
229 let tag = if self.scan_ch(b'[') && self.scan_ch(b'!') {
230 let tag = if self.scan_case_insensitive(b"note") {
231 Some(BlockQuoteKind::Note)
232 } else if self.scan_case_insensitive(b"tip") {
233 Some(BlockQuoteKind::Tip)
234 } else if self.scan_case_insensitive(b"important") {
235 Some(BlockQuoteKind::Important)
236 } else if self.scan_case_insensitive(b"warning") {
237 Some(BlockQuoteKind::Warning)
238 } else if self.scan_case_insensitive(b"caution") {
239 Some(BlockQuoteKind::Caution)
240 } else {
241 None
242 };
243 if tag.is_some() && self.scan_ch(b']') {
244 if let Some(nl) = scan_blank_line(&self.bytes[self.ix..]) {
245 self.ix += nl;
246 tag
247 } else {
248 None
249 }
250 } else {
251 None
252 }
253 } else {
254 None
255 };
256 if tag.is_none() {
257 self.ix = saved_ix;
258 }
259 tag
260 }
261
262 pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
263 if self.scan_ch(b'>') {
264 let _ = self.scan_space(1);
265 true
266 } else {
267 false
268 }
269 }
270
271 pub(crate) fn scan_definition_list_definition_marker_with_indent(
283 &mut self,
284 indent: usize,
285 ) -> Option<usize> {
286 let save = self.clone();
287 if self.scan_ch(b':') {
288 let save = self.clone();
289 if self.scan_space(5) {
290 *self = save;
291 Some(indent + 1 + self.scan_space_upto(1))
292 } else {
293 *self = save;
294 Some(indent + 1 + self.scan_space_upto(5))
295 }
296 } else {
297 *self = save;
298 None
299 }
300 }
301
302 pub(crate) fn scan_list_marker_with_indent(
308 &mut self,
309 indent: usize,
310 ) -> Option<(u8, u64, usize)> {
311 let save = self.clone();
312 if self.ix < self.bytes.len() {
313 let c = self.bytes[self.ix];
314 if c == b'-' || c == b'+' || c == b'*' {
315 if self.ix >= self.min_hrule_offset {
316 if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
318 self.min_hrule_offset = min_offset;
319 } else {
320 *self = save;
321 return None;
322 }
323 }
324 self.ix += 1;
325 if self.scan_space(1) || self.is_at_eol() {
326 return self.finish_list_marker(c, 0, indent + 2);
327 }
328 } else if c.is_ascii_digit() {
329 let start_ix = self.ix;
330 let mut ix = self.ix + 1;
331 let mut val = u64::from(c - b'0');
332 while ix < self.bytes.len() && ix - start_ix < 10 {
333 let c = self.bytes[ix];
334 ix += 1;
335 if c.is_ascii_digit() {
336 val = val * 10 + u64::from(c - b'0');
337 } else if c == b')' || c == b'.' {
338 self.ix = ix;
339 if self.scan_space(1) || self.is_at_eol() {
340 return self.finish_list_marker(c, val, indent + 1 + ix - start_ix);
341 } else {
342 break;
343 }
344 } else {
345 break;
346 }
347 }
348 }
349 }
350 *self = save;
351 None
352 }
353
354 fn finish_list_marker(
355 &mut self,
356 c: u8,
357 start: u64,
358 mut indent: usize,
359 ) -> Option<(u8, u64, usize)> {
360 let save = self.clone();
361
362 if scan_blank_line(&self.bytes[self.ix..]).is_some() {
364 return Some((c, start, indent));
365 }
366
367 let post_indent = self.scan_space_upto(4);
368 if post_indent < 4 {
369 indent += post_indent;
370 } else {
371 *self = save;
372 }
373 Some((c, start, indent))
374 }
375
376 pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
379 let save = self.clone();
380 self.scan_space_upto(3);
381
382 if !self.scan_ch(b'[') {
383 *self = save;
384 return None;
385 }
386 let is_checked = match self.bytes.get(self.ix) {
387 Some(&c) if is_ascii_whitespace_no_nl(c) => {
388 self.ix += 1;
389 false
390 }
391 Some(b'x') | Some(b'X') => {
392 self.ix += 1;
393 true
394 }
395 _ => {
396 *self = save;
397 return None;
398 }
399 };
400 if !self.scan_ch(b']') {
401 *self = save;
402 return None;
403 }
404 if !self
405 .bytes
406 .get(self.ix)
407 .map(|&b| is_ascii_whitespace(b))
408 .unwrap_or(false)
409 {
410 *self = save;
411 return None;
412 }
413 Some(is_checked)
414 }
415
416 pub(crate) fn bytes_scanned(&self) -> usize {
417 self.ix
418 }
419
420 pub(crate) fn remaining_space(&self) -> usize {
421 self.spaces_remaining
422 }
423}
424
425pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
426 (0x09..=0x0d).contains(&c) || c == b' '
427}
428
429pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
430 c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
431}
432
433fn is_ascii_alpha(c: u8) -> bool {
434 c.is_ascii_alphabetic()
435}
436
437fn is_ascii_alphanumeric(c: u8) -> bool {
438 matches!(c, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z')
439}
440
441fn is_ascii_letterdigitdash(c: u8) -> bool {
442 c == b'-' || is_ascii_alphanumeric(c)
443}
444
445fn is_digit(c: u8) -> bool {
446 c.is_ascii_digit()
447}
448
449fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
450 !matches!(
451 c,
452 b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r'
453 )
454}
455
456pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
458 if !data.is_empty() && data[0] == c {
459 1
460 } else {
461 0
462 }
463}
464
465pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
466where
467 F: FnMut(u8) -> bool,
468{
469 data.iter().take_while(|&&c| f(c)).count()
470}
471
472pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
473where
474 F: FnMut(u8) -> bool,
475{
476 data.iter().rev().take_while(|&&c| f(c)).count()
477}
478
479pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
480 scan_while(data, |x| x == c)
481}
482
483pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
486 scan_while(data, is_ascii_whitespace_no_nl)
487}
488
489fn scan_attr_value_chars(data: &[u8]) -> usize {
490 scan_while(data, is_valid_unquoted_attr_value_char)
491}
492
493pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
494 match bytes {
495 &[] => Some(0),
496 &[b'\n', ..] => Some(1),
497 &[b'\r', b'\n', ..] => Some(2),
498 &[b'\r', ..] => Some(1),
499 _ => None,
500 }
501}
502
503pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
504 let i = scan_whitespace_no_nl(bytes);
505 scan_eol(&bytes[i..]).map(|n| i + n)
506}
507
508pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
509 memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
510}
511
512pub(crate) fn scan_closing_code_fence(
515 bytes: &[u8],
516 fence_char: u8,
517 n_fence_char: usize,
518) -> Option<usize> {
519 if bytes.is_empty() {
520 return Some(0);
521 }
522 let mut i = 0;
523 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
524 if num_fence_chars_found < n_fence_char {
525 return None;
526 }
527 i += num_fence_chars_found;
528 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
529 i += num_trailing_spaces;
530 scan_eol(&bytes[i..]).map(|_| i)
531}
532
533pub(crate) fn scan_closing_metadata_block(bytes: &[u8], fence_char: u8) -> Option<usize> {
536 let mut i = 0;
537 let mut num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
538 if num_fence_chars_found != 3 {
539 if fence_char == b'-' {
541 num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'.');
542 if num_fence_chars_found != 3 {
543 return None;
544 }
545 } else {
546 return None;
547 }
548 }
549 i += num_fence_chars_found;
550 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
551 i += num_trailing_spaces;
552 scan_eol(&bytes[i..]).map(|_| i)
553}
554
555pub(crate) fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
557 let mut spaces = 0;
558 let mut offset = 0;
559
560 for (i, &b) in text.iter().enumerate() {
561 offset = i;
562 match b {
563 b' ' => {
564 spaces += 1;
565 if spaces == max {
566 break;
567 }
568 }
569 b'\t' => {
570 let new_spaces = spaces + 4 - (spaces & 3);
571 if new_spaces > max {
572 break;
573 }
574 spaces = new_spaces;
575 }
576 _ => break,
577 }
578 }
579
580 (offset, spaces)
581}
582
583pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
591 if bytes.len() < 3 {
592 return Err(0);
593 }
594 let c = bytes[0];
595 if !(c == b'*' || c == b'-' || c == b'_') {
596 return Err(0);
597 }
598 let mut n = 0;
599 let mut i = 0;
600
601 while i < bytes.len() {
602 match bytes[i] {
603 b'\n' | b'\r' => {
604 i += scan_eol(&bytes[i..]).unwrap_or(0);
605 break;
606 }
607 c2 if c2 == c => {
608 n += 1;
609 }
610 b' ' | b'\t' => (),
611 _ => return Err(i),
612 }
613 i += 1;
614 }
615 if n >= 3 {
616 Ok(i)
617 } else {
618 Err(i)
619 }
620}
621
622pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
626 let level = scan_ch_repeat(data, b'#');
627 if data.get(level).copied().map_or(true, is_ascii_whitespace) {
628 HeadingLevel::try_from(level).ok()
629 } else {
630 None
631 }
632}
633
634pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
638 let c = *data.first()?;
639 let level = if c == b'=' {
640 HeadingLevel::H1
641 } else if c == b'-' {
642 HeadingLevel::H2
643 } else {
644 return None;
645 };
646 let mut i = 1 + scan_ch_repeat(&data[1..], c);
647 i += scan_blank_line(&data[i..])?;
648 Some((i, level))
649}
650
651pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
654 let (mut i, spaces) = calc_indent(data, 4);
655 if spaces > 3 || i == data.len() {
656 return (0, vec![]);
657 }
658 let mut cols = vec![];
659 let mut active_col = Alignment::None;
660 let mut start_col = true;
661 let mut found_pipe = false;
662 let mut found_hyphen = false;
663 let mut found_hyphen_in_col = false;
664 if data[i] == b'|' {
665 i += 1;
666 found_pipe = true;
667 }
668 for c in &data[i..] {
669 if let Some(n) = scan_eol(&data[i..]) {
670 i += n;
671 break;
672 }
673 match *c {
674 b' ' => (),
675 b':' => {
676 active_col = match (start_col, active_col) {
677 (true, Alignment::None) => Alignment::Left,
678 (false, Alignment::Left) => Alignment::Center,
679 (false, Alignment::None) => Alignment::Right,
680 _ => active_col,
681 };
682 start_col = false;
683 }
684 b'-' => {
685 start_col = false;
686 found_hyphen = true;
687 found_hyphen_in_col = true;
688 }
689 b'|' => {
690 start_col = true;
691 found_pipe = true;
692 cols.push(active_col);
693 active_col = Alignment::None;
694 if !found_hyphen_in_col {
695 return (0, vec![]);
697 }
698 found_hyphen_in_col = false;
699 }
700 _ => {
701 return (0, vec![]);
703 }
704 }
705 i += 1;
706 }
707
708 if !start_col {
709 cols.push(active_col);
710 }
711 if !found_pipe || !found_hyphen {
712 return (0, vec![]);
715 }
716
717 (i, cols)
718}
719
720pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
724 let c = *data.first()?;
725 if !(c == b'`' || c == b'~') {
726 return None;
727 }
728 let i = 1 + scan_ch_repeat(&data[1..], c);
729 if i >= 3 {
730 if c == b'`' {
731 let suffix = &data[i..];
732 let next_line = i + scan_nextline(suffix);
733 if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
735 return None;
736 }
737 }
738 Some((i, c))
739 } else {
740 None
741 }
742}
743
744pub(crate) fn scan_metadata_block(
753 data: &[u8],
754 yaml_style_enabled: bool,
755 pluses_style_enabled: bool,
756) -> Option<(usize, u8)> {
757 if yaml_style_enabled || pluses_style_enabled {
759 let c = *data.first()?;
760 if !((c == b'-' && yaml_style_enabled) || (c == b'+' && pluses_style_enabled)) {
761 return None;
762 }
763 let i = 1 + scan_ch_repeat(&data[1..], c);
764 let next_line = scan_nextline(&data[i..]);
766 for c in &data[i..i + next_line] {
767 if !c.is_ascii_whitespace() {
768 return None;
769 }
770 }
771 if i == 3 {
772 let mut j = i;
774 let mut first_line = true;
775 while j < data.len() {
776 j += scan_nextline(&data[j..]);
777 let closed = scan_closing_metadata_block(&data[j..], c).is_some();
778 if first_line {
781 if closed || scan_blank_line(&data[j..]).is_some() {
782 return None;
783 }
784 first_line = false;
785 }
786 if closed {
787 return Some((i, c));
788 }
789 }
790 None
791 } else {
792 None
793 }
794 } else {
795 None
796 }
797}
798
799pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
800 if data.first().copied() == Some(b'>') {
801 let space = if data.get(1).copied() == Some(b' ') {
802 1
803 } else {
804 0
805 };
806 Some(1 + space)
807 } else {
808 None
809 }
810}
811
812pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
814 let mut c = *bytes.first()?;
815 let (w, start) = match c {
816 b'-' | b'+' | b'*' => (1, 0),
817 b'0'..=b'9' => {
818 let (length, start) = parse_decimal(bytes, 9);
819 c = *bytes.get(length)?;
820 if !(c == b'.' || c == b')') {
821 return None;
822 }
823 (length + 1, start)
824 }
825 _ => {
826 return None;
827 }
828 };
829 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
831 if postindent == 0 {
832 scan_eol(&bytes[w..])?;
833 postindent += 1;
834 } else if postindent > 4 {
835 postn = 1;
836 postindent = 1;
837 }
838 if scan_blank_line(&bytes[w..]).is_some() {
839 postn = 0;
840 postindent = 1;
841 }
842 Some((w + postn, c, start, w + postindent))
843}
844
845fn parse_decimal(bytes: &[u8], limit: usize) -> (usize, usize) {
847 match bytes
848 .iter()
849 .take(limit)
850 .take_while(|&&b| is_digit(b))
851 .try_fold((0, 0usize), |(count, acc), c| {
852 let digit = usize::from(c - b'0');
853 match acc
854 .checked_mul(10)
855 .and_then(|ten_acc| ten_acc.checked_add(digit))
856 {
857 Some(number) => Ok((count + 1, number)),
858 None => Err((count, acc)),
860 }
861 }) {
862 Ok(p) | Err(p) => p,
863 }
864}
865
866fn parse_hex(bytes: &[u8], limit: usize) -> (usize, usize) {
868 match bytes
869 .iter()
870 .take(limit)
871 .try_fold((0, 0usize), |(count, acc), c| {
872 let mut c = *c;
873 let digit = if c.is_ascii_digit() {
874 usize::from(c - b'0')
875 } else {
876 c |= 0x20;
878 if (b'a'..=b'f').contains(&c) {
879 usize::from(c - b'a' + 10)
880 } else {
881 return Err((count, acc));
882 }
883 };
884 match acc
885 .checked_mul(16)
886 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
887 {
888 Some(number) => Ok((count + 1, number)),
889 None => Err((count, acc)),
891 }
892 }) {
893 Ok(p) | Err(p) => p,
894 }
895}
896
897fn char_from_codepoint(input: usize) -> Option<char> {
898 let codepoint = input.try_into().ok()?;
899 if codepoint == 0 {
900 return None;
901 }
902 char::from_u32(codepoint)
903}
904
905pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
907 let mut end = 1;
908 if bytes.get(end) == Some(&b'#') {
909 end += 1;
910 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
911 end += 1;
912 parse_hex(&bytes[end..], 6)
913 } else {
914 parse_decimal(&bytes[end..], 7)
915 };
916 end += bytecount;
917 return if bytecount == 0 || bytes.get(end) != Some(&b';') {
918 (0, None)
919 } else {
920 (
921 end + 1,
922 Some(char_from_codepoint(codepoint).unwrap_or('\u{FFFD}').into()),
923 )
924 };
925 }
926 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
927 if bytes.get(end) == Some(&b';') {
928 if let Some(value) = entities::get_entity(&bytes[1..end]) {
929 return (end + 1, Some(value.into()));
930 }
931 }
932 (0, None)
933}
934
935pub(crate) fn scan_wikilink_pipe(data: &str, start_ix: usize, len: usize) -> Option<(usize, &str)> {
936 let bytes = data.as_bytes();
937 let end_ix = std::cmp::min(start_ix + len, bytes.len());
938 let mut i = start_ix;
939
940 while i < end_ix {
941 if bytes[i] == b'|' {
942 return Some((i + 1, &data[start_ix..i]));
943 }
944 i += 1;
945 }
946 None
947}
948
949pub(crate) fn scan_link_dest(
953 data: &str,
954 start_ix: usize,
955 max_next: usize,
956) -> Option<(usize, &str)> {
957 let bytes = &data.as_bytes()[start_ix..];
958 let mut i = scan_ch(bytes, b'<');
959
960 if i != 0 {
961 while i < bytes.len() {
963 match bytes[i] {
964 b'\n' | b'\r' | b'<' => return None,
965 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
966 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
967 i += 1;
968 }
969 _ => {}
970 }
971 i += 1;
972 }
973 None
974 } else {
975 let mut nest = 0;
977 while i < bytes.len() {
978 match bytes[i] {
979 0x0..=0x20 => {
980 break;
981 }
982 b'(' => {
983 if nest > max_next {
984 return None;
985 }
986 nest += 1;
987 }
988 b')' => {
989 if nest == 0 {
990 break;
991 }
992 nest -= 1;
993 }
994 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
995 i += 1;
996 }
997 _ => {}
998 }
999 i += 1;
1000 }
1001 if nest != 0 {
1002 return None;
1003 }
1004 Some((i, &data[start_ix..(start_ix + i)]))
1005 }
1006}
1007
1008fn scan_attribute_name(data: &[u8]) -> Option<usize> {
1010 let (&c, tail) = data.split_first()?;
1011 if is_ascii_alpha(c) || c == b'_' || c == b':' {
1012 Some(
1013 1 + scan_while(tail, |c| {
1014 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
1015 }),
1016 )
1017 } else {
1018 None
1019 }
1020}
1021
1022fn scan_attribute(
1026 data: &[u8],
1027 mut ix: usize,
1028 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1029 buffer: &mut Vec<u8>,
1030 buffer_ix: &mut usize,
1031) -> Option<usize> {
1032 ix += scan_attribute_name(&data[ix..])?;
1033 let ix_after_attribute = ix;
1034 ix = scan_whitespace_with_newline_handler_without_buffer(data, ix, newline_handler)?;
1035 if data.get(ix) == Some(&b'=') {
1036 ix = scan_whitespace_with_newline_handler(
1037 data,
1038 ix_after_attribute,
1039 newline_handler,
1040 buffer,
1041 buffer_ix,
1042 )?;
1043 ix += 1;
1044 ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
1045 ix = scan_attribute_value(data, ix, newline_handler, buffer, buffer_ix)?;
1046 Some(ix)
1047 } else {
1048 Some(ix_after_attribute)
1050 }
1051}
1052
1053fn scan_whitespace_with_newline_handler(
1057 data: &[u8],
1058 mut i: usize,
1059 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1060 buffer: &mut Vec<u8>,
1061 buffer_ix: &mut usize,
1062) -> Option<usize> {
1063 while i < data.len() {
1064 if !is_ascii_whitespace(data[i]) {
1065 return Some(i);
1066 }
1067 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1068 let handler = newline_handler?;
1069 i += eol_bytes;
1070 let skipped_bytes = handler(&data[i..]);
1071
1072 if skipped_bytes > 0 {
1073 buffer.extend(&data[*buffer_ix..i]);
1074 *buffer_ix = i + skipped_bytes;
1075 }
1076
1077 i += skipped_bytes;
1078 } else {
1079 i += 1;
1080 }
1081 }
1082
1083 Some(i)
1084}
1085
1086fn scan_whitespace_with_newline_handler_without_buffer(
1094 data: &[u8],
1095 mut i: usize,
1096 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1097) -> Option<usize> {
1098 while i < data.len() {
1099 if !is_ascii_whitespace(data[i]) {
1100 return Some(i);
1101 }
1102 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1103 let handler = newline_handler?;
1104 i += eol_bytes;
1105 let skipped_bytes = handler(&data[i..]);
1106 i += skipped_bytes;
1107 } else {
1108 i += 1;
1109 }
1110 }
1111
1112 Some(i)
1113}
1114
1115fn scan_attribute_value(
1117 data: &[u8],
1118 mut i: usize,
1119 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1120 buffer: &mut Vec<u8>,
1121 buffer_ix: &mut usize,
1122) -> Option<usize> {
1123 match *data.get(i)? {
1124 b @ b'"' | b @ b'\'' => {
1125 i += 1;
1126 while i < data.len() {
1127 if data[i] == b {
1128 return Some(i + 1);
1129 }
1130 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1131 let handler = newline_handler?;
1132 i += eol_bytes;
1133 let skipped_bytes = handler(&data[i..]);
1134
1135 if skipped_bytes > 0 {
1136 buffer.extend(&data[*buffer_ix..i]);
1137 *buffer_ix = i + skipped_bytes;
1138 }
1139 i += skipped_bytes;
1140 } else {
1141 i += 1;
1142 }
1143 }
1144 return None;
1145 }
1146 b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
1147 return None;
1148 }
1149 _ => {
1150 i += scan_attr_value_chars(&data[i..]);
1152 }
1153 }
1154
1155 Some(i)
1156}
1157
1158pub(crate) fn unescape<'a, I: Into<CowStr<'a>>>(input: I, is_in_table: bool) -> CowStr<'a> {
1160 let input = input.into();
1161 let mut result = String::new();
1162 let mut mark = 0;
1163 let mut i = 0;
1164 let bytes = input.as_bytes();
1165 while i < bytes.len() {
1166 match bytes[i..] {
1167 [b'\\', b'\\', b'|', ..] if is_in_table => {
1171 result.push_str(&input[mark..i]);
1174 mark = i + 2;
1175 i += 3;
1176 }
1177 [b'\\', cx, ..] if is_ascii_punctuation(cx) => {
1178 result.push_str(&input[mark..i]);
1179 mark = i + 1;
1180 i += 2;
1181 }
1182 [b'&', ..] => match scan_entity(&bytes[i..]) {
1183 (n, Some(value)) => {
1184 result.push_str(&input[mark..i]);
1185 result.push_str(&value);
1186 i += n;
1187 mark = i;
1188 }
1189 _ => i += 1,
1190 },
1191 [b'\r', ..] => {
1192 result.push_str(&input[mark..i]);
1193 i += 1;
1194 mark = i;
1195 }
1196 _ => i += 1,
1197 }
1198 }
1199 if mark == 0 {
1200 input
1201 } else {
1202 result.push_str(&input[mark..]);
1203 result.into()
1204 }
1205}
1206
1207pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
1209 let i = scan_ch(data, b'/');
1210 let tail = &data[i..];
1211 let n = scan_while(tail, is_ascii_alphanumeric);
1212 if !is_html_tag(&tail[..n]) {
1213 return false;
1214 }
1215 let tail = &tail[n..];
1218 tail.is_empty()
1219 || tail[0] == b' '
1220 || tail[0] == b'\t'
1221 || tail[0] == b'\r'
1222 || tail[0] == b'\n'
1223 || tail[0] == b'>'
1224 || tail.len() >= 2 && &tail[..2] == b"/>"
1225}
1226
1227fn is_html_tag(tag: &[u8]) -> bool {
1228 HTML_TAGS
1229 .binary_search_by(|probe| {
1230 let probe_bytes_iter = probe.as_bytes().iter();
1231 let tag_bytes_iter = tag.iter();
1232
1233 probe_bytes_iter
1234 .zip(tag_bytes_iter)
1235 .find_map(|(&a, &b)| {
1236 match a.cmp(&(b | 0x20)) {
1239 std::cmp::Ordering::Equal => None,
1240 inequality => Some(inequality),
1241 }
1242 })
1243 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1244 })
1245 .is_ok()
1246}
1247
1248pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1251 let (_span, i) = scan_html_block_inner(data, None)?;
1254 scan_blank_line(&data[i..])?;
1255 Some(i)
1256}
1257
1258pub(crate) fn scan_html_block_inner(
1266 data: &[u8],
1267 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1268) -> Option<(Vec<u8>, usize)> {
1269 let mut buffer = Vec::new();
1270 let mut last_buf_index = 0;
1271
1272 let close_tag_bytes = scan_ch(&data[1..], b'/');
1273 let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1274 if l == 0 {
1275 return None;
1276 }
1277 let mut i = 1 + close_tag_bytes + l;
1278 i += scan_while(&data[i..], is_ascii_letterdigitdash);
1279
1280 if close_tag_bytes == 0 {
1281 loop {
1282 let old_i = i;
1283 loop {
1284 i += scan_whitespace_no_nl(&data[i..]);
1285 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1286 if eol_bytes == 0 {
1287 return None;
1288 }
1289 let handler = newline_handler?;
1290 i += eol_bytes;
1291 let skipped_bytes = handler(&data[i..]);
1292
1293 let data_len = data.len() - i;
1294
1295 debug_assert!(
1296 skipped_bytes <= data_len,
1297 "Handler tried to skip too many bytes, fed {}, skipped {}",
1298 data_len,
1299 skipped_bytes
1300 );
1301
1302 if skipped_bytes > 0 {
1303 buffer.extend(&data[last_buf_index..i]);
1304 i += skipped_bytes;
1305 last_buf_index = i;
1306 }
1307 } else {
1308 break;
1309 }
1310 }
1311 if let Some(b'/') | Some(b'>') = data.get(i) {
1312 break;
1313 }
1314 if old_i == i {
1315 return None;
1317 }
1318 i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1319 }
1320 }
1321
1322 i += scan_whitespace_no_nl(&data[i..]);
1323
1324 if close_tag_bytes == 0 {
1325 i += scan_ch(&data[i..], b'/');
1326 }
1327
1328 if data.get(i) != Some(&b'>') {
1329 None
1330 } else {
1331 i += 1;
1332 if !buffer.is_empty() {
1333 buffer.extend(&data[last_buf_index..i]);
1334 }
1335 Some((buffer, i))
1336 }
1337}
1338
1339pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1341 scan_uri(text, start_ix)
1342 .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1343 .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1344}
1345
1346fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1348 let bytes = &text.as_bytes()[start_ix..];
1349
1350 if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1352 return None;
1353 }
1354
1355 let mut i = 1;
1356
1357 while i < bytes.len() {
1358 let c = bytes[i];
1359 i += 1;
1360 match c {
1361 c if is_ascii_alphanumeric(c) => (),
1362 b'.' | b'-' | b'+' => (),
1363 b':' => break,
1364 _ => return None,
1365 }
1366 }
1367
1368 if !(3..=33).contains(&i) {
1371 return None;
1372 }
1373
1374 while i < bytes.len() {
1375 match bytes[i] {
1376 b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1377 b'\0'..=b' ' | b'<' => return None,
1378 _ => (),
1379 }
1380 i += 1;
1381 }
1382
1383 None
1384}
1385
1386fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1388 let bytes = &text.as_bytes()[start_ix..];
1390 let mut i = 0;
1391
1392 while i < bytes.len() {
1393 let c = bytes[i];
1394 i += 1;
1395 match c {
1396 c if is_ascii_alphanumeric(c) => (),
1397 b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1398 | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1399 b'@' if i > 1 => break,
1400 _ => return None,
1401 }
1402 }
1403
1404 loop {
1405 let label_start_ix = i;
1406 let mut fresh_label = true;
1407
1408 while i < bytes.len() {
1409 match bytes[i] {
1410 c if is_ascii_alphanumeric(c) => (),
1411 b'-' if fresh_label => {
1412 return None;
1413 }
1414 b'-' => (),
1415 _ => break,
1416 }
1417 fresh_label = false;
1418 i += 1;
1419 }
1420
1421 if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1422 return None;
1423 }
1424
1425 if bytes.get(i) != Some(&b'.') {
1426 break;
1427 }
1428 i += 1;
1429 }
1430
1431 if bytes.get(i) != Some(&b'>') {
1432 return None;
1433 }
1434
1435 Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1436}
1437
1438pub(crate) fn scan_inline_html_comment(
1441 bytes: &[u8],
1442 mut ix: usize,
1443 scan_guard: &mut HtmlScanGuard,
1444) -> Option<usize> {
1445 let c = *bytes.get(ix)?;
1446 ix += 1;
1447 match c {
1448 b'-' if ix > scan_guard.comment => {
1451 if *bytes.get(ix)? != b'-' {
1453 return None;
1454 }
1455 ix -= 1;
1463
1464 while let Some(x) = memchr(b'-', &bytes[ix..]) {
1465 ix += x + 1;
1466 scan_guard.comment = ix;
1467 if bytes.get(ix) == Some(&b'-') && bytes.get(ix + 1) == Some(&b'>') {
1468 return Some(ix + 2);
1469 }
1470 }
1471 None
1472 }
1473 b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1476 ix += b"CDATA[".len();
1477 ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1478 let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1479 ix += close_brackets;
1480
1481 if close_brackets == 0 || bytes.get(ix) != Some(&b'>') {
1482 scan_guard.cdata = ix;
1483 None
1484 } else {
1485 Some(ix + 1)
1486 }
1487 }
1488 _ if c.is_ascii_alphabetic() && ix > scan_guard.declaration => {
1491 ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1492 if bytes.get(ix) != Some(&b'>') {
1493 scan_guard.declaration = ix;
1494 None
1495 } else {
1496 Some(ix + 1)
1497 }
1498 }
1499 _ => None,
1500 }
1501}
1502
1503pub(crate) fn scan_inline_html_processing(
1506 bytes: &[u8],
1507 mut ix: usize,
1508 scan_guard: &mut HtmlScanGuard,
1509) -> Option<usize> {
1510 if ix <= scan_guard.processing {
1511 return None;
1512 }
1513 while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1514 ix += offset + 1;
1515 if bytes.get(ix) == Some(&b'>') {
1516 return Some(ix + 1);
1517 }
1518 }
1519 scan_guard.processing = ix;
1520 None
1521}
1522
1523#[cfg(test)]
1524mod test {
1525 use super::*;
1526 #[test]
1527 fn overflow_list() {
1528 assert!(
1529 scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1530 );
1531 }
1532
1533 #[test]
1534 fn overflow_by_addition() {
1535 assert!(scan_listitem(b"1844674407370955161615!").is_none());
1536 }
1537
1538 #[test]
1539 fn good_emails() {
1540 const EMAILS: &[&str] = &[
1541 "<a@b.c>",
1542 "<a@b>",
1543 "<a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-@example.com>",
1544 "<a@sixty-three-letters-in-this-identifier-----------------------63>",
1545 ];
1546 for email in EMAILS {
1547 assert!(scan_email(email, 1).is_some());
1548 }
1549 }
1550
1551 #[test]
1552 fn bad_emails() {
1553 const EMAILS: &[&str] = &[
1554 "<@b.c>",
1555 "<foo@-example.com>",
1556 "<foo@example-.com>",
1557 "<a@notrailingperiod.>",
1558 "<a(noparens)@example.com>",
1559 "<\"noquotes\"@example.com>",
1560 "<a@sixty-four-letters-in-this-identifier-------------------------64>",
1561 ];
1562 for email in EMAILS {
1563 assert!(scan_email(email, 1).is_none());
1564 }
1565 }
1566}