1use std::char;
24use std::convert::TryInto;
25
26use super::entities;
27use super::parse::{Alignment, HtmlScanGuard, LinkType};
28pub use super::puncttable::{is_ascii_punctuation, is_punctuation};
29use super::strings::CowStr;
30
31use memchr::memchr;
32
33const HTML_TAGS: [&str; 62] = [
35 "address",
36 "article",
37 "aside",
38 "base",
39 "basefont",
40 "blockquote",
41 "body",
42 "caption",
43 "center",
44 "col",
45 "colgroup",
46 "dd",
47 "details",
48 "dialog",
49 "dir",
50 "div",
51 "dl",
52 "dt",
53 "fieldset",
54 "figcaption",
55 "figure",
56 "footer",
57 "form",
58 "frame",
59 "frameset",
60 "h1",
61 "h2",
62 "h3",
63 "h4",
64 "h5",
65 "h6",
66 "head",
67 "header",
68 "hr",
69 "html",
70 "iframe",
71 "legend",
72 "li",
73 "link",
74 "main",
75 "menu",
76 "menuitem",
77 "nav",
78 "noframes",
79 "ol",
80 "optgroup",
81 "option",
82 "p",
83 "param",
84 "section",
85 "source",
86 "summary",
87 "table",
88 "tbody",
89 "td",
90 "tfoot",
91 "th",
92 "thead",
93 "title",
94 "tr",
95 "track",
96 "ul",
97];
98
99#[derive(Clone)]
102pub struct LineStart<'a> {
103 bytes: &'a [u8],
104 tab_start: usize,
105 ix: usize,
106 spaces_remaining: usize,
107 min_hrule_offset: usize,
110}
111
112impl<'a> LineStart<'a> {
113 pub(crate) fn new(bytes: &[u8]) -> LineStart {
114 LineStart {
115 bytes,
116 tab_start: 0,
117 ix: 0,
118 spaces_remaining: 0,
119 min_hrule_offset: 0,
120 }
121 }
122
123 pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
129 self.scan_space_inner(n_space) == 0
130 }
131
132 pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
136 n_space - self.scan_space_inner(n_space)
137 }
138
139 fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
141 let n_from_remaining = self.spaces_remaining.min(n_space);
142 self.spaces_remaining -= n_from_remaining;
143 n_space -= n_from_remaining;
144 while n_space > 0 && self.ix < self.bytes.len() {
145 match self.bytes[self.ix] {
146 b' ' => {
147 self.ix += 1;
148 n_space -= 1;
149 }
150 b'\t' => {
151 let spaces = 4 - (self.ix - self.tab_start) % 4;
152 self.ix += 1;
153 self.tab_start = self.ix;
154 let n = spaces.min(n_space);
155 n_space -= n;
156 self.spaces_remaining = spaces - n;
157 }
158 _ => break,
159 }
160 }
161 n_space
162 }
163
164 pub(crate) fn scan_all_space(&mut self) {
166 self.spaces_remaining = 0;
167 self.ix += self.bytes[self.ix..]
168 .iter()
169 .take_while(|&&b| b == b' ' || b == b'\t')
170 .count();
171 }
172
173 pub(crate) fn is_at_eol(&self) -> bool {
175 self.bytes
176 .get(self.ix)
177 .map(|&c| c == b'\r' || c == b'\n')
178 .unwrap_or(true)
179 }
180
181 fn scan_ch(&mut self, c: u8) -> bool {
182 if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
183 self.ix += 1;
184 true
185 } else {
186 false
187 }
188 }
189
190 pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
191 let save = self.clone();
192 let _ = self.scan_space(3);
193 if self.scan_ch(b'>') {
194 let _ = self.scan_space(1);
195 true
196 } else {
197 *self = save;
198 false
199 }
200 }
201
202 pub(crate) fn scan_list_marker(&mut self) -> Option<(u8, u64, usize)> {
208 let save = self.clone();
209 let indent = self.scan_space_upto(3);
210 if self.ix < self.bytes.len() {
211 let c = self.bytes[self.ix];
212 if c == b'-' || c == b'+' || c == b'*' {
213 if self.ix >= self.min_hrule_offset {
214 if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
216 self.min_hrule_offset = min_offset;
217 } else {
218 *self = save;
219 return None;
220 }
221 }
222 self.ix += 1;
223 if self.scan_space(1) || self.is_at_eol() {
224 return self.finish_list_marker(c, 0, indent + 2);
225 }
226 } else if c >= b'0' && c <= b'9' {
227 let start_ix = self.ix;
228 let mut ix = self.ix + 1;
229 let mut val = u64::from(c - b'0');
230 while ix < self.bytes.len() && ix - start_ix < 10 {
231 let c = self.bytes[ix];
232 ix += 1;
233 if c >= b'0' && c <= b'9' {
234 val = val * 10 + u64::from(c - b'0');
235 } else if c == b')' || c == b'.' {
236 self.ix = ix;
237 if self.scan_space(1) || self.is_at_eol() {
238 return self.finish_list_marker(c, val, indent + self.ix - start_ix);
239 } else {
240 break;
241 }
242 } else {
243 break;
244 }
245 }
246 }
247 }
248 *self = save;
249 None
250 }
251
252 fn finish_list_marker(
253 &mut self,
254 c: u8,
255 start: u64,
256 mut indent: usize,
257 ) -> Option<(u8, u64, usize)> {
258 let save = self.clone();
259
260 if scan_blank_line(&self.bytes[self.ix..]).is_some() {
262 return Some((c, start, indent));
263 }
264
265 let post_indent = self.scan_space_upto(4);
266 if post_indent < 4 {
267 indent += post_indent;
268 } else {
269 *self = save;
270 }
271 Some((c, start, indent))
272 }
273
274 pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
277 let save = self.clone();
278 self.scan_space_upto(3);
279
280 if !self.scan_ch(b'[') {
281 *self = save;
282 return None;
283 }
284 let is_checked = match self.bytes.get(self.ix) {
285 Some(&c) if is_ascii_whitespace_no_nl(c) => {
286 self.ix += 1;
287 false
288 }
289 Some(b'x') | Some(b'X') => {
290 self.ix += 1;
291 true
292 }
293 _ => {
294 *self = save;
295 return None;
296 }
297 };
298 if !self.scan_ch(b']') {
299 *self = save;
300 return None;
301 }
302 if !self
303 .bytes
304 .get(self.ix)
305 .map(|&b| is_ascii_whitespace_no_nl(b))
306 .unwrap_or(false)
307 {
308 *self = save;
309 return None;
310 }
311 Some(is_checked)
312 }
313
314 pub(crate) fn bytes_scanned(&self) -> usize {
315 self.ix
316 }
317
318 pub(crate) fn remaining_space(&self) -> usize {
319 self.spaces_remaining
320 }
321}
322
323pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
324 (c >= 0x09 && c <= 0x0d) || c == b' '
325}
326
327pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
328 c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
329}
330
331fn is_ascii_alpha(c: u8) -> bool {
332 match c {
333 b'a'..=b'z' | b'A'..=b'Z' => true,
334 _ => false,
335 }
336}
337
338fn is_ascii_alphanumeric(c: u8) -> bool {
339 match c {
340 b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
341 _ => false,
342 }
343}
344
345fn is_ascii_letterdigitdash(c: u8) -> bool {
346 c == b'-' || is_ascii_alphanumeric(c)
347}
348
349fn is_digit(c: u8) -> bool {
350 b'0' <= c && c <= b'9'
351}
352
353fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
354 match c {
355 b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => false,
356 _ => true,
357 }
358}
359
360pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
362 if !data.is_empty() && data[0] == c {
363 1
364 } else {
365 0
366 }
367}
368
369pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
370where
371 F: FnMut(u8) -> bool,
372{
373 data.iter().take_while(|&&c| f(c)).count()
374}
375
376pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
377where
378 F: FnMut(u8) -> bool,
379{
380 data.iter().rev().take_while(|&&c| f(c)).count()
381}
382
383pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
384 scan_while(data, |x| x == c)
385}
386
387pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
390 scan_while(data, is_ascii_whitespace_no_nl)
391}
392
393fn scan_attr_value_chars(data: &[u8]) -> usize {
394 scan_while(data, is_valid_unquoted_attr_value_char)
395}
396
397pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
398 if bytes.is_empty() {
399 return Some(0);
400 }
401 match bytes[0] {
402 b'\n' => Some(1),
403 b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
404 _ => None,
405 }
406}
407
408pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
409 let i = scan_whitespace_no_nl(bytes);
410 scan_eol(&bytes[i..]).map(|n| i + n)
411}
412
413pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
414 memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
415}
416
417pub(crate) fn scan_closing_code_fence(
420 bytes: &[u8],
421 fence_char: u8,
422 n_fence_char: usize,
423) -> Option<usize> {
424 if bytes.is_empty() {
425 return Some(0);
426 }
427 let mut i = 0;
428 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
429 if num_fence_chars_found < n_fence_char {
430 return None;
431 }
432 i += num_fence_chars_found;
433 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
434 i += num_trailing_spaces;
435 scan_eol(&bytes[i..]).map(|_| i)
436}
437
438pub(crate) fn scan_closing_display_math(bytes: &[u8]) -> Option<usize> {
439 if bytes.is_empty() {
440 return Some(0);
441 }
442 let mut i = 0;
443 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'$');
444 if num_fence_chars_found != 2 {
445 return None;
446 }
447 i += num_fence_chars_found;
448 let num_trailing_spaces = scan_ch_repeat(&bytes, b' ');
449 i += num_trailing_spaces;
450 scan_eol(&bytes[i..]).map(|_| i)
451}
452
453fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
455 let mut spaces = 0;
456 let mut offset = 0;
457
458 for (i, &b) in text.iter().enumerate() {
459 match b {
460 b' ' => {
461 spaces += 1;
462 if spaces == max {
463 break;
464 }
465 }
466 b'\t' => {
467 let new_spaces = spaces + 4 - (spaces & 3);
468 if new_spaces > max {
469 break;
470 }
471 spaces = new_spaces;
472 }
473 _ => break,
474 }
475 offset = i;
476 }
477
478 (offset, spaces)
479}
480
481pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
489 if bytes.len() < 3 {
490 return Err(0);
491 }
492 let c = bytes[0];
493 if !(c == b'*' || c == b'-' || c == b'_') {
494 return Err(0);
495 }
496 let mut n = 0;
497 let mut i = 0;
498
499 while i < bytes.len() {
500 match bytes[i] {
501 b'\n' | b'\r' => {
502 i += scan_eol(&bytes[i..]).unwrap_or(0);
503 break;
504 }
505 c2 if c2 == c => {
506 n += 1;
507 }
508 b' ' | b'\t' => (),
509 _ => return Err(i),
510 }
511 i += 1;
512 }
513 if n >= 3 {
514 Ok(i)
515 } else {
516 Err(i)
517 }
518}
519
520pub(crate) fn scan_frontmatter_delimiter(bytes: &[u8]) -> Option<usize> {
521 if bytes.len() < 3 {
522 return None;
523 }
524 let c = bytes[0];
525 if !(c == b'-' || c == b'+') {
526 return None;
527 }
528 let mut n = 0;
529 let mut i = 0;
530
531 while i < bytes.len() {
532 match bytes[i] {
533 b'\n' | b'\r' => {
534 i += scan_eol(&bytes[i..]).unwrap_or(0);
535 break;
536 }
537 c2 if c2 == c => {
538 n += 1;
539 }
540 b' ' | b'\t' => (),
541 _ => return None,
542 }
543 i += 1;
544 }
545 if n >= 3 {
546 Some(i)
547 } else {
548 None
549 }
550}
551
552pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<usize> {
556 let level = scan_ch_repeat(data, b'#');
557 if level >= 1 && level <= 6 && data.get(level).cloned().map_or(true, is_ascii_whitespace) {
558 Some(level)
559 } else {
560 None
561 }
562}
563
564pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, u32)> {
568 let c = *data.get(0)?;
569 if !(c == b'-' || c == b'=') {
570 return None;
571 }
572 let mut i = 1 + scan_ch_repeat(&data[1..], c);
573 i += scan_blank_line(&data[i..])?;
574 let level = if c == b'=' { 1 } else { 2 };
575 Some((i, level))
576}
577
578pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
581 let (mut i, spaces) = calc_indent(data, 4);
582 if spaces > 3 || i == data.len() {
583 return (0, vec![]);
584 }
585 let mut cols = vec![];
586 let mut active_col = Alignment::None;
587 let mut start_col = true;
588 if data[i] == b'|' {
589 i += 1;
590 }
591 for c in &data[i..] {
592 if let Some(n) = scan_eol(&data[i..]) {
593 i += n;
594 break;
595 }
596 match *c {
597 b' ' => (),
598 b':' => {
599 active_col = match (start_col, active_col) {
600 (true, Alignment::None) => Alignment::Left,
601 (false, Alignment::Left) => Alignment::Center,
602 (false, Alignment::None) => Alignment::Right,
603 _ => active_col,
604 };
605 start_col = false;
606 }
607 b'-' => {
608 start_col = false;
609 }
610 b'|' => {
611 start_col = true;
612 cols.push(active_col);
613 active_col = Alignment::None;
614 }
615 _ => {
616 cols = vec![];
617 start_col = true;
618 break;
619 }
620 }
621 i += 1;
622 }
623
624 if !start_col {
625 cols.push(active_col);
626 }
627
628 (i, cols)
629}
630
631pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
635 let c = *data.get(0)?;
636 if !(c == b'`' || c == b'~') {
637 return None;
638 }
639 let i = 1 + scan_ch_repeat(&data[1..], c);
640 if i >= 3 {
641 if c == b'`' {
642 let suffix = &data[i..];
643 let next_line = i + scan_nextline(suffix);
644 if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
646 return None;
647 }
648 }
649 Some((i, c))
650 } else {
651 None
652 }
653}
654
655pub(crate) fn scan_display_math(data: &[u8]) -> bool {
657 if data.len() < 2 || *data.get(0).unwrap() != b'$' {
658 return false;
659 }
660 let i = 1 + scan_ch_repeat(&data[1..], b'$');
661 if i == 2 {
662 let suffix = &data[i..];
663 let next_line = i + scan_nextline(suffix);
664 if suffix[..(next_line - i)].iter().any(|&b| b == b'$') {
665 return false;
666 }
667 true
668 } else {
669 false
670 }
671}
672
673pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
674 if data.starts_with(b"> ") {
675 Some(2)
676 } else {
677 None
678 }
679}
680
681pub(crate) fn scan_empty_list(data: &[u8]) -> bool {
683 let mut ix = 0;
684 for _ in 0..2 {
685 if let Some(bytes) = scan_blank_line(&data[ix..]) {
686 ix += bytes;
687 } else {
688 return false;
689 }
690 }
691 true
692}
693
694pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
696 let mut c = *bytes.get(0)?;
697 let (w, start) = match c {
698 b'-' | b'+' | b'*' => (1, 0),
699 b'0'..=b'9' => {
700 let (length, start) = parse_decimal(bytes);
701 c = *bytes.get(length)?;
702 if !(c == b'.' || c == b')') {
703 return None;
704 }
705 (length + 1, start)
706 }
707 _ => {
708 return None;
709 }
710 };
711 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
713 if postindent == 0 {
714 scan_eol(&bytes[w..])?;
715 postindent += 1;
716 } else if postindent > 4 {
717 postn = 1;
718 postindent = 1;
719 }
720 if scan_blank_line(&bytes[w..]).is_some() {
721 postn = 0;
722 postindent = 1;
723 }
724 Some((w + postn, c, start, w + postindent))
725}
726
727fn parse_decimal(bytes: &[u8]) -> (usize, usize) {
729 match bytes
730 .iter()
731 .take_while(|&&b| is_digit(b))
732 .try_fold((0, 0usize), |(count, acc), c| {
733 let digit = usize::from(c - b'0');
734 match acc
735 .checked_mul(10)
736 .and_then(|ten_acc| ten_acc.checked_add(digit))
737 {
738 Some(number) => Ok((count + 1, number)),
739 None => Err((count, acc)),
741 }
742 }) {
743 Ok(p) | Err(p) => p,
744 }
745}
746
747fn parse_hex(bytes: &[u8]) -> (usize, usize) {
749 match bytes.iter().try_fold((0, 0usize), |(count, acc), c| {
750 let mut c = *c;
751 let digit = if c >= b'0' && c <= b'9' {
752 usize::from(c - b'0')
753 } else {
754 c |= 0x20;
756 if c >= b'a' && c <= b'f' {
757 usize::from(c - b'a' + 10)
758 } else {
759 return Err((count, acc));
760 }
761 };
762 match acc
763 .checked_mul(16)
764 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
765 {
766 Some(number) => Ok((count + 1, number)),
767 None => Err((count, acc)),
769 }
770 }) {
771 Ok(p) | Err(p) => p,
772 }
773}
774
775fn char_from_codepoint(input: usize) -> Option<char> {
776 let mut codepoint = input.try_into().ok()?;
777 if codepoint == 0 {
778 codepoint = 0xFFFD;
779 }
780 char::from_u32(codepoint)
781}
782
783pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
785 let mut end = 1;
786 if scan_ch(&bytes[end..], b'#') == 1 {
787 end += 1;
788 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
789 end += 1;
790 parse_hex(&bytes[end..])
791 } else {
792 parse_decimal(&bytes[end..])
793 };
794 end += bytecount;
795 return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
796 (0, None)
797 } else if let Some(c) = char_from_codepoint(codepoint) {
798 (end + 1, Some(c.into()))
799 } else {
800 (0, None)
801 };
802 }
803 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
804 if scan_ch(&bytes[end..], b';') == 1 {
805 if let Some(value) = entities::get_entity(&bytes[1..end]) {
806 return (end + 1, Some(value.into()));
807 }
808 }
809 (0, None)
810}
811
812pub(crate) fn scan_refdef_title(text: &str) -> Option<(usize, &str)> {
815 let mut chars = text.chars().peekable();
816 let closing_delim = match chars.next()? {
817 '\'' => '\'',
818 '"' => '"',
819 '(' => ')',
820 _ => return None,
821 };
822 let mut bytecount = 1;
823
824 while let Some(c) = chars.next() {
825 match c {
826 '\n' => {
827 bytecount += 1;
828 let mut next = *chars.peek()?;
829 while is_ascii_whitespace_no_nl(next as u8) {
830 bytecount += chars.next()?.len_utf8();
831 next = *chars.peek()?;
832 }
833 if *chars.peek()? == '\n' {
834 return None;
836 }
837 }
838 '\\' => {
839 let next_char = chars.next()?;
840 bytecount += 1 + next_char.len_utf8();
841 }
842 c if c == closing_delim => {
843 return Some((bytecount + 1, &text[1..bytecount]));
844 }
845 c => {
846 bytecount += c.len_utf8();
847 }
848 }
849 }
850 None
851}
852
853pub(crate) fn scan_link_dest(
857 data: &str,
858 start_ix: usize,
859 max_next: usize,
860) -> Option<(usize, &str)> {
861 let bytes = &data.as_bytes()[start_ix..];
862 let mut i = scan_ch(bytes, b'<');
863
864 if i != 0 {
865 while i < bytes.len() {
867 match bytes[i] {
868 b'\n' | b'\r' | b'<' => return None,
869 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
870 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
871 i += 1;
872 }
873 _ => {}
874 }
875 i += 1;
876 }
877 None
878 } else {
879 let mut nest = 0;
881 while i < bytes.len() {
882 match bytes[i] {
883 0x0..=0x20 => {
884 break;
885 }
886 b'(' => {
887 if nest > max_next {
888 return None;
889 }
890 nest += 1;
891 }
892 b')' => {
893 if nest == 0 {
894 break;
895 }
896 nest -= 1;
897 }
898 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
899 i += 1;
900 }
901 _ => {}
902 }
903 i += 1;
904 }
905 Some((i, &data[start_ix..(start_ix + i)]))
906 }
907}
908
909fn scan_attribute_name(data: &[u8]) -> Option<usize> {
911 let (&c, tail) = data.split_first()?;
912 if is_ascii_alpha(c) || c == b'_' || c == b':' {
913 Some(
914 1 + scan_while(tail, |c| {
915 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
916 }),
917 )
918 } else {
919 None
920 }
921}
922
923fn scan_attribute(data: &[u8], newline_handler: Option<&dyn Fn(&[u8]) -> usize>) -> Option<usize> {
926 let allow_newline = newline_handler.is_some();
927 let whitespace_scanner =
928 |c| is_ascii_whitespace(c) && (allow_newline || c != b'\n' && c != b'\r');
929 let mut ix = scan_attribute_name(data)?;
930 let n_whitespace = scan_while(&data[ix..], whitespace_scanner);
931 ix += n_whitespace;
932 if scan_ch(&data[ix..], b'=') == 1 {
933 ix += 1;
934 ix += scan_while(&data[ix..], whitespace_scanner);
935 ix += scan_attribute_value(&data[ix..], newline_handler)?;
936 } else if n_whitespace > 0 {
937 ix -= 1;
939 }
940 Some(ix)
941}
942
943fn scan_attribute_value(
944 data: &[u8],
945 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
946) -> Option<usize> {
947 let mut i = 0;
948 match *data.get(0)? {
949 b @ b'"' | b @ b'\'' => {
950 i += 1;
951 while i < data.len() {
952 if data[i] == b {
953 return Some(i + 1);
954 }
955 if let Some(eol_bytes) = scan_eol(&data[i..]) {
956 let handler = newline_handler?;
957 i += eol_bytes;
958 i += handler(&data[i..]);
959 } else {
960 i += 1;
961 }
962 }
963 return None;
964 }
965 b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
966 return None;
967 }
968 _ => {
969 i += scan_attr_value_chars(&data[i..]);
971 }
972 }
973 Some(i)
974}
975
976pub(crate) fn unescape(input: &str) -> CowStr<'_> {
978 let mut result = String::new();
979 let mut mark = 0;
980 let mut i = 0;
981 let bytes = input.as_bytes();
982 while i < bytes.len() {
983 match bytes[i] {
984 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
985 result.push_str(&input[mark..i]);
986 mark = i + 1;
987 i += 2;
988 }
989 b'&' => match scan_entity(&bytes[i..]) {
990 (n, Some(value)) => {
991 result.push_str(&input[mark..i]);
992 result.push_str(&value);
993 i += n;
994 mark = i;
995 }
996 _ => i += 1,
997 },
998 b'\r' => {
999 result.push_str(&input[mark..i]);
1000 i += 1;
1001 mark = i;
1002 }
1003 _ => i += 1,
1004 }
1005 }
1006 if mark == 0 {
1007 input.into()
1008 } else {
1009 result.push_str(&input[mark..]);
1010 result.into()
1011 }
1012}
1013
1014pub(crate) fn scan_html_block_tag(data: &[u8]) -> (usize, &[u8]) {
1016 let i = scan_ch(data, b'/');
1017 let n = scan_while(&data[i..], is_ascii_alphanumeric);
1018 (i + n, &data[i..i + n])
1020}
1021
1022pub(crate) fn is_html_tag(tag: &[u8]) -> bool {
1023 HTML_TAGS
1024 .binary_search_by(|probe| {
1025 let probe_bytes_iter = probe.as_bytes().iter();
1026 let tag_bytes_iter = tag.iter();
1027
1028 probe_bytes_iter
1029 .zip(tag_bytes_iter)
1030 .find_map(|(&a, &b)| {
1031 match a.cmp(&(b | 0x20)) {
1034 std::cmp::Ordering::Equal => None,
1035 inequality => Some(inequality),
1036 }
1037 })
1038 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1039 })
1040 .is_ok()
1041}
1042
1043pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1045 let i = scan_html_block_inner(data, None)?;
1048 scan_blank_line(&data[i..])?;
1049 Some(i)
1050}
1051
1052pub(crate) fn scan_html_block_inner(
1058 data: &[u8],
1059 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1060) -> Option<usize> {
1061 let close_tag_bytes = scan_ch(data, b'/');
1062 let l = scan_while(&data[close_tag_bytes..], is_ascii_alpha);
1063 if l == 0 {
1064 return None;
1065 }
1066 let mut i = close_tag_bytes + l;
1067 i += scan_while(&data[i..], is_ascii_letterdigitdash);
1068
1069 if close_tag_bytes == 0 {
1070 loop {
1071 let old_i = i;
1072 loop {
1073 i += scan_whitespace_no_nl(&data[i..]);
1074 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1075 if eol_bytes == 0 {
1076 return None;
1077 }
1078 if let Some(handler) = newline_handler {
1079 i += eol_bytes;
1080 i += handler(&data[i..]);
1081 } else {
1082 return None;
1083 }
1084 } else {
1085 break;
1086 }
1087 }
1088 if let Some(b'/') | Some(b'>') = data.get(i) {
1089 break;
1090 }
1091 if old_i == i {
1092 return None;
1094 }
1095 i += scan_attribute(&data[i..], newline_handler)?;
1096 }
1097 }
1098
1099 i += scan_whitespace_no_nl(&data[i..]);
1100
1101 if close_tag_bytes == 0 {
1102 i += scan_ch(&data[i..], b'/');
1103 }
1104
1105 if scan_ch(&data[i..], b'>') == 0 {
1106 None
1107 } else {
1108 Some(i + 1)
1109 }
1110}
1111
1112pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1114 scan_uri(text, start_ix)
1115 .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1116 .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1117}
1118
1119fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1121 let bytes = &text.as_bytes()[start_ix..];
1122
1123 if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1125 return None;
1126 }
1127
1128 let mut i = 1;
1129
1130 while i < bytes.len() {
1131 let c = bytes[i];
1132 i += 1;
1133 match c {
1134 c if is_ascii_alphanumeric(c) => (),
1135 b'.' | b'-' | b'+' => (),
1136 b':' => break,
1137 _ => return None,
1138 }
1139 }
1140
1141 if i < 3 || i > 33 {
1144 return None;
1145 }
1146
1147 while i < bytes.len() {
1148 match bytes[i] {
1149 b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1150 b'\0'..=b' ' | b'<' => return None,
1151 _ => (),
1152 }
1153 i += 1;
1154 }
1155
1156 None
1157}
1158
1159fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1161 let bytes = &text.as_bytes()[start_ix..];
1163 let mut i = 0;
1164
1165 while i < bytes.len() {
1166 let c = bytes[i];
1167 i += 1;
1168 match c {
1169 c if is_ascii_alphanumeric(c) => (),
1170 b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1171 | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1172 b'@' => break,
1173 _ => return None,
1174 }
1175 }
1176
1177 loop {
1178 let label_start_ix = i;
1179 let mut fresh_label = true;
1180
1181 while i < bytes.len() {
1182 match bytes[i] {
1183 c if is_ascii_alphanumeric(c) => (),
1184 b'-' if fresh_label => {
1185 return None;
1186 }
1187 b'-' => (),
1188 _ => break,
1189 }
1190 fresh_label = false;
1191 i += 1;
1192 }
1193
1194 if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1195 return None;
1196 }
1197
1198 if scan_ch(&bytes[i..], b'.') == 0 {
1199 break;
1200 }
1201 i += 1;
1202 }
1203
1204 if scan_ch(&bytes[i..], b'>') == 0 {
1205 return None;
1206 }
1207
1208 Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1209}
1210
1211pub(crate) fn scan_inline_html_comment(
1214 bytes: &[u8],
1215 mut ix: usize,
1216 scan_guard: &mut HtmlScanGuard,
1217) -> Option<usize> {
1218 let c = *bytes.get(ix)?;
1219 ix += 1;
1220 match c {
1221 b'-' => {
1222 let dashes = scan_ch_repeat(&bytes[ix..], b'-');
1223 if dashes < 1 {
1224 return None;
1225 }
1226 ix += dashes;
1228 if scan_ch(&bytes[ix..], b'>') == 1 {
1229 return None;
1230 }
1231
1232 while let Some(x) = memchr(b'-', &bytes[ix..]) {
1233 ix += x + 1;
1234 if scan_ch(&bytes[ix..], b'-') == 1 {
1235 ix += 1;
1236 return if scan_ch(&bytes[ix..], b'>') == 1 {
1237 Some(ix + 1)
1238 } else {
1239 None
1240 };
1241 }
1242 }
1243 None
1244 }
1245 b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1246 ix += b"CDATA[".len();
1247 ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1248 let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1249 ix += close_brackets;
1250
1251 if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1252 scan_guard.cdata = ix;
1253 None
1254 } else {
1255 Some(ix + 1)
1256 }
1257 }
1258 b'A'..=b'Z' if ix > scan_guard.declaration => {
1259 ix += scan_while(&bytes[ix..], |c| c >= b'A' && c <= b'Z');
1261 let whitespace = scan_while(&bytes[ix..], is_ascii_whitespace);
1262 if whitespace == 0 {
1263 return None;
1264 }
1265 ix += whitespace;
1266 ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1267 if scan_ch(&bytes[ix..], b'>') == 0 {
1268 scan_guard.declaration = ix;
1269 None
1270 } else {
1271 Some(ix + 1)
1272 }
1273 }
1274 _ => None,
1275 }
1276}
1277
1278pub(crate) fn scan_inline_html_processing(
1281 bytes: &[u8],
1282 mut ix: usize,
1283 scan_guard: &mut HtmlScanGuard,
1284) -> Option<usize> {
1285 if ix <= scan_guard.processing {
1286 return None;
1287 }
1288 while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1289 ix += offset + 1;
1290 if scan_ch(&bytes[ix..], b'>') == 1 {
1291 return Some(ix + 1);
1292 }
1293 }
1294 scan_guard.processing = ix;
1295 None
1296}
1297
1298#[cfg(test)]
1299mod test {
1300 use super::*;
1301 #[test]
1302 fn overflow_list() {
1303 assert!(
1304 scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1305 );
1306 }
1307
1308 #[test]
1309 fn overflow_by_addition() {
1310 assert!(scan_listitem(b"1844674407370955161615!").is_none());
1311 }
1312}