use std::cmp::max;
use std::ops::Range;
use crate::parse::{scan_containers, Allocations, HeadingAttributes, Item, ItemBody, LinkDef};
use crate::scanners::*;
use crate::strings::CowStr;
use crate::tree::{Tree, TreeIndex};
use crate::Options;
use crate::{
linklabel::{scan_link_label_rest, LinkLabel},
HeadingLevel,
};
use unicase::UniCase;
pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations) {
let start_capacity = max(128, text.len() / 32);
let lookup_table = &create_lut(&options);
let first_pass = FirstPass {
text,
tree: Tree::with_capacity(start_capacity),
begin_list_item: false,
last_line_blank: false,
allocs: Allocations::new(),
options,
lookup_table,
};
first_pass.run()
}
struct FirstPass<'a, 'b> {
text: &'a str,
tree: Tree<Item>,
begin_list_item: bool,
last_line_blank: bool,
allocs: Allocations<'a>,
options: Options,
lookup_table: &'b LookupTable,
}
impl<'a, 'b> FirstPass<'a, 'b> {
fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
let mut ix = 0;
while ix < self.text.len() {
ix = self.parse_block(ix);
}
for _ in 0..self.tree.spine_len() {
self.pop(ix);
}
(self.tree, self.allocs)
}
fn parse_block(&mut self, mut start_ix: usize) -> usize {
let bytes = self.text.as_bytes();
let mut line_start = LineStart::new(&bytes[start_ix..]);
let i = scan_containers(&self.tree, &mut line_start);
for _ in i..self.tree.spine_len() {
self.pop(start_ix);
}
if self.options.contains(Options::ENABLE_FOOTNOTES) {
if let Some(node_ix) = self.tree.peek_up() {
if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
if self.last_line_blank {
self.pop(start_ix);
}
}
}
let container_start = start_ix + line_start.bytes_scanned();
if let Some(bytecount) = self.parse_footnote(container_start) {
start_ix = container_start + bytecount;
start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
line_start = LineStart::new(&bytes[start_ix..]);
}
}
loop {
let container_start = start_ix + line_start.bytes_scanned();
if let Some((ch, index, indent)) = line_start.scan_list_marker() {
let after_marker_index = start_ix + line_start.bytes_scanned();
self.continue_list(container_start, ch, index);
self.tree.append(Item {
start: container_start,
end: after_marker_index, body: ItemBody::ListItem(indent),
});
self.tree.push();
if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
self.begin_list_item = true;
return after_marker_index + n;
}
if self.options.contains(Options::ENABLE_TASKLISTS) {
if let Some(is_checked) = line_start.scan_task_list_marker() {
self.tree.append(Item {
start: after_marker_index,
end: start_ix + line_start.bytes_scanned(),
body: ItemBody::TaskListMarker(is_checked),
});
}
}
} else if line_start.scan_blockquote_marker() {
self.finish_list(start_ix);
self.tree.append(Item {
start: container_start,
end: 0, body: ItemBody::BlockQuote,
});
self.tree.push();
} else {
break;
}
}
let ix = start_ix + line_start.bytes_scanned();
if let Some(n) = scan_blank_line(&bytes[ix..]) {
if let Some(node_ix) = self.tree.peek_up() {
match self.tree[node_ix].item.body {
ItemBody::BlockQuote => (),
_ => {
if self.begin_list_item {
self.pop(start_ix);
}
self.last_line_blank = true;
}
}
}
return ix + n;
}
self.begin_list_item = false;
self.finish_list(start_ix);
let remaining_space = line_start.remaining_space();
let indent = line_start.scan_space_upto(4);
if indent == 4 {
let ix = start_ix + line_start.bytes_scanned();
let remaining_space = line_start.remaining_space();
return self.parse_indented_code_block(ix, remaining_space);
}
let ix = start_ix + line_start.bytes_scanned();
if bytes[ix] == b'<' {
if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
return self.parse_html_block_type_1_to_5(ix, html_end_tag, remaining_space);
}
if starts_html_block_type_6(&bytes[(ix + 1)..]) {
return self.parse_html_block_type_6_or_7(ix, remaining_space);
}
if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
return self.parse_html_block_type_6_or_7(ix, remaining_space);
}
}
if let Ok(n) = scan_hrule(&bytes[ix..]) {
return self.parse_hrule(n, ix);
}
if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
return self.parse_atx_heading(ix, atx_size);
}
if let Some((bytecount, label, link_def)) = self.parse_refdef_total(ix) {
self.allocs.refdefs.0.entry(label).or_insert(link_def);
let ix = ix + bytecount;
return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
}
if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
return self.parse_fenced_code_block(ix, indent, fence_ch, n);
}
self.parse_paragraph(ix)
}
fn parse_table(&mut self, table_cols: usize, head_start: usize, body_start: usize) -> usize {
let (_sep_start, thead_ix) = self.parse_table_row_inner(head_start, table_cols);
self.tree[thead_ix].item.body = ItemBody::TableHead;
let mut ix = body_start;
while let Some((next_ix, _row_ix)) = self.parse_table_row(ix, table_cols) {
ix = next_ix;
}
self.pop(ix);
ix
}
fn parse_table_row_inner(&mut self, mut ix: usize, row_cells: usize) -> (usize, TreeIndex) {
let bytes = self.text.as_bytes();
let mut cells = 0;
let mut final_cell_ix = None;
let row_ix = self.tree.append(Item {
start: ix,
end: 0, body: ItemBody::TableRow,
});
self.tree.push();
loop {
ix += scan_ch(&bytes[ix..], b'|');
let start_ix = ix;
ix += scan_whitespace_no_nl(&bytes[ix..]);
if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
ix += eol_bytes;
break;
}
let cell_ix = self.tree.append(Item {
start: start_ix,
end: ix,
body: ItemBody::TableCell,
});
self.tree.push();
let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active);
if let Some(cur_ix) = self.tree.cur() {
let trailing_whitespace = scan_rev_while(&bytes[..next_ix], is_ascii_whitespace);
self.tree[cur_ix].item.end -= trailing_whitespace;
}
self.tree[cell_ix].item.end = next_ix;
self.tree.pop();
ix = next_ix;
cells += 1;
if cells == row_cells {
final_cell_ix = Some(cell_ix);
}
}
for _ in cells..row_cells {
self.tree.append(Item {
start: ix,
end: ix,
body: ItemBody::TableCell,
});
}
if let Some(cell_ix) = final_cell_ix {
self.tree[cell_ix].next = None;
}
self.pop(ix);
(ix, row_ix)
}
fn parse_table_row(&mut self, mut ix: usize, row_cells: usize) -> Option<(usize, TreeIndex)> {
let bytes = self.text.as_bytes();
let mut line_start = LineStart::new(&bytes[ix..]);
let current_container =
scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
if !current_container {
return None;
}
line_start.scan_all_space();
ix += line_start.bytes_scanned();
if scan_paragraph_interrupt(&bytes[ix..], current_container) {
return None;
}
let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells);
Some((ix, row_ix))
}
fn parse_paragraph(&mut self, start_ix: usize) -> usize {
let node_ix = self.tree.append(Item {
start: start_ix,
end: 0, body: ItemBody::Paragraph,
});
self.tree.push();
let bytes = self.text.as_bytes();
let mut ix = start_ix;
loop {
if self.options.contains(Options::ENABLE_FOOTNOTES) && self.get_footnote(ix).is_some() {
self.tree[node_ix].item.end = if ix > start_ix { ix - 1 } else { start_ix };
self.tree.pop();
if let Some(node_ix) = self.tree.peek_up() {
if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
self.pop(ix);
}
}
return ix;
}
let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
TableParseMode::Scan
} else {
TableParseMode::Disabled
};
let (next_ix, brk) = self.parse_line(ix, None, scan_mode);
if let Some(Item {
body: ItemBody::Table(alignment_ix),
..
}) = brk
{
let table_cols = self.allocs[alignment_ix].len();
self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
self.tree[node_ix].child = None;
self.tree.pop();
self.tree.push();
return self.parse_table(table_cols, ix, next_ix);
}
ix = next_ix;
let mut line_start = LineStart::new(&bytes[ix..]);
let current_container =
scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
if !line_start.scan_space(4) {
let ix_new = ix + line_start.bytes_scanned();
if current_container {
let trailing_backslash_pos = match brk {
Some(Item {
start,
body: ItemBody::HardBreak,
..
}) if bytes[start] == b'\\' => Some(start),
_ => None,
};
if let Some(ix_setext) =
self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some())
{
if let Some(pos) = trailing_backslash_pos {
self.tree.append_text(pos, pos + 1);
}
ix = ix_setext;
break;
}
}
let suffix = &bytes[ix_new..];
if scan_paragraph_interrupt(suffix, current_container) {
break;
}
}
line_start.scan_all_space();
if line_start.is_at_eol() {
break;
}
ix = next_ix + line_start.bytes_scanned();
if let Some(item) = brk {
self.tree.append(item);
}
}
self.pop(ix);
ix
}
fn parse_setext_heading(
&mut self,
ix: usize,
node_ix: TreeIndex,
has_trailing_content: bool,
) -> Option<usize> {
let bytes = self.text.as_bytes();
let (n, level) = scan_setext_heading(&bytes[ix..])?;
let mut attrs = None;
if let Some(cur_ix) = self.tree.cur() {
let parent_ix = self.tree.peek_up().unwrap();
let header_start = self.tree[parent_ix].item.start;
let header_end = self.tree[cur_ix].item.end;
let (content_end, attrs_) =
self.extract_and_parse_heading_attribute_block(header_start, header_end);
attrs = attrs_;
let new_end = if has_trailing_content {
content_end
} else {
let trailing_ws =
scan_rev_while(&bytes[header_start..content_end], is_ascii_whitespace_no_nl);
content_end - trailing_ws
};
if attrs.is_some() {
self.tree.truncate_siblings(self.text.as_bytes(), new_end);
}
if let Some(cur_ix) = self.tree.cur() {
self.tree[cur_ix].item.end = new_end;
}
}
self.tree[node_ix].item.body = ItemBody::Heading(
level,
attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
);
Some(ix + n)
}
fn parse_line(
&mut self,
start: usize,
end: Option<usize>,
mode: TableParseMode,
) -> (usize, Option<Item>) {
let bytes = self.text.as_bytes();
let bytes = match end {
Some(end) => &bytes[..end],
None => bytes,
};
let bytes_len = bytes.len();
let mut pipes = 0;
let mut last_pipe_ix = start;
let mut begin_text = start;
let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| {
match byte {
b'\n' | b'\r' => {
if let TableParseMode::Active = mode {
return LoopInstruction::BreakAtWith(ix, None);
}
let mut i = ix;
let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
if mode == TableParseMode::Scan && pipes > 0 {
let next_line_ix = ix + eol_bytes;
let mut line_start = LineStart::new(&bytes[next_line_ix..]);
if scan_containers(&self.tree, &mut line_start) == self.tree.spine_len() {
let table_head_ix = next_line_ix + line_start.bytes_scanned();
let (table_head_bytes, alignment) =
scan_table_head(&bytes[table_head_ix..]);
if table_head_bytes > 0 {
let header_count =
count_header_cols(bytes, pipes, start, last_pipe_ix);
if alignment.len() == header_count {
let alignment_ix = self.allocs.allocate_alignment(alignment);
let end_ix = table_head_ix + table_head_bytes;
return LoopInstruction::BreakAtWith(
end_ix,
Some(Item {
start: i,
end: end_ix, body: ItemBody::Table(alignment_ix),
}),
);
}
}
}
}
let end_ix = ix + eol_bytes;
let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
if trailing_backslashes % 2 == 1 && end_ix < bytes_len {
i -= 1;
self.tree.append_text(begin_text, i);
return LoopInstruction::BreakAtWith(
end_ix,
Some(Item {
start: i,
end: end_ix,
body: ItemBody::HardBreak,
}),
);
}
let trailing_whitespace =
scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
if trailing_whitespace >= 2 {
i -= trailing_whitespace;
self.tree.append_text(begin_text, i);
return LoopInstruction::BreakAtWith(
end_ix,
Some(Item {
start: i,
end: end_ix,
body: ItemBody::HardBreak,
}),
);
}
self.tree.append_text(begin_text, ix);
LoopInstruction::BreakAtWith(
end_ix,
Some(Item {
start: i,
end: end_ix,
body: ItemBody::SoftBreak,
}),
)
}
b'\\' => {
if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) {
self.tree.append_text(begin_text, ix);
if bytes[ix + 1] == b'`' {
let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
self.tree.append(Item {
start: ix + 1,
end: ix + count + 1,
body: ItemBody::MaybeCode(count, true),
});
begin_text = ix + 1 + count;
LoopInstruction::ContinueAndSkip(count)
} else {
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(1)
}
} else {
LoopInstruction::ContinueAndSkip(0)
}
}
c @ b'*' | c @ b'_' | c @ b'~' => {
let string_suffix = &self.text[ix..];
let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
let can_open = delim_run_can_open(self.text, string_suffix, count, ix);
let can_close = delim_run_can_close(self.text, string_suffix, count, ix);
let is_valid_seq = c != b'~' || count <= 2;
if (can_open || can_close) && is_valid_seq {
self.tree.append_text(begin_text, ix);
for i in 0..count {
self.tree.append(Item {
start: ix + i,
end: ix + i + 1,
body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
});
}
begin_text = ix + count;
}
LoopInstruction::ContinueAndSkip(count - 1)
}
b'`' => {
self.tree.append_text(begin_text, ix);
let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
self.tree.append(Item {
start: ix,
end: ix + count,
body: ItemBody::MaybeCode(count, false),
});
begin_text = ix + count;
LoopInstruction::ContinueAndSkip(count - 1)
}
b'<' => {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 1,
body: ItemBody::MaybeHtml,
});
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(0)
}
b'!' => {
if ix + 1 < bytes_len && bytes[ix + 1] == b'[' {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 2,
body: ItemBody::MaybeImage,
});
begin_text = ix + 2;
LoopInstruction::ContinueAndSkip(1)
} else {
LoopInstruction::ContinueAndSkip(0)
}
}
b'[' => {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 1,
body: ItemBody::MaybeLinkOpen,
});
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(0)
}
b']' => {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 1,
body: ItemBody::MaybeLinkClose(true),
});
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(0)
}
b'&' => match scan_entity(&bytes[ix..]) {
(n, Some(value)) => {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + n,
body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
});
begin_text = ix + n;
LoopInstruction::ContinueAndSkip(n - 1)
}
_ => LoopInstruction::ContinueAndSkip(0),
},
b'|' => {
if let TableParseMode::Active = mode {
LoopInstruction::BreakAtWith(ix, None)
} else {
last_pipe_ix = ix;
pipes += 1;
LoopInstruction::ContinueAndSkip(0)
}
}
b'.' => {
if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 3,
body: ItemBody::SynthesizeChar('…'),
});
begin_text = ix + 3;
LoopInstruction::ContinueAndSkip(2)
} else {
LoopInstruction::ContinueAndSkip(0)
}
}
b'-' => {
let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
if count == 1 {
LoopInstruction::ContinueAndSkip(0)
} else {
let itembody = if count == 2 {
ItemBody::SynthesizeChar('–')
} else if count == 3 {
ItemBody::SynthesizeChar('—')
} else {
let (ems, ens) = match count % 6 {
0 | 3 => (count / 3, 0),
2 | 4 => (0, count / 2),
1 => (count / 3 - 1, 2),
_ => (count / 3, 1),
};
let mut buf = String::with_capacity(3 * (ems + ens));
for _ in 0..ems {
buf.push('—');
}
for _ in 0..ens {
buf.push('–');
}
ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
};
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + count,
body: itembody,
});
begin_text = ix + count;
LoopInstruction::ContinueAndSkip(count - 1)
}
}
c @ b'\'' | c @ b'"' => {
let string_suffix = &self.text[ix..];
let can_open = delim_run_can_open(self.text, string_suffix, 1, ix);
let can_close = delim_run_can_close(self.text, string_suffix, 1, ix);
self.tree.append_text(begin_text, ix);
self.tree.append(Item {
start: ix,
end: ix + 1,
body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
});
begin_text = ix + 1;
LoopInstruction::ContinueAndSkip(0)
}
_ => LoopInstruction::ContinueAndSkip(0),
}
});
if brk.is_none() {
self.tree.append_text(begin_text, final_ix);
}
(final_ix, brk)
}
fn parse_html_block_type_1_to_5(
&mut self,
start_ix: usize,
html_end_tag: &str,
mut remaining_space: usize,
) -> usize {
let bytes = self.text.as_bytes();
let mut ix = start_ix;
loop {
let line_start_ix = ix;
ix += scan_nextline(&bytes[ix..]);
self.append_html_line(remaining_space, line_start_ix, ix);
let mut line_start = LineStart::new(&bytes[ix..]);
let n_containers = scan_containers(&self.tree, &mut line_start);
if n_containers < self.tree.spine_len() {
break;
}
if (&self.text[line_start_ix..ix]).contains(html_end_tag) {
break;
}
let next_line_ix = ix + line_start.bytes_scanned();
if next_line_ix == self.text.len() {
break;
}
ix = next_line_ix;
remaining_space = line_start.remaining_space();
}
ix
}
fn parse_html_block_type_6_or_7(
&mut self,
start_ix: usize,
mut remaining_space: usize,
) -> usize {
let bytes = self.text.as_bytes();
let mut ix = start_ix;
loop {
let line_start_ix = ix;
ix += scan_nextline(&bytes[ix..]);
self.append_html_line(remaining_space, line_start_ix, ix);
let mut line_start = LineStart::new(&bytes[ix..]);
let n_containers = scan_containers(&self.tree, &mut line_start);
if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
break;
}
let next_line_ix = ix + line_start.bytes_scanned();
if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
{
break;
}
ix = next_line_ix;
remaining_space = line_start.remaining_space();
}
ix
}
fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
self.tree.append(Item {
start: start_ix,
end: 0, body: ItemBody::IndentCodeBlock,
});
self.tree.push();
let bytes = self.text.as_bytes();
let mut last_nonblank_child = None;
let mut last_nonblank_ix = 0;
let mut end_ix = 0;
let mut last_line_blank = false;
let mut ix = start_ix;
loop {
let line_start_ix = ix;
ix += scan_nextline(&bytes[ix..]);
self.append_code_text(remaining_space, line_start_ix, ix);
if !last_line_blank {
last_nonblank_child = self.tree.cur();
last_nonblank_ix = ix;
end_ix = ix;
}
let mut line_start = LineStart::new(&bytes[ix..]);
let n_containers = scan_containers(&self.tree, &mut line_start);
if n_containers < self.tree.spine_len()
|| !(line_start.scan_space(4) || line_start.is_at_eol())
{
break;
}
let next_line_ix = ix + line_start.bytes_scanned();
if next_line_ix == self.text.len() {
break;
}
ix = next_line_ix;
remaining_space = line_start.remaining_space();
last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
}
if let Some(child) = last_nonblank_child {
self.tree[child].next = None;
self.tree[child].item.end = last_nonblank_ix;
}
self.pop(end_ix);
ix
}
fn parse_fenced_code_block(
&mut self,
start_ix: usize,
indent: usize,
fence_ch: u8,
n_fence_char: usize,
) -> usize {
let bytes = self.text.as_bytes();
let mut info_start = start_ix + n_fence_char;
info_start += scan_whitespace_no_nl(&bytes[info_start..]);
let mut ix = info_start + scan_nextline(&bytes[info_start..]);
let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
let info_string = unescape(&self.text[info_start..info_end]);
self.tree.append(Item {
start: start_ix,
end: 0, body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
});
self.tree.push();
loop {
let mut line_start = LineStart::new(&bytes[ix..]);
let n_containers = scan_containers(&self.tree, &mut line_start);
if n_containers < self.tree.spine_len() {
break;
}
line_start.scan_space(indent);
let mut close_line_start = line_start.clone();
if !close_line_start.scan_space(4) {
let close_ix = ix + close_line_start.bytes_scanned();
if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
{
ix = close_ix + n;
break;
}
}
let remaining_space = line_start.remaining_space();
ix += line_start.bytes_scanned();
let next_ix = ix + scan_nextline(&bytes[ix..]);
self.append_code_text(remaining_space, ix, next_ix);
ix = next_ix;
}
self.pop(ix);
ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
}
fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
if remaining_space > 0 {
let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
self.tree.append(Item {
start,
end: start,
body: ItemBody::SynthesizeText(cow_ix),
});
}
if self.text.as_bytes()[end - 2] == b'\r' {
self.tree.append_text(start, end - 2);
self.tree.append_text(end - 1, end);
} else {
self.tree.append_text(start, end);
}
}
fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
if remaining_space > 0 {
let cow_ix = self.allocs.allocate_cow(" "[..remaining_space].into());
self.tree.append(Item {
start,
end: start,
body: ItemBody::SynthesizeText(cow_ix),
});
}
if self.text.as_bytes()[end - 2] == b'\r' {
self.tree.append(Item {
start,
end: end - 2,
body: ItemBody::Html,
});
self.tree.append(Item {
start: end - 1,
end,
body: ItemBody::Html,
});
} else {
self.tree.append(Item {
start,
end,
body: ItemBody::Html,
});
}
}
fn pop(&mut self, ix: usize) {
let cur_ix = self.tree.pop().unwrap();
self.tree[cur_ix].item.end = ix;
if let ItemBody::List(true, _, _) = self.tree[cur_ix].item.body {
surgerize_tight_list(&mut self.tree, cur_ix);
}
}
fn finish_list(&mut self, ix: usize) {
if let Some(node_ix) = self.tree.peek_up() {
if let ItemBody::List(_, _, _) = self.tree[node_ix].item.body {
self.pop(ix);
}
}
if self.last_line_blank {
if let Some(node_ix) = self.tree.peek_grandparent() {
if let ItemBody::List(ref mut is_tight, _, _) = self.tree[node_ix].item.body {
*is_tight = false;
}
}
self.last_line_blank = false;
}
}
fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
if let Some(node_ix) = self.tree.peek_up() {
if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
if existing_ch == ch {
if self.last_line_blank {
*is_tight = false;
self.last_line_blank = false;
}
return;
}
}
self.finish_list(start);
}
self.tree.append(Item {
start,
end: 0, body: ItemBody::List(true, ch, index),
});
self.tree.push();
self.last_line_blank = false;
}
fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
self.tree.append(Item {
start: ix,
end: ix + hrule_size,
body: ItemBody::Rule,
});
ix + hrule_size
}
fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize {
let mut ix = start;
let heading_ix = self.tree.append(Item {
start,
end: 0, body: ItemBody::default(), });
ix += atx_level as usize;
let bytes = self.text.as_bytes();
if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
self.tree[heading_ix].item.end = ix + eol_bytes;
self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None);
return ix + eol_bytes;
}
let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
ix += skip_spaces;
let header_start = ix;
let header_node_idx = self.tree.push();
let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES)
{
let header_end = header_start + scan_nextline(&bytes[header_start..]);
let (content_end, attrs) =
self.extract_and_parse_heading_attribute_block(header_start, header_end);
self.parse_line(ix, Some(content_end), TableParseMode::Disabled);
(header_end, content_end, attrs)
} else {
ix = self.parse_line(ix, None, TableParseMode::Disabled).0;
(ix, ix, None)
};
self.tree[header_node_idx].item.end = end;
if let Some(cur_ix) = self.tree.cur() {
let header_text = &bytes[header_start..content_end];
let mut limit = header_text
.iter()
.rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
.map_or(0, |i| i + 1);
let closer = header_text[..limit]
.iter()
.rposition(|&b| b != b'#')
.map_or(0, |i| i + 1);
if closer == 0 {
limit = closer;
} else {
let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
if spaces > 0 {
limit = closer - spaces;
}
}
self.tree[cur_ix].item.end = limit + header_start;
}
self.tree.pop();
self.tree[heading_ix].item.body = ItemBody::Heading(
atx_level,
attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
);
end
}
fn get_footnote(&mut self, start: usize) -> Option<(usize, CowStr<'a>)> {
let bytes = &self.text.as_bytes()[start..];
if !bytes.starts_with(b"[^") {
return None;
}
let (mut i, label) = self.parse_refdef_label(start + 2)?;
i += 2;
if scan_ch(&bytes[i..], b':') == 0 {
return None;
}
i += 1;
Some((i, label))
}
fn parse_footnote(&mut self, start: usize) -> Option<usize> {
let (i, label) = self.get_footnote(start)?;
self.finish_list(start);
self.tree.append(Item {
start,
end: 0, body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
});
self.tree.push();
Some(i)
}
fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
scan_link_label_rest(&self.text[start..], &|bytes| {
let mut line_start = LineStart::new(bytes);
let current_container =
scan_containers(&self.tree, &mut line_start) == self.tree.spine_len();
let bytes_scanned = line_start.bytes_scanned();
let suffix = &bytes[bytes_scanned..];
if scan_paragraph_interrupt(suffix, current_container) {
None
} else {
Some(bytes_scanned)
}
})
}
fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
let bytes = &self.text.as_bytes()[start..];
if scan_ch(bytes, b'[') == 0 {
return None;
}
let (mut i, label) = self.parse_refdef_label(start + 1)?;
i += 1;
if scan_ch(&bytes[i..], b':') == 0 {
return None;
}
i += 1;
let (bytecount, link_def) = self.scan_refdef(start, start + i)?;
Some((bytecount + i, UniCase::new(label), link_def))
}
fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
let mut newlines = 0;
loop {
let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
i += whitespaces;
if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
i += eol_bytes;
newlines += 1;
if newlines > 1 {
return None;
}
} else {
break;
}
let mut line_start = LineStart::new(&bytes[i..]);
if self.tree.spine_len() != scan_containers(&self.tree, &mut line_start) {
return None;
}
i += line_start.bytes_scanned();
}
Some((i, newlines))
}
fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> {
let bytes = self.text.as_bytes();
let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
let (dest_length, dest) = scan_link_dest(self.text, i, 1)?;
if dest_length == 0 {
return None;
}
let dest = unescape(dest);
i += dest_length;
let mut backup = (
i - start,
LinkDef {
dest,
title: None,
span: span_start..i,
},
);
let (mut i, newlines) =
if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
if i == self.text.len() {
newlines += 1;
}
if new_i == i && newlines == 0 {
return None;
}
if newlines > 1 {
return Some(backup);
};
(new_i, newlines)
} else {
return Some(backup);
};
if let Some((title_length, title)) = scan_refdef_title(&self.text[i..]) {
i += title_length;
backup.1.span = span_start..i;
backup.1.title = Some(unescape(title));
} else if newlines > 0 {
return Some(backup);
} else {
return None;
};
if let Some(bytes) = scan_blank_line(&bytes[i..]) {
backup.0 = i + bytes - start;
Some(backup)
} else if newlines > 0 {
Some(backup)
} else {
None
}
}
fn extract_and_parse_heading_attribute_block(
&mut self,
header_start: usize,
header_end: usize,
) -> (usize, Option<HeadingAttributes<'a>>) {
if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) {
return (header_end, None);
}
let header_bytes = &self.text.as_bytes()[header_start..header_end];
let (content_len, attr_block_range_rel) =
extract_attribute_block_content_from_header_text(header_bytes);
let content_end = header_start + content_len;
let attrs = attr_block_range_rel.and_then(|r| {
parse_inside_attribute_block(
&self.text[(header_start + r.start)..(header_start + r.end)],
)
});
(content_end, attrs)
}
}
#[derive(PartialEq, Eq, Copy, Clone)]
enum TableParseMode {
Scan,
Active,
Disabled,
}
fn count_header_cols(
bytes: &[u8],
mut pipes: usize,
mut start: usize,
last_pipe_ix: usize,
) -> usize {
start += scan_whitespace_no_nl(&bytes[start..]);
if bytes[start] == b'|' {
pipes -= 1;
}
if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
pipes
} else {
pipes + 1
}
}
fn scan_paragraph_interrupt(bytes: &[u8], current_container: bool) -> bool {
scan_eol(bytes).is_some()
|| scan_hrule(bytes).is_ok()
|| scan_atx_heading(bytes).is_some()
|| scan_code_fence(bytes).is_some()
|| scan_blockquote_start(bytes).is_some()
|| scan_listitem(bytes).map_or(false, |(ix, delim, index, _)| {
! current_container ||
(delim == b'*' || delim == b'-' || delim == b'+' || index == 1)
&& !scan_empty_list(&bytes[ix..])
})
|| bytes.starts_with(b"<")
&& (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..]))
}
fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre", b"style", b"script", b"textarea"];
static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
for (beg_tag, end_tag) in BEGIN_TAGS
.iter()
.zip(["</pre>", "</style>", "</script>", "</textarea>"].iter())
{
let tag_len = beg_tag.len();
if text_bytes.len() < tag_len {
break;
}
if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
continue;
}
if text_bytes.len() == tag_len {
return Some(end_tag);
}
let s = text_bytes[tag_len];
if is_ascii_whitespace(s) || s == b'>' {
return Some(end_tag);
}
}
for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
if text_bytes.starts_with(beg_tag) {
return Some(end_tag);
}
}
if text_bytes.len() > 1
&& text_bytes[0] == b'!'
&& text_bytes[1] >= b'A'
&& text_bytes[1] <= b'Z'
{
Some(">")
} else {
None
}
}
fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
let mut list_item = tree[list_ix].child;
while let Some(listitem_ix) = list_item {
let list_item_firstborn = tree[listitem_ix].child;
if let Some(firstborn_ix) = list_item_firstborn {
if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
tree[listitem_ix].child = tree[firstborn_ix].child;
}
let mut list_item_child = Some(firstborn_ix);
let mut node_to_repoint = None;
while let Some(child_ix) = list_item_child {
let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
if let Some(child_firstborn) = tree[child_ix].child {
if let Some(repoint_ix) = node_to_repoint {
tree[repoint_ix].next = Some(child_firstborn);
}
let mut child_lastborn = child_firstborn;
while let Some(lastborn_next_ix) = tree[child_lastborn].next {
child_lastborn = lastborn_next_ix;
}
child_lastborn
} else {
child_ix
}
} else {
child_ix
};
node_to_repoint = Some(repoint_ix);
tree[repoint_ix].next = tree[child_ix].next;
list_item_child = tree[child_ix].next;
}
}
list_item = tree[listitem_ix].next;
}
}
fn delim_run_can_open(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
let next_char = if let Some(c) = suffix.chars().nth(run_len) {
c
} else {
return false;
};
if next_char.is_whitespace() {
return false;
}
if ix == 0 {
return true;
}
let delim = suffix.chars().next().unwrap();
if delim == '*' && !is_punctuation(next_char) {
return true;
}
let prev_char = s[..ix].chars().last().unwrap();
prev_char.is_whitespace()
|| is_punctuation(prev_char) && (delim != '\'' || ![']', ')'].contains(&prev_char))
}
fn delim_run_can_close(s: &str, suffix: &str, run_len: usize, ix: usize) -> bool {
if ix == 0 {
return false;
}
let prev_char = s[..ix].chars().last().unwrap();
if prev_char.is_whitespace() {
return false;
}
let next_char = if let Some(c) = suffix.chars().nth(run_len) {
c
} else {
return true;
};
let delim = suffix.chars().next().unwrap();
if delim == '*' && !is_punctuation(prev_char) {
return true;
}
next_char.is_whitespace() || is_punctuation(next_char)
}
fn create_lut(options: &Options) -> LookupTable {
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
{
LookupTable {
simd: simd::compute_lookup(options),
scalar: special_bytes(options),
}
}
#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
{
special_bytes(options)
}
}
fn special_bytes(options: &Options) -> [bool; 256] {
let mut bytes = [false; 256];
let standard_bytes = [
b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
];
for &byte in &standard_bytes {
bytes[byte as usize] = true;
}
if options.contains(Options::ENABLE_TABLES) {
bytes[b'|' as usize] = true;
}
if options.contains(Options::ENABLE_STRIKETHROUGH) {
bytes[b'~' as usize] = true;
}
if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
for &byte in &[b'.', b'-', b'"', b'\''] {
bytes[byte as usize] = true;
}
}
bytes
}
enum LoopInstruction<T> {
ContinueAndSkip(usize),
BreakAtWith(usize, T),
}
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
struct LookupTable {
simd: [u8; 16],
scalar: [bool; 256],
}
#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
type LookupTable = [bool; 256];
fn iterate_special_bytes<F, T>(
lut: &LookupTable,
bytes: &[u8],
ix: usize,
callback: F,
) -> (usize, Option<T>)
where
F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
{
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
{
simd::iterate_special_bytes(lut, bytes, ix, callback)
}
#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
{
scalar_iterate_special_bytes(lut, bytes, ix, callback)
}
}
fn scalar_iterate_special_bytes<F, T>(
lut: &[bool; 256],
bytes: &[u8],
mut ix: usize,
mut callback: F,
) -> (usize, Option<T>)
where
F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
{
while ix < bytes.len() {
let b = bytes[ix];
if lut[b as usize] {
match callback(ix, b) {
LoopInstruction::ContinueAndSkip(skip) => {
ix += skip;
}
LoopInstruction::BreakAtWith(ix, val) => {
return (ix, val);
}
}
}
ix += 1;
}
(ix, None)
}
fn extract_attribute_block_content_from_header_text(
heading: &[u8],
) -> (usize, Option<Range<usize>>) {
let heading_len = heading.len();
let mut ix = heading_len;
ix -= scan_rev_while(heading, |b| {
b == b'\n' || b == b'\r' || b == b' ' || b == b'\t'
});
if ix == 0 {
return (heading_len, None);
}
let attr_block_close = ix - 1;
if heading.get(attr_block_close) != Some(&b'}') {
return (heading_len, None);
}
ix -= 1;
ix -= scan_rev_while(&heading[..ix], |b| {
!matches!(b, b'{' | b'}' | b'<' | b'>' | b'\\' | b'\n' | b'\r')
});
if ix == 0 {
return (heading_len, None);
}
let attr_block_open = ix - 1;
if heading[attr_block_open] != b'{' {
return (heading_len, None);
}
(attr_block_open, Some(ix..attr_block_close))
}
fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes> {
let mut id = None;
let mut classes = Vec::new();
for attr in inside_attr_block.split_ascii_whitespace() {
if attr.len() > 1 {
let first_byte = attr.as_bytes()[0];
if first_byte == b'#' {
id = Some(&attr[1..]);
} else if first_byte == b'.' {
classes.push(&attr[1..]);
}
}
}
Some(HeadingAttributes { id, classes })
}
#[cfg(all(target_arch = "x86_64", feature = "simd"))]
mod simd {
use super::{LookupTable, LoopInstruction};
use crate::Options;
use core::arch::x86_64::*;
const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
pub(super) fn compute_lookup(options: &Options) -> [u8; 16] {
let mut lookup = [0u8; 16];
let standard_bytes = [
b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
];
for &byte in &standard_bytes {
add_lookup_byte(&mut lookup, byte);
}
if options.contains(Options::ENABLE_TABLES) {
add_lookup_byte(&mut lookup, b'|');
}
if options.contains(Options::ENABLE_STRIKETHROUGH) {
add_lookup_byte(&mut lookup, b'~');
}
if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
for &byte in &[b'.', b'-', b'"', b'\''] {
add_lookup_byte(&mut lookup, byte);
}
}
lookup
}
fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) {
lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4);
}
#[target_feature(enable = "ssse3")]
#[inline]
unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 {
debug_assert!(bytes.len() >= ix + VECTOR_SIZE);
let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i);
let bitmask_lookup =
_mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1);
let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i;
let input = _mm_loadu_si128(raw_ptr);
let bitset = _mm_shuffle_epi8(bitmap, input);
let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
let tmp = _mm_and_si128(bitset, bitmask);
let result = _mm_cmpeq_epi8(tmp, bitmask);
_mm_movemask_epi8(result)
}
pub(super) fn iterate_special_bytes<F, T>(
lut: &LookupTable,
bytes: &[u8],
ix: usize,
callback: F,
) -> (usize, Option<T>)
where
F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
{
if is_x86_feature_detected!("ssse3") && bytes.len() >= VECTOR_SIZE {
unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) }
} else {
super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback)
}
}
unsafe fn process_mask<F, T>(
mut mask: i32,
bytes: &[u8],
mut offset: usize,
callback: &mut F,
) -> Result<usize, (usize, Option<T>)>
where
F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
{
while mask != 0 {
let mask_ix = mask.trailing_zeros() as usize;
offset += mask_ix;
match callback(offset, *bytes.get_unchecked(offset)) {
LoopInstruction::ContinueAndSkip(skip) => {
offset += skip + 1;
mask >>= skip + 1 + mask_ix;
}
LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)),
}
}
Ok(offset)
}
#[target_feature(enable = "ssse3")]
unsafe fn simd_iterate_special_bytes<F, T>(
lut: &[u8; 16],
bytes: &[u8],
mut ix: usize,
mut callback: F,
) -> (usize, Option<T>)
where
F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
{
debug_assert!(bytes.len() >= VECTOR_SIZE);
let upperbound = bytes.len() - VECTOR_SIZE;
while ix < upperbound {
let mask = compute_mask(lut, bytes, ix);
let block_start = ix;
ix = match process_mask(mask, bytes, ix, &mut callback) {
Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start),
Err((end_ix, val)) => return (end_ix, val),
};
}
if bytes.len() > ix {
let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound;
if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) {
return (end_ix, val);
}
}
(bytes.len(), None)
}
#[cfg(test)]
mod simd_test {
use super::super::create_lut;
use super::{iterate_special_bytes, LoopInstruction};
use crate::Options;
fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) {
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
opts.insert(Options::ENABLE_STRIKETHROUGH);
opts.insert(Options::ENABLE_TASKLISTS);
let lut = create_lut(&opts);
let mut indices = vec![];
iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| {
indices.push(ix);
LoopInstruction::ContinueAndSkip(skip)
});
assert_eq!(&indices[..], expected);
}
#[test]
fn simple_no_match() {
check_expected_indices("abcdef0123456789".as_bytes(), &[], 0);
}
#[test]
fn simple_match() {
check_expected_indices("*bcd&f0123456789".as_bytes(), &[0, 4], 0);
}
#[test]
fn single_open_fish() {
check_expected_indices("<".as_bytes(), &[0], 0);
}
#[test]
fn long_match() {
check_expected_indices("0123456789abcde~*bcd&f0".as_bytes(), &[15, 16, 20], 0);
}
#[test]
fn border_skip() {
check_expected_indices("0123456789abcde~~~~d&f0".as_bytes(), &[15, 20], 3);
}
#[test]
fn exhaustive_search() {
let chars = [
b'\n', b'\r', b'*', b'_', b'~', b'|', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
];
for &c in &chars {
for i in 0u8..=255 {
if !chars.contains(&i) {
let mut buf = [i; 18];
buf[3] = c;
buf[6] = c;
check_expected_indices(&buf[..], &[3, 6], 0);
}
}
}
}
}
}