1mod html_block;
2mod leaf_blocks;
3mod link_ref_def;
4mod parser;
5
6use html_block::*;
7use leaf_blocks::*;
8use link_ref_def::*;
9
10use crate::ParseOptions;
11use crate::ast::{Block, ListKind, TableAlignment};
12use crate::entities;
13use crate::html::trim_cr;
14use crate::inline::{InlineBuffers, LinkRefMap};
15use crate::render::render_block;
16use std::borrow::Cow;
17
18pub fn parse(markdown: &str, options: &ParseOptions) -> String {
29 let markdown = if options.max_input_size > 0 && markdown.len() > options.max_input_size {
30 let mut end = options.max_input_size;
32 while end > 0 && !markdown.is_char_boundary(end) {
33 end -= 1;
34 }
35 &markdown[..end]
36 } else {
37 markdown
38 };
39 let mut parser = BlockParser::new(markdown, options);
40 let doc = parser.parse();
41 let refs = parser.ref_defs;
42 let mut out = String::with_capacity(markdown.len() + markdown.len() / 2);
43 let mut bufs = InlineBuffers::new();
44 render_block(&doc, &refs, &mut out, options, &mut bufs);
45 out
46}
47
48pub fn parse_to_ast(markdown: &str, options: &ParseOptions) -> Block {
67 let markdown = if options.max_input_size > 0 && markdown.len() > options.max_input_size {
68 let mut end = options.max_input_size;
69 while end > 0 && !markdown.is_char_boundary(end) {
70 end -= 1;
71 }
72 &markdown[..end]
73 } else {
74 markdown
75 };
76 let mut parser = BlockParser::new(markdown, options);
77 parser.parse()
78}
79
80#[derive(Clone, Debug)]
81struct Line<'a> {
82 raw: &'a str,
83 col_offset: usize,
84 byte_offset: usize,
85 partial_spaces: usize,
86 cached_ns_col: usize,
87 cached_ns_off: usize,
88 cached_ns_byte: u8,
89}
90
91impl<'a> Line<'a> {
92 fn new(raw: &'a str) -> Self {
93 Self {
94 raw,
95 col_offset: 0,
96 byte_offset: 0,
97 partial_spaces: 0,
98 cached_ns_col: 0,
99 cached_ns_off: 0,
100 cached_ns_byte: 0,
101 }
102 }
103
104 fn remainder(&self) -> &'a str {
105 if self.byte_offset >= self.raw.len() {
106 ""
107 } else {
108 &self.raw[self.byte_offset..]
109 }
110 }
111
112 #[inline(always)]
113 fn is_blank(&mut self) -> bool {
114 if self.partial_spaces > 0 {
115 return false;
116 }
117 let (_, ns_off, ns_byte) = self.peek_nonspace_col();
118 ns_byte == 0 && ns_off >= self.raw.len()
119 }
120
121 #[inline]
122 fn skip_indent(&mut self, max: usize) -> usize {
123 let bytes = self.raw.as_bytes();
124 let mut cols = 0;
125 if self.partial_spaces > 0 {
126 let consume = self.partial_spaces.min(max);
127 cols += consume;
128 self.col_offset += consume;
129 self.partial_spaces -= consume;
130 if cols >= max {
131 return cols;
132 }
133 }
134 let remaining = max - cols;
135 let end = (self.byte_offset + remaining).min(bytes.len());
136 if end > self.byte_offset {
137 let mut fast_end = self.byte_offset;
138 while fast_end < end && bytes[fast_end] == b' ' {
139 fast_end += 1;
140 }
141 let fast_count = fast_end - self.byte_offset;
142 if fast_count >= remaining {
143 self.byte_offset += remaining;
144 self.col_offset += remaining;
145 return max;
146 }
147 if fast_count > 0 {
148 cols += fast_count;
149 self.byte_offset += fast_count;
150 self.col_offset += fast_count;
151 }
152 }
153 while self.byte_offset < bytes.len() && cols < max {
154 match bytes[self.byte_offset] {
155 b' ' => {
156 cols += 1;
157 self.byte_offset += 1;
158 self.col_offset += 1;
159 }
160 b'\t' => {
161 let tab_width = 4 - (self.col_offset % 4);
162 if cols + tab_width > max {
163 let consume = max - cols;
164 self.partial_spaces = tab_width - consume;
165 self.col_offset += consume;
166 self.byte_offset += 1;
167 cols += consume;
168 break;
169 }
170 cols += tab_width;
171 self.byte_offset += 1;
172 self.col_offset += tab_width;
173 }
174 _ => break,
175 }
176 }
177 cols
178 }
179
180 fn advance_columns(&mut self, n: usize) {
181 let bytes = self.raw.as_bytes();
182 let mut cols = 0;
183 while self.byte_offset < bytes.len() && cols < n {
184 match bytes[self.byte_offset] {
185 b' ' => {
186 cols += 1;
187 self.byte_offset += 1;
188 self.col_offset += 1;
189 }
190 b'\t' => {
191 let tab_width = 4 - (self.col_offset % 4);
192 cols += tab_width;
193 self.byte_offset += 1;
194 self.col_offset += tab_width;
195 }
196 _ => {
197 cols += 1;
198 self.byte_offset += 1;
199 self.col_offset += 1;
200 }
201 }
202 }
203 }
204
205 #[inline(always)]
206 fn peek_nonspace_col(&mut self) -> (usize, usize, u8) {
207 if self.cached_ns_off >= self.byte_offset
208 && (self.cached_ns_byte != 0 || self.cached_ns_off >= self.raw.len())
209 {
210 return (self.cached_ns_col, self.cached_ns_off, self.cached_ns_byte);
211 }
212 let bytes = self.raw.as_bytes();
213 let mut col = self.col_offset;
214 let mut off = self.byte_offset;
215 if self.partial_spaces > 0 {
216 col += self.partial_spaces;
217 }
218 while off < bytes.len() {
219 match bytes[off] {
220 b' ' => {
221 col += 1;
222 off += 1;
223 }
224 b'\t' => {
225 col += 4 - (col % 4);
226 off += 1;
227 }
228 b => {
229 self.cached_ns_col = col;
230 self.cached_ns_off = off;
231 self.cached_ns_byte = b;
232 return (col, off, b);
233 }
234 }
235 }
236 self.cached_ns_col = col;
237 self.cached_ns_off = off;
238 self.cached_ns_byte = 0;
239 (col, off, 0)
240 }
241
242 fn advance_to_nonspace(&mut self) {
243 self.partial_spaces = 0;
244 let (col, off, _) = self.peek_nonspace_col();
245 self.col_offset = col;
246 self.byte_offset = off;
247 }
248
249 fn remainder_with_partial(&self) -> Cow<'a, str> {
250 if self.partial_spaces > 0 {
251 static SPACES: &str = " ";
252 let rem = self.remainder();
253 let mut s = String::with_capacity(self.partial_spaces + rem.len());
254 s.push_str(&SPACES[..self.partial_spaces]);
255 s.push_str(rem);
256 Cow::Owned(s)
257 } else {
258 Cow::Borrowed(self.remainder())
259 }
260 }
261}
262
263#[derive(Clone, Debug)]
264struct FencedCodeData {
265 fence_char: u8,
266 fence_len: usize,
267 fence_indent: usize,
268 info: String,
269}
270
271#[derive(Clone, Debug)]
272struct TableData {
273 alignments: Vec<TableAlignment>,
274 header: Vec<String>,
275 rows: Vec<Vec<String>>,
276}
277
278#[derive(Clone, Debug)]
279enum OpenBlockType {
280 Document,
281 BlockQuote,
282 ListItem {
283 content_col: usize,
284 started_blank: bool,
285 },
286 FencedCode(Box<FencedCodeData>),
287 IndentedCode,
288 HtmlBlock {
289 end_condition: HtmlBlockEnd,
290 },
291 Paragraph,
292 Table(Box<TableData>),
293}
294
295#[derive(Copy, Clone, Debug, PartialEq)]
296enum HtmlBlockEnd {
297 EndTag(&'static str),
298 Comment,
299 ProcessingInstruction,
300 Declaration,
301 Cdata,
302 BlankLine,
303}
304
305#[derive(Clone, Debug)]
306struct OpenBlock {
307 block_type: OpenBlockType,
308 content: String,
309 children: Vec<Block>,
310 had_blank_in_item: bool,
311 list_has_blank_between: bool,
312 content_has_newline: bool,
313 checked: Option<bool>,
314 list_start: u32,
315 list_kind: Option<ListKind>,
316}
317
318impl OpenBlock {
319 #[inline]
320 fn new(block_type: OpenBlockType) -> Self {
321 Self {
322 block_type,
323 content: String::new(),
324 children: Vec::new(),
325 had_blank_in_item: false,
326 list_has_blank_between: false,
327 content_has_newline: false,
328 checked: None,
329 list_start: 0,
330 list_kind: None,
331 }
332 }
333
334 #[inline]
335 fn with_content_capacity(block_type: OpenBlockType, cap: usize) -> Self {
336 Self {
337 content: String::with_capacity(cap),
338 ..Self::new(block_type)
339 }
340 }
341
342 #[inline]
343 fn new_list_item(content_col: usize, started_blank: bool) -> Self {
344 Self {
345 block_type: OpenBlockType::ListItem {
346 content_col,
347 started_blank,
348 },
349 content: String::new(),
350 children: Vec::with_capacity(2),
351 had_blank_in_item: false,
352 list_has_blank_between: false,
353 content_has_newline: false,
354 checked: None,
355 list_start: 0,
356 list_kind: None,
357 }
358 }
359}
360
361pub(crate) struct BlockParser<'a> {
362 input: &'a str,
363 pub(crate) ref_defs: LinkRefMap,
364 open: Vec<OpenBlock>,
365 enable_tables: bool,
366 enable_task_lists: bool,
367 open_blockquotes: usize,
368 list_indent_sum: usize,
369 max_nesting_depth: usize,
370}
371
372impl<'a> BlockParser<'a> {
373 pub fn new(input: &'a str, options: &ParseOptions) -> Self {
374 let mut doc = OpenBlock::new(OpenBlockType::Document);
375 let estimated_blocks = (input.len() / 50).clamp(8, 256);
376 doc.children = Vec::with_capacity(estimated_blocks);
377 let mut open = Vec::with_capacity(16);
378 open.push(doc);
379 Self {
380 input,
381 ref_defs: LinkRefMap::default(),
382 open,
383 enable_tables: options.enable_tables,
384 enable_task_lists: options.enable_task_lists,
385 open_blockquotes: 0,
386 list_indent_sum: 0,
387 max_nesting_depth: options.max_nesting_depth,
388 }
389 }
390
391 pub fn parse(&mut self) -> Block {
392 let input = self.input;
393 let bytes = input.as_bytes();
394 let len = bytes.len();
395 let mut start = 0;
396 while start < len {
397 let end = memchr_newline(bytes, start);
398 let raw_line = &input[start..end];
399 let raw_line = trim_cr(raw_line);
400 let line = Line::new(raw_line);
401 self.process_line(line);
402
403 if self.open.len() == 2
404 && let OpenBlockType::FencedCode(ref fc_data) = self.open[1].block_type
405 && fc_data.fence_indent == 0
406 {
407 let fc = fc_data.fence_char;
408 let fl = fc_data.fence_len;
409 start = end + 1;
410 start = self.bulk_scan_fenced_code(input, bytes, start, len, fc, fl);
411 continue;
412 }
413
414 start = end + 1;
415 }
416 while self.open.len() > 1 {
417 self.close_top_block();
418 }
419 let doc = self.open.pop().unwrap();
420 Block::Document {
421 children: doc.children,
422 }
423 }
424
425 #[inline(never)]
426 fn bulk_scan_fenced_code(
427 &mut self,
428 input: &str,
429 bytes: &[u8],
430 start: usize,
431 len: usize,
432 fence_char: u8,
433 fence_len: usize,
434 ) -> usize {
435 let content_start = start;
436 let mut pos = start;
437 let mut has_cr = false;
438
439 while pos < len {
440 let line_end = memchr_newline(bytes, pos);
441 let check_end = if line_end > pos && bytes[line_end - 1] == b'\r' {
442 has_cr = true;
443 line_end - 1
444 } else {
445 line_end
446 };
447
448 if is_closing_fence(&bytes[pos..check_end], fence_char, fence_len) {
449 if pos > content_start {
450 self.push_bulk_content(input, content_start, pos, has_cr);
451 }
452 self.close_top_block();
453 return line_end + 1;
454 }
455
456 pos = line_end + 1;
457 }
458
459 if len > content_start {
460 self.push_bulk_content(input, content_start, len, has_cr);
461 let content = &mut self.open[1].content;
462 if !content.ends_with('\n') {
463 content.push('\n');
464 }
465 }
466 pos
467 }
468
469 #[inline]
470 fn push_bulk_content(&mut self, input: &str, start: usize, end: usize, has_cr: bool) {
471 let content = &mut self.open[1].content;
472 if !has_cr {
473 content.push_str(unsafe { input.get_unchecked(start..end) });
475 } else {
476 let s = unsafe { input.get_unchecked(start..end) };
478 content.reserve(s.len());
479 for chunk in s.split('\r') {
480 content.push_str(chunk);
481 }
482 }
483 }
484
485 fn mark_blank_on_list_items(&mut self) {
486 let len = self.open.len();
487 for i in (1..len).rev() {
488 match &self.open[i].block_type {
489 OpenBlockType::ListItem { .. } => {
490 self.open[i].had_blank_in_item = true;
491 break;
492 }
493 OpenBlockType::BlockQuote => {
494 break;
495 }
496 _ => {}
497 }
498 }
499 }
500
501 #[inline]
502 fn close_top_block(&mut self) {
503 let block = self.open.pop().unwrap();
504 match &block.block_type {
505 OpenBlockType::BlockQuote => {
506 self.open_blockquotes -= 1;
507 }
508 OpenBlockType::ListItem { content_col, .. } => {
509 self.list_indent_sum -= content_col;
510 }
511 _ => {}
512 }
513 let finalized = self.finalize_block(block);
514 if let Some(block) = finalized {
515 let parent = self.open.last_mut().unwrap();
516 parent.children.push(block);
517 }
518 }
519}