Skip to main content

ironmark/block/
mod.rs

1mod html_block;
2mod leaf_blocks;
3mod link_ref_def;
4mod parser;
5
6use html_block::*;
7use leaf_blocks::*;
8use link_ref_def::*;
9
10use crate::ParseOptions;
11use crate::ast::{Block, ListKind, TableAlignment};
12use crate::entities;
13use crate::html::trim_cr;
14use crate::inline::{InlineBuffers, LinkRefMap};
15use crate::render::render_block;
16use std::borrow::Cow;
17
18/// Parse a Markdown string and return the rendered HTML.
19///
20/// # Examples
21///
22/// ```
23/// use ironmark::{parse, ParseOptions};
24///
25/// let html = parse("**bold** and *italic*", &ParseOptions::default());
26/// assert!(html.contains("<strong>bold</strong>"));
27/// ```
28pub fn parse(markdown: &str, options: &ParseOptions) -> String {
29    let markdown = if options.max_input_size > 0 && markdown.len() > options.max_input_size {
30        // Truncate at a valid UTF-8 boundary
31        let mut end = options.max_input_size;
32        while end > 0 && !markdown.is_char_boundary(end) {
33            end -= 1;
34        }
35        &markdown[..end]
36    } else {
37        markdown
38    };
39    let mut parser = BlockParser::new(markdown, options);
40    let doc = parser.parse();
41    let refs = parser.ref_defs;
42    let mut out = String::with_capacity(markdown.len() + markdown.len() / 2);
43    let mut bufs = InlineBuffers::new();
44    render_block(&doc, &refs, &mut out, options, &mut bufs);
45    out
46}
47
48/// Parse a Markdown string and return the block-level AST.
49///
50/// This returns the raw AST without rendering to HTML, useful for
51/// programmatic inspection or transformation of the document structure.
52///
53/// # Examples
54///
55/// ```
56/// use ironmark::{parse_to_ast, ParseOptions, Block};
57///
58/// let ast = parse_to_ast("# Hello", &ParseOptions::default());
59/// match &ast {
60///     Block::Document { children } => {
61///         assert_eq!(children.len(), 1);
62///     }
63///     _ => panic!("expected Document"),
64/// }
65/// ```
66pub fn parse_to_ast(markdown: &str, options: &ParseOptions) -> Block {
67    let markdown = if options.max_input_size > 0 && markdown.len() > options.max_input_size {
68        let mut end = options.max_input_size;
69        while end > 0 && !markdown.is_char_boundary(end) {
70            end -= 1;
71        }
72        &markdown[..end]
73    } else {
74        markdown
75    };
76    let mut parser = BlockParser::new(markdown, options);
77    parser.parse()
78}
79
80#[derive(Clone, Debug)]
81struct Line<'a> {
82    raw: &'a str,
83    col_offset: usize,
84    byte_offset: usize,
85    partial_spaces: usize,
86    cached_ns_col: usize,
87    cached_ns_off: usize,
88    cached_ns_byte: u8,
89}
90
91impl<'a> Line<'a> {
92    fn new(raw: &'a str) -> Self {
93        Self {
94            raw,
95            col_offset: 0,
96            byte_offset: 0,
97            partial_spaces: 0,
98            cached_ns_col: 0,
99            cached_ns_off: 0,
100            cached_ns_byte: 0,
101        }
102    }
103
104    fn remainder(&self) -> &'a str {
105        if self.byte_offset >= self.raw.len() {
106            ""
107        } else {
108            &self.raw[self.byte_offset..]
109        }
110    }
111
112    #[inline(always)]
113    fn is_blank(&mut self) -> bool {
114        if self.partial_spaces > 0 {
115            return false;
116        }
117        let (_, ns_off, ns_byte) = self.peek_nonspace_col();
118        ns_byte == 0 && ns_off >= self.raw.len()
119    }
120
121    #[inline]
122    fn skip_indent(&mut self, max: usize) -> usize {
123        let bytes = self.raw.as_bytes();
124        let mut cols = 0;
125        if self.partial_spaces > 0 {
126            let consume = self.partial_spaces.min(max);
127            cols += consume;
128            self.col_offset += consume;
129            self.partial_spaces -= consume;
130            if cols >= max {
131                return cols;
132            }
133        }
134        let remaining = max - cols;
135        let end = (self.byte_offset + remaining).min(bytes.len());
136        if end > self.byte_offset {
137            let mut fast_end = self.byte_offset;
138            while fast_end < end && bytes[fast_end] == b' ' {
139                fast_end += 1;
140            }
141            let fast_count = fast_end - self.byte_offset;
142            if fast_count >= remaining {
143                self.byte_offset += remaining;
144                self.col_offset += remaining;
145                return max;
146            }
147            if fast_count > 0 {
148                cols += fast_count;
149                self.byte_offset += fast_count;
150                self.col_offset += fast_count;
151            }
152        }
153        while self.byte_offset < bytes.len() && cols < max {
154            match bytes[self.byte_offset] {
155                b' ' => {
156                    cols += 1;
157                    self.byte_offset += 1;
158                    self.col_offset += 1;
159                }
160                b'\t' => {
161                    let tab_width = 4 - (self.col_offset % 4);
162                    if cols + tab_width > max {
163                        let consume = max - cols;
164                        self.partial_spaces = tab_width - consume;
165                        self.col_offset += consume;
166                        self.byte_offset += 1;
167                        cols += consume;
168                        break;
169                    }
170                    cols += tab_width;
171                    self.byte_offset += 1;
172                    self.col_offset += tab_width;
173                }
174                _ => break,
175            }
176        }
177        cols
178    }
179
180    fn advance_columns(&mut self, n: usize) {
181        let bytes = self.raw.as_bytes();
182        let mut cols = 0;
183        while self.byte_offset < bytes.len() && cols < n {
184            match bytes[self.byte_offset] {
185                b' ' => {
186                    cols += 1;
187                    self.byte_offset += 1;
188                    self.col_offset += 1;
189                }
190                b'\t' => {
191                    let tab_width = 4 - (self.col_offset % 4);
192                    cols += tab_width;
193                    self.byte_offset += 1;
194                    self.col_offset += tab_width;
195                }
196                _ => {
197                    cols += 1;
198                    self.byte_offset += 1;
199                    self.col_offset += 1;
200                }
201            }
202        }
203    }
204
205    #[inline(always)]
206    fn peek_nonspace_col(&mut self) -> (usize, usize, u8) {
207        if self.cached_ns_off >= self.byte_offset
208            && (self.cached_ns_byte != 0 || self.cached_ns_off >= self.raw.len())
209        {
210            return (self.cached_ns_col, self.cached_ns_off, self.cached_ns_byte);
211        }
212        let bytes = self.raw.as_bytes();
213        let mut col = self.col_offset;
214        let mut off = self.byte_offset;
215        if self.partial_spaces > 0 {
216            col += self.partial_spaces;
217        }
218        while off < bytes.len() {
219            match bytes[off] {
220                b' ' => {
221                    col += 1;
222                    off += 1;
223                }
224                b'\t' => {
225                    col += 4 - (col % 4);
226                    off += 1;
227                }
228                b => {
229                    self.cached_ns_col = col;
230                    self.cached_ns_off = off;
231                    self.cached_ns_byte = b;
232                    return (col, off, b);
233                }
234            }
235        }
236        self.cached_ns_col = col;
237        self.cached_ns_off = off;
238        self.cached_ns_byte = 0;
239        (col, off, 0)
240    }
241
242    fn advance_to_nonspace(&mut self) {
243        self.partial_spaces = 0;
244        let (col, off, _) = self.peek_nonspace_col();
245        self.col_offset = col;
246        self.byte_offset = off;
247    }
248
249    fn remainder_with_partial(&self) -> Cow<'a, str> {
250        if self.partial_spaces > 0 {
251            static SPACES: &str = "    ";
252            let rem = self.remainder();
253            let mut s = String::with_capacity(self.partial_spaces + rem.len());
254            s.push_str(&SPACES[..self.partial_spaces]);
255            s.push_str(rem);
256            Cow::Owned(s)
257        } else {
258            Cow::Borrowed(self.remainder())
259        }
260    }
261}
262
263#[derive(Clone, Debug)]
264struct FencedCodeData {
265    fence_char: u8,
266    fence_len: usize,
267    fence_indent: usize,
268    info: String,
269}
270
271#[derive(Clone, Debug)]
272struct TableData {
273    alignments: Vec<TableAlignment>,
274    header: Vec<String>,
275    rows: Vec<Vec<String>>,
276}
277
278#[derive(Clone, Debug)]
279enum OpenBlockType {
280    Document,
281    BlockQuote,
282    ListItem {
283        content_col: usize,
284        started_blank: bool,
285    },
286    FencedCode(Box<FencedCodeData>),
287    IndentedCode,
288    HtmlBlock {
289        end_condition: HtmlBlockEnd,
290    },
291    Paragraph,
292    Table(Box<TableData>),
293}
294
295#[derive(Copy, Clone, Debug, PartialEq)]
296enum HtmlBlockEnd {
297    EndTag(&'static str),
298    Comment,
299    ProcessingInstruction,
300    Declaration,
301    Cdata,
302    BlankLine,
303}
304
305#[derive(Clone, Debug)]
306struct OpenBlock {
307    block_type: OpenBlockType,
308    content: String,
309    children: Vec<Block>,
310    had_blank_in_item: bool,
311    list_has_blank_between: bool,
312    content_has_newline: bool,
313    checked: Option<bool>,
314    list_start: u32,
315    list_kind: Option<ListKind>,
316}
317
318impl OpenBlock {
319    #[inline]
320    fn new(block_type: OpenBlockType) -> Self {
321        Self {
322            block_type,
323            content: String::new(),
324            children: Vec::new(),
325            had_blank_in_item: false,
326            list_has_blank_between: false,
327            content_has_newline: false,
328            checked: None,
329            list_start: 0,
330            list_kind: None,
331        }
332    }
333
334    #[inline]
335    fn with_content_capacity(block_type: OpenBlockType, cap: usize) -> Self {
336        Self {
337            content: String::with_capacity(cap),
338            ..Self::new(block_type)
339        }
340    }
341
342    #[inline]
343    fn new_list_item(content_col: usize, started_blank: bool) -> Self {
344        Self {
345            block_type: OpenBlockType::ListItem {
346                content_col,
347                started_blank,
348            },
349            content: String::new(),
350            children: Vec::with_capacity(2),
351            had_blank_in_item: false,
352            list_has_blank_between: false,
353            content_has_newline: false,
354            checked: None,
355            list_start: 0,
356            list_kind: None,
357        }
358    }
359}
360
361pub(crate) struct BlockParser<'a> {
362    input: &'a str,
363    pub(crate) ref_defs: LinkRefMap,
364    open: Vec<OpenBlock>,
365    enable_tables: bool,
366    enable_task_lists: bool,
367    open_blockquotes: usize,
368    list_indent_sum: usize,
369    max_nesting_depth: usize,
370}
371
372impl<'a> BlockParser<'a> {
373    pub fn new(input: &'a str, options: &ParseOptions) -> Self {
374        let mut doc = OpenBlock::new(OpenBlockType::Document);
375        let estimated_blocks = (input.len() / 50).clamp(8, 256);
376        doc.children = Vec::with_capacity(estimated_blocks);
377        let mut open = Vec::with_capacity(16);
378        open.push(doc);
379        Self {
380            input,
381            ref_defs: LinkRefMap::default(),
382            open,
383            enable_tables: options.enable_tables,
384            enable_task_lists: options.enable_task_lists,
385            open_blockquotes: 0,
386            list_indent_sum: 0,
387            max_nesting_depth: options.max_nesting_depth,
388        }
389    }
390
391    pub fn parse(&mut self) -> Block {
392        let input = self.input;
393        let bytes = input.as_bytes();
394        let len = bytes.len();
395        let mut start = 0;
396        while start < len {
397            let end = memchr_newline(bytes, start);
398            let raw_line = &input[start..end];
399            let raw_line = trim_cr(raw_line);
400            let line = Line::new(raw_line);
401            self.process_line(line);
402
403            if self.open.len() == 2
404                && let OpenBlockType::FencedCode(ref fc_data) = self.open[1].block_type
405                && fc_data.fence_indent == 0
406            {
407                let fc = fc_data.fence_char;
408                let fl = fc_data.fence_len;
409                start = end + 1;
410                start = self.bulk_scan_fenced_code(input, bytes, start, len, fc, fl);
411                continue;
412            }
413
414            start = end + 1;
415        }
416        while self.open.len() > 1 {
417            self.close_top_block();
418        }
419        let doc = self.open.pop().unwrap();
420        Block::Document {
421            children: doc.children,
422        }
423    }
424
425    #[inline(never)]
426    fn bulk_scan_fenced_code(
427        &mut self,
428        input: &str,
429        bytes: &[u8],
430        start: usize,
431        len: usize,
432        fence_char: u8,
433        fence_len: usize,
434    ) -> usize {
435        let content_start = start;
436        let mut pos = start;
437        let mut has_cr = false;
438
439        while pos < len {
440            let line_end = memchr_newline(bytes, pos);
441            let check_end = if line_end > pos && bytes[line_end - 1] == b'\r' {
442                has_cr = true;
443                line_end - 1
444            } else {
445                line_end
446            };
447
448            if is_closing_fence(&bytes[pos..check_end], fence_char, fence_len) {
449                if pos > content_start {
450                    self.push_bulk_content(input, content_start, pos, has_cr);
451                }
452                self.close_top_block();
453                return line_end + 1;
454            }
455
456            pos = line_end + 1;
457        }
458
459        if len > content_start {
460            self.push_bulk_content(input, content_start, len, has_cr);
461            let content = &mut self.open[1].content;
462            if !content.ends_with('\n') {
463                content.push('\n');
464            }
465        }
466        pos
467    }
468
469    #[inline]
470    fn push_bulk_content(&mut self, input: &str, start: usize, end: usize, has_cr: bool) {
471        let content = &mut self.open[1].content;
472        if !has_cr {
473            // SAFETY: `start..end` comes from newline scanning over `input` and is in-bounds.
474            content.push_str(unsafe { input.get_unchecked(start..end) });
475        } else {
476            // SAFETY: same bounds guarantee as above.
477            let s = unsafe { input.get_unchecked(start..end) };
478            content.reserve(s.len());
479            for chunk in s.split('\r') {
480                content.push_str(chunk);
481            }
482        }
483    }
484
485    fn mark_blank_on_list_items(&mut self) {
486        let len = self.open.len();
487        for i in (1..len).rev() {
488            match &self.open[i].block_type {
489                OpenBlockType::ListItem { .. } => {
490                    self.open[i].had_blank_in_item = true;
491                    break;
492                }
493                OpenBlockType::BlockQuote => {
494                    break;
495                }
496                _ => {}
497            }
498        }
499    }
500
501    #[inline]
502    fn close_top_block(&mut self) {
503        let block = self.open.pop().unwrap();
504        match &block.block_type {
505            OpenBlockType::BlockQuote => {
506                self.open_blockquotes -= 1;
507            }
508            OpenBlockType::ListItem { content_col, .. } => {
509                self.list_indent_sum -= content_col;
510            }
511            _ => {}
512        }
513        let finalized = self.finalize_block(block);
514        if let Some(block) = finalized {
515            let parent = self.open.last_mut().unwrap();
516            parent.children.push(block);
517        }
518    }
519}