Skip to main content

sciforge_parser/markdown/
parser.rs

1use super::error::{MdError, MdErrorKind};
2use super::inline::validate_inline;
3use super::lexer::{LineCursor, MdLine};
4use super::value::MdValue;
5
6pub const DEFAULT_MAX_MD_DEPTH: usize = 64;
7
8#[derive(Clone, Copy, Debug, PartialEq, Eq)]
9pub struct MdLimits {
10    pub max_depth: usize,
11    pub max_line_len: usize,
12    pub max_list_len: usize,
13    pub max_node_count: usize,
14}
15
16pub const DEFAULT_MD_LIMITS: MdLimits = MdLimits {
17    max_depth: DEFAULT_MAX_MD_DEPTH,
18    max_line_len: 64 * 1024,
19    max_list_len: 16 * 1024,
20    max_node_count: 128 * 1024,
21};
22
23pub struct MdParser<'a> {
24    cursor: LineCursor<'a>,
25    limits: MdLimits,
26    nodes_seen: usize,
27}
28
29impl<'a> MdParser<'a> {
30    pub const fn new(bytes: &'a [u8]) -> Self {
31        Self {
32            cursor: LineCursor::new(bytes),
33            limits: DEFAULT_MD_LIMITS,
34            nodes_seen: 0,
35        }
36    }
37
38    pub const fn with_limits(mut self, limits: MdLimits) -> Self {
39        self.limits = limits;
40        self
41    }
42
43    pub const fn with_max_depth(mut self, max_depth: usize) -> Self {
44        self.limits.max_depth = max_depth;
45        self
46    }
47
48    pub fn parse(mut self) -> Result<MdValue<'a>, MdError> {
49        self.parse_blocks(0)?;
50        Ok(MdValue::Document)
51    }
52
53    pub fn validate(mut self) -> Result<(), MdError> {
54        self.parse_blocks(0)?;
55        Ok(())
56    }
57
58    fn tick_node(&mut self) -> Result<(), MdError> {
59        self.nodes_seen = self.nodes_seen.saturating_add(1);
60        if self.nodes_seen > self.limits.max_node_count {
61            return Err(MdError::new(
62                MdErrorKind::MaxNodeCountExceeded,
63                self.cursor.position(),
64            ));
65        }
66        Ok(())
67    }
68
69    fn check_line_len(&self, line: &MdLine<'_>) -> Result<(), MdError> {
70        if line.content.len() > self.limits.max_line_len {
71            return Err(MdError::new(
72                MdErrorKind::MaxLineLengthExceeded,
73                line.offset,
74            ));
75        }
76        Ok(())
77    }
78
79    fn parse_blocks(&mut self, depth: usize) -> Result<(), MdError> {
80        if depth > self.limits.max_depth {
81            return Err(MdError::new(
82                MdErrorKind::MaxDepthExceeded,
83                self.cursor.position(),
84            ));
85        }
86
87        while let Some(line) = self.cursor.peek_line()? {
88            self.check_line_len(&line)?;
89
90            let trimmed = line.content.trim();
91
92            if trimmed.is_empty() {
93                self.cursor.advance_line();
94                continue;
95            }
96
97            if is_thematic_break(trimmed) {
98                self.tick_node()?;
99                self.cursor.advance_line();
100                continue;
101            }
102
103            if is_atx_heading(trimmed) {
104                self.parse_heading(line)?;
105                continue;
106            }
107
108            if is_fenced_code_start(trimmed) {
109                self.parse_fenced_code(line)?;
110                continue;
111            }
112
113            if trimmed.starts_with('>') {
114                self.parse_block_quote(depth)?;
115                continue;
116            }
117
118            if is_list_item(trimmed) {
119                self.parse_list(depth)?;
120                continue;
121            }
122
123            if is_table_row(trimmed) {
124                self.parse_table()?;
125                continue;
126            }
127
128            self.parse_paragraph()?;
129        }
130
131        Ok(())
132    }
133
134    fn parse_heading(&mut self, line: MdLine<'a>) -> Result<(), MdError> {
135        self.tick_node()?;
136        let trimmed = line.content.trim();
137        let bytes = trimmed.as_bytes();
138        let mut level = 0usize;
139        while level < bytes.len() && bytes[level] == b'#' {
140            level += 1;
141        }
142
143        self.cursor.advance_line();
144
145        if level >= bytes.len() {
146            return Ok(());
147        }
148
149        let content = trimmed[level..].trim();
150        let content = content.trim_end_matches(['#', ' ']);
151        if !content.is_empty() {
152            validate_inline(content, line.offset)?;
153        }
154
155        Ok(())
156    }
157
158    fn parse_fenced_code(&mut self, line: MdLine<'_>) -> Result<(), MdError> {
159        self.tick_node()?;
160        let trimmed = line.content.trim();
161        let fence_char = trimmed.as_bytes()[0];
162        let mut fence_len = 0usize;
163        while fence_len < trimmed.len() && trimmed.as_bytes()[fence_len] == fence_char {
164            fence_len += 1;
165        }
166
167        self.cursor.advance_line();
168
169        loop {
170            let Some(inner) = self.cursor.peek_line()? else {
171                return Err(MdError::new(
172                    MdErrorKind::UnterminatedCodeBlock,
173                    line.offset,
174                ));
175            };
176            self.check_line_len(&inner)?;
177            self.cursor.advance_line();
178
179            let inner_trimmed = inner.content.trim();
180            if is_closing_fence(inner_trimmed, fence_char, fence_len) {
181                return Ok(());
182            }
183        }
184    }
185
186    fn parse_block_quote(&mut self, depth: usize) -> Result<(), MdError> {
187        self.tick_node()?;
188        if depth + 1 > self.limits.max_depth {
189            return Err(MdError::new(
190                MdErrorKind::MaxDepthExceeded,
191                self.cursor.position(),
192            ));
193        }
194
195        while let Some(line) = self.cursor.peek_line()? {
196            let trimmed = line.content.trim();
197            if !trimmed.starts_with('>') {
198                break;
199            }
200            self.check_line_len(&line)?;
201            self.cursor.advance_line();
202            self.tick_node()?;
203
204            let inner = if trimmed.len() > 1 {
205                if trimmed.as_bytes()[1] == b' ' {
206                    &trimmed[2..]
207                } else {
208                    &trimmed[1..]
209                }
210            } else {
211                ""
212            };
213
214            let inner = inner.trim();
215            if !inner.is_empty() {
216                validate_inline(inner, line.offset)?;
217            }
218        }
219
220        Ok(())
221    }
222
223    fn parse_list(&mut self, depth: usize) -> Result<(), MdError> {
224        self.tick_node()?;
225        if depth + 1 > self.limits.max_depth {
226            return Err(MdError::new(
227                MdErrorKind::MaxDepthExceeded,
228                self.cursor.position(),
229            ));
230        }
231
232        let mut count = 0usize;
233
234        while let Some(line) = self.cursor.peek_line()? {
235            let trimmed = line.content.trim();
236            if trimmed.is_empty() {
237                break;
238            }
239            if !is_list_item(trimmed) && line.indent < 2 {
240                break;
241            }
242            self.check_line_len(&line)?;
243            self.tick_node()?;
244            self.cursor.advance_line();
245
246            let item_text = strip_list_marker(trimmed);
247            if !item_text.is_empty() {
248                validate_inline(item_text, line.offset)?;
249            }
250
251            count = count.saturating_add(1);
252            if count > self.limits.max_list_len {
253                return Err(MdError::new(
254                    MdErrorKind::MaxListLengthExceeded,
255                    line.offset,
256                ));
257            }
258        }
259
260        Ok(())
261    }
262
263    fn parse_table(&mut self) -> Result<(), MdError> {
264        self.tick_node()?;
265
266        while let Some(line) = self.cursor.peek_line()? {
267            let trimmed = line.content.trim();
268            if !is_table_row(trimmed) {
269                break;
270            }
271            self.check_line_len(&line)?;
272            self.tick_node()?;
273            self.cursor.advance_line();
274        }
275
276        Ok(())
277    }
278
279    fn parse_paragraph(&mut self) -> Result<(), MdError> {
280        self.tick_node()?;
281
282        while let Some(line) = self.cursor.peek_line()? {
283            let trimmed = line.content.trim();
284            if trimmed.is_empty()
285                || is_atx_heading(trimmed)
286                || is_fenced_code_start(trimmed)
287                || is_thematic_break(trimmed)
288                || trimmed.starts_with('>')
289                || is_list_item(trimmed)
290                || is_table_row(trimmed)
291            {
292                break;
293            }
294            self.check_line_len(&line)?;
295            validate_inline(trimmed, line.offset)?;
296            self.cursor.advance_line();
297        }
298
299        Ok(())
300    }
301}
302
303fn is_atx_heading(trimmed: &str) -> bool {
304    let bytes = trimmed.as_bytes();
305    if bytes.is_empty() || bytes[0] != b'#' {
306        return false;
307    }
308    let mut level = 0usize;
309    while level < bytes.len() && bytes[level] == b'#' {
310        level += 1;
311    }
312    level <= 6 && (level == bytes.len() || bytes[level] == b' ')
313}
314
315fn is_thematic_break(trimmed: &str) -> bool {
316    let bytes = trimmed.as_bytes();
317    if bytes.len() < 3 {
318        return false;
319    }
320    let ch = bytes[0];
321    if ch != b'-' && ch != b'*' && ch != b'_' {
322        return false;
323    }
324    let mut count = 0usize;
325    for &b in bytes {
326        if b == ch {
327            count += 1;
328        } else if b != b' ' {
329            return false;
330        }
331    }
332    count >= 3
333}
334
335fn is_fenced_code_start(trimmed: &str) -> bool {
336    let bytes = trimmed.as_bytes();
337    if bytes.len() < 3 {
338        return false;
339    }
340    let ch = bytes[0];
341    if ch != b'`' && ch != b'~' {
342        return false;
343    }
344    let mut count = 0usize;
345    for &b in bytes {
346        if b == ch {
347            count += 1;
348        } else {
349            break;
350        }
351    }
352    count >= 3
353}
354
355fn is_closing_fence(trimmed: &str, fence_char: u8, min_len: usize) -> bool {
356    let bytes = trimmed.as_bytes();
357    if bytes.is_empty() {
358        return false;
359    }
360    for &b in bytes {
361        if b != fence_char {
362            return false;
363        }
364    }
365    bytes.len() >= min_len
366}
367
368fn is_list_item(trimmed: &str) -> bool {
369    let bytes = trimmed.as_bytes();
370    if bytes.is_empty() {
371        return false;
372    }
373    if (bytes[0] == b'-' || bytes[0] == b'*' || bytes[0] == b'+')
374        && (bytes.len() == 1 || bytes[1] == b' ')
375    {
376        return true;
377    }
378    let mut idx = 0usize;
379    while idx < bytes.len() && bytes[idx].is_ascii_digit() {
380        idx += 1;
381    }
382    if idx > 0 && idx < bytes.len() && (bytes[idx] == b'.' || bytes[idx] == b')') {
383        return idx + 1 == bytes.len() || bytes[idx + 1] == b' ';
384    }
385    false
386}
387
388fn strip_list_marker(trimmed: &str) -> &str {
389    let bytes = trimmed.as_bytes();
390    if bytes.is_empty() {
391        return "";
392    }
393    if bytes[0] == b'-' || bytes[0] == b'*' || bytes[0] == b'+' {
394        return if bytes.len() > 2 {
395            trimmed[2..].trim_start()
396        } else {
397            ""
398        };
399    }
400    let mut idx = 0usize;
401    while idx < bytes.len() && bytes[idx].is_ascii_digit() {
402        idx += 1;
403    }
404    if idx < bytes.len() && (bytes[idx] == b'.' || bytes[idx] == b')') {
405        idx += 1;
406        return if idx < bytes.len() {
407            trimmed[idx..].trim_start()
408        } else {
409            ""
410        };
411    }
412    trimmed
413}
414
415fn is_table_row(trimmed: &str) -> bool {
416    trimmed.starts_with('|')
417}
418
419pub fn parse_md(bytes: &[u8]) -> Result<MdValue<'_>, MdError> {
420    MdParser::new(bytes).parse()
421}
422
423pub fn parse_md_with_max_depth(bytes: &[u8], max_depth: usize) -> Result<MdValue<'_>, MdError> {
424    MdParser::new(bytes).with_max_depth(max_depth).parse()
425}
426
427pub fn parse_md_with_limits(bytes: &[u8], limits: MdLimits) -> Result<MdValue<'_>, MdError> {
428    MdParser::new(bytes).with_limits(limits).parse()
429}
430
431pub fn validate_md(bytes: &[u8]) -> Result<(), MdError> {
432    MdParser::new(bytes).validate()
433}