sciforge_parser/markdown/
parser.rs1use super::error::{MdError, MdErrorKind};
2use super::inline::validate_inline;
3use super::lexer::{LineCursor, MdLine};
4use super::value::MdValue;
5
6pub const DEFAULT_MAX_MD_DEPTH: usize = 64;
7
8#[derive(Clone, Copy, Debug, PartialEq, Eq)]
9pub struct MdLimits {
10 pub max_depth: usize,
11 pub max_line_len: usize,
12 pub max_list_len: usize,
13 pub max_node_count: usize,
14}
15
16pub const DEFAULT_MD_LIMITS: MdLimits = MdLimits {
17 max_depth: DEFAULT_MAX_MD_DEPTH,
18 max_line_len: 64 * 1024,
19 max_list_len: 16 * 1024,
20 max_node_count: 128 * 1024,
21};
22
23pub struct MdParser<'a> {
24 cursor: LineCursor<'a>,
25 limits: MdLimits,
26 nodes_seen: usize,
27}
28
29impl<'a> MdParser<'a> {
30 pub const fn new(bytes: &'a [u8]) -> Self {
31 Self {
32 cursor: LineCursor::new(bytes),
33 limits: DEFAULT_MD_LIMITS,
34 nodes_seen: 0,
35 }
36 }
37
38 pub const fn with_limits(mut self, limits: MdLimits) -> Self {
39 self.limits = limits;
40 self
41 }
42
43 pub const fn with_max_depth(mut self, max_depth: usize) -> Self {
44 self.limits.max_depth = max_depth;
45 self
46 }
47
48 pub fn parse(mut self) -> Result<MdValue<'a>, MdError> {
49 self.parse_blocks(0)?;
50 Ok(MdValue::Document)
51 }
52
53 pub fn validate(mut self) -> Result<(), MdError> {
54 self.parse_blocks(0)?;
55 Ok(())
56 }
57
58 fn tick_node(&mut self) -> Result<(), MdError> {
59 self.nodes_seen = self.nodes_seen.saturating_add(1);
60 if self.nodes_seen > self.limits.max_node_count {
61 return Err(MdError::new(
62 MdErrorKind::MaxNodeCountExceeded,
63 self.cursor.position(),
64 ));
65 }
66 Ok(())
67 }
68
69 fn check_line_len(&self, line: &MdLine<'_>) -> Result<(), MdError> {
70 if line.content.len() > self.limits.max_line_len {
71 return Err(MdError::new(
72 MdErrorKind::MaxLineLengthExceeded,
73 line.offset,
74 ));
75 }
76 Ok(())
77 }
78
79 fn parse_blocks(&mut self, depth: usize) -> Result<(), MdError> {
80 if depth > self.limits.max_depth {
81 return Err(MdError::new(
82 MdErrorKind::MaxDepthExceeded,
83 self.cursor.position(),
84 ));
85 }
86
87 while let Some(line) = self.cursor.peek_line()? {
88 self.check_line_len(&line)?;
89
90 let trimmed = line.content.trim();
91
92 if trimmed.is_empty() {
93 self.cursor.advance_line();
94 continue;
95 }
96
97 if is_thematic_break(trimmed) {
98 self.tick_node()?;
99 self.cursor.advance_line();
100 continue;
101 }
102
103 if is_atx_heading(trimmed) {
104 self.parse_heading(line)?;
105 continue;
106 }
107
108 if is_fenced_code_start(trimmed) {
109 self.parse_fenced_code(line)?;
110 continue;
111 }
112
113 if trimmed.starts_with('>') {
114 self.parse_block_quote(depth)?;
115 continue;
116 }
117
118 if is_list_item(trimmed) {
119 self.parse_list(depth)?;
120 continue;
121 }
122
123 if is_table_row(trimmed) {
124 self.parse_table()?;
125 continue;
126 }
127
128 self.parse_paragraph()?;
129 }
130
131 Ok(())
132 }
133
134 fn parse_heading(&mut self, line: MdLine<'a>) -> Result<(), MdError> {
135 self.tick_node()?;
136 let trimmed = line.content.trim();
137 let bytes = trimmed.as_bytes();
138 let mut level = 0usize;
139 while level < bytes.len() && bytes[level] == b'#' {
140 level += 1;
141 }
142
143 self.cursor.advance_line();
144
145 if level >= bytes.len() {
146 return Ok(());
147 }
148
149 let content = trimmed[level..].trim();
150 let content = content.trim_end_matches(['#', ' ']);
151 if !content.is_empty() {
152 validate_inline(content, line.offset)?;
153 }
154
155 Ok(())
156 }
157
158 fn parse_fenced_code(&mut self, line: MdLine<'_>) -> Result<(), MdError> {
159 self.tick_node()?;
160 let trimmed = line.content.trim();
161 let fence_char = trimmed.as_bytes()[0];
162 let mut fence_len = 0usize;
163 while fence_len < trimmed.len() && trimmed.as_bytes()[fence_len] == fence_char {
164 fence_len += 1;
165 }
166
167 self.cursor.advance_line();
168
169 loop {
170 let Some(inner) = self.cursor.peek_line()? else {
171 return Err(MdError::new(
172 MdErrorKind::UnterminatedCodeBlock,
173 line.offset,
174 ));
175 };
176 self.check_line_len(&inner)?;
177 self.cursor.advance_line();
178
179 let inner_trimmed = inner.content.trim();
180 if is_closing_fence(inner_trimmed, fence_char, fence_len) {
181 return Ok(());
182 }
183 }
184 }
185
186 fn parse_block_quote(&mut self, depth: usize) -> Result<(), MdError> {
187 self.tick_node()?;
188 if depth + 1 > self.limits.max_depth {
189 return Err(MdError::new(
190 MdErrorKind::MaxDepthExceeded,
191 self.cursor.position(),
192 ));
193 }
194
195 while let Some(line) = self.cursor.peek_line()? {
196 let trimmed = line.content.trim();
197 if !trimmed.starts_with('>') {
198 break;
199 }
200 self.check_line_len(&line)?;
201 self.cursor.advance_line();
202 self.tick_node()?;
203
204 let inner = if trimmed.len() > 1 {
205 if trimmed.as_bytes()[1] == b' ' {
206 &trimmed[2..]
207 } else {
208 &trimmed[1..]
209 }
210 } else {
211 ""
212 };
213
214 let inner = inner.trim();
215 if !inner.is_empty() {
216 validate_inline(inner, line.offset)?;
217 }
218 }
219
220 Ok(())
221 }
222
223 fn parse_list(&mut self, depth: usize) -> Result<(), MdError> {
224 self.tick_node()?;
225 if depth + 1 > self.limits.max_depth {
226 return Err(MdError::new(
227 MdErrorKind::MaxDepthExceeded,
228 self.cursor.position(),
229 ));
230 }
231
232 let mut count = 0usize;
233
234 while let Some(line) = self.cursor.peek_line()? {
235 let trimmed = line.content.trim();
236 if trimmed.is_empty() {
237 break;
238 }
239 if !is_list_item(trimmed) && line.indent < 2 {
240 break;
241 }
242 self.check_line_len(&line)?;
243 self.tick_node()?;
244 self.cursor.advance_line();
245
246 let item_text = strip_list_marker(trimmed);
247 if !item_text.is_empty() {
248 validate_inline(item_text, line.offset)?;
249 }
250
251 count = count.saturating_add(1);
252 if count > self.limits.max_list_len {
253 return Err(MdError::new(
254 MdErrorKind::MaxListLengthExceeded,
255 line.offset,
256 ));
257 }
258 }
259
260 Ok(())
261 }
262
263 fn parse_table(&mut self) -> Result<(), MdError> {
264 self.tick_node()?;
265
266 while let Some(line) = self.cursor.peek_line()? {
267 let trimmed = line.content.trim();
268 if !is_table_row(trimmed) {
269 break;
270 }
271 self.check_line_len(&line)?;
272 self.tick_node()?;
273 self.cursor.advance_line();
274 }
275
276 Ok(())
277 }
278
279 fn parse_paragraph(&mut self) -> Result<(), MdError> {
280 self.tick_node()?;
281
282 while let Some(line) = self.cursor.peek_line()? {
283 let trimmed = line.content.trim();
284 if trimmed.is_empty()
285 || is_atx_heading(trimmed)
286 || is_fenced_code_start(trimmed)
287 || is_thematic_break(trimmed)
288 || trimmed.starts_with('>')
289 || is_list_item(trimmed)
290 || is_table_row(trimmed)
291 {
292 break;
293 }
294 self.check_line_len(&line)?;
295 validate_inline(trimmed, line.offset)?;
296 self.cursor.advance_line();
297 }
298
299 Ok(())
300 }
301}
302
303fn is_atx_heading(trimmed: &str) -> bool {
304 let bytes = trimmed.as_bytes();
305 if bytes.is_empty() || bytes[0] != b'#' {
306 return false;
307 }
308 let mut level = 0usize;
309 while level < bytes.len() && bytes[level] == b'#' {
310 level += 1;
311 }
312 level <= 6 && (level == bytes.len() || bytes[level] == b' ')
313}
314
315fn is_thematic_break(trimmed: &str) -> bool {
316 let bytes = trimmed.as_bytes();
317 if bytes.len() < 3 {
318 return false;
319 }
320 let ch = bytes[0];
321 if ch != b'-' && ch != b'*' && ch != b'_' {
322 return false;
323 }
324 let mut count = 0usize;
325 for &b in bytes {
326 if b == ch {
327 count += 1;
328 } else if b != b' ' {
329 return false;
330 }
331 }
332 count >= 3
333}
334
335fn is_fenced_code_start(trimmed: &str) -> bool {
336 let bytes = trimmed.as_bytes();
337 if bytes.len() < 3 {
338 return false;
339 }
340 let ch = bytes[0];
341 if ch != b'`' && ch != b'~' {
342 return false;
343 }
344 let mut count = 0usize;
345 for &b in bytes {
346 if b == ch {
347 count += 1;
348 } else {
349 break;
350 }
351 }
352 count >= 3
353}
354
355fn is_closing_fence(trimmed: &str, fence_char: u8, min_len: usize) -> bool {
356 let bytes = trimmed.as_bytes();
357 if bytes.is_empty() {
358 return false;
359 }
360 for &b in bytes {
361 if b != fence_char {
362 return false;
363 }
364 }
365 bytes.len() >= min_len
366}
367
368fn is_list_item(trimmed: &str) -> bool {
369 let bytes = trimmed.as_bytes();
370 if bytes.is_empty() {
371 return false;
372 }
373 if (bytes[0] == b'-' || bytes[0] == b'*' || bytes[0] == b'+')
374 && (bytes.len() == 1 || bytes[1] == b' ')
375 {
376 return true;
377 }
378 let mut idx = 0usize;
379 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
380 idx += 1;
381 }
382 if idx > 0 && idx < bytes.len() && (bytes[idx] == b'.' || bytes[idx] == b')') {
383 return idx + 1 == bytes.len() || bytes[idx + 1] == b' ';
384 }
385 false
386}
387
388fn strip_list_marker(trimmed: &str) -> &str {
389 let bytes = trimmed.as_bytes();
390 if bytes.is_empty() {
391 return "";
392 }
393 if bytes[0] == b'-' || bytes[0] == b'*' || bytes[0] == b'+' {
394 return if bytes.len() > 2 {
395 trimmed[2..].trim_start()
396 } else {
397 ""
398 };
399 }
400 let mut idx = 0usize;
401 while idx < bytes.len() && bytes[idx].is_ascii_digit() {
402 idx += 1;
403 }
404 if idx < bytes.len() && (bytes[idx] == b'.' || bytes[idx] == b')') {
405 idx += 1;
406 return if idx < bytes.len() {
407 trimmed[idx..].trim_start()
408 } else {
409 ""
410 };
411 }
412 trimmed
413}
414
415fn is_table_row(trimmed: &str) -> bool {
416 trimmed.starts_with('|')
417}
418
419pub fn parse_md(bytes: &[u8]) -> Result<MdValue<'_>, MdError> {
420 MdParser::new(bytes).parse()
421}
422
423pub fn parse_md_with_max_depth(bytes: &[u8], max_depth: usize) -> Result<MdValue<'_>, MdError> {
424 MdParser::new(bytes).with_max_depth(max_depth).parse()
425}
426
427pub fn parse_md_with_limits(bytes: &[u8], limits: MdLimits) -> Result<MdValue<'_>, MdError> {
428 MdParser::new(bytes).with_limits(limits).parse()
429}
430
431pub fn validate_md(bytes: &[u8]) -> Result<(), MdError> {
432 MdParser::new(bytes).validate()
433}