Skip to main content

sciforge_parser/yaml/
parser.rs

1use super::error::{YamlError, YamlErrorKind};
2use super::lexer::{LineCursor, YamlLine};
3use super::scalar::parse_scalar;
4use super::value::YamlValue;
5
6pub const DEFAULT_MAX_YAML_DEPTH: usize = 64;
7
8#[derive(Clone, Copy, Debug, PartialEq, Eq)]
9pub struct YamlLimits {
10    pub max_depth: usize,
11    pub max_scalar_len: usize,
12    pub max_sequence_len: usize,
13    pub max_mapping_len: usize,
14    pub max_node_count: usize,
15}
16
17pub const DEFAULT_YAML_LIMITS: YamlLimits = YamlLimits {
18    max_depth: DEFAULT_MAX_YAML_DEPTH,
19    max_scalar_len: 64 * 1024,
20    max_sequence_len: 16 * 1024,
21    max_mapping_len: 16 * 1024,
22    max_node_count: 128 * 1024,
23};
24
25pub struct YamlParser<'a> {
26    cursor: LineCursor<'a>,
27    limits: YamlLimits,
28    nodes_seen: usize,
29}
30
31impl<'a> YamlParser<'a> {
32    pub const fn new(bytes: &'a [u8]) -> Self {
33        Self {
34            cursor: LineCursor::new(bytes),
35            limits: DEFAULT_YAML_LIMITS,
36            nodes_seen: 0,
37        }
38    }
39
40    pub const fn with_max_depth(mut self, max_depth: usize) -> Self {
41        self.limits.max_depth = max_depth;
42        self
43    }
44
45    pub const fn with_limits(mut self, limits: YamlLimits) -> Self {
46        self.limits = limits;
47        self
48    }
49
50    pub fn parse(mut self) -> Result<YamlValue<'a>, YamlError> {
51        let first = self
52            .cursor
53            .peek()?
54            .ok_or(YamlError::new(YamlErrorKind::Eof, self.cursor.position()))?;
55        self.parse_node(first.indent, 0)
56    }
57
58    pub fn validate(mut self) -> Result<(), YamlError> {
59        let first = self
60            .cursor
61            .peek()?
62            .ok_or(YamlError::new(YamlErrorKind::Eof, self.cursor.position()))?;
63        self.parse_node(first.indent, 0)?;
64        if self.cursor.peek()?.is_some() {
65            let line = self.cursor.peek()?.expect("peek checked is_some");
66            return Err(YamlError::new(YamlErrorKind::UnexpectedToken, line.offset));
67        }
68        Ok(())
69    }
70
71    fn parse_node(&mut self, base_indent: usize, depth: usize) -> Result<YamlValue<'a>, YamlError> {
72        if depth > self.limits.max_depth {
73            return Err(YamlError::new(
74                YamlErrorKind::MaxDepthExceeded,
75                self.cursor.position(),
76            ));
77        }
78
79        self.nodes_seen = self.nodes_seen.saturating_add(1);
80        if self.nodes_seen > self.limits.max_node_count {
81            return Err(YamlError::new(
82                YamlErrorKind::MaxNodeCountExceeded,
83                self.cursor.position(),
84            ));
85        }
86
87        let line = self
88            .cursor
89            .peek()?
90            .ok_or(YamlError::new(YamlErrorKind::Eof, self.cursor.position()))?;
91
92        if line.indent < base_indent {
93            return Err(YamlError::new(
94                YamlErrorKind::InvalidIndentation,
95                line.offset,
96            ));
97        }
98        if line.indent > base_indent {
99            return Err(YamlError::new(
100                YamlErrorKind::InvalidIndentation,
101                line.offset,
102            ));
103        }
104
105        if is_sequence_entry(line.content) {
106            self.parse_sequence(base_indent, depth + 1)
107        } else if has_mapping_separator(line.content) {
108            self.parse_mapping(base_indent, depth + 1)
109        } else {
110            self.parse_scalar_line(line)
111        }
112    }
113
114    fn parse_scalar_line(&mut self, line: YamlLine<'a>) -> Result<YamlValue<'a>, YamlError> {
115        if line.content.len() > self.limits.max_scalar_len {
116            return Err(YamlError::new(
117                YamlErrorKind::MaxScalarLengthExceeded,
118                line.offset,
119            ));
120        }
121        self.cursor.next()?;
122        parse_scalar(line.content, line.offset)
123    }
124
125    fn parse_sequence(
126        &mut self,
127        base_indent: usize,
128        depth: usize,
129    ) -> Result<YamlValue<'a>, YamlError> {
130        let mut items = Vec::new();
131
132        loop {
133            let Some(line) = self.cursor.peek()? else {
134                break;
135            };
136            if line.indent < base_indent {
137                break;
138            }
139            if line.indent > base_indent {
140                return Err(YamlError::new(
141                    YamlErrorKind::InvalidIndentation,
142                    line.offset,
143                ));
144            }
145            if !is_sequence_entry(line.content) {
146                break;
147            }
148
149            let item_text = line.content[1..].trim_start();
150            self.cursor.next()?;
151
152            if !item_text.is_empty() {
153                if item_text.len() > self.limits.max_scalar_len {
154                    return Err(YamlError::new(
155                        YamlErrorKind::MaxScalarLengthExceeded,
156                        line.offset,
157                    ));
158                }
159                if has_mapping_separator(item_text) {
160                    let (key, value_part) = split_mapping_entry(item_text).unwrap();
161                    let mut entries = Vec::new();
162                    let val = if value_part.is_empty() {
163                        let nested = self.cursor.peek()?;
164                        if let Some(nl) = nested
165                            && nl.indent > base_indent
166                        {
167                            self.parse_node(nl.indent, depth)?
168                        } else {
169                            YamlValue::Null
170                        }
171                    } else if value_part == "[]" {
172                        YamlValue::Sequence(Vec::new())
173                    } else {
174                        parse_scalar(value_part, line.offset)?
175                    };
176                    entries.push((key, val));
177                    loop {
178                        let Some(next) = self.cursor.peek()? else {
179                            break;
180                        };
181                        if next.indent <= base_indent || is_sequence_entry(next.content) {
182                            break;
183                        }
184                        if !has_mapping_separator(next.content) {
185                            break;
186                        }
187                        let (nk, nv) = split_mapping_entry(next.content).unwrap();
188                        self.cursor.next()?;
189                        let val = if nv.is_empty() {
190                            let nested = self.cursor.peek()?;
191                            if let Some(nl) = nested
192                                && nl.indent > next.indent
193                            {
194                                self.parse_node(nl.indent, depth)?
195                            } else {
196                                YamlValue::Null
197                            }
198                        } else if nv == "[]" {
199                            YamlValue::Sequence(Vec::new())
200                        } else {
201                            parse_scalar(nv, next.offset)?
202                        };
203                        entries.push((nk, val));
204                    }
205                    items.push(YamlValue::Mapping(entries));
206                } else {
207                    items.push(parse_scalar(item_text, line.offset)?);
208                }
209            } else {
210                let nested = self
211                    .cursor
212                    .peek()?
213                    .ok_or(YamlError::new(YamlErrorKind::Eof, line.offset))?;
214                if nested.indent <= base_indent {
215                    return Err(YamlError::new(
216                        YamlErrorKind::InvalidIndentation,
217                        nested.offset,
218                    ));
219                }
220                items.push(self.parse_node(nested.indent, depth)?);
221            }
222
223            if items.len() > self.limits.max_sequence_len {
224                return Err(YamlError::new(
225                    YamlErrorKind::MaxSequenceLengthExceeded,
226                    line.offset,
227                ));
228            }
229        }
230
231        if items.is_empty() {
232            return Err(YamlError::new(
233                YamlErrorKind::UnexpectedToken,
234                self.cursor.position(),
235            ));
236        }
237
238        Ok(YamlValue::Sequence(items))
239    }
240
241    fn parse_mapping(
242        &mut self,
243        base_indent: usize,
244        depth: usize,
245    ) -> Result<YamlValue<'a>, YamlError> {
246        let mut entries = Vec::new();
247
248        loop {
249            let Some(line) = self.cursor.peek()? else {
250                break;
251            };
252            if line.indent < base_indent {
253                break;
254            }
255            if line.indent > base_indent {
256                return Err(YamlError::new(
257                    YamlErrorKind::InvalidIndentation,
258                    line.offset,
259                ));
260            }
261
262            let Some((key, value_part)) = split_mapping_entry(line.content) else {
263                break;
264            };
265
266            if key.is_empty() {
267                return Err(YamlError::new(
268                    YamlErrorKind::InvalidMappingKey,
269                    line.offset,
270                ));
271            }
272            if key.len() > self.limits.max_scalar_len {
273                return Err(YamlError::new(
274                    YamlErrorKind::MaxScalarLengthExceeded,
275                    line.offset,
276                ));
277            }
278
279            self.cursor.next()?;
280
281            let val = if !value_part.is_empty() {
282                if value_part.len() > self.limits.max_scalar_len {
283                    return Err(YamlError::new(
284                        YamlErrorKind::MaxScalarLengthExceeded,
285                        line.offset,
286                    ));
287                }
288                if value_part == "[]" {
289                    YamlValue::Sequence(Vec::new())
290                } else {
291                    parse_scalar(value_part, line.offset)?
292                }
293            } else {
294                let next = self.cursor.peek()?;
295                if let Some(next_line) = next
296                    && next_line.indent > base_indent
297                {
298                    self.parse_node(next_line.indent, depth)?
299                } else {
300                    YamlValue::Null
301                }
302            };
303
304            entries.push((key, val));
305
306            if entries.len() > self.limits.max_mapping_len {
307                return Err(YamlError::new(
308                    YamlErrorKind::MaxMappingLengthExceeded,
309                    line.offset,
310                ));
311            }
312        }
313
314        if entries.is_empty() {
315            return Err(YamlError::new(
316                YamlErrorKind::UnexpectedToken,
317                self.cursor.position(),
318            ));
319        }
320
321        Ok(YamlValue::Mapping(entries))
322    }
323}
324
325fn is_sequence_entry(content: &str) -> bool {
326    content.starts_with('-') && (content.len() == 1 || content.as_bytes()[1] == b' ')
327}
328
329fn has_mapping_separator(content: &str) -> bool {
330    split_mapping_entry(content).is_some()
331}
332
333fn split_mapping_entry(content: &str) -> Option<(&str, &str)> {
334    let bytes = content.as_bytes();
335    let mut idx = 0usize;
336    while idx < bytes.len() {
337        if bytes[idx] == b':' {
338            let key = content[..idx].trim();
339            let value = content[idx + 1..].trim_start();
340            if key.is_empty() {
341                return None;
342            }
343            return Some((key, value));
344        }
345        idx += 1;
346    }
347    None
348}
349
350pub fn parse_yaml(bytes: &[u8]) -> Result<YamlValue<'_>, YamlError> {
351    YamlParser::new(bytes).parse()
352}
353
354pub fn parse_yaml_with_max_depth(
355    bytes: &[u8],
356    max_depth: usize,
357) -> Result<YamlValue<'_>, YamlError> {
358    YamlParser::new(bytes).with_max_depth(max_depth).parse()
359}
360
361pub fn parse_yaml_with_limits(
362    bytes: &[u8],
363    limits: YamlLimits,
364) -> Result<YamlValue<'_>, YamlError> {
365    YamlParser::new(bytes).with_limits(limits).parse()
366}
367
368pub fn validate_yaml(bytes: &[u8]) -> Result<(), YamlError> {
369    YamlParser::new(bytes).validate()
370}