Skip to main content

hedl_core/parser/
mod.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Main parser for HEDL documents.
19//!
20//! # Security Limits
21//!
22//! The parser enforces several security limits to prevent denial-of-service attacks:
23//!
24//! - `max_file_size`: Maximum input file size (default: 1GB)
25//! - `max_line_length`: Maximum line length (default: 1MB)
26//! - `max_indent_depth`: Maximum nesting depth for objects (default: 50)
27//! - `max_nodes`: Maximum number of matrix list nodes (default: 10M)
28//! - `max_aliases`: Maximum number of aliases (default: 10k)
29//! - `max_columns`: Maximum columns per schema (default: 100)
30//! - `max_nest_depth`: Maximum NEST hierarchy depth (default: 100)
31//! - `max_block_string_size`: Maximum block string size (default: 10MB)
32//! - `max_object_keys`: Maximum keys per object (default: 10k)
33//! - **`max_total_keys`**: Maximum total keys across all objects (default: 10M)
34//!
35//! ## max_total_keys: Defense in Depth
36//!
37//! The `max_total_keys` limit is a critical security feature that prevents
38//! memory exhaustion attacks via cumulative key allocation. Without this limit,
39//! an attacker could create many small objects, each under `max_object_keys`,
40//! but collectively consuming excessive memory.
41//!
42//! ### Attack Scenario (Without max_total_keys)
43//!
44//! ```text
45//! # Attacker creates 100,000 objects with 10 keys each
46//! # Each object is "valid" (under max_object_keys = 10,000)
47//! # But total memory usage is excessive: 1,000,000 keys!
48//! object0:
49//!   key0: val0
50//!   key1: val1
51//!   ...
52//!   key9: val9
53//! object1:
54//!   key0: val0
55//!   ...
56//! # ... 99,998 more objects
57//! ```
58//!
59//! ### Defense (With max_total_keys = 10,000,000)
60//!
61//! The parser tracks cumulative keys across all objects and rejects documents
62//! that exceed the limit, preventing this attack vector while allowing legitimate
63//! large documents. The 10M default accommodates most real-world datasets while
64//! still providing protection. For extremely large datasets, this limit can be
65//! increased via `ParseOptions`.
66
67mod context;
68mod line_parsing;
69mod options;
70mod utils;
71
72// Re-export public types
73pub use options::{ParseOptions, ParseOptionsBuilder};
74
75use crate::block_string::{try_start_block_string, BlockStringResult, BlockStringState};
76use crate::document::{Document, Item};
77use crate::error::{HedlError, HedlResult};
78use crate::header::parse_header;
79use crate::lex::calculate_indent;
80use crate::limits::{Limits, TimeoutCheckExt, TimeoutContext};
81use crate::preprocess::{is_blank_line, is_comment_line, preprocess};
82use crate::reference::{resolve_references, TypeRegistry};
83use crate::value::Value;
84use context::{pop_frames, Frame};
85use line_parsing::{
86    is_expanded_child_list, is_inline_child_list, parse_expanded_child_list,
87    parse_inline_child_list, parse_matrix_row, parse_non_matrix_line, MatrixParseParams,
88};
89use std::collections::BTreeMap;
90use utils::{check_duplicate_key, finalize_stack, insert_into_current, validate_indent_for_child};
91
92/// Recommended maximum inline children in `@Type#N:|...` syntax.
93/// Per SPEC v2.0 line 58: "Style rule (not a hard syntax limit): keep inline N <= 10"
94/// This is NOT enforced by the parser; use hedl-lint for style warnings.
95const _STYLE_INLINE_CHILDREN_LIMIT: usize = 10;
96
97/// Parse a HEDL document from bytes.
98pub fn parse(input: &[u8]) -> HedlResult<Document> {
99    parse_with_limits(input, ParseOptions::default())
100}
101
102/// Parse a HEDL document with custom options.
103pub fn parse_with_limits(input: &[u8], options: ParseOptions) -> HedlResult<Document> {
104    // Create timeout context for parsing
105    let timeout_ctx = TimeoutContext::new(options.limits.timeout);
106
107    // Phase 1: Preprocess (zero-copy line splitting)
108    let preprocessed = preprocess(input, &options.limits)?;
109
110    // Collect lines as borrowed slices (no per-line allocation)
111    let lines: Vec<(usize, &str)> = preprocessed.lines().collect();
112
113    // Phase 2: Parse header
114    let (header, body_start_idx) = parse_header(&lines, &options.limits, &timeout_ctx)?;
115
116    // Phase 3: Parse body
117    let body_lines = &lines[body_start_idx..];
118    let mut type_registries = TypeRegistry::new();
119    let root = parse_body(
120        body_lines,
121        &header,
122        &options.limits,
123        &mut type_registries,
124        &timeout_ctx,
125    )?;
126
127    // Build document
128    let mut doc = Document::new(header.version);
129    doc.aliases = header.aliases;
130    doc.structs = header.structs;
131    doc.nests = header.nests;
132    doc.root = root;
133
134    // Phase 4: Reference resolution (with timeout check)
135    timeout_ctx.check_timeout(0)?;
136    resolve_references(&doc, options.reference_mode)?;
137
138    Ok(doc)
139}
140
141/// Context for body parsing, holding references to shared state.
142struct ParseContext<'a> {
143    header: &'a crate::header::Header,
144    limits: &'a Limits,
145    type_registries: &'a mut TypeRegistry,
146    node_count: &'a mut usize,
147}
148
149fn parse_body(
150    lines: &[(usize, &str)],
151    header: &crate::header::Header,
152    limits: &Limits,
153    type_registries: &mut TypeRegistry,
154    timeout_ctx: &TimeoutContext,
155) -> HedlResult<BTreeMap<String, Item>> {
156    let mut stack: Vec<Frame> = vec![Frame::Root {
157        object: BTreeMap::new(),
158    }];
159    let mut node_count = 0usize;
160    let mut total_keys = 0usize;
161    let mut block_string: Option<BlockStringState> = None;
162
163    // Create parsing context once for reuse throughout the loop
164    let ctx = ParseContext {
165        header,
166        limits,
167        type_registries,
168        node_count: &mut node_count,
169    };
170
171    // Automatic timeout checking every 10,000 iterations
172    for result in lines.iter().copied().with_timeout_check(timeout_ctx) {
173        let (line_num, line) = result?;
174        // Handle block string accumulation mode
175        if let Some(ref mut state) = block_string {
176            // Process the line and check if block string is complete
177            if let Some(full_content) = state.process_line(line, line_num, limits)? {
178                // Block string is complete
179                let value = Value::String(full_content.into());
180                pop_frames(&mut stack, state.indent);
181                insert_into_current(&mut stack, state.key.clone(), Item::Scalar(value));
182                block_string = None;
183            }
184            continue;
185        }
186
187        // Skip blank and comment lines
188        if is_blank_line(line) || is_comment_line(line) {
189            continue;
190        }
191
192        // Calculate indentation
193        let indent_info = calculate_indent(line, line_num as u32)
194            .map_err(|e| HedlError::syntax(e.to_string(), line_num))?;
195
196        let indent_info = match indent_info {
197            Some(info) => info,
198            None => continue, // Blank line
199        };
200
201        if indent_info.level > limits.max_indent_depth {
202            return Err(HedlError::security(
203                format!(
204                    "indent depth {} exceeds limit {}",
205                    indent_info.level, limits.max_indent_depth
206                ),
207                line_num,
208            ));
209        }
210
211        let indent = indent_info.level;
212        let content = &line[indent_info.spaces..];
213
214        // Pop frames as needed based on indentation
215        pop_frames(&mut stack, indent);
216
217        // Classify and parse line
218        if content.starts_with('|') {
219            let params = MatrixParseParams {
220                content,
221                indent,
222                line_num,
223                header: ctx.header,
224                limits: ctx.limits,
225            };
226            parse_matrix_row(&mut stack, &params, ctx.type_registries, ctx.node_count)?;
227        } else if content.starts_with('@') && is_inline_child_list(content) {
228            // Inline child list:@Type#N:|child1|child2|...
229            let params = MatrixParseParams {
230                content,
231                indent,
232                line_num,
233                header: ctx.header,
234                limits: ctx.limits,
235            };
236            parse_inline_child_list(&mut stack, &params, ctx.type_registries, ctx.node_count)?;
237        } else if content.starts_with('@') && is_expanded_child_list(content) {
238            // Expanded child list:@Type#N: (children on following lines)
239            parse_expanded_child_list(
240                &mut stack, content, indent, line_num, ctx.header, ctx.limits,
241            )?;
242        } else {
243            // Check if this starts a block string
244            match try_start_block_string(content, indent, line_num)? {
245                BlockStringResult::MultiLineStarted(state) => {
246                    // Validate indent and check for duplicate key
247                    validate_indent_for_child(&stack, indent, line_num)?;
248                    check_duplicate_key(&stack, &state.key, line_num, limits, &mut total_keys)?;
249                    block_string = Some(state);
250                }
251                BlockStringResult::NotBlockString => {
252                    parse_non_matrix_line(
253                        &mut stack,
254                        content,
255                        indent,
256                        line_num,
257                        header,
258                        limits,
259                        &mut total_keys,
260                    )?;
261                }
262            }
263        }
264    }
265
266    // Check for unclosed block string
267    if let Some(state) = block_string {
268        return Err(HedlError::syntax(
269            format!(
270                "unclosed block string starting at line {}",
271                state.start_line
272            ),
273            state.start_line,
274        ));
275    }
276
277    // Finalize: pop all frames and build result
278    finalize_stack(stack)
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284    use crate::reference::ReferenceMode;
285
286    // ==================== ParseOptionsBuilder::new() tests ====================
287
288    #[test]
289    fn test_builder_new_creates_default_options() {
290        let builder = ParseOptionsBuilder::new();
291        let opts = builder.build();
292
293        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
294        assert_eq!(opts.limits.max_indent_depth, 50);
295        assert_eq!(opts.limits.max_nodes, 10_000_000);
296    }
297
298    #[test]
299    fn test_builder_default_trait() {
300        let builder1 = ParseOptionsBuilder::new();
301        let builder2 = ParseOptionsBuilder::default();
302        let opts1 = builder1.build();
303        let opts2 = builder2.build();
304
305        assert_eq!(opts1.reference_mode, opts2.reference_mode);
306        assert_eq!(opts1.limits.max_indent_depth, opts2.limits.max_indent_depth);
307    }
308
309    // ==================== ParseOptions::builder() tests ====================
310
311    #[test]
312    fn test_parse_options_builder_method() {
313        let opts = ParseOptions::builder().build();
314        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
315    }
316
317    // ==================== Chainable method tests ====================
318
319    #[test]
320    fn test_builder_max_depth() {
321        let opts = ParseOptions::builder().max_depth(100).build();
322
323        assert_eq!(opts.limits.max_indent_depth, 100);
324    }
325
326    #[test]
327    fn test_builder_max_array_length() {
328        let opts = ParseOptions::builder().max_array_length(5000).build();
329
330        assert_eq!(opts.limits.max_nodes, 5000);
331    }
332
333    #[test]
334    fn test_builder_strict_true() {
335        let opts = ParseOptions::builder().strict(true).build();
336
337        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
338    }
339
340    #[test]
341    fn test_builder_strict_false() {
342        let opts = ParseOptions::builder().strict(false).build();
343
344        assert_eq!(opts.reference_mode, ReferenceMode::Lenient);
345    }
346
347    #[test]
348    fn test_builder_max_file_size() {
349        let size = 500 * 1024 * 1024;
350        let opts = ParseOptions::builder().max_file_size(size).build();
351
352        assert_eq!(opts.limits.max_file_size, size);
353    }
354
355    #[test]
356    fn test_builder_max_line_length() {
357        let length = 512 * 1024;
358        let opts = ParseOptions::builder().max_line_length(length).build();
359
360        assert_eq!(opts.limits.max_line_length, length);
361    }
362
363    #[test]
364    fn test_builder_max_aliases() {
365        let opts = ParseOptions::builder().max_aliases(5000).build();
366
367        assert_eq!(opts.limits.max_aliases, 5000);
368    }
369
370    #[test]
371    fn test_builder_max_columns() {
372        let opts = ParseOptions::builder().max_columns(50).build();
373
374        assert_eq!(opts.limits.max_columns, 50);
375    }
376
377    #[test]
378    fn test_builder_max_nest_depth() {
379        let opts = ParseOptions::builder().max_nest_depth(50).build();
380
381        assert_eq!(opts.limits.max_nest_depth, 50);
382    }
383
384    #[test]
385    fn test_builder_max_block_string_size() {
386        let size = 5 * 1024 * 1024;
387        let opts = ParseOptions::builder().max_block_string_size(size).build();
388
389        assert_eq!(opts.limits.max_block_string_size, size);
390    }
391
392    #[test]
393    fn test_builder_max_object_keys() {
394        let opts = ParseOptions::builder().max_object_keys(5000).build();
395
396        assert_eq!(opts.limits.max_object_keys, 5000);
397    }
398
399    #[test]
400    fn test_builder_max_total_keys() {
401        let opts = ParseOptions::builder().max_total_keys(5_000_000).build();
402
403        assert_eq!(opts.limits.max_total_keys, 5_000_000);
404    }
405
406    // ==================== Multiple chained methods tests ====================
407
408    #[test]
409    fn test_builder_multiple_chains() {
410        let opts = ParseOptions::builder()
411            .max_depth(100)
412            .max_array_length(5000)
413            .strict(false)
414            .build();
415
416        assert_eq!(opts.limits.max_indent_depth, 100);
417        assert_eq!(opts.limits.max_nodes, 5000);
418        assert_eq!(opts.reference_mode, ReferenceMode::Lenient);
419    }
420
421    #[test]
422    fn test_builder_all_options_chained() {
423        let opts = ParseOptions::builder()
424            .max_depth(75)
425            .max_array_length(2000)
426            .strict(false)
427            .max_file_size(100 * 1024 * 1024)
428            .max_line_length(256 * 1024)
429            .max_aliases(1000)
430            .max_columns(25)
431            .max_nest_depth(30)
432            .max_block_string_size(1024 * 1024)
433            .max_object_keys(1000)
434            .max_total_keys(1_000_000)
435            .build();
436
437        assert_eq!(opts.limits.max_indent_depth, 75);
438        assert_eq!(opts.limits.max_nodes, 2000);
439        assert_eq!(opts.reference_mode, ReferenceMode::Lenient);
440        assert_eq!(opts.limits.max_file_size, 100 * 1024 * 1024);
441        assert_eq!(opts.limits.max_line_length, 256 * 1024);
442        assert_eq!(opts.limits.max_aliases, 1000);
443        assert_eq!(opts.limits.max_columns, 25);
444        assert_eq!(opts.limits.max_nest_depth, 30);
445        assert_eq!(opts.limits.max_block_string_size, 1024 * 1024);
446        assert_eq!(opts.limits.max_object_keys, 1000);
447        assert_eq!(opts.limits.max_total_keys, 1_000_000);
448    }
449
450    // ==================== Override tests ====================
451
452    #[test]
453    fn test_builder_override_previous_value() {
454        let opts = ParseOptions::builder().max_depth(50).max_depth(100).build();
455
456        assert_eq!(opts.limits.max_indent_depth, 100);
457    }
458
459    #[test]
460    fn test_builder_override_multiple_times() {
461        let opts = ParseOptions::builder()
462            .max_array_length(1000)
463            .max_array_length(2000)
464            .max_array_length(3000)
465            .build();
466
467        assert_eq!(opts.limits.max_nodes, 3000);
468    }
469
470    // ==================== Default behavior tests ====================
471
472    #[test]
473    fn test_builder_default_keeps_other_defaults() {
474        let opts = ParseOptions::builder().max_depth(100).build();
475
476        assert_eq!(opts.limits.max_indent_depth, 100);
477        // Other values should remain at defaults
478        assert_eq!(opts.limits.max_file_size, 1024 * 1024 * 1024);
479        assert_eq!(opts.limits.max_line_length, 1024 * 1024);
480        assert_eq!(opts.limits.max_nodes, 10_000_000);
481        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
482    }
483
484    // ==================== Edge case tests ====================
485
486    #[test]
487    fn test_builder_zero_values() {
488        let opts = ParseOptions::builder()
489            .max_depth(0)
490            .max_array_length(0)
491            .max_aliases(0)
492            .build();
493
494        assert_eq!(opts.limits.max_indent_depth, 0);
495        assert_eq!(opts.limits.max_nodes, 0);
496        assert_eq!(opts.limits.max_aliases, 0);
497    }
498
499    #[test]
500    fn test_builder_max_values() {
501        let opts = ParseOptions::builder()
502            .max_depth(usize::MAX)
503            .max_array_length(usize::MAX)
504            .max_file_size(usize::MAX)
505            .build();
506
507        assert_eq!(opts.limits.max_indent_depth, usize::MAX);
508        assert_eq!(opts.limits.max_nodes, usize::MAX);
509        assert_eq!(opts.limits.max_file_size, usize::MAX);
510    }
511
512    // ==================== Equivalence tests ====================
513
514    #[test]
515    fn test_builder_build_equivalent_to_default() {
516        let builder_opts = ParseOptions::builder().build();
517        let default_opts = ParseOptions::default();
518
519        assert_eq!(builder_opts.reference_mode, default_opts.reference_mode);
520        assert_eq!(
521            builder_opts.limits.max_indent_depth,
522            default_opts.limits.max_indent_depth
523        );
524        assert_eq!(builder_opts.limits.max_nodes, default_opts.limits.max_nodes);
525        assert_eq!(
526            builder_opts.limits.max_file_size,
527            default_opts.limits.max_file_size
528        );
529    }
530
531    #[test]
532    fn test_builder_clone_independent() {
533        let builder1 = ParseOptions::builder().max_depth(100);
534        let builder2 = builder1.clone().max_depth(200);
535
536        let opts1 = builder1.build();
537        let opts2 = builder2.build();
538
539        assert_eq!(opts1.limits.max_indent_depth, 100);
540        assert_eq!(opts2.limits.max_indent_depth, 200);
541    }
542
543    // ==================== Usage pattern tests ====================
544
545    #[test]
546    fn test_builder_typical_usage_pattern() {
547        // Typical use case: strict parsing with moderate limits
548        let opts = ParseOptions::builder().max_depth(100).strict(true).build();
549
550        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
551        assert_eq!(opts.limits.max_indent_depth, 100);
552    }
553
554    #[test]
555    fn test_builder_lenient_parsing_pattern() {
556        // Lenient parsing with higher limits
557        let opts = ParseOptions::builder()
558            .max_array_length(50_000)
559            .strict(false)
560            .max_block_string_size(50 * 1024 * 1024)
561            .build();
562
563        assert_eq!(opts.reference_mode, ReferenceMode::Lenient);
564        assert_eq!(opts.limits.max_nodes, 50_000);
565        assert_eq!(opts.limits.max_block_string_size, 50 * 1024 * 1024);
566    }
567
568    #[test]
569    fn test_builder_restricted_parsing_pattern() {
570        // Restricted parsing for security
571        let opts = ParseOptions::builder()
572            .max_file_size(10 * 1024 * 1024)
573            .max_line_length(64 * 1024)
574            .max_depth(20)
575            .max_array_length(1000)
576            .strict(true)
577            .build();
578
579        assert_eq!(opts.limits.max_file_size, 10 * 1024 * 1024);
580        assert_eq!(opts.limits.max_line_length, 64 * 1024);
581        assert_eq!(opts.limits.max_indent_depth, 20);
582        assert_eq!(opts.limits.max_nodes, 1000);
583        assert_eq!(opts.reference_mode, ReferenceMode::Strict);
584    }
585
586    // ==================== Timeout integration tests ====================
587
588    #[test]
589    fn test_parse_with_generous_timeout_succeeds() {
590        let doc = b"%V:2.0\n%NULL:~\n%QUOTE:\"\n---\nkey: value\n";
591        let mut opts = ParseOptions::default();
592        opts.limits.timeout = Some(std::time::Duration::from_secs(10));
593        let result = parse_with_limits(doc, opts);
594        assert!(result.is_ok());
595    }
596
597    #[test]
598    fn test_parse_with_no_timeout_succeeds() {
599        let doc = b"%V:2.0\n%NULL:~\n%QUOTE:\"\n---\nkey: value\n";
600        let mut opts = ParseOptions::default();
601        opts.limits.timeout = None;
602        let result = parse_with_limits(doc, opts);
603        assert!(result.is_ok());
604    }
605
606    #[test]
607    fn test_parse_with_very_short_timeout_fails() {
608        // Create a document large enough to take some time
609        let mut doc = String::from("%V:2.0\n%NULL:~\n%QUOTE:\"\n---\ndata:\n");
610        for i in 0..100_000 {
611            doc.push_str(&format!("  key{}: value{}\n", i, i));
612        }
613
614        let mut opts = ParseOptions::default();
615        // Set an impossibly short timeout (1 microsecond)
616        opts.limits.timeout = Some(std::time::Duration::from_micros(1));
617
618        let result = parse_with_limits(doc.as_bytes(), opts);
619        assert!(result.is_err());
620
621        if let Err(e) = result {
622            let msg = e.to_string();
623            assert!(msg.contains("timeout") || msg.contains("Timeout"));
624        }
625    }
626
627    #[test]
628    fn test_default_timeout_is_reasonable() {
629        let opts = ParseOptions::default();
630        assert_eq!(
631            opts.limits.timeout,
632            Some(std::time::Duration::from_secs(30))
633        );
634    }
635
636    #[test]
637    fn test_unlimited_has_no_timeout() {
638        let limits = Limits::unlimited();
639        assert_eq!(limits.timeout, None);
640    }
641}