Skip to main content

hedl_core/
reference.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License in the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! Reference resolution for HEDL.
19
20use crate::document::{Document, Item, MatrixList, Node};
21use crate::error::{HedlError, HedlResult};
22use crate::limits::Limits;
23use crate::value::Value;
24use std::collections::{BTreeMap, HashMap};
25
26/// Reference resolution mode for controlling validation behavior.
27///
28/// Determines how the reference resolver handles unresolved or problematic references.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
30pub enum ReferenceMode {
31    /// Strict mode: Unresolved references cause errors.
32    ///
33    /// This is the default and recommended mode for production parsing.
34    /// Any reference that cannot be resolved will result in a parse error.
35    ///
36    /// # Behavior
37    /// - Unresolved references → Error
38    /// - Ambiguous references → Error (always, regardless of mode)
39    #[default]
40    Strict,
41
42    /// Lenient mode: Unresolved references are ignored.
43    ///
44    /// Useful for partial parsing, work-in-progress documents, or when
45    /// reference validation is deferred to a separate validation pass.
46    ///
47    /// # Behavior
48    /// - Unresolved references → Silently ignored
49    /// - Ambiguous references → Error (always, regardless of mode)
50    ///
51    /// # Use Cases
52    /// - Parsing incomplete documents during development
53    /// - Incremental parsing where not all nodes are loaded
54    /// - Custom validation workflows
55    Lenient,
56}
57
58impl ReferenceMode {
59    /// Returns true if this mode should fail on unresolved references.
60    #[inline]
61    pub fn is_strict(self) -> bool {
62        matches!(self, ReferenceMode::Strict)
63    }
64
65    /// Returns true if this mode allows unresolved references.
66    #[inline]
67    pub fn is_lenient(self) -> bool {
68        matches!(self, ReferenceMode::Lenient)
69    }
70}
71
72impl From<bool> for ReferenceMode {
73    /// Convert from legacy boolean parameter.
74    ///
75    /// `true` → `Strict`, `false` → `Lenient`
76    fn from(strict: bool) -> Self {
77        if strict {
78            ReferenceMode::Strict
79        } else {
80            ReferenceMode::Lenient
81        }
82    }
83}
84
85/// Type registries with both forward and inverted indices for efficient lookups.
86///
87/// P0 OPTIMIZATION: Inverted index for unqualified references (100-1000x speedup)
88/// - Forward index: type -> (id -> line_num) for qualified lookups (O(log n))
89/// - Inverted index: id -> \[types\] for unqualified lookups (O(1))
90pub struct TypeRegistry {
91    /// Forward index: type_name -> (id -> line_number)
92    by_type: BTreeMap<String, BTreeMap<String, usize>>,
93    /// Inverted index: id -> list of type names containing that ID
94    by_id: HashMap<String, Vec<String>>,
95    /// Total number of IDs registered across all types (for limit enforcement)
96    total_ids: usize,
97}
98
99impl TypeRegistry {
100    /// Create a new empty registry
101    pub fn new() -> Self {
102        Self {
103            by_type: BTreeMap::new(),
104            by_id: HashMap::new(),
105            total_ids: 0,
106        }
107    }
108
109    /// Register an ID in a type, maintaining both indices
110    pub fn register(
111        &mut self,
112        type_name: &str,
113        id: &str,
114        line_num: usize,
115        limits: &Limits,
116    ) -> HedlResult<()> {
117        let type_registry = self.by_type.entry(type_name.to_string()).or_default();
118
119        if let Some(&prev_line) = type_registry.get(id) {
120            return Err(HedlError::collision(
121                format!(
122                    "duplicate ID '{}' in type '{}', previously defined at line {}",
123                    id, type_name, prev_line
124                ),
125                line_num,
126            ));
127        }
128
129        // Check total IDs limit before registration
130        if self.total_ids >= limits.max_total_ids {
131            return Err(HedlError::security(
132                format!(
133                    "total ID registrations {} exceeds limit {}",
134                    self.total_ids, limits.max_total_ids
135                ),
136                line_num,
137            ));
138        }
139
140        type_registry.insert(id.to_string(), line_num);
141
142        // Update inverted index
143        self.by_id
144            .entry(id.to_string())
145            .or_default()
146            .push(type_name.to_string());
147
148        // Increment total count
149        self.total_ids += 1;
150
151        Ok(())
152    }
153
154    /// Look up ID in a specific type (qualified reference)
155    pub fn contains_in_type(&self, type_name: &str, id: &str) -> bool {
156        self.by_type
157            .get(type_name)
158            .map(|r| r.contains_key(id))
159            .unwrap_or(false)
160    }
161
162    /// Look up ID across all types (unqualified reference)
163    /// Returns list of types containing this ID
164    pub fn lookup_unqualified(&self, id: &str) -> Option<&[String]> {
165        self.by_id.get(id).map(|v| v.as_slice())
166    }
167
168    /// Iterate over all IDs and their associated types in the inverted index.
169    ///
170    /// Used for merging registries during parallel parsing.
171    pub fn by_id_iter(&self) -> impl Iterator<Item = (&String, &Vec<String>)> {
172        self.by_id.iter()
173    }
174}
175
176impl Default for TypeRegistry {
177    fn default() -> Self {
178        Self::new()
179    }
180}
181
182/// Check NEST hierarchy depth against security limit.
183///
184/// Returns an error if the depth exceeds the maximum allowed depth.
185fn check_nest_depth(depth: usize, max_depth: usize) -> HedlResult<()> {
186    if depth > max_depth {
187        return Err(HedlError::security(
188            format!(
189                "NEST hierarchy depth {} exceeds maximum allowed depth {}",
190                depth, max_depth
191            ),
192            0,
193        ));
194    }
195    Ok(())
196}
197
198/// Register a node ID, checking for collisions.
199pub fn register_node(
200    registries: &mut TypeRegistry,
201    type_name: &str,
202    id: &str,
203    line_num: usize,
204    limits: &Limits,
205) -> HedlResult<()> {
206    registries.register(type_name, id, line_num, limits)
207}
208
209/// Resolve all references in a document using default limits.
210///
211/// Validates that all references point to existing nodes according to the
212/// specified reference resolution mode.
213///
214/// # Arguments
215/// - `doc`: The document to validate
216/// - `mode`: Reference resolution mode (strict or lenient)
217///
218/// # Errors
219/// - In strict mode: Returns error if any reference cannot be resolved
220/// - In any mode: Returns error if any reference is ambiguous
221/// - Returns error if nesting depth exceeds limits
222///
223/// # Examples
224/// ```
225/// use hedl_core::{Document, ReferenceMode, resolve_references};
226///
227/// let doc = Document::new((2, 0));
228/// // Strict mode - fail on unresolved references
229/// resolve_references(&doc, ReferenceMode::Strict)?;
230///
231/// // Lenient mode - ignore unresolved references
232/// resolve_references(&doc, ReferenceMode::Lenient)?;
233/// # Ok::<(), hedl_core::HedlError>(())
234/// ```
235pub fn resolve_references(doc: &Document, mode: ReferenceMode) -> HedlResult<()> {
236    resolve_references_with_limits(doc, mode, &Limits::default())
237}
238
239/// Resolve all references in a document with configurable limits.
240///
241/// Validates that all references point to existing nodes according to the
242/// specified reference resolution mode, using custom security limits.
243///
244/// # Arguments
245/// - `doc`: The document to validate
246/// - `mode`: Reference resolution mode (strict or lenient)
247/// - `limits`: Security limits for parsing
248///
249/// # Errors
250/// - In strict mode: Returns error if any reference cannot be resolved
251/// - In any mode: Returns error if any reference is ambiguous
252/// - Returns error if nesting depth exceeds limits
253pub fn resolve_references_with_limits(
254    doc: &Document,
255    mode: ReferenceMode,
256    limits: &Limits,
257) -> HedlResult<()> {
258    // Build type registries from document
259    let mut registries = TypeRegistry::new();
260    collect_node_ids(&doc.root, &mut registries, 0, limits)?;
261
262    // Validate all references
263    validate_references(&doc.root, &registries, mode, None, 0, limits.max_nest_depth)
264}
265
266fn collect_node_ids(
267    items: &BTreeMap<String, Item>,
268    registries: &mut TypeRegistry,
269    depth: usize,
270    limits: &Limits,
271) -> HedlResult<()> {
272    check_nest_depth(depth, limits.max_nest_depth)?;
273
274    for item in items.values() {
275        match item {
276            Item::List(list) => {
277                collect_list_ids(list, registries, depth, limits)?;
278            }
279            Item::Object(obj) => {
280                collect_node_ids(obj, registries, depth + 1, limits)?;
281            }
282            Item::Scalar(_) => {}
283        }
284    }
285    Ok(())
286}
287
288fn collect_list_ids(
289    list: &MatrixList,
290    registries: &mut TypeRegistry,
291    depth: usize,
292    limits: &Limits,
293) -> HedlResult<()> {
294    // Collect IDs from this list
295    for node in &list.rows {
296        // Node IDs were already validated during parsing, just collect them
297        registries.register(&list.type_name, &node.id, 0, limits)?; // line 0 = already parsed
298    }
299
300    // Then recurse into children
301    for node in &list.rows {
302        if let Some(children) = node.children() {
303            for child_list in children.values() {
304                for child in child_list {
305                    collect_list_ids_from_node(child, registries, depth + 1, limits)?;
306                }
307            }
308        }
309    }
310
311    Ok(())
312}
313
314fn collect_list_ids_from_node(
315    node: &Node,
316    registries: &mut TypeRegistry,
317    depth: usize,
318    limits: &Limits,
319) -> HedlResult<()> {
320    check_nest_depth(depth, limits.max_nest_depth)?;
321
322    registries.register(&node.type_name, &node.id, 0, limits)?;
323
324    if let Some(children) = node.children() {
325        for child_list in children.values() {
326            for child in child_list {
327                collect_list_ids_from_node(child, registries, depth + 1, limits)?;
328            }
329        }
330    }
331
332    Ok(())
333}
334
335fn validate_references(
336    items: &BTreeMap<String, Item>,
337    registries: &TypeRegistry,
338    mode: ReferenceMode,
339    current_type: Option<&str>,
340    depth: usize,
341    max_depth: usize,
342) -> HedlResult<()> {
343    check_nest_depth(depth, max_depth)?;
344
345    for item in items.values() {
346        match item {
347            Item::Scalar(value) => {
348                validate_value_reference(value, registries, mode, current_type)?;
349            }
350            Item::List(list) => {
351                for node in &list.rows {
352                    validate_node_references(node, registries, mode, depth, max_depth)?;
353                }
354            }
355            Item::Object(obj) => {
356                validate_references(obj, registries, mode, current_type, depth + 1, max_depth)?;
357            }
358        }
359    }
360    Ok(())
361}
362
363fn validate_node_references(
364    node: &Node,
365    registries: &TypeRegistry,
366    mode: ReferenceMode,
367    depth: usize,
368    max_depth: usize,
369) -> HedlResult<()> {
370    check_nest_depth(depth, max_depth)?;
371
372    for value in &node.fields {
373        validate_value_reference(value, registries, mode, Some(&node.type_name))?;
374    }
375
376    if let Some(children) = node.children() {
377        for child_list in children.values() {
378            for child in child_list {
379                validate_node_references(child, registries, mode, depth + 1, max_depth)?;
380            }
381        }
382    }
383
384    Ok(())
385}
386
387fn validate_value_reference(
388    value: &Value,
389    registries: &TypeRegistry,
390    mode: ReferenceMode,
391    current_type: Option<&str>,
392) -> HedlResult<()> {
393    if let Value::Reference(ref_val) = value {
394        // If reference has explicit type (@User:u1), look only in that type's registry
395        let resolved = match &ref_val.type_name {
396            Some(t) => registries.contains_in_type(t, &ref_val.id),
397            None => {
398                // No type qualifier - behavior depends on context
399                match current_type {
400                    // SPEC 10.2, 10.3: In matrix context, search ONLY current type
401                    Some(type_name) => registries.contains_in_type(type_name, &ref_val.id),
402                    // SPEC 10.3.1: In Key-Value context, search all types but detect ambiguity
403                    // P0 OPTIMIZATION: Use inverted index for O(1) lookup instead of O(m) scan
404                    None => {
405                        let matching_types =
406                            registries.lookup_unqualified(&ref_val.id).unwrap_or(&[]);
407
408                        match matching_types.len() {
409                            0 => false, // Not found
410                            1 => true,  // Unambiguous match
411                            _ => {
412                                // Multiple matches - ambiguous reference
413                                return Err(HedlError::reference(
414                                    format!(
415                                        "Ambiguous unqualified reference '@{}' matches multiple types: [{}]",
416                                        ref_val.id,
417                                        matching_types.join(", ")
418                                    ),
419                                    0, // Line number lost at this point
420                                ));
421                            }
422                        }
423                    }
424                }
425            }
426        };
427
428        if !resolved && mode.is_strict() {
429            return Err(HedlError::reference(
430                format!("unresolved reference {}", ref_val.to_ref_string()),
431                0, // Line number lost at this point
432            ));
433        }
434    }
435
436    Ok(())
437}
438
439#[cfg(test)]
440mod tests {
441    use super::*;
442
443    // ==================== max_total_ids limit tests ====================
444
445    #[test]
446    fn test_max_total_ids_limit() {
447        let mut registry = TypeRegistry::new();
448        let limits = Limits {
449            max_total_ids: 3,
450            ..Default::default()
451        };
452
453        // Register 3 IDs across different types (should succeed)
454        assert!(registry.register("Type1", "id1", 1, &limits).is_ok());
455        assert!(registry.register("Type2", "id2", 2, &limits).is_ok());
456        assert!(registry.register("Type3", "id3", 3, &limits).is_ok());
457
458        // 4th registration should fail
459        let result = registry.register("Type4", "id4", 4, &limits);
460        assert!(result.is_err());
461        let err_msg = result.unwrap_err().to_string();
462        assert!(
463            err_msg.contains("exceeds limit"),
464            "Expected 'exceeds limit' in error message, got: {}",
465            err_msg
466        );
467    }
468
469    #[test]
470    fn test_max_total_ids_across_types() {
471        let mut registry = TypeRegistry::new();
472        let limits = Limits {
473            max_total_ids: 10,
474            ..Default::default()
475        };
476
477        // Register IDs in same type
478        for i in 0..5 {
479            assert!(registry
480                .register("Type1", &format!("id{}", i), i, &limits)
481                .is_ok());
482        }
483
484        // Register IDs in different type
485        for i in 0..5 {
486            assert!(registry
487                .register("Type2", &format!("id{}", i), i + 5, &limits)
488                .is_ok());
489        }
490
491        // 11th registration should fail
492        let result = registry.register("Type3", "id_extra", 10, &limits);
493        assert!(result.is_err());
494        let err_msg = result.unwrap_err().to_string();
495        assert!(
496            err_msg.contains("exceeds limit"),
497            "Expected 'exceeds limit' in error message, got: {}",
498            err_msg
499        );
500    }
501
502    #[test]
503    fn test_unlimited_ids() {
504        let mut registry = TypeRegistry::new();
505        let limits = Limits::unlimited();
506
507        // Should be able to register many IDs
508        for i in 0..10000 {
509            let result =
510                registry.register(&format!("Type{}", i % 100), &format!("id{}", i), i, &limits);
511            assert!(
512                result.is_ok(),
513                "Failed to register ID {} in unlimited mode",
514                i
515            );
516        }
517    }
518
519    #[test]
520    fn test_collision_detection_with_limits() {
521        let mut registry = TypeRegistry::new();
522        let limits = Limits::default();
523
524        assert!(registry.register("Type1", "id1", 1, &limits).is_ok());
525
526        // Duplicate in same type should fail with collision error, not limit error
527        let result = registry.register("Type1", "id1", 2, &limits);
528        assert!(result.is_err());
529        let err_msg = result.unwrap_err().to_string();
530        assert!(
531            err_msg.contains("duplicate"),
532            "Expected 'duplicate' in error message, got: {}",
533            err_msg
534        );
535    }
536
537    #[test]
538    fn test_max_total_ids_exact_limit() {
539        let mut registry = TypeRegistry::new();
540        let limits = Limits {
541            max_total_ids: 5,
542            ..Default::default()
543        };
544
545        // Register exactly at limit (should succeed)
546        for i in 0..5 {
547            let result = registry.register("Type", &format!("id{}", i), i, &limits);
548            assert!(result.is_ok(), "Failed to register ID {} at exact limit", i);
549        }
550
551        // One more should fail
552        let result = registry.register("Type", "id5", 5, &limits);
553        assert!(result.is_err());
554        let err_msg = result.unwrap_err().to_string();
555        assert!(
556            err_msg.contains("exceeds limit"),
557            "Expected 'exceeds limit' in error message, got: {}",
558            err_msg
559        );
560    }
561
562    #[test]
563    fn test_max_total_ids_just_under_limit() {
564        let mut registry = TypeRegistry::new();
565        let limits = Limits {
566            max_total_ids: 5,
567            ..Default::default()
568        };
569
570        // Register just under limit (should succeed)
571        for i in 0..4 {
572            assert!(registry
573                .register("Type", &format!("id{}", i), i, &limits)
574                .is_ok());
575        }
576
577        // Still have room for one more
578        assert!(registry.register("Type", "id4", 4, &limits).is_ok());
579    }
580
581    #[test]
582    fn test_max_total_ids_error_message_clarity() {
583        let mut registry = TypeRegistry::new();
584        let limits = Limits {
585            max_total_ids: 2,
586            ..Default::default()
587        };
588
589        registry.register("Type1", "id1", 1, &limits).unwrap();
590        registry.register("Type2", "id2", 2, &limits).unwrap();
591
592        let result = registry.register("Type3", "id3", 3, &limits);
593        assert!(result.is_err());
594        let err_msg = result.unwrap_err().to_string();
595        assert!(
596            err_msg.contains("2"),
597            "Error message should contain the count"
598        );
599        assert!(
600            err_msg.contains("limit"),
601            "Error message should mention 'limit'"
602        );
603    }
604
605    #[test]
606    fn test_total_ids_count_tracking() {
607        let mut registry = TypeRegistry::new();
608        let limits = Limits::unlimited();
609
610        assert_eq!(registry.total_ids, 0);
611
612        registry.register("Type1", "id1", 1, &limits).unwrap();
613        assert_eq!(registry.total_ids, 1);
614
615        registry.register("Type2", "id2", 2, &limits).unwrap();
616        assert_eq!(registry.total_ids, 2);
617
618        registry.register("Type1", "id3", 3, &limits).unwrap();
619        assert_eq!(registry.total_ids, 3);
620    }
621
622    #[test]
623    fn test_max_total_ids_with_multiple_types() {
624        let mut registry = TypeRegistry::new();
625        let limits = Limits {
626            max_total_ids: 100,
627            ..Default::default()
628        };
629
630        // Distribute IDs across 10 types
631        for type_idx in 0..10 {
632            for id_idx in 0..10 {
633                let result = registry.register(
634                    &format!("Type{}", type_idx),
635                    &format!("id{}_{}", type_idx, id_idx),
636                    type_idx * 10 + id_idx,
637                    &limits,
638                );
639                assert!(result.is_ok(), "Failed at type {} id {}", type_idx, id_idx);
640            }
641        }
642
643        // Now we're at limit (100), next should fail
644        let result = registry.register("TypeExtra", "extra", 100, &limits);
645        assert!(result.is_err());
646    }
647
648    #[test]
649    fn test_collision_preserves_total_count() {
650        let mut registry = TypeRegistry::new();
651        let limits = Limits::unlimited();
652
653        registry.register("Type1", "id1", 1, &limits).unwrap();
654        assert_eq!(registry.total_ids, 1);
655
656        // Attempt duplicate registration (should fail)
657        let result = registry.register("Type1", "id1", 2, &limits);
658        assert!(result.is_err());
659
660        // Total count should not have changed
661        assert_eq!(registry.total_ids, 1);
662    }
663
664    #[test]
665    fn test_default_limits_max_total_ids() {
666        let limits = Limits::default();
667        assert_eq!(limits.max_total_ids, 10_000_000);
668    }
669
670    #[test]
671    fn test_unlimited_limits_max_total_ids() {
672        let limits = Limits::unlimited();
673        assert_eq!(limits.max_total_ids, usize::MAX);
674    }
675
676    // ==================== TypeRegistry basic functionality tests ====================
677
678    #[test]
679    fn test_registry_new() {
680        let registry = TypeRegistry::new();
681        assert_eq!(registry.total_ids, 0);
682        assert!(registry.by_type.is_empty());
683        assert!(registry.by_id.is_empty());
684    }
685
686    #[test]
687    fn test_registry_default() {
688        let registry = TypeRegistry::default();
689        assert_eq!(registry.total_ids, 0);
690    }
691
692    #[test]
693    fn test_contains_in_type() {
694        let mut registry = TypeRegistry::new();
695        let limits = Limits::unlimited();
696
697        registry.register("User", "u1", 1, &limits).unwrap();
698        assert!(registry.contains_in_type("User", "u1"));
699        assert!(!registry.contains_in_type("User", "u2"));
700        assert!(!registry.contains_in_type("Post", "u1"));
701    }
702
703    #[test]
704    fn test_lookup_unqualified() {
705        let mut registry = TypeRegistry::new();
706        let limits = Limits::unlimited();
707
708        registry.register("User", "id1", 1, &limits).unwrap();
709        registry.register("Post", "id1", 2, &limits).unwrap();
710
711        let types = registry.lookup_unqualified("id1");
712        assert!(types.is_some());
713        let types = types.unwrap();
714        assert_eq!(types.len(), 2);
715        assert!(types.contains(&"User".to_string()));
716        assert!(types.contains(&"Post".to_string()));
717
718        let not_found = registry.lookup_unqualified("nonexistent");
719        assert!(not_found.is_none());
720    }
721
722    #[test]
723    fn test_inverted_index_maintenance() {
724        let mut registry = TypeRegistry::new();
725        let limits = Limits::unlimited();
726
727        // Same ID in multiple types should appear in inverted index
728        registry.register("Type1", "shared_id", 1, &limits).unwrap();
729        registry.register("Type2", "shared_id", 2, &limits).unwrap();
730        registry.register("Type3", "shared_id", 3, &limits).unwrap();
731
732        let types = registry.lookup_unqualified("shared_id").unwrap();
733        assert_eq!(types.len(), 3);
734    }
735
736    // ==================== ReferenceMode tests ====================
737
738    #[test]
739    fn test_reference_mode_default() {
740        assert_eq!(ReferenceMode::default(), ReferenceMode::Strict);
741    }
742
743    #[test]
744    fn test_reference_mode_from_bool() {
745        assert_eq!(ReferenceMode::from(true), ReferenceMode::Strict);
746        assert_eq!(ReferenceMode::from(false), ReferenceMode::Lenient);
747    }
748
749    #[test]
750    fn test_reference_mode_is_strict() {
751        assert!(ReferenceMode::Strict.is_strict());
752        assert!(!ReferenceMode::Lenient.is_strict());
753    }
754
755    #[test]
756    fn test_reference_mode_is_lenient() {
757        assert!(ReferenceMode::Lenient.is_lenient());
758        assert!(!ReferenceMode::Strict.is_lenient());
759    }
760
761    #[test]
762    fn test_reference_mode_equality() {
763        assert_eq!(ReferenceMode::Strict, ReferenceMode::Strict);
764        assert_eq!(ReferenceMode::Lenient, ReferenceMode::Lenient);
765        assert_ne!(ReferenceMode::Strict, ReferenceMode::Lenient);
766    }
767}