Skip to main content

khive_query/
validate.rs

1//! AST validation per ADR-008 §Validation Rules.
2//!
3//! `validate` normalises an AST in place and rejects queries that violate the
4//! closed edge ontology or attempt to subvert namespace scoping:
5//!
6//! 1. **Edge relations** must parse to one of the 13 canonical [`EdgeRelation`]
7//!    variants (ADR-002). Aliases and case differences are normalised to the
8//!    canonical snake_case form stored in the database. Applies to edge
9//!    patterns *and* `WHERE e.relation = '…'` constraints.
10//! 2. **Node kinds** pass through unchanged — the query layer is pack-agnostic
11//!    (ADR-025). Kind validation is the responsibility of the service boundary,
12//!    not the query compiler.
13//! 3. **Namespace scoping is a trusted parameter only.** Queries must not name
14//!    `namespace` in node property maps or `WHERE` conditions — the only valid
15//!    source of namespace filtering is `CompileOptions::scopes`. This matches
16//!    ADR-008 §Validation: "never trust query strings to set namespaces."
17//! 4. **Traversal depth** is limited to [`MAX_DEPTH`] (10 hops). Requests that
18//!    exceed the cap are rejected with [`QueryError::InvalidInput`] at validation
19//!    time (ADR-008 §"Depth limits").
20
21use std::collections::HashSet;
22use std::str::FromStr;
23
24use khive_types::EdgeRelation;
25
26use crate::ast::{Condition, ConditionValue, GqlQuery, PatternElement};
27use crate::error::QueryError;
28
29/// Maximum traversal depth allowed by the query layer (ADR-008 §Validation).
30pub const MAX_DEPTH: usize = 10;
31
32/// Validate and normalise an AST in place.
33///
34/// Canonicalizes edge relation strings to their snake_case form (closed set).
35/// Node kind strings pass through unchanged (pack-agnostic).
36pub fn validate(query: &mut GqlQuery) -> Result<(), QueryError> {
37    validate_with_warnings(query).map(|_| ())
38}
39
40/// Validate and normalise an AST in place, returning any warnings generated.
41///
42/// Returns an empty `Vec<String>` for forward compatibility; no warning paths
43/// are currently emitted.  The F048 depth-cap path now returns `InvalidInput`
44/// rather than clamping and warning.
45pub fn validate_with_warnings(query: &mut GqlQuery) -> Result<Vec<String>, QueryError> {
46    let warnings: Vec<String> = Vec::new();
47
48    // Pattern variables are bindings — the same variable name appearing twice
49    // would mean "same node/edge" and require alias-equality predicates in
50    // SQL. Until that is implemented, reject repeated bindings explicitly so
51    // cycles and self-reachability don't silently compile to wrong results.
52    let mut seen_node_vars: HashSet<&str> = HashSet::new();
53    let mut seen_edge_vars: HashSet<&str> = HashSet::new();
54    for element in &query.pattern.elements {
55        match element {
56            PatternElement::Node(node) => {
57                if let Some(var) = node.variable.as_deref() {
58                    if !seen_node_vars.insert(var) {
59                        return Err(QueryError::Unsupported(format!(
60                            "repeated node variable '{var}' (cycle / self-reachability \
61                             requires alias-equality predicates not yet implemented)"
62                        )));
63                    }
64                }
65            }
66            PatternElement::Edge(edge) => {
67                if let Some(var) = edge.variable.as_deref() {
68                    if !seen_edge_vars.insert(var) {
69                        return Err(QueryError::Unsupported(format!(
70                            "repeated edge variable '{var}' not supported"
71                        )));
72                    }
73                }
74            }
75        }
76    }
77
78    for element in &mut query.pattern.elements {
79        match element {
80            PatternElement::Node(node) => {
81                if node.properties.contains_key("namespace") {
82                    return Err(QueryError::Validation(
83                        "namespace is set by CompileOptions, not query text".into(),
84                    ));
85                }
86            }
87            PatternElement::Edge(edge) => {
88                for relation in edge.relations.iter_mut() {
89                    // Synthetic ADR-041 relations (observed_as_*) do not exist
90                    // in the closed EdgeRelation enum — skip taxonomy validation
91                    // for them and leave the string unchanged.  The SQL compiler
92                    // handles them via the event_observations join path.
93                    if relation.starts_with("observed_as_") {
94                        continue;
95                    }
96                    let parsed = EdgeRelation::from_str(relation)
97                        .map_err(|err| QueryError::Validation(err.to_string()))?;
98                    *relation = parsed.as_str().to_string();
99                }
100                if edge.min_hops == 0 {
101                    return Err(QueryError::Unsupported(
102                        "zero-hop ranges (min_hops = 0) not yet supported; \
103                         use a minimum of 1 hop"
104                            .into(),
105                    ));
106                }
107                // Reject inverted ranges before any clamping — silently
108                // rewriting *3..1 to *1..1 changes query semantics.
109                if edge.min_hops > edge.max_hops {
110                    return Err(QueryError::Validation(format!(
111                        "invalid hop range: min {} > max {}",
112                        edge.min_hops, edge.max_hops
113                    )));
114                }
115                // If the minimum already exceeds our depth cap, the query
116                // can never produce results — reject rather than silently
117                // returning an empty set from a clamped range.
118                if edge.min_hops > MAX_DEPTH {
119                    return Err(QueryError::Unsupported(format!(
120                        "minimum hop count {} exceeds depth cap {}",
121                        edge.min_hops, MAX_DEPTH
122                    )));
123                }
124                // Reject max_hops above the depth cap (ADR-008 §"Depth limits").
125                if edge.max_hops > MAX_DEPTH {
126                    return Err(QueryError::InvalidInput(format!(
127                        "max_hops {} exceeds the depth cap of {}; reduce the range or use a smaller bound",
128                        edge.max_hops, MAX_DEPTH
129                    )));
130                }
131            }
132        }
133    }
134
135    // Build variable → kind map so condition validation is context-aware.
136    // `kind` and `relation` only get taxonomy enforcement on the correct
137    // variable type (node vs edge). On the other type, they're treated as
138    // ordinary JSON property keys.
139    let mut var_kinds: std::collections::HashMap<&str, VarKind> = std::collections::HashMap::new();
140    for element in &query.pattern.elements {
141        match element {
142            PatternElement::Node(n) => {
143                if let Some(v) = n.variable.as_deref() {
144                    var_kinds.insert(v, VarKind::Node);
145                }
146            }
147            PatternElement::Edge(e) => {
148                if let Some(v) = e.variable.as_deref() {
149                    var_kinds.insert(v, VarKind::Edge);
150                }
151            }
152        }
153    }
154
155    // Walk all leaf conditions in the WHERE expression tree.
156    let mut validate_err: Option<QueryError> = None;
157    query.where_clause.for_each_condition_mut(&mut |cond| {
158        if validate_err.is_some() {
159            return;
160        }
161        let is_edge = var_kinds
162            .get(cond.variable.as_str())
163            .copied()
164            .unwrap_or(VarKind::Node)
165            == VarKind::Edge;
166        if let Err(e) = validate_condition(cond, is_edge) {
167            validate_err = Some(e);
168        }
169    });
170    if let Some(e) = validate_err {
171        return Err(e);
172    }
173
174    Ok(warnings)
175}
176
177#[derive(Clone, Copy, PartialEq, Eq)]
178enum VarKind {
179    Node,
180    Edge,
181}
182
183fn validate_condition(cond: &mut Condition, is_edge: bool) -> Result<(), QueryError> {
184    match cond.property.as_str() {
185        "namespace" => Err(QueryError::Validation(
186            "namespace is set by CompileOptions, not query text".into(),
187        )),
188        "kind" if !is_edge => Ok(()),
189        "relation" if is_edge => {
190            if let ConditionValue::String(ref mut s) = cond.value {
191                let parsed = EdgeRelation::from_str(s)
192                    .map_err(|err| QueryError::Validation(err.to_string()))?;
193                *s = parsed.as_str().to_string();
194            }
195            Ok(())
196        }
197        _ => Ok(()),
198    }
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204    use crate::parsers::gql;
205
206    #[test]
207    fn node_kind_passes_through_unchanged() {
208        // Entity kinds are pack-agnostic strings — no normalization at the query layer.
209        let mut q = gql::parse("MATCH (a:paper)-[:introduced_by]->(b:concept) RETURN a").unwrap();
210        validate(&mut q).unwrap();
211        let kinds: Vec<_> = q
212            .pattern
213            .nodes()
214            .map(|n| n.kind.as_deref().unwrap_or(""))
215            .collect();
216        assert_eq!(kinds, vec!["paper", "concept"]);
217    }
218
219    #[test]
220    fn normalises_relation_case_and_hyphens() {
221        let mut q = gql::parse("MATCH (a)-[:Introduced_By]->(b) RETURN a").unwrap();
222        validate(&mut q).unwrap();
223        let rels: Vec<_> = q
224            .pattern
225            .edges()
226            .flat_map(|e| e.relations.iter().cloned())
227            .collect();
228        assert_eq!(rels, vec!["introduced_by".to_string()]);
229    }
230
231    #[test]
232    fn rejects_unknown_relation() {
233        let mut q = gql::parse("MATCH (a)-[:not_a_relation]->(b) RETURN a").unwrap();
234        let err = validate(&mut q).unwrap_err();
235        let msg = err.to_string();
236        assert!(msg.contains("not_a_relation"), "msg: {msg}");
237    }
238
239    #[test]
240    fn unknown_kind_passes_through() {
241        // Entity kinds are pack-agnostic strings — any string is accepted at the query layer.
242        let mut q = gql::parse("MATCH (a:gizmo)-[:extends]->(b) RETURN a").unwrap();
243        validate(&mut q).unwrap();
244    }
245
246    #[test]
247    fn rejects_depth_above_max() {
248        // ADR-008 §"Depth limits": exceeding MAX_DEPTH is an InvalidInput error,
249        // not a silent clamp.
250        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
251        let err = validate(&mut q).unwrap_err();
252        assert!(
253            matches!(err, QueryError::InvalidInput(_)),
254            "expected InvalidInput, got {err:?}"
255        );
256        assert!(
257            err.to_string().contains("50"),
258            "error should mention requested depth: {err}"
259        );
260    }
261
262    #[test]
263    fn rejects_depth_above_max_warnings_path() {
264        // validate_with_warnings must also reject (not clamp + warn).
265        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
266        let err = validate_with_warnings(&mut q).unwrap_err();
267        assert!(
268            matches!(err, QueryError::InvalidInput(_)),
269            "expected InvalidInput, got {err:?}"
270        );
271    }
272
273    #[test]
274    fn multi_relation_all_normalised() {
275        let mut q = gql::parse("MATCH (a)-[:Extends|VARIANT_OF]->(b) RETURN a").unwrap();
276        validate(&mut q).unwrap();
277        let edge = q.pattern.edges().next().unwrap();
278        assert_eq!(
279            edge.relations,
280            vec!["extends".to_string(), "variant_of".to_string()]
281        );
282    }
283
284    #[test]
285    fn rejects_namespace_in_where() {
286        let mut q =
287            gql::parse("MATCH (a:concept)-[:extends]->(b) WHERE a.namespace = 'other' RETURN a")
288                .unwrap();
289        let err = validate(&mut q).unwrap_err();
290        assert!(err.to_string().contains("namespace"), "msg: {err}");
291    }
292
293    #[test]
294    fn rejects_namespace_in_node_properties() {
295        let mut q =
296            gql::parse("MATCH (a:concept {namespace: 'other'})-[:extends]->(b) RETURN a").unwrap();
297        let err = validate(&mut q).unwrap_err();
298        assert!(err.to_string().contains("namespace"), "msg: {err}");
299    }
300
301    #[test]
302    fn rejects_unknown_relation_in_where() {
303        let mut q =
304            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'related_to' RETURN a")
305                .unwrap();
306        let err = validate(&mut q).unwrap_err();
307        assert!(err.to_string().contains("related_to"), "msg: {err}");
308    }
309
310    fn first_condition_string_value(q: &GqlQuery) -> String {
311        match q.where_clause.conditions().next().unwrap().value {
312            ConditionValue::String(ref s) => s.clone(),
313            _ => panic!("expected string condition value"),
314        }
315    }
316
317    #[test]
318    fn unknown_kind_in_where_passes_through() {
319        // Entity kinds are pack-agnostic strings — any kind string is accepted.
320        let mut q =
321            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'gizmo' RETURN a").unwrap();
322        validate(&mut q).unwrap();
323        assert_eq!(first_condition_string_value(&q), "gizmo");
324    }
325
326    #[test]
327    fn kind_in_where_passes_through_unchanged() {
328        // Pack-agnostic: 'paper' is not normalized to 'document'; strings pass through as-is.
329        let mut q =
330            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'paper' RETURN a").unwrap();
331        validate(&mut q).unwrap();
332        assert_eq!(first_condition_string_value(&q), "paper");
333    }
334
335    #[test]
336    fn normalises_relation_alias_in_where() {
337        let mut q =
338            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'Introduced_By' RETURN a")
339                .unwrap();
340        validate(&mut q).unwrap();
341        assert_eq!(first_condition_string_value(&q), "introduced_by");
342    }
343
344    #[test]
345    fn rejects_zero_hop_range_gql_wide() {
346        let mut q = gql::parse("MATCH (a)-[:extends*0..3]->(b) RETURN b").unwrap();
347        let err = validate(&mut q).unwrap_err();
348        assert!(
349            matches!(err, QueryError::Unsupported(_)),
350            "expected Unsupported, got {err:?}"
351        );
352    }
353
354    #[test]
355    fn rejects_zero_hop_range_gql_narrow() {
356        // *0..1 has max_hops=1 so has_variable_length() is false, but the
357        // fixed-length compiler also can't produce zero-hop rows — reject at
358        // validation regardless of compile path.
359        let mut q = gql::parse("MATCH (a)-[:extends*0..1]->(b) RETURN b").unwrap();
360        let err = validate(&mut q).unwrap_err();
361        assert!(
362            matches!(err, QueryError::Unsupported(_)),
363            "expected Unsupported, got {err:?}"
364        );
365    }
366
367    #[test]
368    fn rejects_zero_hop_sparql_explicit_range() {
369        use crate::parsers::sparql;
370        let mut q = sparql::parse("SELECT ?a ?b WHERE { ?a :extends{0,3} ?b . }").unwrap();
371        let err = validate(&mut q).unwrap_err();
372        assert!(
373            matches!(err, QueryError::Unsupported(_)),
374            "expected Unsupported, got {err:?}"
375        );
376    }
377
378    #[test]
379    fn rejects_repeated_node_var_cycle_gql() {
380        let mut q = gql::parse("MATCH (a)-[:extends]->(b)-[:variant_of]->(a) RETURN a").unwrap();
381        let err = validate(&mut q).unwrap_err();
382        assert!(
383            matches!(err, QueryError::Unsupported(_)),
384            "expected Unsupported, got {err:?}"
385        );
386    }
387
388    #[test]
389    fn rejects_repeated_node_var_self_reach_variable_length() {
390        let mut q = gql::parse("MATCH (a)-[:extends*1..3]->(a) RETURN a").unwrap();
391        let err = validate(&mut q).unwrap_err();
392        assert!(
393            matches!(err, QueryError::Unsupported(_)),
394            "expected Unsupported, got {err:?}"
395        );
396    }
397
398    #[test]
399    fn rejects_repeated_node_var_cycle_sparql() {
400        use crate::parsers::sparql;
401        let mut q =
402            sparql::parse("SELECT ?a WHERE { ?a :extends ?b . ?b :variant_of ?a . }").unwrap();
403        let err = validate(&mut q).unwrap_err();
404        assert!(
405            matches!(err, QueryError::Unsupported(_)),
406            "expected Unsupported, got {err:?}"
407        );
408    }
409
410    #[test]
411    fn rejects_repeated_edge_var() {
412        let mut q = gql::parse("MATCH (a)-[e:extends]->(b)-[e:variant_of]->(c) RETURN c").unwrap();
413        let err = validate(&mut q).unwrap_err();
414        assert!(
415            matches!(err, QueryError::Unsupported(_)),
416            "expected Unsupported, got {err:?}"
417        );
418    }
419
420    #[test]
421    fn rejects_inverted_range() {
422        // *3..1 is an inverted range — must error, not silently rewrite to *1..1.
423        let mut q = gql::parse("MATCH (a)-[:extends*3..1]->(b) RETURN b").unwrap();
424        let err = validate(&mut q).unwrap_err();
425        assert!(
426            matches!(err, QueryError::Validation(_)),
427            "expected Validation error, got {err:?}"
428        );
429    }
430
431    #[test]
432    fn rejects_min_hops_above_depth_cap() {
433        // min=50, max=100 — the lower bound exceeds MAX_DEPTH so the query
434        // can never produce results within our cap.
435        let mut q = gql::parse("MATCH (a)-[:extends*50..100]->(b) RETURN b").unwrap();
436        let err = validate(&mut q).unwrap_err();
437        assert!(
438            matches!(err, QueryError::Unsupported(_)),
439            "expected Unsupported, got {err:?}"
440        );
441    }
442
443    #[test]
444    fn rejects_max_above_depth_cap_with_satisfiable_min() {
445        // *2..50 — min 2 is satisfiable but max 50 exceeds MAX_DEPTH; must error.
446        let mut q = gql::parse("MATCH (a)-[:extends*2..50]->(b) RETURN b").unwrap();
447        let err = validate(&mut q).unwrap_err();
448        assert!(
449            matches!(err, QueryError::InvalidInput(_)),
450            "expected InvalidInput, got {err:?}"
451        );
452    }
453
454    #[test]
455    fn node_property_named_relation_allowed() {
456        // `relation` on a node variable is a free-form JSON property, not the
457        // edge relation column — taxonomy enforcement should not apply.
458        let mut q =
459            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.relation = 'external' RETURN a").unwrap();
460        validate(&mut q).unwrap();
461        assert_eq!(first_condition_string_value(&q), "external");
462    }
463
464    #[test]
465    fn edge_relation_still_validated() {
466        // `relation` on an edge variable must still go through EdgeRelation
467        // taxonomy validation.
468        let mut q = gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'not_real' RETURN a")
469            .unwrap();
470        let err = validate(&mut q).unwrap_err();
471        assert!(err.to_string().contains("not_real"), "msg: {err}");
472    }
473}