Skip to main content

khive_query/
validate.rs

1//! AST validation per ADR-008 §Validation Rules.
2//!
3//! `validate` normalises an AST in place and rejects queries that violate the
4//! closed edge ontology or attempt to subvert namespace scoping:
5//!
6//! 1. **Edge relations** must parse to one of the 13 canonical [`EdgeRelation`]
7//!    variants (ADR-002). Aliases and case differences are normalised to the
8//!    canonical snake_case form stored in the database. Applies to edge
9//!    patterns *and* `WHERE e.relation = '…'` constraints.
10//! 2. **Node kinds** pass through unchanged — the query layer is pack-agnostic
11//!    (ADR-025). Kind validation is the responsibility of the service boundary,
12//!    not the query compiler.
13//! 3. **Namespace scoping is a trusted parameter only.** Queries must not name
14//!    `namespace` in node property maps or `WHERE` conditions — the only valid
15//!    source of namespace filtering is `CompileOptions::scopes`. This matches
16//!    ADR-008 §Validation: "never trust query strings to set namespaces."
17//! 4. **Traversal depth** is capped at [`MAX_DEPTH`] (10 hops). Requests above
18//!    the cap are clamped, not rejected — this matches the cap the compiler
19//!    applies when generating recursive CTEs.
20
21use std::collections::HashSet;
22use std::str::FromStr;
23
24use khive_types::EdgeRelation;
25
26use crate::ast::{Condition, ConditionValue, GqlQuery, PatternElement};
27use crate::error::QueryError;
28
29/// Maximum traversal depth allowed by the query layer (ADR-008 §Validation).
30pub const MAX_DEPTH: usize = 10;
31
32/// Validate and normalise an AST in place.
33///
34/// Canonicalizes edge relation strings to their snake_case form (closed set).
35/// Node kind strings pass through unchanged (pack-agnostic).
36pub fn validate(query: &mut GqlQuery) -> Result<(), QueryError> {
37    validate_with_warnings(query).map(|_| ())
38}
39
40/// Validate and normalise an AST in place, returning any warnings generated.
41///
42/// Currently warns when `max_hops` is clamped to [`MAX_DEPTH`].
43pub fn validate_with_warnings(query: &mut GqlQuery) -> Result<Vec<String>, QueryError> {
44    let mut warnings = Vec::new();
45
46    // Pattern variables are bindings — the same variable name appearing twice
47    // would mean "same node/edge" and require alias-equality predicates in
48    // SQL. Until that is implemented, reject repeated bindings explicitly so
49    // cycles and self-reachability don't silently compile to wrong results.
50    let mut seen_node_vars: HashSet<&str> = HashSet::new();
51    let mut seen_edge_vars: HashSet<&str> = HashSet::new();
52    for element in &query.pattern.elements {
53        match element {
54            PatternElement::Node(node) => {
55                if let Some(var) = node.variable.as_deref() {
56                    if !seen_node_vars.insert(var) {
57                        return Err(QueryError::Unsupported(format!(
58                            "repeated node variable '{var}' (cycle / self-reachability \
59                             requires alias-equality predicates not yet implemented)"
60                        )));
61                    }
62                }
63            }
64            PatternElement::Edge(edge) => {
65                if let Some(var) = edge.variable.as_deref() {
66                    if !seen_edge_vars.insert(var) {
67                        return Err(QueryError::Unsupported(format!(
68                            "repeated edge variable '{var}' not supported"
69                        )));
70                    }
71                }
72            }
73        }
74    }
75
76    for element in &mut query.pattern.elements {
77        match element {
78            PatternElement::Node(node) => {
79                if node.properties.contains_key("namespace") {
80                    return Err(QueryError::Validation(
81                        "namespace is set by CompileOptions, not query text".into(),
82                    ));
83                }
84            }
85            PatternElement::Edge(edge) => {
86                for relation in edge.relations.iter_mut() {
87                    let parsed = EdgeRelation::from_str(relation)
88                        .map_err(|err| QueryError::Validation(err.to_string()))?;
89                    *relation = parsed.as_str().to_string();
90                }
91                if edge.min_hops == 0 {
92                    return Err(QueryError::Unsupported(
93                        "zero-hop ranges (min_hops = 0) not yet supported; \
94                         use a minimum of 1 hop"
95                            .into(),
96                    ));
97                }
98                // Reject inverted ranges before any clamping — silently
99                // rewriting *3..1 to *1..1 changes query semantics.
100                if edge.min_hops > edge.max_hops {
101                    return Err(QueryError::Validation(format!(
102                        "invalid hop range: min {} > max {}",
103                        edge.min_hops, edge.max_hops
104                    )));
105                }
106                // If the minimum already exceeds our depth cap, the query
107                // can never produce results — reject rather than silently
108                // returning an empty set from a clamped range.
109                if edge.min_hops > MAX_DEPTH {
110                    return Err(QueryError::Unsupported(format!(
111                        "minimum hop count {} exceeds depth cap {}",
112                        edge.min_hops, MAX_DEPTH
113                    )));
114                }
115                // Clamp max_hops to the depth cap; report the narrowing to callers.
116                if edge.max_hops > MAX_DEPTH {
117                    let requested = edge.max_hops;
118                    edge.max_hops = MAX_DEPTH;
119                    warnings.push(format!(
120                        "Query depth capped at {MAX_DEPTH} hops (requested {requested})"
121                    ));
122                }
123            }
124        }
125    }
126
127    // Build variable → kind map so condition validation is context-aware.
128    // `kind` and `relation` only get taxonomy enforcement on the correct
129    // variable type (node vs edge). On the other type, they're treated as
130    // ordinary JSON property keys.
131    let mut var_kinds: std::collections::HashMap<&str, VarKind> = std::collections::HashMap::new();
132    for element in &query.pattern.elements {
133        match element {
134            PatternElement::Node(n) => {
135                if let Some(v) = n.variable.as_deref() {
136                    var_kinds.insert(v, VarKind::Node);
137                }
138            }
139            PatternElement::Edge(e) => {
140                if let Some(v) = e.variable.as_deref() {
141                    var_kinds.insert(v, VarKind::Edge);
142                }
143            }
144        }
145    }
146
147    for cond in query.where_clause.iter_mut() {
148        let is_edge = var_kinds
149            .get(cond.variable.as_str())
150            .copied()
151            .unwrap_or(VarKind::Node)
152            == VarKind::Edge;
153        validate_condition(cond, is_edge)?;
154    }
155
156    Ok(warnings)
157}
158
159#[derive(Clone, Copy, PartialEq, Eq)]
160enum VarKind {
161    Node,
162    Edge,
163}
164
165fn validate_condition(cond: &mut Condition, is_edge: bool) -> Result<(), QueryError> {
166    match cond.property.as_str() {
167        "namespace" => Err(QueryError::Validation(
168            "namespace is set by CompileOptions, not query text".into(),
169        )),
170        "kind" if !is_edge => Ok(()),
171        "relation" if is_edge => {
172            if let ConditionValue::String(ref mut s) = cond.value {
173                let parsed = EdgeRelation::from_str(s)
174                    .map_err(|err| QueryError::Validation(err.to_string()))?;
175                *s = parsed.as_str().to_string();
176            }
177            Ok(())
178        }
179        _ => Ok(()),
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use crate::parsers::gql;
187
188    #[test]
189    fn node_kind_passes_through_unchanged() {
190        // Entity kinds are pack-agnostic strings — no normalization at the query layer.
191        let mut q = gql::parse("MATCH (a:paper)-[:introduced_by]->(b:concept) RETURN a").unwrap();
192        validate(&mut q).unwrap();
193        let kinds: Vec<_> = q
194            .pattern
195            .nodes()
196            .map(|n| n.kind.as_deref().unwrap_or(""))
197            .collect();
198        assert_eq!(kinds, vec!["paper", "concept"]);
199    }
200
201    #[test]
202    fn normalises_relation_case_and_hyphens() {
203        let mut q = gql::parse("MATCH (a)-[:Introduced_By]->(b) RETURN a").unwrap();
204        validate(&mut q).unwrap();
205        let rels: Vec<_> = q
206            .pattern
207            .edges()
208            .flat_map(|e| e.relations.iter().cloned())
209            .collect();
210        assert_eq!(rels, vec!["introduced_by".to_string()]);
211    }
212
213    #[test]
214    fn rejects_unknown_relation() {
215        let mut q = gql::parse("MATCH (a)-[:not_a_relation]->(b) RETURN a").unwrap();
216        let err = validate(&mut q).unwrap_err();
217        let msg = err.to_string();
218        assert!(msg.contains("not_a_relation"), "msg: {msg}");
219    }
220
221    #[test]
222    fn unknown_kind_passes_through() {
223        // Entity kinds are pack-agnostic strings — any string is accepted at the query layer.
224        let mut q = gql::parse("MATCH (a:gizmo)-[:extends]->(b) RETURN a").unwrap();
225        validate(&mut q).unwrap();
226    }
227
228    #[test]
229    fn clamps_depth_above_max() {
230        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
231        validate(&mut q).unwrap();
232        let edge = q.pattern.edges().next().unwrap();
233        assert_eq!(edge.max_hops, MAX_DEPTH);
234        assert!(edge.min_hops <= edge.max_hops);
235    }
236
237    #[test]
238    fn warns_when_clamping_depth_above_max() {
239        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
240        let warnings = validate_with_warnings(&mut q).unwrap();
241        assert_eq!(q.pattern.edges().next().unwrap().max_hops, MAX_DEPTH);
242        assert!(
243            warnings
244                .iter()
245                .any(|w| w.contains("Query depth capped at 10")),
246            "warnings: {warnings:?}"
247        );
248    }
249
250    #[test]
251    fn multi_relation_all_normalised() {
252        let mut q = gql::parse("MATCH (a)-[:Extends|VARIANT_OF]->(b) RETURN a").unwrap();
253        validate(&mut q).unwrap();
254        let edge = q.pattern.edges().next().unwrap();
255        assert_eq!(
256            edge.relations,
257            vec!["extends".to_string(), "variant_of".to_string()]
258        );
259    }
260
261    #[test]
262    fn rejects_namespace_in_where() {
263        let mut q =
264            gql::parse("MATCH (a:concept)-[:extends]->(b) WHERE a.namespace = 'other' RETURN a")
265                .unwrap();
266        let err = validate(&mut q).unwrap_err();
267        assert!(err.to_string().contains("namespace"), "msg: {err}");
268    }
269
270    #[test]
271    fn rejects_namespace_in_node_properties() {
272        let mut q =
273            gql::parse("MATCH (a:concept {namespace: 'other'})-[:extends]->(b) RETURN a").unwrap();
274        let err = validate(&mut q).unwrap_err();
275        assert!(err.to_string().contains("namespace"), "msg: {err}");
276    }
277
278    #[test]
279    fn rejects_unknown_relation_in_where() {
280        let mut q =
281            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'related_to' RETURN a")
282                .unwrap();
283        let err = validate(&mut q).unwrap_err();
284        assert!(err.to_string().contains("related_to"), "msg: {err}");
285    }
286
287    #[test]
288    fn unknown_kind_in_where_passes_through() {
289        // Entity kinds are pack-agnostic strings — any kind string is accepted.
290        let mut q =
291            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'gizmo' RETURN a").unwrap();
292        validate(&mut q).unwrap();
293        let val = match &q.where_clause[0].value {
294            ConditionValue::String(s) => s.clone(),
295            _ => panic!("expected string"),
296        };
297        assert_eq!(val, "gizmo");
298    }
299
300    #[test]
301    fn kind_in_where_passes_through_unchanged() {
302        // Pack-agnostic: 'paper' is not normalized to 'document'; strings pass through as-is.
303        let mut q =
304            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'paper' RETURN a").unwrap();
305        validate(&mut q).unwrap();
306        let val = match &q.where_clause[0].value {
307            ConditionValue::String(s) => s.clone(),
308            _ => panic!("expected string"),
309        };
310        assert_eq!(val, "paper");
311    }
312
313    #[test]
314    fn normalises_relation_alias_in_where() {
315        let mut q =
316            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'Introduced_By' RETURN a")
317                .unwrap();
318        validate(&mut q).unwrap();
319        let val = match &q.where_clause[0].value {
320            ConditionValue::String(s) => s.clone(),
321            _ => panic!("expected string"),
322        };
323        assert_eq!(val, "introduced_by");
324    }
325
326    #[test]
327    fn rejects_zero_hop_range_gql_wide() {
328        let mut q = gql::parse("MATCH (a)-[:extends*0..3]->(b) RETURN b").unwrap();
329        let err = validate(&mut q).unwrap_err();
330        assert!(
331            matches!(err, QueryError::Unsupported(_)),
332            "expected Unsupported, got {err:?}"
333        );
334    }
335
336    #[test]
337    fn rejects_zero_hop_range_gql_narrow() {
338        // *0..1 has max_hops=1 so has_variable_length() is false, but the
339        // fixed-length compiler also can't produce zero-hop rows — reject at
340        // validation regardless of compile path.
341        let mut q = gql::parse("MATCH (a)-[:extends*0..1]->(b) RETURN b").unwrap();
342        let err = validate(&mut q).unwrap_err();
343        assert!(
344            matches!(err, QueryError::Unsupported(_)),
345            "expected Unsupported, got {err:?}"
346        );
347    }
348
349    #[test]
350    fn rejects_zero_hop_sparql_explicit_range() {
351        use crate::parsers::sparql;
352        let mut q = sparql::parse("SELECT ?a ?b WHERE { ?a :extends{0,3} ?b . }").unwrap();
353        let err = validate(&mut q).unwrap_err();
354        assert!(
355            matches!(err, QueryError::Unsupported(_)),
356            "expected Unsupported, got {err:?}"
357        );
358    }
359
360    #[test]
361    fn rejects_repeated_node_var_cycle_gql() {
362        let mut q = gql::parse("MATCH (a)-[:extends]->(b)-[:variant_of]->(a) RETURN a").unwrap();
363        let err = validate(&mut q).unwrap_err();
364        assert!(
365            matches!(err, QueryError::Unsupported(_)),
366            "expected Unsupported, got {err:?}"
367        );
368    }
369
370    #[test]
371    fn rejects_repeated_node_var_self_reach_variable_length() {
372        let mut q = gql::parse("MATCH (a)-[:extends*1..3]->(a) RETURN a").unwrap();
373        let err = validate(&mut q).unwrap_err();
374        assert!(
375            matches!(err, QueryError::Unsupported(_)),
376            "expected Unsupported, got {err:?}"
377        );
378    }
379
380    #[test]
381    fn rejects_repeated_node_var_cycle_sparql() {
382        use crate::parsers::sparql;
383        let mut q =
384            sparql::parse("SELECT ?a WHERE { ?a :extends ?b . ?b :variant_of ?a . }").unwrap();
385        let err = validate(&mut q).unwrap_err();
386        assert!(
387            matches!(err, QueryError::Unsupported(_)),
388            "expected Unsupported, got {err:?}"
389        );
390    }
391
392    #[test]
393    fn rejects_repeated_edge_var() {
394        let mut q = gql::parse("MATCH (a)-[e:extends]->(b)-[e:variant_of]->(c) RETURN c").unwrap();
395        let err = validate(&mut q).unwrap_err();
396        assert!(
397            matches!(err, QueryError::Unsupported(_)),
398            "expected Unsupported, got {err:?}"
399        );
400    }
401
402    #[test]
403    fn rejects_inverted_range() {
404        // *3..1 is an inverted range — must error, not silently rewrite to *1..1.
405        let mut q = gql::parse("MATCH (a)-[:extends*3..1]->(b) RETURN b").unwrap();
406        let err = validate(&mut q).unwrap_err();
407        assert!(
408            matches!(err, QueryError::Validation(_)),
409            "expected Validation error, got {err:?}"
410        );
411    }
412
413    #[test]
414    fn rejects_min_hops_above_depth_cap() {
415        // min=50, max=100 — the lower bound exceeds MAX_DEPTH so the query
416        // can never produce results within our cap.
417        let mut q = gql::parse("MATCH (a)-[:extends*50..100]->(b) RETURN b").unwrap();
418        let err = validate(&mut q).unwrap_err();
419        assert!(
420            matches!(err, QueryError::Unsupported(_)),
421            "expected Unsupported, got {err:?}"
422        );
423    }
424
425    #[test]
426    fn clamps_max_but_keeps_satisfiable_min() {
427        // *2..50 — min 2 is satisfiable, max gets clamped to MAX_DEPTH.
428        let mut q = gql::parse("MATCH (a)-[:extends*2..50]->(b) RETURN b").unwrap();
429        validate(&mut q).unwrap();
430        let edge = q.pattern.edges().next().unwrap();
431        assert_eq!(edge.min_hops, 2);
432        assert_eq!(edge.max_hops, MAX_DEPTH);
433    }
434
435    #[test]
436    fn node_property_named_relation_allowed() {
437        // `relation` on a node variable is a free-form JSON property, not the
438        // edge relation column — taxonomy enforcement should not apply.
439        let mut q =
440            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.relation = 'external' RETURN a").unwrap();
441        validate(&mut q).unwrap();
442        let val = match &q.where_clause[0].value {
443            ConditionValue::String(s) => s.clone(),
444            _ => panic!("expected string"),
445        };
446        assert_eq!(val, "external");
447    }
448
449    #[test]
450    fn edge_relation_still_validated() {
451        // `relation` on an edge variable must still go through EdgeRelation
452        // taxonomy validation.
453        let mut q = gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'not_real' RETURN a")
454            .unwrap();
455        let err = validate(&mut q).unwrap_err();
456        assert!(err.to_string().contains("not_real"), "msg: {err}");
457    }
458}