Skip to main content

khive_query/
validate.rs

1//! AST validation per ADR-008 §Validation Rules.
2//!
3//! `validate` normalises an AST in place and rejects queries that violate the
4//! closed edge ontology or attempt to subvert namespace scoping:
5//!
6//! 1. **Edge relations** must parse to one of the 13 canonical [`EdgeRelation`]
7//!    variants (ADR-002). Aliases and case differences are normalised to the
8//!    canonical snake_case form stored in the database. Applies to edge
9//!    patterns *and* `WHERE e.relation = '…'` constraints.
10//! 2. **Node kinds** pass through unchanged — the query layer is pack-agnostic
11//!    (ADR-025). Kind validation is the responsibility of the service boundary,
12//!    not the query compiler.
13//! 3. **Namespace scoping is a trusted parameter only.** Queries must not name
14//!    `namespace` in node property maps or `WHERE` conditions — the only valid
15//!    source of namespace filtering is `CompileOptions::scopes`. This matches
16//!    ADR-008 §Validation: "never trust query strings to set namespaces."
17//! 4. **Traversal depth** is capped at [`MAX_DEPTH`] (10 hops). Requests above
18//!    the cap are clamped, not rejected — this matches the cap the compiler
19//!    applies when generating recursive CTEs.
20
21use std::collections::HashSet;
22use std::str::FromStr;
23
24use khive_types::EdgeRelation;
25
26use crate::ast::{Condition, ConditionValue, GqlQuery, PatternElement};
27use crate::error::QueryError;
28
29/// Maximum traversal depth allowed by the query layer (ADR-008 §Validation).
30pub const MAX_DEPTH: usize = 10;
31
32/// Validate and normalise an AST in place.
33///
34/// Canonicalizes edge relation strings to their snake_case form (closed set).
35/// Node kind strings pass through unchanged (pack-agnostic).
36pub fn validate(query: &mut GqlQuery) -> Result<(), QueryError> {
37    // Pattern variables are bindings — the same variable name appearing twice
38    // would mean "same node/edge" and require alias-equality predicates in
39    // SQL. Until that is implemented, reject repeated bindings explicitly so
40    // cycles and self-reachability don't silently compile to wrong results.
41    let mut seen_node_vars: HashSet<&str> = HashSet::new();
42    let mut seen_edge_vars: HashSet<&str> = HashSet::new();
43    for element in &query.pattern.elements {
44        match element {
45            PatternElement::Node(node) => {
46                if let Some(var) = node.variable.as_deref() {
47                    if !seen_node_vars.insert(var) {
48                        return Err(QueryError::Unsupported(format!(
49                            "repeated node variable '{var}' (cycle / self-reachability \
50                             requires alias-equality predicates not yet implemented)"
51                        )));
52                    }
53                }
54            }
55            PatternElement::Edge(edge) => {
56                if let Some(var) = edge.variable.as_deref() {
57                    if !seen_edge_vars.insert(var) {
58                        return Err(QueryError::Unsupported(format!(
59                            "repeated edge variable '{var}' not supported"
60                        )));
61                    }
62                }
63            }
64        }
65    }
66
67    for element in &mut query.pattern.elements {
68        match element {
69            PatternElement::Node(node) => {
70                if node.properties.contains_key("namespace") {
71                    return Err(QueryError::Validation(
72                        "namespace is set by CompileOptions, not query text".into(),
73                    ));
74                }
75            }
76            PatternElement::Edge(edge) => {
77                for relation in edge.relations.iter_mut() {
78                    let parsed = EdgeRelation::from_str(relation)
79                        .map_err(|err| QueryError::Validation(err.to_string()))?;
80                    *relation = parsed.as_str().to_string();
81                }
82                if edge.min_hops == 0 {
83                    return Err(QueryError::Unsupported(
84                        "zero-hop ranges (min_hops = 0) not yet supported; \
85                         use a minimum of 1 hop"
86                            .into(),
87                    ));
88                }
89                // Reject inverted ranges before any clamping — silently
90                // rewriting *3..1 to *1..1 changes query semantics.
91                if edge.min_hops > edge.max_hops {
92                    return Err(QueryError::Validation(format!(
93                        "invalid hop range: min {} > max {}",
94                        edge.min_hops, edge.max_hops
95                    )));
96                }
97                // If the minimum already exceeds our depth cap, the query
98                // can never produce results — reject rather than silently
99                // returning an empty set from a clamped range.
100                if edge.min_hops > MAX_DEPTH {
101                    return Err(QueryError::Unsupported(format!(
102                        "minimum hop count {} exceeds depth cap {}",
103                        edge.min_hops, MAX_DEPTH
104                    )));
105                }
106                // Clamp max_hops to the depth cap — the lower bound is
107                // still satisfiable, so this only narrows the search.
108                if edge.max_hops > MAX_DEPTH {
109                    edge.max_hops = MAX_DEPTH;
110                }
111            }
112        }
113    }
114
115    // Build variable → kind map so condition validation is context-aware.
116    // `kind` and `relation` only get taxonomy enforcement on the correct
117    // variable type (node vs edge). On the other type, they're treated as
118    // ordinary JSON property keys.
119    let mut var_kinds: std::collections::HashMap<&str, VarKind> = std::collections::HashMap::new();
120    for element in &query.pattern.elements {
121        match element {
122            PatternElement::Node(n) => {
123                if let Some(v) = n.variable.as_deref() {
124                    var_kinds.insert(v, VarKind::Node);
125                }
126            }
127            PatternElement::Edge(e) => {
128                if let Some(v) = e.variable.as_deref() {
129                    var_kinds.insert(v, VarKind::Edge);
130                }
131            }
132        }
133    }
134
135    for cond in query.where_clause.iter_mut() {
136        let is_edge = var_kinds
137            .get(cond.variable.as_str())
138            .copied()
139            .unwrap_or(VarKind::Node)
140            == VarKind::Edge;
141        validate_condition(cond, is_edge)?;
142    }
143
144    Ok(())
145}
146
147#[derive(Clone, Copy, PartialEq, Eq)]
148enum VarKind {
149    Node,
150    Edge,
151}
152
153fn validate_condition(cond: &mut Condition, is_edge: bool) -> Result<(), QueryError> {
154    match cond.property.as_str() {
155        "namespace" => Err(QueryError::Validation(
156            "namespace is set by CompileOptions, not query text".into(),
157        )),
158        "kind" if !is_edge => Ok(()),
159        "relation" if is_edge => {
160            if let ConditionValue::String(ref mut s) = cond.value {
161                let parsed = EdgeRelation::from_str(s)
162                    .map_err(|err| QueryError::Validation(err.to_string()))?;
163                *s = parsed.as_str().to_string();
164            }
165            Ok(())
166        }
167        _ => Ok(()),
168    }
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174    use crate::parsers::gql;
175
176    #[test]
177    fn node_kind_passes_through_unchanged() {
178        // Entity kinds are pack-agnostic strings — no normalization at the query layer.
179        let mut q = gql::parse("MATCH (a:paper)-[:introduced_by]->(b:concept) RETURN a").unwrap();
180        validate(&mut q).unwrap();
181        let kinds: Vec<_> = q
182            .pattern
183            .nodes()
184            .map(|n| n.kind.as_deref().unwrap_or(""))
185            .collect();
186        assert_eq!(kinds, vec!["paper", "concept"]);
187    }
188
189    #[test]
190    fn normalises_relation_case_and_hyphens() {
191        let mut q = gql::parse("MATCH (a)-[:Introduced_By]->(b) RETURN a").unwrap();
192        validate(&mut q).unwrap();
193        let rels: Vec<_> = q
194            .pattern
195            .edges()
196            .flat_map(|e| e.relations.iter().cloned())
197            .collect();
198        assert_eq!(rels, vec!["introduced_by".to_string()]);
199    }
200
201    #[test]
202    fn rejects_unknown_relation() {
203        let mut q = gql::parse("MATCH (a)-[:not_a_relation]->(b) RETURN a").unwrap();
204        let err = validate(&mut q).unwrap_err();
205        let msg = err.to_string();
206        assert!(msg.contains("not_a_relation"), "msg: {msg}");
207    }
208
209    #[test]
210    fn unknown_kind_passes_through() {
211        // Entity kinds are pack-agnostic strings — any string is accepted at the query layer.
212        let mut q = gql::parse("MATCH (a:gizmo)-[:extends]->(b) RETURN a").unwrap();
213        validate(&mut q).unwrap();
214    }
215
216    #[test]
217    fn clamps_depth_above_max() {
218        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
219        validate(&mut q).unwrap();
220        let edge = q.pattern.edges().next().unwrap();
221        assert_eq!(edge.max_hops, MAX_DEPTH);
222        assert!(edge.min_hops <= edge.max_hops);
223    }
224
225    #[test]
226    fn multi_relation_all_normalised() {
227        let mut q = gql::parse("MATCH (a)-[:Extends|VARIANT_OF]->(b) RETURN a").unwrap();
228        validate(&mut q).unwrap();
229        let edge = q.pattern.edges().next().unwrap();
230        assert_eq!(
231            edge.relations,
232            vec!["extends".to_string(), "variant_of".to_string()]
233        );
234    }
235
236    #[test]
237    fn rejects_namespace_in_where() {
238        let mut q =
239            gql::parse("MATCH (a:concept)-[:extends]->(b) WHERE a.namespace = 'other' RETURN a")
240                .unwrap();
241        let err = validate(&mut q).unwrap_err();
242        assert!(err.to_string().contains("namespace"), "msg: {err}");
243    }
244
245    #[test]
246    fn rejects_namespace_in_node_properties() {
247        let mut q =
248            gql::parse("MATCH (a:concept {namespace: 'other'})-[:extends]->(b) RETURN a").unwrap();
249        let err = validate(&mut q).unwrap_err();
250        assert!(err.to_string().contains("namespace"), "msg: {err}");
251    }
252
253    #[test]
254    fn rejects_unknown_relation_in_where() {
255        let mut q =
256            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'related_to' RETURN a")
257                .unwrap();
258        let err = validate(&mut q).unwrap_err();
259        assert!(err.to_string().contains("related_to"), "msg: {err}");
260    }
261
262    #[test]
263    fn unknown_kind_in_where_passes_through() {
264        // Entity kinds are pack-agnostic strings — any kind string is accepted.
265        let mut q =
266            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'gizmo' RETURN a").unwrap();
267        validate(&mut q).unwrap();
268        let val = match &q.where_clause[0].value {
269            ConditionValue::String(s) => s.clone(),
270            _ => panic!("expected string"),
271        };
272        assert_eq!(val, "gizmo");
273    }
274
275    #[test]
276    fn kind_in_where_passes_through_unchanged() {
277        // Pack-agnostic: 'paper' is not normalized to 'document'; strings pass through as-is.
278        let mut q =
279            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'paper' RETURN a").unwrap();
280        validate(&mut q).unwrap();
281        let val = match &q.where_clause[0].value {
282            ConditionValue::String(s) => s.clone(),
283            _ => panic!("expected string"),
284        };
285        assert_eq!(val, "paper");
286    }
287
288    #[test]
289    fn normalises_relation_alias_in_where() {
290        let mut q =
291            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'Introduced_By' RETURN a")
292                .unwrap();
293        validate(&mut q).unwrap();
294        let val = match &q.where_clause[0].value {
295            ConditionValue::String(s) => s.clone(),
296            _ => panic!("expected string"),
297        };
298        assert_eq!(val, "introduced_by");
299    }
300
301    #[test]
302    fn rejects_zero_hop_range_gql_wide() {
303        let mut q = gql::parse("MATCH (a)-[:extends*0..3]->(b) RETURN b").unwrap();
304        let err = validate(&mut q).unwrap_err();
305        assert!(
306            matches!(err, QueryError::Unsupported(_)),
307            "expected Unsupported, got {err:?}"
308        );
309    }
310
311    #[test]
312    fn rejects_zero_hop_range_gql_narrow() {
313        // *0..1 has max_hops=1 so has_variable_length() is false, but the
314        // fixed-length compiler also can't produce zero-hop rows — reject at
315        // validation regardless of compile path.
316        let mut q = gql::parse("MATCH (a)-[:extends*0..1]->(b) RETURN b").unwrap();
317        let err = validate(&mut q).unwrap_err();
318        assert!(
319            matches!(err, QueryError::Unsupported(_)),
320            "expected Unsupported, got {err:?}"
321        );
322    }
323
324    #[test]
325    fn rejects_zero_hop_sparql_explicit_range() {
326        use crate::parsers::sparql;
327        let mut q = sparql::parse("SELECT ?a ?b WHERE { ?a :extends{0,3} ?b . }").unwrap();
328        let err = validate(&mut q).unwrap_err();
329        assert!(
330            matches!(err, QueryError::Unsupported(_)),
331            "expected Unsupported, got {err:?}"
332        );
333    }
334
335    #[test]
336    fn rejects_repeated_node_var_cycle_gql() {
337        let mut q = gql::parse("MATCH (a)-[:extends]->(b)-[:variant_of]->(a) RETURN a").unwrap();
338        let err = validate(&mut q).unwrap_err();
339        assert!(
340            matches!(err, QueryError::Unsupported(_)),
341            "expected Unsupported, got {err:?}"
342        );
343    }
344
345    #[test]
346    fn rejects_repeated_node_var_self_reach_variable_length() {
347        let mut q = gql::parse("MATCH (a)-[:extends*1..3]->(a) RETURN a").unwrap();
348        let err = validate(&mut q).unwrap_err();
349        assert!(
350            matches!(err, QueryError::Unsupported(_)),
351            "expected Unsupported, got {err:?}"
352        );
353    }
354
355    #[test]
356    fn rejects_repeated_node_var_cycle_sparql() {
357        use crate::parsers::sparql;
358        let mut q =
359            sparql::parse("SELECT ?a WHERE { ?a :extends ?b . ?b :variant_of ?a . }").unwrap();
360        let err = validate(&mut q).unwrap_err();
361        assert!(
362            matches!(err, QueryError::Unsupported(_)),
363            "expected Unsupported, got {err:?}"
364        );
365    }
366
367    #[test]
368    fn rejects_repeated_edge_var() {
369        let mut q = gql::parse("MATCH (a)-[e:extends]->(b)-[e:variant_of]->(c) RETURN c").unwrap();
370        let err = validate(&mut q).unwrap_err();
371        assert!(
372            matches!(err, QueryError::Unsupported(_)),
373            "expected Unsupported, got {err:?}"
374        );
375    }
376
377    #[test]
378    fn rejects_inverted_range() {
379        // *3..1 is an inverted range — must error, not silently rewrite to *1..1.
380        let mut q = gql::parse("MATCH (a)-[:extends*3..1]->(b) RETURN b").unwrap();
381        let err = validate(&mut q).unwrap_err();
382        assert!(
383            matches!(err, QueryError::Validation(_)),
384            "expected Validation error, got {err:?}"
385        );
386    }
387
388    #[test]
389    fn rejects_min_hops_above_depth_cap() {
390        // min=50, max=100 — the lower bound exceeds MAX_DEPTH so the query
391        // can never produce results within our cap.
392        let mut q = gql::parse("MATCH (a)-[:extends*50..100]->(b) RETURN b").unwrap();
393        let err = validate(&mut q).unwrap_err();
394        assert!(
395            matches!(err, QueryError::Unsupported(_)),
396            "expected Unsupported, got {err:?}"
397        );
398    }
399
400    #[test]
401    fn clamps_max_but_keeps_satisfiable_min() {
402        // *2..50 — min 2 is satisfiable, max gets clamped to MAX_DEPTH.
403        let mut q = gql::parse("MATCH (a)-[:extends*2..50]->(b) RETURN b").unwrap();
404        validate(&mut q).unwrap();
405        let edge = q.pattern.edges().next().unwrap();
406        assert_eq!(edge.min_hops, 2);
407        assert_eq!(edge.max_hops, MAX_DEPTH);
408    }
409
410    #[test]
411    fn node_property_named_relation_allowed() {
412        // `relation` on a node variable is a free-form JSON property, not the
413        // edge relation column — taxonomy enforcement should not apply.
414        let mut q =
415            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.relation = 'external' RETURN a").unwrap();
416        validate(&mut q).unwrap();
417        let val = match &q.where_clause[0].value {
418            ConditionValue::String(s) => s.clone(),
419            _ => panic!("expected string"),
420        };
421        assert_eq!(val, "external");
422    }
423
424    #[test]
425    fn edge_relation_still_validated() {
426        // `relation` on an edge variable must still go through EdgeRelation
427        // taxonomy validation.
428        let mut q = gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'not_real' RETURN a")
429            .unwrap();
430        let err = validate(&mut q).unwrap_err();
431        assert!(err.to_string().contains("not_real"), "msg: {err}");
432    }
433}