Skip to main content

khive_query/
validate.rs

1//! AST validation per ADR-008 §Validation Rules.
2//!
3//! `validate` normalises an AST in place and rejects queries that violate the
4//! closed taxonomies or attempt to subvert namespace scoping:
5//!
6//! 1. **Edge relations** must parse to one of the 13 canonical [`EdgeRelation`]
7//!    variants (ADR-002). Aliases and case differences are normalised to the
8//!    canonical snake_case form stored in the database. Applies to edge
9//!    patterns *and* `WHERE e.relation = '…'` constraints.
10//! 2. **Node kinds** must parse to one of the 6 [`EntityKind`] variants
11//!    (ADR-001). Common aliases (`paper` → `document`, `benchmark` → `dataset`)
12//!    are normalised. Applies to node labels *and* `WHERE a.kind = '…'`
13//!    constraints.
14//! 3. **Namespace scoping is a trusted parameter only.** Queries must not name
15//!    `namespace` in node property maps or `WHERE` conditions — the only valid
16//!    source of namespace filtering is `CompileOptions::scopes`. This matches
17//!    ADR-008 §Validation: "never trust query strings to set namespaces."
18//! 4. **Traversal depth** is capped at [`MAX_DEPTH`] (10 hops). Requests above
19//!    the cap are clamped, not rejected — this matches the cap the compiler
20//!    applies when generating recursive CTEs.
21
22use std::collections::HashSet;
23use std::str::FromStr;
24
25use khive_types::{EdgeRelation, EntityKind};
26
27use crate::ast::{Condition, ConditionValue, GqlQuery, PatternElement};
28use crate::error::QueryError;
29
30/// Maximum traversal depth allowed by the query layer (ADR-008 §Validation).
31pub const MAX_DEPTH: usize = 10;
32
33/// Validate and normalise an AST in place.
34///
35/// On success, every kind / relation string in the AST is replaced with its
36/// canonical lowercase form so the compiler can emit literal SQL parameters
37/// that match the values written by `khive-db`.
38pub fn validate(query: &mut GqlQuery) -> Result<(), QueryError> {
39    // Pattern variables are bindings — the same variable name appearing twice
40    // would mean "same node/edge" and require alias-equality predicates in
41    // SQL. Until that is implemented, reject repeated bindings explicitly so
42    // cycles and self-reachability don't silently compile to wrong results.
43    let mut seen_node_vars: HashSet<&str> = HashSet::new();
44    let mut seen_edge_vars: HashSet<&str> = HashSet::new();
45    for element in &query.pattern.elements {
46        match element {
47            PatternElement::Node(node) => {
48                if let Some(var) = node.variable.as_deref() {
49                    if !seen_node_vars.insert(var) {
50                        return Err(QueryError::Unsupported(format!(
51                            "repeated node variable '{var}' (cycle / self-reachability \
52                             requires alias-equality predicates not yet implemented)"
53                        )));
54                    }
55                }
56            }
57            PatternElement::Edge(edge) => {
58                if let Some(var) = edge.variable.as_deref() {
59                    if !seen_edge_vars.insert(var) {
60                        return Err(QueryError::Unsupported(format!(
61                            "repeated edge variable '{var}' not supported"
62                        )));
63                    }
64                }
65            }
66        }
67    }
68
69    for element in &mut query.pattern.elements {
70        match element {
71            PatternElement::Node(node) => {
72                if let Some(kind) = node.kind.as_mut() {
73                    let parsed = EntityKind::from_str(kind).map_err(QueryError::Validation)?;
74                    *kind = parsed.name().to_string();
75                }
76                if node.properties.contains_key("namespace") {
77                    return Err(QueryError::Validation(
78                        "namespace is set by CompileOptions, not query text".into(),
79                    ));
80                }
81            }
82            PatternElement::Edge(edge) => {
83                for relation in edge.relations.iter_mut() {
84                    let parsed = EdgeRelation::from_str(relation)
85                        .map_err(|err| QueryError::Validation(err.to_string()))?;
86                    *relation = parsed.as_str().to_string();
87                }
88                if edge.min_hops == 0 {
89                    return Err(QueryError::Unsupported(
90                        "zero-hop ranges (min_hops = 0) not yet supported; \
91                         use a minimum of 1 hop"
92                            .into(),
93                    ));
94                }
95                // Reject inverted ranges before any clamping — silently
96                // rewriting *3..1 to *1..1 changes query semantics.
97                if edge.min_hops > edge.max_hops {
98                    return Err(QueryError::Validation(format!(
99                        "invalid hop range: min {} > max {}",
100                        edge.min_hops, edge.max_hops
101                    )));
102                }
103                // If the minimum already exceeds our depth cap, the query
104                // can never produce results — reject rather than silently
105                // returning an empty set from a clamped range.
106                if edge.min_hops > MAX_DEPTH {
107                    return Err(QueryError::Unsupported(format!(
108                        "minimum hop count {} exceeds depth cap {}",
109                        edge.min_hops, MAX_DEPTH
110                    )));
111                }
112                // Clamp max_hops to the depth cap — the lower bound is
113                // still satisfiable, so this only narrows the search.
114                if edge.max_hops > MAX_DEPTH {
115                    edge.max_hops = MAX_DEPTH;
116                }
117            }
118        }
119    }
120
121    // Build variable → kind map so condition validation is context-aware.
122    // `kind` and `relation` only get taxonomy enforcement on the correct
123    // variable type (node vs edge). On the other type, they're treated as
124    // ordinary JSON property keys.
125    let mut var_kinds: std::collections::HashMap<&str, VarKind> = std::collections::HashMap::new();
126    for element in &query.pattern.elements {
127        match element {
128            PatternElement::Node(n) => {
129                if let Some(v) = n.variable.as_deref() {
130                    var_kinds.insert(v, VarKind::Node);
131                }
132            }
133            PatternElement::Edge(e) => {
134                if let Some(v) = e.variable.as_deref() {
135                    var_kinds.insert(v, VarKind::Edge);
136                }
137            }
138        }
139    }
140
141    for cond in query.where_clause.iter_mut() {
142        let is_edge = var_kinds
143            .get(cond.variable.as_str())
144            .copied()
145            .unwrap_or(VarKind::Node)
146            == VarKind::Edge;
147        validate_condition(cond, is_edge)?;
148    }
149
150    Ok(())
151}
152
153#[derive(Clone, Copy, PartialEq, Eq)]
154enum VarKind {
155    Node,
156    Edge,
157}
158
159fn validate_condition(cond: &mut Condition, is_edge: bool) -> Result<(), QueryError> {
160    match cond.property.as_str() {
161        "namespace" => Err(QueryError::Validation(
162            "namespace is set by CompileOptions, not query text".into(),
163        )),
164        "kind" if !is_edge => {
165            if let ConditionValue::String(ref mut s) = cond.value {
166                let parsed = EntityKind::from_str(s).map_err(QueryError::Validation)?;
167                *s = parsed.name().to_string();
168            }
169            Ok(())
170        }
171        "relation" if is_edge => {
172            if let ConditionValue::String(ref mut s) = cond.value {
173                let parsed = EdgeRelation::from_str(s)
174                    .map_err(|err| QueryError::Validation(err.to_string()))?;
175                *s = parsed.as_str().to_string();
176            }
177            Ok(())
178        }
179        _ => Ok(()),
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186    use crate::parsers::gql;
187
188    #[test]
189    fn normalises_entity_kind_aliases() {
190        let mut q = gql::parse("MATCH (a:paper)-[:introduced_by]->(b:concept) RETURN a").unwrap();
191        validate(&mut q).unwrap();
192        let kinds: Vec<_> = q
193            .pattern
194            .nodes()
195            .map(|n| n.kind.as_deref().unwrap_or(""))
196            .collect();
197        assert_eq!(kinds, vec!["document", "concept"]);
198    }
199
200    #[test]
201    fn normalises_relation_case_and_hyphens() {
202        let mut q = gql::parse("MATCH (a)-[:Introduced_By]->(b) RETURN a").unwrap();
203        validate(&mut q).unwrap();
204        let rels: Vec<_> = q
205            .pattern
206            .edges()
207            .flat_map(|e| e.relations.iter().cloned())
208            .collect();
209        assert_eq!(rels, vec!["introduced_by".to_string()]);
210    }
211
212    #[test]
213    fn rejects_unknown_relation() {
214        let mut q = gql::parse("MATCH (a)-[:not_a_relation]->(b) RETURN a").unwrap();
215        let err = validate(&mut q).unwrap_err();
216        let msg = err.to_string();
217        assert!(msg.contains("not_a_relation"), "msg: {msg}");
218    }
219
220    #[test]
221    fn rejects_unknown_kind() {
222        let mut q = gql::parse("MATCH (a:gizmo)-[:extends]->(b) RETURN a").unwrap();
223        let err = validate(&mut q).unwrap_err();
224        let msg = err.to_string();
225        assert!(msg.contains("gizmo"), "msg: {msg}");
226    }
227
228    #[test]
229    fn clamps_depth_above_max() {
230        let mut q = gql::parse("MATCH (a)-[:extends*1..50]->(b) RETURN b").unwrap();
231        validate(&mut q).unwrap();
232        let edge = q.pattern.edges().next().unwrap();
233        assert_eq!(edge.max_hops, MAX_DEPTH);
234        assert!(edge.min_hops <= edge.max_hops);
235    }
236
237    #[test]
238    fn multi_relation_all_normalised() {
239        let mut q = gql::parse("MATCH (a)-[:Extends|VARIANT_OF]->(b) RETURN a").unwrap();
240        validate(&mut q).unwrap();
241        let edge = q.pattern.edges().next().unwrap();
242        assert_eq!(
243            edge.relations,
244            vec!["extends".to_string(), "variant_of".to_string()]
245        );
246    }
247
248    #[test]
249    fn rejects_namespace_in_where() {
250        let mut q =
251            gql::parse("MATCH (a:concept)-[:extends]->(b) WHERE a.namespace = 'other' RETURN a")
252                .unwrap();
253        let err = validate(&mut q).unwrap_err();
254        assert!(err.to_string().contains("namespace"), "msg: {err}");
255    }
256
257    #[test]
258    fn rejects_namespace_in_node_properties() {
259        let mut q =
260            gql::parse("MATCH (a:concept {namespace: 'other'})-[:extends]->(b) RETURN a").unwrap();
261        let err = validate(&mut q).unwrap_err();
262        assert!(err.to_string().contains("namespace"), "msg: {err}");
263    }
264
265    #[test]
266    fn rejects_unknown_relation_in_where() {
267        let mut q =
268            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'related_to' RETURN a")
269                .unwrap();
270        let err = validate(&mut q).unwrap_err();
271        assert!(err.to_string().contains("related_to"), "msg: {err}");
272    }
273
274    #[test]
275    fn rejects_unknown_kind_in_where() {
276        let mut q =
277            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'gizmo' RETURN a").unwrap();
278        let err = validate(&mut q).unwrap_err();
279        assert!(err.to_string().contains("gizmo"), "msg: {err}");
280    }
281
282    #[test]
283    fn normalises_kind_alias_in_where() {
284        let mut q =
285            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.kind = 'paper' RETURN a").unwrap();
286        validate(&mut q).unwrap();
287        let val = match &q.where_clause[0].value {
288            ConditionValue::String(s) => s.clone(),
289            _ => panic!("expected string"),
290        };
291        assert_eq!(val, "document");
292    }
293
294    #[test]
295    fn normalises_relation_alias_in_where() {
296        let mut q =
297            gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'Introduced_By' RETURN a")
298                .unwrap();
299        validate(&mut q).unwrap();
300        let val = match &q.where_clause[0].value {
301            ConditionValue::String(s) => s.clone(),
302            _ => panic!("expected string"),
303        };
304        assert_eq!(val, "introduced_by");
305    }
306
307    #[test]
308    fn rejects_zero_hop_range_gql_wide() {
309        let mut q = gql::parse("MATCH (a)-[:extends*0..3]->(b) RETURN b").unwrap();
310        let err = validate(&mut q).unwrap_err();
311        assert!(
312            matches!(err, QueryError::Unsupported(_)),
313            "expected Unsupported, got {err:?}"
314        );
315    }
316
317    #[test]
318    fn rejects_zero_hop_range_gql_narrow() {
319        // *0..1 has max_hops=1 so has_variable_length() is false, but the
320        // fixed-length compiler also can't produce zero-hop rows — reject at
321        // validation regardless of compile path.
322        let mut q = gql::parse("MATCH (a)-[:extends*0..1]->(b) RETURN b").unwrap();
323        let err = validate(&mut q).unwrap_err();
324        assert!(
325            matches!(err, QueryError::Unsupported(_)),
326            "expected Unsupported, got {err:?}"
327        );
328    }
329
330    #[test]
331    fn rejects_zero_hop_sparql_explicit_range() {
332        use crate::parsers::sparql;
333        let mut q = sparql::parse("SELECT ?a ?b WHERE { ?a :extends{0,3} ?b . }").unwrap();
334        let err = validate(&mut q).unwrap_err();
335        assert!(
336            matches!(err, QueryError::Unsupported(_)),
337            "expected Unsupported, got {err:?}"
338        );
339    }
340
341    #[test]
342    fn rejects_repeated_node_var_cycle_gql() {
343        let mut q = gql::parse("MATCH (a)-[:extends]->(b)-[:variant_of]->(a) RETURN a").unwrap();
344        let err = validate(&mut q).unwrap_err();
345        assert!(
346            matches!(err, QueryError::Unsupported(_)),
347            "expected Unsupported, got {err:?}"
348        );
349    }
350
351    #[test]
352    fn rejects_repeated_node_var_self_reach_variable_length() {
353        let mut q = gql::parse("MATCH (a)-[:extends*1..3]->(a) RETURN a").unwrap();
354        let err = validate(&mut q).unwrap_err();
355        assert!(
356            matches!(err, QueryError::Unsupported(_)),
357            "expected Unsupported, got {err:?}"
358        );
359    }
360
361    #[test]
362    fn rejects_repeated_node_var_cycle_sparql() {
363        use crate::parsers::sparql;
364        let mut q =
365            sparql::parse("SELECT ?a WHERE { ?a :extends ?b . ?b :variant_of ?a . }").unwrap();
366        let err = validate(&mut q).unwrap_err();
367        assert!(
368            matches!(err, QueryError::Unsupported(_)),
369            "expected Unsupported, got {err:?}"
370        );
371    }
372
373    #[test]
374    fn rejects_repeated_edge_var() {
375        let mut q = gql::parse("MATCH (a)-[e:extends]->(b)-[e:variant_of]->(c) RETURN c").unwrap();
376        let err = validate(&mut q).unwrap_err();
377        assert!(
378            matches!(err, QueryError::Unsupported(_)),
379            "expected Unsupported, got {err:?}"
380        );
381    }
382
383    #[test]
384    fn rejects_inverted_range() {
385        // *3..1 is an inverted range — must error, not silently rewrite to *1..1.
386        let mut q = gql::parse("MATCH (a)-[:extends*3..1]->(b) RETURN b").unwrap();
387        let err = validate(&mut q).unwrap_err();
388        assert!(
389            matches!(err, QueryError::Validation(_)),
390            "expected Validation error, got {err:?}"
391        );
392    }
393
394    #[test]
395    fn rejects_min_hops_above_depth_cap() {
396        // min=50, max=100 — the lower bound exceeds MAX_DEPTH so the query
397        // can never produce results within our cap.
398        let mut q = gql::parse("MATCH (a)-[:extends*50..100]->(b) RETURN b").unwrap();
399        let err = validate(&mut q).unwrap_err();
400        assert!(
401            matches!(err, QueryError::Unsupported(_)),
402            "expected Unsupported, got {err:?}"
403        );
404    }
405
406    #[test]
407    fn clamps_max_but_keeps_satisfiable_min() {
408        // *2..50 — min 2 is satisfiable, max gets clamped to MAX_DEPTH.
409        let mut q = gql::parse("MATCH (a)-[:extends*2..50]->(b) RETURN b").unwrap();
410        validate(&mut q).unwrap();
411        let edge = q.pattern.edges().next().unwrap();
412        assert_eq!(edge.min_hops, 2);
413        assert_eq!(edge.max_hops, MAX_DEPTH);
414    }
415
416    #[test]
417    fn node_property_named_relation_allowed() {
418        // `relation` on a node variable is a free-form JSON property, not the
419        // edge relation column — taxonomy enforcement should not apply.
420        let mut q =
421            gql::parse("MATCH (a)-[:extends]->(b) WHERE a.relation = 'external' RETURN a").unwrap();
422        validate(&mut q).unwrap();
423        let val = match &q.where_clause[0].value {
424            ConditionValue::String(s) => s.clone(),
425            _ => panic!("expected string"),
426        };
427        assert_eq!(val, "external");
428    }
429
430    #[test]
431    fn edge_relation_still_validated() {
432        // `relation` on an edge variable must still go through EdgeRelation
433        // taxonomy validation.
434        let mut q = gql::parse("MATCH (a)-[e:extends]->(b) WHERE e.relation = 'not_real' RETURN a")
435            .unwrap();
436        let err = validate(&mut q).unwrap_err();
437        assert!(err.to_string().contains("not_real"), "msg: {err}");
438    }
439}