Skip to main content

khive_query/
validate.rs

1//! AST validation: relation normalization, namespace guard, depth cap.
2
3use std::collections::HashSet;
4use std::str::FromStr;
5
6use khive_types::EdgeRelation;
7
8use crate::ast::{Condition, ConditionValue, GqlQuery, PatternElement};
9use crate::error::QueryError;
10
11/// Valid synthetic relations; unknown `observed_as_*` strings are rejected.
12const SYNTHETIC_RELATIONS: &[&str] = &[
13    "observed_as_candidate",
14    "observed_as_selected",
15    "observed_as_target",
16    "observed_as_signal",
17];
18
19/// Maximum traversal depth allowed by the query layer.
20pub const MAX_DEPTH: usize = 10;
21
22/// Validate and normalise an AST in place.
23pub fn validate(query: &mut GqlQuery) -> Result<(), QueryError> {
24    validate_with_warnings(query).map(|_| ())
25}
26
27/// Validate that a pattern alternates Node/Edge/Node correctly.
28pub fn validate_pattern_shape(elements: &[PatternElement]) -> Result<(), QueryError> {
29    if elements.is_empty() {
30        // Empty pattern: caught separately by the compiler as "empty pattern".
31        return Ok(());
32    }
33    if elements.len().is_multiple_of(2) {
34        return Err(QueryError::Validation(
35            "pattern must alternate Node, Edge, Node, … (even element count is invalid)".into(),
36        ));
37    }
38    for (i, element) in elements.iter().enumerate() {
39        match (i % 2, element) {
40            (0, PatternElement::Node(_)) => {}
41            (1, PatternElement::Edge(_)) => {}
42            _ => {
43                return Err(QueryError::Validation(
44                    "pattern must alternate Node, Edge, Node, … (wrong element type at position)"
45                        .into(),
46                ))
47            }
48        }
49    }
50    Ok(())
51}
52
53/// Validate and normalise an AST in place, returning any warnings generated.
54pub fn validate_with_warnings(query: &mut GqlQuery) -> Result<Vec<String>, QueryError> {
55    let warnings: Vec<String> = Vec::new();
56
57    // Structural shape check: must alternate Node/Edge/Node.
58    validate_pattern_shape(&query.pattern.elements)?;
59
60    // Pattern variables are bindings — the same variable name appearing twice
61    // would mean "same node/edge" and require alias-equality predicates in
62    // SQL. Until that is implemented, reject repeated bindings explicitly so
63    // cycles and self-reachability don't silently compile to wrong results.
64    let mut seen_node_vars: HashSet<&str> = HashSet::new();
65    let mut seen_edge_vars: HashSet<&str> = HashSet::new();
66    for element in &query.pattern.elements {
67        match element {
68            PatternElement::Node(node) => {
69                if let Some(var) = node.variable.as_deref() {
70                    if !seen_node_vars.insert(var) {
71                        return Err(QueryError::Unsupported(format!(
72                            "repeated node variable '{var}' (cycle / self-reachability \
73                             requires alias-equality predicates not yet implemented)"
74                        )));
75                    }
76                }
77            }
78            PatternElement::Edge(edge) => {
79                if let Some(var) = edge.variable.as_deref() {
80                    if !seen_edge_vars.insert(var) {
81                        return Err(QueryError::Unsupported(format!(
82                            "repeated edge variable '{var}' not supported"
83                        )));
84                    }
85                }
86            }
87        }
88    }
89
90    for element in &mut query.pattern.elements {
91        match element {
92            PatternElement::Node(node) => {
93                if node.properties.contains_key("namespace") {
94                    return Err(QueryError::Validation(
95                        "namespace is set by CompileOptions, not query text".into(),
96                    ));
97                }
98            }
99            PatternElement::Edge(edge) => {
100                for relation in edge.relations.iter_mut() {
101                    // Synthetic observed_as_* relations do not exist in the
102                    // closed EdgeRelation enum — skip taxonomy validation and
103                    // leave the string unchanged.  The SQL compiler handles them
104                    // via the event_observations join path.
105                    // Only the four known synthetic relations are valid; an unknown
106                    // observed_as_* string must be rejected (closes the bypass that
107                    // allowed arbitrary observed_as_bogus strings to compile as
108                    // canonical graph_edges queries).
109                    if relation.starts_with("observed_as_") {
110                        if !SYNTHETIC_RELATIONS.contains(&relation.as_str()) {
111                            return Err(QueryError::Validation(format!(
112                                "unknown synthetic relation '{relation}'; valid synthetic relations: {}",
113                                SYNTHETIC_RELATIONS.join(", ")
114                            )));
115                        }
116                        continue;
117                    }
118                    let parsed = EdgeRelation::from_str(relation)
119                        .map_err(|err| QueryError::Validation(err.to_string()))?;
120                    *relation = parsed.as_str().to_string();
121                }
122                if edge.min_hops == 0 {
123                    return Err(QueryError::Unsupported(
124                        "zero-hop ranges (min_hops = 0) not yet supported; \
125                         use a minimum of 1 hop"
126                            .into(),
127                    ));
128                }
129                // Reject inverted ranges before any clamping — silently
130                // rewriting *3..1 to *1..1 changes query semantics.
131                if edge.min_hops > edge.max_hops {
132                    return Err(QueryError::Validation(format!(
133                        "invalid hop range: min {} > max {}",
134                        edge.min_hops, edge.max_hops
135                    )));
136                }
137                // If the minimum already exceeds our depth cap, the query
138                // can never produce results — reject rather than silently
139                // returning an empty set from a clamped range.
140                if edge.min_hops > MAX_DEPTH {
141                    return Err(QueryError::Unsupported(format!(
142                        "minimum hop count {} exceeds depth cap {}",
143                        edge.min_hops, MAX_DEPTH
144                    )));
145                }
146                // Reject max_hops above the depth cap.
147                if edge.max_hops > MAX_DEPTH {
148                    return Err(QueryError::InvalidInput(format!(
149                        "max_hops {} exceeds the depth cap of {}; reduce the range or use a smaller bound",
150                        edge.max_hops, MAX_DEPTH
151                    )));
152                }
153            }
154        }
155    }
156
157    // Build variable → kind map so condition validation is context-aware.
158    // `kind` and `relation` only get taxonomy enforcement on the correct
159    // variable type (node vs edge). On the other type, they're treated as
160    // ordinary JSON property keys.
161    let mut var_kinds: std::collections::HashMap<&str, VarKind> = std::collections::HashMap::new();
162    for element in &query.pattern.elements {
163        match element {
164            PatternElement::Node(n) => {
165                if let Some(v) = n.variable.as_deref() {
166                    var_kinds.insert(v, VarKind::Node);
167                }
168            }
169            PatternElement::Edge(e) => {
170                if let Some(v) = e.variable.as_deref() {
171                    var_kinds.insert(v, VarKind::Edge);
172                }
173            }
174        }
175    }
176
177    // Walk all leaf conditions in the WHERE expression tree.
178    let mut validate_err: Option<QueryError> = None;
179    query.where_clause.for_each_condition_mut(&mut |cond| {
180        if validate_err.is_some() {
181            return;
182        }
183        let is_edge = var_kinds
184            .get(cond.variable.as_str())
185            .copied()
186            .unwrap_or(VarKind::Node)
187            == VarKind::Edge;
188        if let Err(e) = validate_condition(cond, is_edge) {
189            validate_err = Some(e);
190        }
191    });
192    if let Some(e) = validate_err {
193        return Err(e);
194    }
195
196    Ok(warnings)
197}
198
199#[derive(Clone, Copy, PartialEq, Eq)]
200enum VarKind {
201    Node,
202    Edge,
203}
204
205fn validate_condition(cond: &mut Condition, is_edge: bool) -> Result<(), QueryError> {
206    match cond.property.as_str() {
207        "namespace" => Err(QueryError::Validation(
208            "namespace is set by CompileOptions, not query text".into(),
209        )),
210        "kind" if !is_edge => Ok(()),
211        "relation" if is_edge => {
212            if let ConditionValue::String(ref mut s) = cond.value {
213                let parsed = EdgeRelation::from_str(s)
214                    .map_err(|err| QueryError::Validation(err.to_string()))?;
215                *s = parsed.as_str().to_string();
216            }
217            Ok(())
218        }
219        _ => Ok(()),
220    }
221}
222
223#[cfg(test)]
224#[path = "validate_tests.rs"]
225mod tests;