Skip to main content

hyperi_rustlib/expression/
profile.rs

1// Project:   hyperi-rustlib
2// File:      src/expression/profile.rs
3// Purpose:   DFE expression profile -- allowed/restricted CEL functions
4// Language:  Rust
5//
6// License:   BUSL-1.1
7// Copyright: (c) 2026 HYPERI PTY LIMITED
8
9//! DFE expression profile -- allowed and restricted CEL functions.
10//!
11//! The DFE profile restricts CEL to a high-performance subset suitable
12//! for per-record evaluation at ingest/query time. Functions with
13//! unbounded or unpredictable cost are blocked by default but can be
14//! unlocked per-category via [`ProfileConfig`].
15
16/// CEL functions allowed unconditionally in the DFE profile.
17pub const ALLOWED_FUNCTIONS: &[&str] = &[
18    // String operations (SIMD-friendly, bounded cost)
19    "contains",
20    "startsWith",
21    "endsWith",
22    // Collection
23    "size",
24    // Existence
25    "has",
26    // Type casts
27    "int",
28    "uint",
29    "double",
30    "string",
31    "bool",
32];
33
34/// Restricted function categories -- blocked by default, opt-in via config.
35///
36/// Each category has a reason for restriction and a config flag to unlock.
37pub const RESTRICTED_REGEX: &[&str] = &["matches"];
38pub const RESTRICTED_ITERATION: &[&str] = &["map", "filter", "exists", "all", "exists_one"];
39pub const RESTRICTED_TIME: &[&str] = &["timestamp", "duration"];
40
41/// All restricted functions (union of all categories).
42pub const DISALLOWED_FUNCTIONS: &[&str] = &[
43    "matches",
44    "map",
45    "filter",
46    "exists",
47    "all",
48    "exists_one",
49    "timestamp",
50    "duration",
51];
52
53/// Configuration for the DFE expression profile.
54///
55/// Each flag unlocks a category of restricted functions. All default
56/// to `false` (blocked). Set explicitly in application config to opt in.
57#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)]
58pub struct ProfileConfig {
59    /// Allow `matches()` (regex). Unbounded cost per record -- use only
60    /// when `contains()`/`startsWith()`/`endsWith()` are insufficient.
61    pub allow_regex: bool,
62    /// Allow `map()`, `filter()`, `exists()`, `all()`, `exists_one()`.
63    /// O(n) per collection element -- cost proportional to data size.
64    pub allow_iteration: bool,
65    /// Allow `timestamp()`, `duration()`. Excluded because ClickHouse
66    /// handles time natively -- rarely needed in CEL expressions.
67    pub allow_time: bool,
68}
69
70impl ProfileConfig {
71    /// Returns the set of functions that are currently blocked.
72    #[must_use]
73    pub fn blocked_functions(&self) -> Vec<&'static str> {
74        let mut blocked = Vec::new();
75        if !self.allow_regex {
76            blocked.extend_from_slice(RESTRICTED_REGEX);
77        }
78        if !self.allow_iteration {
79            blocked.extend_from_slice(RESTRICTED_ITERATION);
80        }
81        if !self.allow_time {
82            blocked.extend_from_slice(RESTRICTED_TIME);
83        }
84        blocked
85    }
86}
87
88/// CEL keywords and built-in names that look like function calls but
89/// should be skipped during profile scanning.
90const SKIP_NAMES: &[&str] = &[
91    "true", "false", "null", "in", "has", "int", "uint", "double", "string", "bool",
92];
93
94/// Scan an expression for restricted function calls using default config.
95///
96/// Returns a list of error strings (empty if all function calls are
97/// within the DFE profile). Equivalent to `check_profile_with_config`
98/// with [`ProfileConfig::default()`] (all restrictions active).
99#[must_use]
100pub fn check_profile(expr: &str) -> Vec<String> {
101    check_profile_with_config(expr, &ProfileConfig::default())
102}
103
104/// Scan an expression for restricted function calls.
105///
106/// The scanner skips string literals to avoid false positives on
107/// function names that appear inside quoted values.
108///
109/// Returns a list of error strings (empty if compliant).
110#[must_use]
111pub fn check_profile_with_config(expr: &str, config: &ProfileConfig) -> Vec<String> {
112    let blocked = config.blocked_functions();
113    if blocked.is_empty() {
114        return Vec::new();
115    }
116
117    let mut errors = Vec::new();
118    let bytes = expr.as_bytes();
119    let len = bytes.len();
120    let mut i = 0;
121
122    while i < len {
123        // Skip string literals (double-quoted and single-quoted)
124        if bytes[i] == b'"' || bytes[i] == b'\'' {
125            i = skip_string_literal(bytes, i);
126            continue;
127        }
128
129        // Skip non-identifier characters
130        if !is_ident_start(bytes[i]) {
131            i += 1;
132            continue;
133        }
134
135        // Read identifier
136        let start = i;
137        while i < len && is_ident_char(bytes[i]) {
138            i += 1;
139        }
140        let name = &expr[start..i];
141
142        // Skip whitespace between identifier and potential `(`
143        let mut peek = i;
144        while peek < len && bytes[peek] == b' ' {
145            peek += 1;
146        }
147
148        // Check if followed by `(`
149        if peek < len && bytes[peek] == b'(' {
150            if SKIP_NAMES.contains(&name) {
151                continue;
152            }
153
154            if blocked.contains(&name) {
155                let reason = restriction_reason(name);
156                errors.push(format!(
157                    "Function '{name}()' is not allowed in the DFE expression profile. {reason}"
158                ));
159            }
160        }
161    }
162
163    errors
164}
165
166/// Skip past a string literal, handling escape sequences.
167///
168/// `start` must point to the opening quote character.
169/// Returns the index after the closing quote (or end of input).
170fn skip_string_literal(bytes: &[u8], start: usize) -> usize {
171    let quote = bytes[start];
172    let mut i = start + 1;
173    while i < bytes.len() {
174        if bytes[i] == b'\\' {
175            // Skip escaped character
176            i += 2;
177            continue;
178        }
179        if bytes[i] == quote {
180            return i + 1;
181        }
182        i += 1;
183    }
184    // Unterminated string -- return end of input
185    bytes.len()
186}
187
188fn restriction_reason(name: &str) -> &'static str {
189    match name {
190        "matches" => {
191            "Regex has unbounded cost per record. Use contains()/startsWith()/endsWith() instead, or set allow_regex: true in expression config."
192        }
193        "map" | "filter" | "exists" | "all" | "exists_one" => {
194            "Per-element iteration has O(n) cost proportional to collection size. Set allow_iteration: true in expression config to permit."
195        }
196        "timestamp" | "duration" => {
197            "Time functions excluded -- ClickHouse handles time natively. Set allow_time: true in expression config to permit."
198        }
199        _ => "Restricted by DFE expression profile.",
200    }
201}
202
203fn is_ident_start(b: u8) -> bool {
204    b.is_ascii_alphabetic() || b == b'_'
205}
206
207fn is_ident_char(b: u8) -> bool {
208    b.is_ascii_alphanumeric() || b == b'_'
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    // ── Default config (all restricted) ─────────────────────────
216
217    #[test]
218    fn allowed_function_passes() {
219        assert!(check_profile(r#"msg.contains("error")"#).is_empty());
220    }
221
222    #[test]
223    fn starts_with_passes() {
224        assert!(check_profile(r#"path.startsWith("/api/")"#).is_empty());
225    }
226
227    #[test]
228    fn ends_with_passes() {
229        assert!(check_profile(r#"file.endsWith(".log")"#).is_empty());
230    }
231
232    #[test]
233    fn matches_blocked_by_default() {
234        let errors = check_profile(r#"name.matches("^web-[0-9]+$")"#);
235        assert_eq!(errors.len(), 1);
236        assert!(errors[0].contains("matches()"));
237        assert!(errors[0].contains("allow_regex"));
238    }
239
240    #[test]
241    fn disallowed_map_rejected() {
242        let errors = check_profile("[1,2,3].map(x, x * 2)");
243        assert_eq!(errors.len(), 1);
244        assert!(errors[0].contains("map()"));
245    }
246
247    #[test]
248    fn disallowed_filter_rejected() {
249        let errors = check_profile("[1,2,3].filter(x, x > 1)");
250        assert_eq!(errors.len(), 1);
251        assert!(errors[0].contains("filter()"));
252    }
253
254    #[test]
255    fn disallowed_timestamp_rejected() {
256        let errors = check_profile(r#"timestamp("2024-01-01T00:00:00Z")"#);
257        assert_eq!(errors.len(), 1);
258        assert!(errors[0].contains("timestamp()"));
259    }
260
261    #[test]
262    fn disallowed_duration_rejected() {
263        let errors = check_profile(r#"duration("1h")"#);
264        assert_eq!(errors.len(), 1);
265        assert!(errors[0].contains("duration()"));
266    }
267
268    #[test]
269    fn keywords_skipped() {
270        assert!(check_profile("has(user.name)").is_empty());
271        assert!(check_profile("int(x) > 10").is_empty());
272        assert!(check_profile("bool(y)").is_empty());
273    }
274
275    #[test]
276    fn plain_comparison_passes() {
277        assert!(check_profile(r#"severity == "critical""#).is_empty());
278    }
279
280    #[test]
281    fn compound_expression_passes() {
282        assert!(check_profile(r#"severity == "critical" && amount > 10000"#).is_empty());
283    }
284
285    // ── String literal false-positive prevention ────────────────
286
287    #[test]
288    fn function_name_inside_string_not_flagged() {
289        // "filter" appears inside a string literal, not as a function call
290        assert!(check_profile(r#"msg.contains("filter")"#).is_empty());
291    }
292
293    #[test]
294    fn function_name_inside_string_with_parens_not_flagged() {
295        // "map(" appears inside a string -- should not be flagged
296        assert!(check_profile(r#"msg.contains("map(x)")"#).is_empty());
297    }
298
299    #[test]
300    fn matches_inside_string_not_flagged() {
301        assert!(check_profile(r#"msg.contains("matches")"#).is_empty());
302    }
303
304    #[test]
305    fn timestamp_inside_string_not_flagged() {
306        assert!(check_profile(r#"label == "timestamp""#).is_empty());
307    }
308
309    #[test]
310    fn escaped_quote_inside_string_handled() {
311        // String with escaped quote: "filter\"(" -- scanner must not exit early
312        assert!(check_profile(r#"msg.contains("filter\"(")"#).is_empty());
313    }
314
315    #[test]
316    fn single_quoted_string_handled() {
317        assert!(check_profile("msg.contains('filter')").is_empty());
318    }
319
320    #[test]
321    fn real_call_after_string_still_caught() {
322        // String contains "ok" but then a real map() call follows
323        let errors = check_profile(r#""ok" + items.map(x, x)"#);
324        assert_eq!(errors.len(), 1);
325        assert!(errors[0].contains("map()"));
326    }
327
328    // ── Config overrides ────────────────────────────────────────
329
330    #[test]
331    fn matches_allowed_with_regex_config() {
332        let config = ProfileConfig {
333            allow_regex: true,
334            ..Default::default()
335        };
336        assert!(check_profile_with_config(r#"name.matches("^web-[0-9]+$")"#, &config).is_empty());
337    }
338
339    #[test]
340    fn map_still_blocked_with_regex_config() {
341        let config = ProfileConfig {
342            allow_regex: true,
343            ..Default::default()
344        };
345        let errors = check_profile_with_config("[1,2].map(x, x)", &config);
346        assert_eq!(errors.len(), 1);
347        assert!(errors[0].contains("map()"));
348    }
349
350    #[test]
351    fn iteration_allowed_with_config() {
352        let config = ProfileConfig {
353            allow_iteration: true,
354            ..Default::default()
355        };
356        assert!(check_profile_with_config("[1,2].map(x, x * 2)", &config).is_empty());
357        assert!(check_profile_with_config("[1,2].filter(x, x > 1)", &config).is_empty());
358        assert!(check_profile_with_config("[1,2].exists(x, x > 1)", &config).is_empty());
359    }
360
361    #[test]
362    fn time_allowed_with_config() {
363        let config = ProfileConfig {
364            allow_time: true,
365            ..Default::default()
366        };
367        assert!(
368            check_profile_with_config(r#"timestamp("2024-01-01T00:00:00Z")"#, &config).is_empty()
369        );
370        assert!(check_profile_with_config(r#"duration("1h")"#, &config).is_empty());
371    }
372
373    #[test]
374    fn all_restrictions_lifted() {
375        let config = ProfileConfig {
376            allow_regex: true,
377            allow_iteration: true,
378            allow_time: true,
379        };
380        assert!(config.blocked_functions().is_empty());
381        assert!(
382            check_profile_with_config(r#"name.matches("x") && [1].map(x, x)"#, &config).is_empty()
383        );
384    }
385
386    // ── Edge cases ──────────────────────────────────────────────
387
388    #[test]
389    fn identifier_not_followed_by_paren_is_fine() {
390        // "filter" as a variable name, not a function call
391        assert!(check_profile("filter > 10").is_empty());
392    }
393
394    #[test]
395    fn identifier_with_space_before_paren() {
396        let errors = check_profile("[1,2].map (x, x)");
397        assert_eq!(errors.len(), 1);
398        assert!(errors[0].contains("map()"));
399    }
400
401    #[test]
402    fn empty_expression() {
403        assert!(check_profile("").is_empty());
404    }
405
406    #[test]
407    fn whitespace_only() {
408        assert!(check_profile("   ").is_empty());
409    }
410
411    #[test]
412    fn multiple_violations_reported() {
413        let errors = check_profile("[1].map(x, x).filter(y, y > 0)");
414        assert_eq!(errors.len(), 2);
415    }
416}