Skip to main content

moltendb_core/
validation.rs

1// ─── validation.rs ────────────────────────────────────────────────────────────
2// This file implements input validation for all incoming HTTP requests.
3//
4// Why validate?
5//   Without validation, a malicious client could send:
6//     - Collection names like "../etc/passwd" (path traversal)
7//     - Deeply nested JSON like { a: { b: { c: ... } } } (stack overflow)
8//     - Payloads of hundreds of megabytes (memory exhaustion)
9//     - Reserved names like "admin" or "system" (privilege escalation)
10//     - Thousands of keys in one request (CPU exhaustion)
11//
12// All validation happens in validate_request(), which is called at the top
13// of every process_* function in handlers.rs before any database work is done.
14//
15// Validation rules:
16//   Collection names: 1–64 chars, alphanumeric + _ and - only, not reserved.
17//   Key names:        1–256 chars, alphanumeric + _ - . only.
18//   Field names:      1–128 chars, alphanumeric + _ - . only (dot = nested path).
19//   JSON depth:       max 32 levels of nesting.
20//   Payload size:     max 10 MB.
21//   Batch size:       max 1000 keys per request.
22// ─────────────────────────────────────────────────────────────────────────────
23
24// Regex = compiled regular expression for pattern matching.
25use regex::Regex;
26use serde_json::Value;
27// LazyLock = initialise a static value lazily on first access (thread-safe).
28// Used here to compile regexes once at startup instead of on every request.
29use std::sync::LazyLock;
30
31// ─── Compiled regexes ─────────────────────────────────────────────────────────
32// Regexes are expensive to compile — we compile them once and reuse them.
33// LazyLock ensures the regex is compiled on the first call and cached forever.
34
35/// Valid collection names: 1–64 alphanumeric characters, underscores, or hyphens.
36/// Rejects path separators (/ \), dots, spaces, and special characters.
37static COLLECTION_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
38    Regex::new(r"^[a-zA-Z0-9_-]{1,64}$").unwrap()
39});
40
41/// Valid document keys: 1–256 alphanumeric characters, underscores, hyphens, or dots.
42/// Dots are allowed in keys (e.g. "user.123") but not in collection names.
43static KEY_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"^[a-zA-Z0-9_.-]{1,256}$").unwrap()
45});
46
47/// Valid field names: 1–128 alphanumeric characters, underscores, hyphens, or dots.
48/// Dots are used for nested field access (e.g. "meta.logins").
49static FIELD_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
50    Regex::new(r"^[a-zA-Z0-9_.-]{1,128}$").unwrap()
51});
52
53// ─── ValidationError enum ─────────────────────────────────────────────────────
54
55/// All possible validation failures. Each variant carries enough context to
56/// produce a helpful error message for the client.
57#[derive(Debug)]
58pub enum ValidationError {
59    /// The collection name contains invalid characters or is a reserved name.
60    InvalidCollectionName(String),
61    /// The document key contains invalid characters or is too long.
62    InvalidKeyName(String),
63    /// A field name in a projection, WHERE clause, or join contains invalid characters.
64    InvalidFieldName(String),
65    /// The collection name exceeds 64 characters.
66    CollectionNameTooLong,
67    /// A document key exceeds 256 characters.
68    KeyNameTooLong,
69    /// The entire request payload exceeds 10 MB.
70    PayloadTooLarge,
71    /// The JSON object is nested more than 32 levels deep.
72    InvalidJsonDepth,
73    /// A single request contains more than 1000 keys.
74    TooManyKeys,
75    /// The request payload contains a property that is not recognised for this endpoint.
76    UnknownProperty(String),
77}
78
79/// Implement Display so ValidationError can be returned as a JSON error message.
80impl std::fmt::Display for ValidationError {
81    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82        match self {
83            ValidationError::InvalidCollectionName(name) => {
84                write!(f, "Invalid collection name: '{}'. Must be alphanumeric with _, - only (1-64 chars)", name)
85            }
86            ValidationError::InvalidKeyName(name) => {
87                write!(f, "Invalid key name: '{}'. Must be alphanumeric with _, -, . only (1-256 chars)", name)
88            }
89            ValidationError::InvalidFieldName(name) => {
90                write!(f, "Invalid field name: '{}'. Must be alphanumeric with _, -, . only (1-128 chars)", name)
91            }
92            ValidationError::CollectionNameTooLong => {
93                write!(f, "Collection name too long (max 64 characters)")
94            }
95            ValidationError::KeyNameTooLong => {
96                write!(f, "Key name too long (max 256 characters)")
97            }
98            ValidationError::PayloadTooLarge => {
99                write!(f, "Payload too large (max 10MB)")
100            }
101            ValidationError::InvalidJsonDepth => {
102                write!(f, "JSON nesting too deep (max 32 levels)")
103            }
104            ValidationError::TooManyKeys => {
105                write!(f, "Too many keys in single request (max 1000)")
106            }
107            ValidationError::UnknownProperty(name) => {
108                write!(f, "Unknown property: '{}'. Check the API docs for the list of supported properties for this endpoint", name)
109            }
110        }
111    }
112}
113
114/// Mark ValidationError as a standard Rust error type.
115/// Required for it to be used with the `?` operator in functions returning
116/// `Result<_, Box<dyn std::error::Error>>`.
117impl std::error::Error for ValidationError {}
118
119// ─── Individual validators ────────────────────────────────────────────────────
120
121/// Validate a collection name.
122///
123/// Rules:
124///   - Must not be empty.
125///   - Must be 1–64 characters.
126///   - Must match [a-zA-Z0-9_-] (no dots, slashes, spaces, or special chars).
127///   - Must not be a reserved name (admin, system, config, internal, __proto__).
128pub fn validate_collection_name(name: &str) -> Result<(), ValidationError> {
129    if name.is_empty() {
130        return Err(ValidationError::InvalidCollectionName(name.to_string()));
131    }
132
133    if name.len() > 64 {
134        return Err(ValidationError::CollectionNameTooLong);
135    }
136
137    // Check against the compiled regex.
138    if !COLLECTION_NAME_REGEX.is_match(name) {
139        return Err(ValidationError::InvalidCollectionName(name.to_string()));
140    }
141
142    // Block reserved names that could be confused with system collections
143    // or used for privilege escalation.
144    if matches!(name, "admin" | "system" | "config" | "internal" | "__proto__") {
145        return Err(ValidationError::InvalidCollectionName(
146            format!("{} (reserved name)", name)
147        ));
148    }
149
150    Ok(())
151}
152
153/// Validate a document key name.
154///
155/// Rules:
156///   - Must not be empty.
157///   - Must be 1–256 characters.
158///   - Must match [a-zA-Z0-9_.-] (dots allowed for structured keys like "user.123").
159pub fn validate_key_name(key: &str) -> Result<(), ValidationError> {
160    if key.is_empty() {
161        return Err(ValidationError::InvalidKeyName(key.to_string()));
162    }
163
164    if key.len() > 256 {
165        return Err(ValidationError::KeyNameTooLong);
166    }
167
168    if !KEY_NAME_REGEX.is_match(key) {
169        return Err(ValidationError::InvalidKeyName(key.to_string()));
170    }
171
172    Ok(())
173}
174
175/// Validate a field name used in projections, WHERE clauses, or joins.
176///
177/// Rules:
178///   - Must not be empty.
179///   - Must be 1–128 characters.
180///   - Must match [a-zA-Z0-9_.-] (dots allowed for nested paths like "meta.logins").
181///   - Dot-separated parts must not be empty (rejects "a..b" or ".field").
182pub fn validate_field_name(field: &str) -> Result<(), ValidationError> {
183    if field.is_empty() {
184        return Err(ValidationError::InvalidFieldName(field.to_string()));
185    }
186
187    if field.len() > 128 {
188        return Err(ValidationError::InvalidFieldName(
189            format!("{} (too long)", field)
190        ));
191    }
192
193    if !FIELD_NAME_REGEX.is_match(field) {
194        return Err(ValidationError::InvalidFieldName(field.to_string()));
195    }
196
197    // Validate each dot-separated part individually.
198    // This catches "a..b" (empty middle part) or ".field" (empty first part).
199    for part in field.split('.') {
200        if part.is_empty() {
201            return Err(ValidationError::InvalidFieldName(field.to_string()));
202        }
203    }
204
205    Ok(())
206}
207
208/// Validate that a JSON value is not nested more than `max_depth` levels deep.
209///
210/// Deeply nested JSON can cause stack overflows during recursive processing.
211/// The limit of 32 levels is generous for real data but blocks malicious inputs
212/// like { a: { b: { c: ... } } } with hundreds of levels.
213///
214/// Uses a recursive inner function — the outer function is the public API.
215pub fn validate_json_depth(value: &Value, max_depth: usize) -> Result<(), ValidationError> {
216    /// Inner recursive function that tracks the current depth.
217    fn check_depth(value: &Value, current: usize, max: usize) -> Result<(), ValidationError> {
218        // If we've exceeded the maximum depth, reject immediately.
219        if current > max {
220            return Err(ValidationError::InvalidJsonDepth);
221        }
222
223        match value {
224            // For objects and arrays, recurse into each child with depth + 1.
225            Value::Object(map) => {
226                for v in map.values() {
227                    check_depth(v, current + 1, max)?;
228                }
229            }
230            Value::Array(arr) => {
231                for v in arr {
232                    check_depth(v, current + 1, max)?;
233                }
234            }
235            // Scalar values (string, number, bool, null) don't add depth.
236            _ => {}
237        }
238
239        Ok(())
240    }
241
242    // Start the recursion at depth 0.
243    check_depth(value, 0, max_depth)
244}
245
246/// Validate that the serialized payload does not exceed `max_size_bytes`.
247///
248/// Serializing to a string to measure size is slightly wasteful, but it's
249/// the most accurate way to measure the actual byte count of the JSON.
250pub fn validate_payload_size(payload: &Value, max_size_bytes: usize) -> Result<(), ValidationError> {
251    // Serialize to a string to get the byte count.
252    // unwrap_or_default() returns an empty string if serialization fails —
253    // in that case the size check passes (the real error will surface later).
254    let serialized = serde_json::to_string(payload).unwrap_or_default();
255    if serialized.len() > max_size_bytes {
256        return Err(ValidationError::PayloadTooLarge);
257    }
258    Ok(())
259}
260
261/// Validate that a batch operation doesn't contain more than `max_keys` keys.
262///
263/// Large batches can cause CPU spikes and memory pressure. The limit of 1000
264/// keys per request is generous for normal use but blocks accidental or
265/// malicious bulk operations.
266pub fn validate_key_count(count: usize, max_keys: usize) -> Result<(), ValidationError> {
267    if count > max_keys {
268        return Err(ValidationError::TooManyKeys);
269    }
270    Ok(())
271}
272
273/// Check that every top-level key in the payload is in the `allowed` list.
274///
275/// This prevents clients from sending unrecognised properties that would be
276/// silently ignored, which can mask typos (e.g. `"filed"` instead of `"fields"`).
277/// Only top-level keys are checked — nested document data is not validated here.
278pub fn validate_allowed_properties(payload: &Value, allowed: &[&str]) -> Result<(), ValidationError> {
279    if let Some(obj) = payload.as_object() {
280        for key in obj.keys() {
281            if !allowed.contains(&key.as_str()) {
282                return Err(ValidationError::UnknownProperty(key.clone()));
283            }
284        }
285    }
286    Ok(())
287}
288
289/// Run all validation checks on an incoming request payload.
290///
291/// This is the single entry point called by every process_* function in
292/// handlers.rs. It checks everything in one pass:
293///   1. Payload size (10 MB limit)
294///   2. JSON nesting depth (32 levels max)
295///   3. Collection name validity
296///   4. Key name validity (single key, batch keys, data map keys)
297///   5. Field name validity (projections, joins, WHERE clause)
298///
299/// Returns Ok(()) if all checks pass, or the first ValidationError found.
300pub fn validate_request(payload: &Value, max_body_size: usize) -> Result<(), ValidationError> {
301    // Check 1: Payload size — reject before doing any other work.
302    validate_payload_size(payload, max_body_size)?;
303
304    // Check 2: JSON depth — prevent stack overflows in recursive processing.
305    validate_json_depth(payload, 32)?;
306
307    // Check 3: Collection name — must be safe to use as a storage key.
308    if let Some(collection) = payload.get("collection").and_then(|v| v.as_str()) {
309        validate_collection_name(collection)?;
310    }
311
312    // Check 4a: Single or batch key lookup/delete.
313    if let Some(keys) = payload.get("keys") {
314        match keys {
315            Value::String(key) => validate_key_name(key)?,
316            Value::Array(arr) => {
317                // Reject if too many keys in one request.
318                validate_key_count(arr.len(), 1000)?;
319                for key in arr {
320                    if let Some(key_str) = key.as_str() {
321                        validate_key_name(key_str)?;
322                    }
323                }
324            }
325            _ => {}
326        }
327    }
328
329    // Check 4b: Data map keys in insert/update operations.
330    if let Some(data) = payload.get("data") {
331        if let Value::Object(map) = data {
332            validate_key_count(map.len(), 1000)?;
333            for key in map.keys() {
334                validate_key_name(key)?;
335            }
336        }
337    }
338
339    // Check 5a: Field names in projection (fields: ["name", "meta.logins"]).
340    if let Some(fields) = payload.get("fields").and_then(|v| v.as_array()) {
341        for field in fields {
342            if let Some(field_str) = field.as_str() {
343                validate_field_name(field_str)?;
344            }
345        }
346    }
347
348    // Check 5b: Join specifications — validate collection, alias, foreign_key, fields.
349    if let Some(joins) = payload.get("joins").and_then(|v| v.as_array()) {
350        for join in joins {
351            if let Some(join_collection) = join.get("collection").and_then(|v| v.as_str()) {
352                validate_collection_name(join_collection)?;
353            }
354            if let Some(alias) = join.get("alias").and_then(|v| v.as_str()) {
355                validate_key_name(alias)?;
356            }
357            if let Some(foreign_key) = join.get("foreign_key").and_then(|v| v.as_str()) {
358                validate_field_name(foreign_key)?;
359            }
360            if let Some(join_fields) = join.get("fields").and_then(|v| v.as_array()) {
361                for field in join_fields {
362                    if let Some(field_str) = field.as_str() {
363                        validate_field_name(field_str)?;
364                    }
365                }
366            }
367        }
368    }
369
370    // Check 5c: WHERE clause field names (top-level keys only).
371    // Operator keys like $or, $and, $gt are skipped (they start with '$').
372    if let Some(where_clause) = payload.get("where").and_then(|v| v.as_object()) {
373        for key in where_clause.keys() {
374            if !key.starts_with('$') {
375                validate_field_name(key)?;
376            }
377        }
378    }
379
380    Ok(())
381}
382
383// ─── Tests ────────────────────────────────────────────────────────────────────
384// These tests run with `cargo test` and verify the validation logic.
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389    use serde_json::json;
390
391    /// Valid collection names should pass without error.
392    #[test]
393    fn test_valid_collection_names() {
394        assert!(validate_collection_name("users").is_ok());
395        assert!(validate_collection_name("user_data").is_ok());
396        assert!(validate_collection_name("data-2024").is_ok());
397        assert!(validate_collection_name("test123").is_ok());
398    }
399
400    /// Invalid collection names should be rejected.
401    #[test]
402    fn test_invalid_collection_names() {
403        assert!(validate_collection_name("").is_err());           // empty
404        assert!(validate_collection_name("user$data").is_err());  // invalid char
405        assert!(validate_collection_name("../etc/passwd").is_err()); // path traversal
406        assert!(validate_collection_name("admin").is_err());      // reserved name
407    }
408
409    /// Shallow JSON should pass the depth check; deeply nested JSON should fail.
410    #[test]
411    fn test_json_depth() {
412        let shallow = json!({"a": {"b": "c"}});
413        assert!(validate_json_depth(&shallow, 10).is_ok());
414
415        // Build a JSON object nested 50 levels deep — should fail at max 32.
416        let mut deep = json!({});
417        let mut current = &mut deep;
418        for _ in 0..50 {
419            *current = json!({"nested": {}});
420            current = current.get_mut("nested").unwrap();
421        }
422        assert!(validate_json_depth(&deep, 32).is_err());
423    }
424}