moltendb_core/validation.rs
1// ─── validation.rs ────────────────────────────────────────────────────────────
2// This file implements input validation for all incoming HTTP requests.
3//
4// Why validate?
5// Without validation, a malicious client could send:
6// - Collection names like "../etc/passwd" (path traversal)
7// - Deeply nested JSON like { a: { b: { c: ... } } } (stack overflow)
8// - Payloads of hundreds of megabytes (memory exhaustion)
9// - Reserved names like "admin" or "system" (privilege escalation)
10// - Thousands of keys in one request (CPU exhaustion)
11//
12// All validation happens in validate_request(), which is called at the top
13// of every process_* function in handlers.rs before any database work is done.
14//
15// Validation rules:
16// Collection names: 1–64 chars, alphanumeric + _ and - only, not reserved.
17// Key names: 1–256 chars, alphanumeric + _ - . only.
18// Field names: 1–128 chars, alphanumeric + _ - . only (dot = nested path).
19// JSON depth: max 32 levels of nesting.
20// Payload size: max 10 MB.
21// Batch size: max 1000 keys per request.
22// ─────────────────────────────────────────────────────────────────────────────
23
24// Regex = compiled regular expression for pattern matching.
25use regex::Regex;
26use serde_json::Value;
27// LazyLock = initialise a static value lazily on first access (thread-safe).
28// Used here to compile regexes once at startup instead of on every request.
29use std::sync::LazyLock;
30
31// ─── Compiled regexes ─────────────────────────────────────────────────────────
32// Regexes are expensive to compile — we compile them once and reuse them.
33// LazyLock ensures the regex is compiled on the first call and cached forever.
34
35/// Valid collection names: 1–64 alphanumeric characters, underscores, or hyphens.
36/// Rejects path separators (/ \), dots, spaces, and special characters.
37static COLLECTION_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
38 Regex::new(r"^[a-zA-Z0-9_-]{1,64}$").unwrap()
39});
40
41/// Valid document keys: 1–256 alphanumeric characters, underscores, hyphens, or dots.
42/// Dots are allowed in keys (e.g. "user.123") but not in collection names.
43static KEY_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
44 Regex::new(r"^[a-zA-Z0-9_.-]{1,256}$").unwrap()
45});
46
47/// Valid field names: 1–128 alphanumeric characters, underscores, hyphens, or dots.
48/// Dots are used for nested field access (e.g. "meta.logins").
49static FIELD_NAME_REGEX: LazyLock<Regex> = LazyLock::new(|| {
50 Regex::new(r"^[a-zA-Z0-9_.-]{1,128}$").unwrap()
51});
52
53// ─── ValidationError enum ─────────────────────────────────────────────────────
54
55/// All possible validation failures. Each variant carries enough context to
56/// produce a helpful error message for the client.
57#[derive(Debug)]
58pub enum ValidationError {
59 /// The collection name contains invalid characters or is a reserved name.
60 InvalidCollectionName(String),
61 /// The document key contains invalid characters or is too long.
62 InvalidKeyName(String),
63 /// A field name in a projection, WHERE clause, or join contains invalid characters.
64 InvalidFieldName(String),
65 /// The collection name exceeds 64 characters.
66 CollectionNameTooLong,
67 /// A document key exceeds 256 characters.
68 KeyNameTooLong,
69 /// The entire request payload exceeds 10 MB.
70 PayloadTooLarge,
71 /// The JSON object is nested more than 32 levels deep.
72 InvalidJsonDepth,
73 /// A single request contains more than 1000 keys.
74 TooManyKeys,
75 /// The request payload contains a property that is not recognised for this endpoint.
76 UnknownProperty(String),
77}
78
79/// Implement Display so ValidationError can be returned as a JSON error message.
80impl std::fmt::Display for ValidationError {
81 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
82 match self {
83 ValidationError::InvalidCollectionName(name) => {
84 write!(f, "Invalid collection name: '{}'. Must be alphanumeric with _, - only (1-64 chars)", name)
85 }
86 ValidationError::InvalidKeyName(name) => {
87 write!(f, "Invalid key name: '{}'. Must be alphanumeric with _, -, . only (1-256 chars)", name)
88 }
89 ValidationError::InvalidFieldName(name) => {
90 write!(f, "Invalid field name: '{}'. Must be alphanumeric with _, -, . only (1-128 chars)", name)
91 }
92 ValidationError::CollectionNameTooLong => {
93 write!(f, "Collection name too long (max 64 characters)")
94 }
95 ValidationError::KeyNameTooLong => {
96 write!(f, "Key name too long (max 256 characters)")
97 }
98 ValidationError::PayloadTooLarge => {
99 write!(f, "Payload too large (max 10MB)")
100 }
101 ValidationError::InvalidJsonDepth => {
102 write!(f, "JSON nesting too deep (max 32 levels)")
103 }
104 ValidationError::TooManyKeys => {
105 write!(f, "Too many keys in single request (max 1000)")
106 }
107 ValidationError::UnknownProperty(name) => {
108 write!(f, "Unknown property: '{}'. Check the API docs for the list of supported properties for this endpoint", name)
109 }
110 }
111 }
112}
113
114/// Mark ValidationError as a standard Rust error type.
115/// Required for it to be used with the `?` operator in functions returning
116/// `Result<_, Box<dyn std::error::Error>>`.
117impl std::error::Error for ValidationError {}
118
119// ─── Individual validators ────────────────────────────────────────────────────
120
121/// Validate a collection name.
122///
123/// Rules:
124/// - Must not be empty.
125/// - Must be 1–64 characters.
126/// - Must match [a-zA-Z0-9_-] (no dots, slashes, spaces, or special chars).
127/// - Must not be a reserved name (admin, system, config, internal, __proto__).
128pub fn validate_collection_name(name: &str) -> Result<(), ValidationError> {
129 if name.is_empty() {
130 return Err(ValidationError::InvalidCollectionName(name.to_string()));
131 }
132
133 if name.len() > 64 {
134 return Err(ValidationError::CollectionNameTooLong);
135 }
136
137 // Check against the compiled regex.
138 if !COLLECTION_NAME_REGEX.is_match(name) {
139 return Err(ValidationError::InvalidCollectionName(name.to_string()));
140 }
141
142 // Block reserved names that could be confused with system collections
143 // or used for privilege escalation.
144 if matches!(name, "admin" | "system" | "config" | "internal" | "__proto__") {
145 return Err(ValidationError::InvalidCollectionName(
146 format!("{} (reserved name)", name)
147 ));
148 }
149
150 Ok(())
151}
152
153/// Validate a document key name.
154///
155/// Rules:
156/// - Must not be empty.
157/// - Must be 1–256 characters.
158/// - Must match [a-zA-Z0-9_.-] (dots allowed for structured keys like "user.123").
159pub fn validate_key_name(key: &str) -> Result<(), ValidationError> {
160 if key.is_empty() {
161 return Err(ValidationError::InvalidKeyName(key.to_string()));
162 }
163
164 if key.len() > 256 {
165 return Err(ValidationError::KeyNameTooLong);
166 }
167
168 if !KEY_NAME_REGEX.is_match(key) {
169 return Err(ValidationError::InvalidKeyName(key.to_string()));
170 }
171
172 Ok(())
173}
174
175/// Validate a field name used in projections, WHERE clauses, or joins.
176///
177/// Rules:
178/// - Must not be empty.
179/// - Must be 1–128 characters.
180/// - Must match [a-zA-Z0-9_.-] (dots allowed for nested paths like "meta.logins").
181/// - Dot-separated parts must not be empty (rejects "a..b" or ".field").
182pub fn validate_field_name(field: &str) -> Result<(), ValidationError> {
183 if field.is_empty() {
184 return Err(ValidationError::InvalidFieldName(field.to_string()));
185 }
186
187 if field.len() > 128 {
188 return Err(ValidationError::InvalidFieldName(
189 format!("{} (too long)", field)
190 ));
191 }
192
193 if !FIELD_NAME_REGEX.is_match(field) {
194 return Err(ValidationError::InvalidFieldName(field.to_string()));
195 }
196
197 // Validate each dot-separated part individually.
198 // This catches "a..b" (empty middle part) or ".field" (empty first part).
199 for part in field.split('.') {
200 if part.is_empty() {
201 return Err(ValidationError::InvalidFieldName(field.to_string()));
202 }
203 }
204
205 Ok(())
206}
207
208/// Validate that a JSON value is not nested more than `max_depth` levels deep.
209///
210/// Deeply nested JSON can cause stack overflows during recursive processing.
211/// The limit of 32 levels is generous for real data but blocks malicious inputs
212/// like { a: { b: { c: ... } } } with hundreds of levels.
213///
214/// Uses a recursive inner function — the outer function is the public API.
215pub fn validate_json_depth(value: &Value, max_depth: usize) -> Result<(), ValidationError> {
216 /// Inner recursive function that tracks the current depth.
217 fn check_depth(value: &Value, current: usize, max: usize) -> Result<(), ValidationError> {
218 // If we've exceeded the maximum depth, reject immediately.
219 if current > max {
220 return Err(ValidationError::InvalidJsonDepth);
221 }
222
223 match value {
224 // For objects and arrays, recurse into each child with depth + 1.
225 Value::Object(map) => {
226 for v in map.values() {
227 check_depth(v, current + 1, max)?;
228 }
229 }
230 Value::Array(arr) => {
231 for v in arr {
232 check_depth(v, current + 1, max)?;
233 }
234 }
235 // Scalar values (string, number, bool, null) don't add depth.
236 _ => {}
237 }
238
239 Ok(())
240 }
241
242 // Start the recursion at depth 0.
243 check_depth(value, 0, max_depth)
244}
245
246/// Validate that the serialized payload does not exceed `max_size_bytes`.
247///
248/// Serializing to a string to measure size is slightly wasteful, but it's
249/// the most accurate way to measure the actual byte count of the JSON.
250pub fn validate_payload_size(payload: &Value, max_size_bytes: usize) -> Result<(), ValidationError> {
251 // Serialize to a string to get the byte count.
252 // unwrap_or_default() returns an empty string if serialization fails —
253 // in that case the size check passes (the real error will surface later).
254 let serialized = serde_json::to_string(payload).unwrap_or_default();
255 if serialized.len() > max_size_bytes {
256 return Err(ValidationError::PayloadTooLarge);
257 }
258 Ok(())
259}
260
261/// Validate that a batch operation doesn't contain more than `max_keys` keys.
262///
263/// Large batches can cause CPU spikes and memory pressure. The limit of 1000
264/// keys per request is generous for normal use but blocks accidental or
265/// malicious bulk operations.
266pub fn validate_key_count(count: usize, max_keys: usize) -> Result<(), ValidationError> {
267 if count > max_keys {
268 return Err(ValidationError::TooManyKeys);
269 }
270 Ok(())
271}
272
273/// Check that every top-level key in the payload is in the `allowed` list.
274///
275/// This prevents clients from sending unrecognised properties that would be
276/// silently ignored, which can mask typos (e.g. `"filed"` instead of `"fields"`).
277/// Only top-level keys are checked — nested document data is not validated here.
278pub fn validate_allowed_properties(payload: &Value, allowed: &[&str]) -> Result<(), ValidationError> {
279 if let Some(obj) = payload.as_object() {
280 for key in obj.keys() {
281 if !allowed.contains(&key.as_str()) {
282 return Err(ValidationError::UnknownProperty(key.clone()));
283 }
284 }
285 }
286 Ok(())
287}
288
289/// Run all validation checks on an incoming request payload.
290///
291/// This is the single entry point called by every process_* function in
292/// handlers.rs. It checks everything in one pass:
293/// 1. Payload size (10 MB limit)
294/// 2. JSON nesting depth (32 levels max)
295/// 3. Collection name validity
296/// 4. Key name validity (single key, batch keys, data map keys)
297/// 5. Field name validity (projections, joins, WHERE clause)
298///
299/// Returns Ok(()) if all checks pass, or the first ValidationError found.
300pub fn validate_request(payload: &Value, max_body_size: usize) -> Result<(), ValidationError> {
301 // Check 1: Payload size — reject before doing any other work.
302 validate_payload_size(payload, max_body_size)?;
303
304 // Check 2: JSON depth — prevent stack overflows in recursive processing.
305 validate_json_depth(payload, 32)?;
306
307 // Check 3: Collection name — must be safe to use as a storage key.
308 if let Some(collection) = payload.get("collection").and_then(|v| v.as_str()) {
309 validate_collection_name(collection)?;
310 }
311
312 // Check 4a: Single or batch key lookup/delete.
313 if let Some(keys) = payload.get("keys") {
314 match keys {
315 Value::String(key) => validate_key_name(key)?,
316 Value::Array(arr) => {
317 // Reject if too many keys in one request.
318 validate_key_count(arr.len(), 1000)?;
319 for key in arr {
320 if let Some(key_str) = key.as_str() {
321 validate_key_name(key_str)?;
322 }
323 }
324 }
325 _ => {}
326 }
327 }
328
329 // Check 4b: Data map keys in insert/update operations.
330 if let Some(data) = payload.get("data") {
331 if let Value::Object(map) = data {
332 validate_key_count(map.len(), 1000)?;
333 for key in map.keys() {
334 validate_key_name(key)?;
335 }
336 }
337 }
338
339 // Check 5a: Field names in projection (fields: ["name", "meta.logins"]).
340 if let Some(fields) = payload.get("fields").and_then(|v| v.as_array()) {
341 for field in fields {
342 if let Some(field_str) = field.as_str() {
343 validate_field_name(field_str)?;
344 }
345 }
346 }
347
348 // Check 5b: Join specifications — validate collection, alias, foreign_key, fields.
349 if let Some(joins) = payload.get("joins").and_then(|v| v.as_array()) {
350 for join in joins {
351 if let Some(join_collection) = join.get("collection").and_then(|v| v.as_str()) {
352 validate_collection_name(join_collection)?;
353 }
354 if let Some(alias) = join.get("alias").and_then(|v| v.as_str()) {
355 validate_key_name(alias)?;
356 }
357 if let Some(foreign_key) = join.get("foreign_key").and_then(|v| v.as_str()) {
358 validate_field_name(foreign_key)?;
359 }
360 if let Some(join_fields) = join.get("fields").and_then(|v| v.as_array()) {
361 for field in join_fields {
362 if let Some(field_str) = field.as_str() {
363 validate_field_name(field_str)?;
364 }
365 }
366 }
367 }
368 }
369
370 // Check 5c: WHERE clause field names (top-level keys only).
371 // Operator keys like $or, $and, $gt are skipped (they start with '$').
372 if let Some(where_clause) = payload.get("where").and_then(|v| v.as_object()) {
373 for key in where_clause.keys() {
374 if !key.starts_with('$') {
375 validate_field_name(key)?;
376 }
377 }
378 }
379
380 Ok(())
381}
382
383// ─── Tests ────────────────────────────────────────────────────────────────────
384// These tests run with `cargo test` and verify the validation logic.
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389 use serde_json::json;
390
391 /// Valid collection names should pass without error.
392 #[test]
393 fn test_valid_collection_names() {
394 assert!(validate_collection_name("users").is_ok());
395 assert!(validate_collection_name("user_data").is_ok());
396 assert!(validate_collection_name("data-2024").is_ok());
397 assert!(validate_collection_name("test123").is_ok());
398 }
399
400 /// Invalid collection names should be rejected.
401 #[test]
402 fn test_invalid_collection_names() {
403 assert!(validate_collection_name("").is_err()); // empty
404 assert!(validate_collection_name("user$data").is_err()); // invalid char
405 assert!(validate_collection_name("../etc/passwd").is_err()); // path traversal
406 assert!(validate_collection_name("admin").is_err()); // reserved name
407 }
408
409 /// Shallow JSON should pass the depth check; deeply nested JSON should fail.
410 #[test]
411 fn test_json_depth() {
412 let shallow = json!({"a": {"b": "c"}});
413 assert!(validate_json_depth(&shallow, 10).is_ok());
414
415 // Build a JSON object nested 50 levels deep — should fail at max 32.
416 let mut deep = json!({});
417 let mut current = &mut deep;
418 for _ in 0..50 {
419 *current = json!({"nested": {}});
420 current = current.get_mut("nested").unwrap();
421 }
422 assert!(validate_json_depth(&deep, 32).is_err());
423 }
424}