ingest/config.rs
1//! Configuration types for the ingest pipeline.
2//!
3//! This module defines [`IngestConfig`] and [`MetadataPolicy`], which control how
4//! raw ingest requests are interpreted, defaulted, and constrained at runtime.
5//! These types are intended to be cheap to clone and easy to serialize from
6//! external configuration formats such as JSON, TOML, or YAML.
7//!
8//! # Quick Start
9//!
10//! ```rust
11//! use ingest::IngestConfig;
12//!
13//! // Use defaults for development
14//! let config = IngestConfig::default();
15//!
16//! // Validate before use
17//! config.validate().expect("Invalid configuration");
18//! ```
19//!
20//! # Production Configuration
21//!
22//! ```rust
23//! use ingest::{IngestConfig, MetadataPolicy, RequiredField};
24//! use uuid::Uuid;
25//!
26//! let config = IngestConfig {
27//! version: 1,
28//! default_tenant_id: "production".to_string(),
29//! doc_id_namespace: Uuid::new_v5(
30//! &Uuid::NAMESPACE_DNS,
31//! b"myapp.example.com"
32//! ),
33//! strip_control_chars: true,
34//! metadata_policy: MetadataPolicy {
35//! required_fields: vec![
36//! RequiredField::TenantId,
37//! RequiredField::DocId,
38//! ],
39//! max_attribute_bytes: Some(1024 * 1024), // 1 MB
40//! reject_future_timestamps: true,
41//! },
42//! max_payload_bytes: Some(100 * 1024 * 1024), // 100 MB
43//! max_normalized_bytes: Some(50 * 1024 * 1024), // 50 MB
44//! };
45//!
46//! // Always validate at startup
47//! if let Err(e) = config.validate() {
48//! eprintln!("Configuration error: {}", e);
49//! std::process::exit(1);
50//! }
51//! ```
52use serde::{Deserialize, Serialize};
53use thiserror::Error;
54use uuid::Uuid;
55
56/// Runtime configuration for ingest behavior.
57///
58/// `IngestConfig` controls all aspects of the ingest pipeline including validation,
59/// normalization, size limits, and ID generation. It is designed to be cheap to clone
60/// and serializable for configuration management.
61///
62/// # Fields
63///
64/// - `version`: Semantic version for tracking configuration changes
65/// - `default_tenant_id`: Fallback tenant when metadata doesn't specify one
66/// - `doc_id_namespace`: UUID namespace for deterministic document ID generation
67/// - `strip_control_chars`: Whether to remove control characters from metadata
68/// - `metadata_policy`: Fine-grained metadata validation rules
69/// - `max_payload_bytes`: Maximum raw payload size (optional)
70/// - `max_normalized_bytes`: Maximum normalized text size (optional)
71///
72/// # Serialization
73///
74/// This struct supports JSON, TOML, and YAML serialization:
75///
76/// ```json
77/// {
78/// "version": 1,
79/// "default_tenant_id": "default",
80/// "strip_control_chars": true,
81/// "max_payload_bytes": 52428800,
82/// "max_normalized_bytes": 10485760,
83/// "metadata_policy": {
84/// "required_fields": ["TenantId", "DocId"],
85/// "max_attribute_bytes": 1048576,
86/// "reject_future_timestamps": true
87/// }
88/// }
89/// ```
90///
91/// # Examples
92///
93/// ## Default Configuration
94///
95/// ```rust
96/// use ingest::IngestConfig;
97/// use uuid::Uuid;
98///
99/// let config = IngestConfig::default();
100///
101/// assert_eq!(config.version, 1);
102/// assert_eq!(config.default_tenant_id, "default");
103/// assert_eq!(config.strip_control_chars, true);
104/// assert!(config.max_payload_bytes.is_none());
105/// assert!(config.max_normalized_bytes.is_none());
106/// ```
107///
108/// ## Custom Configuration
109///
110/// ```rust
111/// use ingest::{IngestConfig, MetadataPolicy, RequiredField};
112/// use uuid::Uuid;
113///
114/// let config = IngestConfig {
115/// version: 2,
116/// default_tenant_id: "my-app".to_string(),
117/// doc_id_namespace: Uuid::new_v5(
118/// &Uuid::NAMESPACE_DNS,
119/// b"my-app.example.com"
120/// ),
121/// strip_control_chars: true,
122/// metadata_policy: MetadataPolicy {
123/// required_fields: vec![RequiredField::TenantId],
124/// max_attribute_bytes: Some(65536),
125/// reject_future_timestamps: true,
126/// },
127/// max_payload_bytes: Some(10 * 1024 * 1024),
128/// max_normalized_bytes: Some(5 * 1024 * 1024),
129/// };
130///
131/// assert!(config.validate().is_ok());
132/// ```
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct IngestConfig {
135 /// Semantic version of the ingest configuration.
136 ///
137 /// This version number helps track configuration changes and can be used
138 /// for schema migration or feature flagging. Increment this when making
139 /// breaking changes to ingest behavior.
140 ///
141 /// Default: `1`
142 pub version: u32,
143
144 /// Default tenant ID to use when metadata doesn't specify one.
145 ///
146 /// This ensures every canonical record has a tenant identifier, enabling
147 /// multi-tenant isolation even when callers omit the tenant field.
148 ///
149 /// Default: `"default"`
150 pub default_tenant_id: String,
151
152 /// Namespace UUID for deterministic document ID generation.
153 ///
154 /// When `doc_id` is not provided in metadata, a UUIDv5 is derived using:
155 /// `UUIDv5(doc_id_namespace, tenant_id + "\0" + record_id)`
156 ///
157 /// Using a consistent namespace ensures that:
158 /// - The same content always gets the same ID (deterministic)
159 /// - Different applications don't collide (namespace isolation)
160 /// - Re-ingesting content is idempotent
161 ///
162 /// Default: [`Uuid::NAMESPACE_OID`]
163 pub doc_id_namespace: Uuid,
164
165 /// Whether to strip ASCII control characters from metadata strings.
166 ///
167 /// When `true`, control characters (0x00-0x1F and 0x7F) are removed from:
168 /// - `tenant_id`
169 /// - `doc_id`
170 /// - `original_source`
171 /// - `id` (record ID)
172 ///
173 /// This prevents log injection attacks and ensures metadata is safe for
174 /// downstream systems. It is strongly recommended to keep this enabled.
175 ///
176 /// Default: `true`
177 pub strip_control_chars: bool,
178
179 /// Additional metadata validation policies.
180 ///
181 /// Controls which fields are required, attribute size limits, and timestamp
182 /// validation rules.
183 ///
184 /// Default: [`MetadataPolicy::default()`]
185 #[serde(default)]
186 pub metadata_policy: MetadataPolicy,
187
188 /// Maximum raw payload byte length allowed.
189 ///
190 /// If set, payloads exceeding this limit are rejected with
191 /// `IngestError::PayloadTooLarge` before any processing.
192 ///
193 /// This check is performed on the raw payload size before normalization
194 /// (whitespace collapsing, UTF-8 decoding, etc.).
195 ///
196 /// # Size Recommendations
197 ///
198 /// - Small text: 1-10 MB
199 /// - Documents: 50-100 MB
200 /// - Large files: 500 MB - 1 GB (if memory allows)
201 ///
202 /// Default: `None` (unlimited)
203 #[serde(default)]
204 pub max_payload_bytes: Option<usize>,
205
206 /// Maximum normalized payload byte length allowed.
207 ///
208 /// If set, text payloads exceeding this limit after whitespace normalization
209 /// are rejected with `IngestError::PayloadTooLarge`.
210 ///
211 /// This is useful for enforcing limits on processed content size, which
212 /// may differ from raw size due to whitespace collapsing.
213 ///
214 /// # Constraint
215 ///
216 /// Must be less than or equal to `max_payload_bytes` (validated by
217 /// [`IngestConfig::validate()`]).
218 ///
219 /// Default: `None` (unlimited)
220 #[serde(default)]
221 pub max_normalized_bytes: Option<usize>,
222}
223
224/// Controls which metadata fields must be present and how optional blobs are constrained.
225///
226/// `MetadataPolicy` provides fine-grained control over metadata validation,
227/// allowing you to enforce business rules such as required fields, size limits,
228/// and timestamp constraints.
229///
230/// # Examples
231///
232/// ## Strict Policy
233///
234/// ```rust
235/// use ingest::{MetadataPolicy, RequiredField};
236///
237/// let strict_policy = MetadataPolicy {
238/// required_fields: vec![
239/// RequiredField::TenantId,
240/// RequiredField::DocId,
241/// RequiredField::ReceivedAt,
242/// RequiredField::OriginalSource,
243/// ],
244/// max_attribute_bytes: Some(1024),
245/// reject_future_timestamps: true,
246/// };
247/// ```
248///
249/// ## Lenient Policy
250///
251/// ```rust
252/// use ingest::MetadataPolicy;
253///
254/// let lenient_policy = MetadataPolicy::default();
255/// // All fields optional, no size limits, future timestamps allowed
256/// ```
257#[derive(Debug, Clone, Serialize, Deserialize, Default)]
258#[serde(default)]
259pub struct MetadataPolicy {
260 /// Metadata fields that must be provided by the caller (after sanitization).
261 ///
262 /// If a required field is missing or empty after control character stripping,
263 /// ingest fails with `IngestError::InvalidMetadata`.
264 ///
265 /// # Example
266 ///
267 /// ```rust
268 /// use ingest::{MetadataPolicy, RequiredField};
269 ///
270 /// let policy = MetadataPolicy {
271 /// required_fields: vec![RequiredField::TenantId, RequiredField::DocId],
272 /// ..Default::default()
273 /// };
274 /// ```
275 ///
276 /// Default: empty vector (no required fields)
277 pub required_fields: Vec<RequiredField>,
278
279 /// Maximum serialized byte length allowed for `metadata.attributes`.
280 ///
281 /// If set, the JSON-serialized size of the attributes field must not exceed
282 /// this limit. This protects downstream systems from very large metadata blobs.
283 ///
284 /// # Example
285 ///
286 /// ```rust
287 /// use ingest::MetadataPolicy;
288 ///
289 /// let policy = MetadataPolicy {
290 /// max_attribute_bytes: Some(1024 * 1024), // 1 MB
291 /// ..Default::default()
292 /// };
293 /// ```
294 ///
295 /// Default: `None` (unlimited)
296 pub max_attribute_bytes: Option<usize>,
297
298 /// Reject ingests with timestamps that lie in the future.
299 ///
300 /// When `true`, if `received_at` is strictly greater than the current time,
301 /// ingest fails with `IngestError::InvalidMetadata` containing "future".
302 ///
303 /// This is useful for detecting clock skew or preventing future-dated content
304 /// from entering the system.
305 ///
306 /// Default: `false`
307 pub reject_future_timestamps: bool,
308}
309
310/// Metadata identifiers that can be enforced via [`MetadataPolicy`].
311///
312/// This enum defines the metadata fields that can be marked as required.
313/// It is marked `#[non_exhaustive]` to allow future additions without
314/// breaking existing code.
315///
316/// # Required Fields
317///
318/// - `TenantId`: Tenant identifier for multi-tenant isolation
319/// - `DocId`: Document identifier (caller must provide, no derivation)
320/// - `ReceivedAt`: Timestamp when content was received
321/// - `OriginalSource`: Human-readable source reference
322///
323/// # Examples
324///
325/// ```rust
326/// use ingest::{MetadataPolicy, RequiredField};
327///
328/// let policy = MetadataPolicy {
329/// required_fields: vec![
330/// RequiredField::TenantId,
331/// RequiredField::DocId,
332/// ],
333/// ..Default::default()
334/// };
335/// ```
336#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
337#[non_exhaustive]
338pub enum RequiredField {
339 /// Require the `tenant_id` field to be present in metadata.
340 ///
341 /// When required, callers must explicitly provide a non-empty tenant ID.
342 /// The `default_tenant_id` fallback is not used.
343 TenantId,
344
345 /// Require the `doc_id` field to be present in metadata.
346 ///
347 /// When required, callers must explicitly provide a document ID.
348 /// No UUIDv5 derivation is performed.
349 DocId,
350
351 /// Require the `received_at` timestamp to be present in metadata.
352 ///
353 /// When required, callers must provide a timestamp. The default
354 /// (current time) is not applied.
355 ReceivedAt,
356
357 /// Require the `original_source` field to be present in metadata.
358 ///
359 /// When required, callers must provide a source reference.
360 OriginalSource,
361}
362
363/// Errors that can occur when validating an [`IngestConfig`].
364///
365/// These are configuration-time issues and are intended to be surfaced during
366/// service start-up rather than at request time. They indicate misconfiguration
367/// that should be fixed before handling live traffic.
368///
369/// # Examples
370///
371/// ```rust
372/// use ingest::{IngestConfig, ConfigError};
373///
374/// let bad_config = IngestConfig {
375/// max_payload_bytes: Some(100),
376/// max_normalized_bytes: Some(200), // Invalid: exceeds raw limit
377/// ..Default::default()
378/// };
379///
380/// match bad_config.validate() {
381/// Err(ConfigError::NormalizedExceedsPayload { normalized, payload }) => {
382/// println!("Config error: normalized ({}) > payload ({})",
383/// normalized, payload);
384/// }
385/// Ok(()) => println!("Config is valid"),
386/// }
387/// ```
388#[derive(Debug, Error, Clone, PartialEq, Eq)]
389#[non_exhaustive]
390pub enum ConfigError {
391 /// The configured `max_normalized_bytes` is larger than `max_payload_bytes`.
392 ///
393 /// This violates the expectation that normalized text should always be
394 /// bounded by the raw payload size limit and usually indicates a
395 /// misconfiguration.
396 ///
397 /// # Example
398 ///
399 /// This error occurs when:
400 /// ```rust,ignore
401 /// max_payload_bytes: Some(100),
402 /// max_normalized_bytes: Some(200), // ERROR: exceeds raw limit
403 /// ```
404 #[error(
405 "max_normalized_bytes ({normalized}) exceeds max_payload_bytes ({payload}); \
406 normalized payload must not exceed the raw payload limit"
407 )]
408 NormalizedExceedsPayload {
409 /// Configured upper bound for normalized text payloads, in bytes.
410 normalized: usize,
411 /// Configured upper bound for raw payloads, in bytes.
412 payload: usize,
413 },
414}
415
416impl Default for IngestConfig {
417 /// Creates a default `IngestConfig` suitable for development.
418 ///
419 /// # Defaults
420 ///
421 /// - `version`: 1
422 /// - `default_tenant_id`: "default"
423 /// - `doc_id_namespace`: `Uuid::NAMESPACE_OID`
424 /// - `strip_control_chars`: true
425 /// - `metadata_policy`: default (no required fields, no limits)
426 /// - `max_payload_bytes`: None (unlimited)
427 /// - `max_normalized_bytes`: None (unlimited)
428 ///
429 /// # Example
430 ///
431 /// ```rust
432 /// use ingest::IngestConfig;
433 ///
434 /// let config = IngestConfig::default();
435 /// assert_eq!(config.version, 1);
436 /// assert_eq!(config.default_tenant_id, "default");
437 /// assert!(config.strip_control_chars);
438 /// ```
439 fn default() -> Self {
440 Self {
441 version: 1,
442 default_tenant_id: "default".into(),
443 doc_id_namespace: Uuid::NAMESPACE_OID,
444 strip_control_chars: true,
445 metadata_policy: MetadataPolicy::default(),
446 max_payload_bytes: None,
447 max_normalized_bytes: None,
448 }
449 }
450}
451
452impl IngestConfig {
453 /// Validates internal consistency of this configuration.
454 ///
455 /// This method checks for logical errors in the configuration that would
456 /// cause runtime issues. It is inexpensive and should be called at process
457 /// start-up to catch misconfigurations before handling live ingest traffic.
458 ///
459 /// # Validation Rules
460 ///
461 /// 1. `max_normalized_bytes` must be ≤ `max_payload_bytes` (if both are set)
462 ///
463 /// # Returns
464 ///
465 /// - `Ok(())` if configuration is valid
466 /// - `Err(ConfigError)` describing the validation failure
467 ///
468 /// # Performance
469 ///
470 /// This method performs only in-memory checks with O(1) complexity.
471 /// No I/O is performed.
472 ///
473 /// # Examples
474 ///
475 /// ## Valid Configuration
476 ///
477 /// ```rust
478 /// use ingest::IngestConfig;
479 ///
480 /// let config = IngestConfig::default();
481 /// assert!(config.validate().is_ok());
482 /// ```
483 ///
484 /// ## Invalid Configuration
485 ///
486 /// ```rust
487 /// use ingest::IngestConfig;
488 ///
489 /// let invalid_config = IngestConfig {
490 /// max_payload_bytes: Some(100),
491 /// max_normalized_bytes: Some(200), // Invalid!
492 /// ..Default::default()
493 /// };
494 ///
495 /// assert!(invalid_config.validate().is_err());
496 /// ```
497 ///
498 /// ## Production Usage
499 ///
500 /// ```rust
501 /// use ingest::IngestConfig;
502 ///
503 /// fn main() -> Result<(), Box<dyn std::error::Error>> {
504 /// let config = load_config()?;
505 /// config.validate()?;
506 /// // Continue with valid config...
507 /// Ok(())
508 /// }
509 ///
510 /// fn load_config() -> anyhow::Result<IngestConfig> {
511 /// // Load from file, env vars, etc.
512 /// Ok(IngestConfig::default())
513 /// }
514 /// ```
515 pub fn validate(&self) -> Result<(), ConfigError> {
516 if let (Some(normalized), Some(payload)) =
517 (self.max_normalized_bytes, self.max_payload_bytes)
518 {
519 if normalized > payload {
520 return Err(ConfigError::NormalizedExceedsPayload {
521 normalized,
522 payload,
523 });
524 }
525 }
526
527 Ok(())
528 }
529}