Skip to main content

ingest/
config.rs

1//! Configuration types for the ingest pipeline.
2//!
3//! This module defines [`IngestConfig`] and [`MetadataPolicy`], which control how
4//! raw ingest requests are interpreted, defaulted, and constrained at runtime.
5//! These types are intended to be cheap to clone and easy to serialize from
6//! external configuration formats such as JSON, TOML, or YAML.
7//!
8//! # Quick Start
9//!
10//! ```rust
11//! use ingest::IngestConfig;
12//!
13//! // Use defaults for development
14//! let config = IngestConfig::default();
15//!
16//! // Validate before use
17//! config.validate().expect("Invalid configuration");
18//! ```
19//!
20//! # Production Configuration
21//!
22//! ```rust
23//! use ingest::{IngestConfig, MetadataPolicy, RequiredField};
24//! use uuid::Uuid;
25//!
26//! let config = IngestConfig {
27//!     version: 1,
28//!     default_tenant_id: "production".to_string(),
29//!     doc_id_namespace: Uuid::new_v5(
30//!         &Uuid::NAMESPACE_DNS,
31//!         b"myapp.example.com"
32//!     ),
33//!     strip_control_chars: true,
34//!     metadata_policy: MetadataPolicy {
35//!         required_fields: vec![
36//!             RequiredField::TenantId,
37//!             RequiredField::DocId,
38//!         ],
39//!         max_attribute_bytes: Some(1024 * 1024), // 1 MB
40//!         reject_future_timestamps: true,
41//!     },
42//!     max_payload_bytes: Some(100 * 1024 * 1024),     // 100 MB
43//!     max_normalized_bytes: Some(50 * 1024 * 1024),   // 50 MB
44//! };
45//!
46//! // Always validate at startup
47//! if let Err(e) = config.validate() {
48//!     eprintln!("Configuration error: {}", e);
49//!     std::process::exit(1);
50//! }
51//! ```
52use serde::{Deserialize, Serialize};
53use thiserror::Error;
54use uuid::Uuid;
55
56/// Runtime configuration for ingest behavior.
57///
58/// `IngestConfig` controls all aspects of the ingest pipeline including validation,
59/// normalization, size limits, and ID generation. It is designed to be cheap to clone
60/// and serializable for configuration management.
61///
62/// # Fields
63///
64/// - `version`: Semantic version for tracking configuration changes
65/// - `default_tenant_id`: Fallback tenant when metadata doesn't specify one
66/// - `doc_id_namespace`: UUID namespace for deterministic document ID generation
67/// - `strip_control_chars`: Whether to remove control characters from metadata
68/// - `metadata_policy`: Fine-grained metadata validation rules
69/// - `max_payload_bytes`: Maximum raw payload size (optional)
70/// - `max_normalized_bytes`: Maximum normalized text size (optional)
71///
72/// # Serialization
73///
74/// This struct supports JSON, TOML, and YAML serialization:
75///
76/// ```json
77/// {
78///   "version": 1,
79///   "default_tenant_id": "default",
80///   "strip_control_chars": true,
81///   "max_payload_bytes": 52428800,
82///   "max_normalized_bytes": 10485760,
83///   "metadata_policy": {
84///     "required_fields": ["TenantId", "DocId"],
85///     "max_attribute_bytes": 1048576,
86///     "reject_future_timestamps": true
87///   }
88/// }
89/// ```
90///
91/// # Examples
92///
93/// ## Default Configuration
94///
95/// ```rust
96/// use ingest::IngestConfig;
97/// use uuid::Uuid;
98///
99/// let config = IngestConfig::default();
100///
101/// assert_eq!(config.version, 1);
102/// assert_eq!(config.default_tenant_id, "default");
103/// assert_eq!(config.strip_control_chars, true);
104/// assert!(config.max_payload_bytes.is_none());
105/// assert!(config.max_normalized_bytes.is_none());
106/// ```
107///
108/// ## Custom Configuration
109///
110/// ```rust
111/// use ingest::{IngestConfig, MetadataPolicy, RequiredField};
112/// use uuid::Uuid;
113///
114/// let config = IngestConfig {
115///     version: 2,
116///     default_tenant_id: "my-app".to_string(),
117///     doc_id_namespace: Uuid::new_v5(
118///         &Uuid::NAMESPACE_DNS,
119///         b"my-app.example.com"
120///     ),
121///     strip_control_chars: true,
122///     metadata_policy: MetadataPolicy {
123///         required_fields: vec![RequiredField::TenantId],
124///         max_attribute_bytes: Some(65536),
125///         reject_future_timestamps: true,
126///     },
127///     max_payload_bytes: Some(10 * 1024 * 1024),
128///     max_normalized_bytes: Some(5 * 1024 * 1024),
129/// };
130///
131/// assert!(config.validate().is_ok());
132/// ```
133#[derive(Debug, Clone, Serialize, Deserialize)]
134pub struct IngestConfig {
135    /// Semantic version of the ingest configuration.
136    ///
137    /// This version number helps track configuration changes and can be used
138    /// for schema migration or feature flagging. Increment this when making
139    /// breaking changes to ingest behavior.
140    ///
141    /// Default: `1`
142    pub version: u32,
143
144    /// Default tenant ID to use when metadata doesn't specify one.
145    ///
146    /// This ensures every canonical record has a tenant identifier, enabling
147    /// multi-tenant isolation even when callers omit the tenant field.
148    ///
149    /// Default: `"default"`
150    pub default_tenant_id: String,
151
152    /// Namespace UUID for deterministic document ID generation.
153    ///
154    /// When `doc_id` is not provided in metadata, a UUIDv5 is derived using:
155    /// `UUIDv5(doc_id_namespace, tenant_id + "\0" + record_id)`
156    ///
157    /// Using a consistent namespace ensures that:
158    /// - The same content always gets the same ID (deterministic)
159    /// - Different applications don't collide (namespace isolation)
160    /// - Re-ingesting content is idempotent
161    ///
162    /// Default: [`Uuid::NAMESPACE_OID`]
163    pub doc_id_namespace: Uuid,
164
165    /// Whether to strip ASCII control characters from metadata strings.
166    ///
167    /// When `true`, control characters (0x00-0x1F and 0x7F) are removed from:
168    /// - `tenant_id`
169    /// - `doc_id`
170    /// - `original_source`
171    /// - `id` (record ID)
172    ///
173    /// This prevents log injection attacks and ensures metadata is safe for
174    /// downstream systems. It is strongly recommended to keep this enabled.
175    ///
176    /// Default: `true`
177    pub strip_control_chars: bool,
178
179    /// Additional metadata validation policies.
180    ///
181    /// Controls which fields are required, attribute size limits, and timestamp
182    /// validation rules.
183    ///
184    /// Default: [`MetadataPolicy::default()`]
185    #[serde(default)]
186    pub metadata_policy: MetadataPolicy,
187
188    /// Maximum raw payload byte length allowed.
189    ///
190    /// If set, payloads exceeding this limit are rejected with
191    /// `IngestError::PayloadTooLarge` before any processing.
192    ///
193    /// This check is performed on the raw payload size before normalization
194    /// (whitespace collapsing, UTF-8 decoding, etc.).
195    ///
196    /// # Size Recommendations
197    ///
198    /// - Small text: 1-10 MB
199    /// - Documents: 50-100 MB
200    /// - Large files: 500 MB - 1 GB (if memory allows)
201    ///
202    /// Default: `None` (unlimited)
203    #[serde(default)]
204    pub max_payload_bytes: Option<usize>,
205
206    /// Maximum normalized payload byte length allowed.
207    ///
208    /// If set, text payloads exceeding this limit after whitespace normalization
209    /// are rejected with `IngestError::PayloadTooLarge`.
210    ///
211    /// This is useful for enforcing limits on processed content size, which
212    /// may differ from raw size due to whitespace collapsing.
213    ///
214    /// # Constraint
215    ///
216    /// Must be less than or equal to `max_payload_bytes` (validated by
217    /// [`IngestConfig::validate()`]).
218    ///
219    /// Default: `None` (unlimited)
220    #[serde(default)]
221    pub max_normalized_bytes: Option<usize>,
222}
223
224/// Controls which metadata fields must be present and how optional blobs are constrained.
225///
226/// `MetadataPolicy` provides fine-grained control over metadata validation,
227/// allowing you to enforce business rules such as required fields, size limits,
228/// and timestamp constraints.
229///
230/// # Examples
231///
232/// ## Strict Policy
233///
234/// ```rust
235/// use ingest::{MetadataPolicy, RequiredField};
236///
237/// let strict_policy = MetadataPolicy {
238///     required_fields: vec![
239///         RequiredField::TenantId,
240///         RequiredField::DocId,
241///         RequiredField::ReceivedAt,
242///         RequiredField::OriginalSource,
243///     ],
244///     max_attribute_bytes: Some(1024),
245///     reject_future_timestamps: true,
246/// };
247/// ```
248///
249/// ## Lenient Policy
250///
251/// ```rust
252/// use ingest::MetadataPolicy;
253///
254/// let lenient_policy = MetadataPolicy::default();
255/// // All fields optional, no size limits, future timestamps allowed
256/// ```
257#[derive(Debug, Clone, Serialize, Deserialize, Default)]
258#[serde(default)]
259pub struct MetadataPolicy {
260    /// Metadata fields that must be provided by the caller (after sanitization).
261    ///
262    /// If a required field is missing or empty after control character stripping,
263    /// ingest fails with `IngestError::InvalidMetadata`.
264    ///
265    /// # Example
266    ///
267    /// ```rust
268    /// use ingest::{MetadataPolicy, RequiredField};
269    ///
270    /// let policy = MetadataPolicy {
271    ///     required_fields: vec![RequiredField::TenantId, RequiredField::DocId],
272    ///     ..Default::default()
273    /// };
274    /// ```
275    ///
276    /// Default: empty vector (no required fields)
277    pub required_fields: Vec<RequiredField>,
278
279    /// Maximum serialized byte length allowed for `metadata.attributes`.
280    ///
281    /// If set, the JSON-serialized size of the attributes field must not exceed
282    /// this limit. This protects downstream systems from very large metadata blobs.
283    ///
284    /// # Example
285    ///
286    /// ```rust
287    /// use ingest::MetadataPolicy;
288    ///
289    /// let policy = MetadataPolicy {
290    ///     max_attribute_bytes: Some(1024 * 1024), // 1 MB
291    ///     ..Default::default()
292    /// };
293    /// ```
294    ///
295    /// Default: `None` (unlimited)
296    pub max_attribute_bytes: Option<usize>,
297
298    /// Reject ingests with timestamps that lie in the future.
299    ///
300    /// When `true`, if `received_at` is strictly greater than the current time,
301    /// ingest fails with `IngestError::InvalidMetadata` containing "future".
302    ///
303    /// This is useful for detecting clock skew or preventing future-dated content
304    /// from entering the system.
305    ///
306    /// Default: `false`
307    pub reject_future_timestamps: bool,
308}
309
310/// Metadata identifiers that can be enforced via [`MetadataPolicy`].
311///
312/// This enum defines the metadata fields that can be marked as required.
313/// It is marked `#[non_exhaustive]` to allow future additions without
314/// breaking existing code.
315///
316/// # Required Fields
317///
318/// - `TenantId`: Tenant identifier for multi-tenant isolation
319/// - `DocId`: Document identifier (caller must provide, no derivation)
320/// - `ReceivedAt`: Timestamp when content was received
321/// - `OriginalSource`: Human-readable source reference
322///
323/// # Examples
324///
325/// ```rust
326/// use ingest::{MetadataPolicy, RequiredField};
327///
328/// let policy = MetadataPolicy {
329///     required_fields: vec![
330///         RequiredField::TenantId,
331///         RequiredField::DocId,
332///     ],
333///     ..Default::default()
334/// };
335/// ```
336#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
337#[non_exhaustive]
338pub enum RequiredField {
339    /// Require the `tenant_id` field to be present in metadata.
340    ///
341    /// When required, callers must explicitly provide a non-empty tenant ID.
342    /// The `default_tenant_id` fallback is not used.
343    TenantId,
344
345    /// Require the `doc_id` field to be present in metadata.
346    ///
347    /// When required, callers must explicitly provide a document ID.
348    /// No UUIDv5 derivation is performed.
349    DocId,
350
351    /// Require the `received_at` timestamp to be present in metadata.
352    ///
353    /// When required, callers must provide a timestamp. The default
354    /// (current time) is not applied.
355    ReceivedAt,
356
357    /// Require the `original_source` field to be present in metadata.
358    ///
359    /// When required, callers must provide a source reference.
360    OriginalSource,
361}
362
363/// Errors that can occur when validating an [`IngestConfig`].
364///
365/// These are configuration-time issues and are intended to be surfaced during
366/// service start-up rather than at request time. They indicate misconfiguration
367/// that should be fixed before handling live traffic.
368///
369/// # Examples
370///
371/// ```rust
372/// use ingest::{IngestConfig, ConfigError};
373///
374/// let bad_config = IngestConfig {
375///     max_payload_bytes: Some(100),
376///     max_normalized_bytes: Some(200), // Invalid: exceeds raw limit
377///     ..Default::default()
378/// };
379///
380/// match bad_config.validate() {
381///     Err(ConfigError::NormalizedExceedsPayload { normalized, payload }) => {
382///         println!("Config error: normalized ({}) > payload ({})",
383///                  normalized, payload);
384///     }
385///     Ok(()) => println!("Config is valid"),
386/// }
387/// ```
388#[derive(Debug, Error, Clone, PartialEq, Eq)]
389#[non_exhaustive]
390pub enum ConfigError {
391    /// The configured `max_normalized_bytes` is larger than `max_payload_bytes`.
392    ///
393    /// This violates the expectation that normalized text should always be
394    /// bounded by the raw payload size limit and usually indicates a
395    /// misconfiguration.
396    ///
397    /// # Example
398    ///
399    /// This error occurs when:
400    /// ```rust,ignore
401    /// max_payload_bytes: Some(100),
402    /// max_normalized_bytes: Some(200), // ERROR: exceeds raw limit
403    /// ```
404    #[error(
405        "max_normalized_bytes ({normalized}) exceeds max_payload_bytes ({payload}); \
406         normalized payload must not exceed the raw payload limit"
407    )]
408    NormalizedExceedsPayload {
409        /// Configured upper bound for normalized text payloads, in bytes.
410        normalized: usize,
411        /// Configured upper bound for raw payloads, in bytes.
412        payload: usize,
413    },
414}
415
416impl Default for IngestConfig {
417    /// Creates a default `IngestConfig` suitable for development.
418    ///
419    /// # Defaults
420    ///
421    /// - `version`: 1
422    /// - `default_tenant_id`: "default"
423    /// - `doc_id_namespace`: `Uuid::NAMESPACE_OID`
424    /// - `strip_control_chars`: true
425    /// - `metadata_policy`: default (no required fields, no limits)
426    /// - `max_payload_bytes`: None (unlimited)
427    /// - `max_normalized_bytes`: None (unlimited)
428    ///
429    /// # Example
430    ///
431    /// ```rust
432    /// use ingest::IngestConfig;
433    ///
434    /// let config = IngestConfig::default();
435    /// assert_eq!(config.version, 1);
436    /// assert_eq!(config.default_tenant_id, "default");
437    /// assert!(config.strip_control_chars);
438    /// ```
439    fn default() -> Self {
440        Self {
441            version: 1,
442            default_tenant_id: "default".into(),
443            doc_id_namespace: Uuid::NAMESPACE_OID,
444            strip_control_chars: true,
445            metadata_policy: MetadataPolicy::default(),
446            max_payload_bytes: None,
447            max_normalized_bytes: None,
448        }
449    }
450}
451
452impl IngestConfig {
453    /// Validates internal consistency of this configuration.
454    ///
455    /// This method checks for logical errors in the configuration that would
456    /// cause runtime issues. It is inexpensive and should be called at process
457    /// start-up to catch misconfigurations before handling live ingest traffic.
458    ///
459    /// # Validation Rules
460    ///
461    /// 1. `max_normalized_bytes` must be ≤ `max_payload_bytes` (if both are set)
462    ///
463    /// # Returns
464    ///
465    /// - `Ok(())` if configuration is valid
466    /// - `Err(ConfigError)` describing the validation failure
467    ///
468    /// # Performance
469    ///
470    /// This method performs only in-memory checks with O(1) complexity.
471    /// No I/O is performed.
472    ///
473    /// # Examples
474    ///
475    /// ## Valid Configuration
476    ///
477    /// ```rust
478    /// use ingest::IngestConfig;
479    ///
480    /// let config = IngestConfig::default();
481    /// assert!(config.validate().is_ok());
482    /// ```
483    ///
484    /// ## Invalid Configuration
485    ///
486    /// ```rust
487    /// use ingest::IngestConfig;
488    ///
489    /// let invalid_config = IngestConfig {
490    ///     max_payload_bytes: Some(100),
491    ///     max_normalized_bytes: Some(200), // Invalid!
492    ///     ..Default::default()
493    /// };
494    ///
495    /// assert!(invalid_config.validate().is_err());
496    /// ```
497    ///
498    /// ## Production Usage
499    ///
500    /// ```rust
501    /// use ingest::IngestConfig;
502    ///
503    /// fn main() -> Result<(), Box<dyn std::error::Error>> {
504    ///     let config = load_config()?;
505    ///     config.validate()?;
506    ///     // Continue with valid config...
507    ///     Ok(())
508    /// }
509    ///
510    /// fn load_config() -> anyhow::Result<IngestConfig> {
511    ///     // Load from file, env vars, etc.
512    ///     Ok(IngestConfig::default())
513    /// }
514    /// ```
515    pub fn validate(&self) -> Result<(), ConfigError> {
516        if let (Some(normalized), Some(payload)) =
517            (self.max_normalized_bytes, self.max_payload_bytes)
518        {
519            if normalized > payload {
520                return Err(ConfigError::NormalizedExceedsPayload {
521                    normalized,
522                    payload,
523                });
524            }
525        }
526
527        Ok(())
528    }
529}