Skip to main content

ingest/
types.rs

1//! Core data model types for the ingest crate.
2//!
3//! These types represent the shape of ingest requests and the normalized
4//! records that flow to downstream pipeline stages. They are designed to be:
5//!
6//! - **Serializable**: Support for JSON, binary formats via serde
7//! - **Cloneable**: Cheap to clone for pipeline processing
8//! - **Comparable**: Support equality checks for testing
9//! - **Extensible**: Marked `#[non_exhaustive]` where appropriate
10//!
11//! # Type Hierarchy
12//!
13//! ```text
14//! RawIngestRecord
15//! ├── id: String
16//! ├── source: IngestSource
17//! ├── metadata: IngestMetadata
18//! │   ├── tenant_id: Option<String>
19//! │   ├── doc_id: Option<String>
20//! │   ├── received_at: Option<DateTime<Utc>>
21//! │   ├── original_source: Option<String>
22//! │   └── attributes: Option<Value>
23//! └── payload: Option<IngestPayload>
24//!     ├── Text(String)
25//!     ├── TextBytes(Vec<u8>)
26//!     └── Binary(Vec<u8>)
27//!
28//!         ↓ ingest()
29//!
30//! CanonicalIngestRecord
31//! ├── id: String (sanitized)
32//! ├── tenant_id: String (default applied)
33//! ├── doc_id: String (derived or provided)
34//! ├── received_at: DateTime<Utc> (default applied)
35//! ├── original_source: Option<String> (sanitized)
36//! ├── source: IngestSource
37//! ├── normalized_payload: Option<CanonicalPayload>
38//! │   ├── Text(String) (whitespace normalized)
39//! │   └── Binary(Vec<u8>) (preserved)
40//! └── attributes: Option<Value>
41//! ```
42//!
43//! # Examples
44//!
45//! ## Creating a Raw Record
46//!
47//! ```rust
48//! use ingest::{
49//!     RawIngestRecord, IngestMetadata, IngestSource,
50//!     IngestPayload
51//! };
52//! use chrono::Utc;
53//!
54//! let record = RawIngestRecord {
55//!     id: "doc-001".to_string(),
56//!     source: IngestSource::RawText,
57//!     metadata: IngestMetadata {
58//!         tenant_id: Some("acme-corp".to_string()),
59//!         doc_id: Some("report-q4-2024".to_string()),
60//!         received_at: Some(Utc::now()),
61//!         original_source: None,
62//!         attributes: None,
63//!     },
64//!     payload: Some(IngestPayload::Text(
65//!         "Quarterly report content...".to_string()
66//!     )),
67//! };
68//! ```
69//!
70//! ## Working with Canonical Records
71//!
72//! ```rust
73//! use ingest::{CanonicalIngestRecord, CanonicalPayload};
74//!
75//! fn process_text(record: &CanonicalIngestRecord) -> Option<String> {
76//!     match &record.normalized_payload {
77//!         Some(CanonicalPayload::Text(text)) => Some(text.clone()),
78//!         Some(CanonicalPayload::Binary(_)) => {
79//!             println!("Skipping binary payload");
80//!             None
81//!         }
82//!         None => {
83//!             println!("No payload");
84//!             None
85//!         }
86//!     }
87//! }
88//! ```
89use chrono::{DateTime, Utc};
90use serde::{Deserialize, Serialize};
91
92/// Source kinds accepted at ingest time.
93///
94/// `IngestSource` identifies where content originated, which affects validation
95/// rules (e.g., whether a payload is required) and downstream processing.
96///
97/// # Source Types
98///
99/// - `RawText`: Plain text supplied directly (requires text payload)
100/// - `Url(String)`: Content from a URL (requires text payload)
101/// - `File { filename, content_type }`: Uploaded file (requires payload)
102/// - `Api`: Generic API call (payload optional)
103///
104/// # Payload Requirements
105///
106/// | Source | Payload Required | Text Required |
107/// |--------|-----------------|---------------|
108/// | `RawText` | Yes | Yes |
109/// | `Url` | Yes | Yes |
110/// | `File` | Yes | No |
111/// | `Api` | No | No |
112///
113/// # Examples
114///
115/// ```rust
116/// use ingest::IngestSource;
117///
118/// // Raw text input
119/// let source = IngestSource::RawText;
120///
121/// // URL-sourced content
122/// let source = IngestSource::Url("https://example.com/page".to_string());
123///
124/// // File upload
125/// let source = IngestSource::File {
126///     filename: "document.pdf".to_string(),
127///     content_type: Some("application/pdf".to_string()),
128/// };
129///
130/// // Generic API call
131/// let source = IngestSource::Api;
132/// ```
133#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
134#[non_exhaustive]
135pub enum IngestSource {
136    /// Plain text supplied directly in the request body.
137    ///
138    /// This source requires a text payload. The content will be whitespace-normalized
139    /// during ingest.
140    ///
141    /// # Example
142    ///
143    /// ```rust
144    /// use ingest::{IngestSource, IngestPayload, RawIngestRecord, IngestMetadata};
145    ///
146    /// let record = RawIngestRecord {
147    ///     id: "text-001".to_string(),
148    ///     source: IngestSource::RawText,
149    ///     metadata: IngestMetadata {
150    ///         tenant_id: Some("tenant".to_string()),
151    ///         doc_id: Some("doc".to_string()),
152    ///         received_at: None,
153    ///         original_source: None,
154    ///         attributes: None,
155    ///     },
156    ///     payload: Some(IngestPayload::Text("Hello world".to_string())),
157    /// };
158    /// ```
159    RawText,
160
161    /// Content logically associated with a URL.
162    ///
163    /// This source requires a text payload and is typically used for content
164    /// crawled from web pages.
165    ///
166    /// # Example
167    ///
168    /// ```rust
169    /// use ingest::IngestSource;
170    ///
171    /// let source = IngestSource::Url(
172    ///     "https://example.com/article".to_string()
173    /// );
174    /// ```
175    Url(String),
176
177    /// An uploaded file with associated metadata.
178    ///
179    /// This source requires a payload (text or binary) and captures file metadata
180    /// for downstream processing.
181    ///
182    /// # Fields
183    ///
184    /// - `filename`: The original filename
185    /// - `content_type`: Optional MIME type (e.g., "application/pdf")
186    ///
187    /// # Example
188    ///
189    /// ```rust
190    /// use ingest::{IngestSource, IngestPayload, RawIngestRecord, IngestMetadata};
191    ///
192    /// let record = RawIngestRecord {
193    ///     id: "file-001".to_string(),
194    ///     source: IngestSource::File {
195    ///         filename: "report.pdf".to_string(),
196    ///         content_type: Some("application/pdf".to_string()),
197    ///     },
198    ///     metadata: IngestMetadata {
199    ///         tenant_id: Some("tenant".to_string()),
200    ///         doc_id: Some("doc-123".to_string()),
201    ///         received_at: None,
202    ///         original_source: Some("uploads/report.pdf".to_string()),
203    ///         attributes: None,
204    ///     },
205    ///     payload: Some(IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47])), // PNG header
206    /// };
207    /// ```
208    File {
209        /// The original filename of the uploaded file.
210        filename: String,
211        /// Optional MIME type of the file (e.g., "application/pdf", "image/png").
212        content_type: Option<String>,
213    },
214
215    /// Catch-all for ingests originating from an API call.
216    ///
217    /// Unlike other sources, `Api` does not require a payload, making it suitable
218    /// for metadata-only events or API calls without content.
219    ///
220    /// # Example
221    ///
222    /// ```rust
223    /// use ingest::{IngestSource, RawIngestRecord, IngestMetadata};
224    ///
225    /// let record = RawIngestRecord {
226    ///     id: "api-001".to_string(),
227    ///     source: IngestSource::Api,
228    ///     metadata: IngestMetadata {
229    ///         tenant_id: Some("tenant".to_string()),
230    ///         doc_id: Some("doc".to_string()),
231    ///         received_at: None,
232    ///         original_source: None,
233    ///         attributes: Some(serde_json::json!({"event": "user_action"})),
234    ///     },
235    ///     payload: None, // Optional for Api source
236    /// };
237    /// ```
238    Api,
239}
240
241/// Metadata associated with an ingest request.
242///
243/// `IngestMetadata` carries contextual information about the content being ingested.
244/// All fields are optional and will be defaulted during normalization if not provided.
245///
246/// # Field Defaults
247///
248/// | Field | Default Behavior |
249/// |-------|------------------|
250/// | `tenant_id` | Falls back to `IngestConfig::default_tenant_id` |
251/// | `doc_id` | Derived via UUIDv5 if not provided |
252/// | `received_at` | Set to current UTC time |
253/// | `original_source` | Remains `None` if not provided |
254/// | `attributes` | Remains `None` if not provided |
255///
256/// # Examples
257///
258/// ## Minimal Metadata
259///
260/// ```rust
261/// use ingest::IngestMetadata;
262///
263/// let metadata = IngestMetadata {
264///     tenant_id: None,
265///     doc_id: None,
266///     received_at: None,
267///     original_source: None,
268///     attributes: None,
269/// };
270/// // All fields will be defaulted during ingest
271/// ```
272///
273/// ## Full Metadata
274///
275/// ```rust
276/// use ingest::IngestMetadata;
277/// use chrono::Utc;
278/// use serde_json::json;
279///
280/// let metadata = IngestMetadata {
281///     tenant_id: Some("acme-corp".to_string()),
282///     doc_id: Some("report-q4-2024".to_string()),
283///     received_at: Some(Utc::now()),
284///     original_source: Some("https://docs.example.com/reports/q4".to_string()),
285///     attributes: Some(json!({
286///         "department": "Engineering",
287///         "classification": "internal",
288///         "tags": ["quarterly", "2024"]
289///     })),
290/// };
291/// ```
292#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
293pub struct IngestMetadata {
294    /// Optional tenant identifier for multi-tenant isolation.
295    ///
296    /// When `None` or empty after sanitization, falls back to
297    /// `IngestConfig::default_tenant_id`.
298    ///
299    /// # Example
300    ///
301    /// ```rust
302    /// use ingest::IngestMetadata;
303    ///
304    /// let metadata = IngestMetadata {
305    ///     tenant_id: Some("tenant-123".to_string()),
306    ///     doc_id: None,
307    ///     received_at: None,
308    ///     original_source: None,
309    ///     attributes: None,
310    /// };
311    /// ```
312    pub tenant_id: Option<String>,
313
314    /// Optional document identifier.
315    ///
316    /// When `None` or empty after sanitization, a deterministic UUIDv5 is generated
317    /// using `IngestConfig::doc_id_namespace`:
318    /// `UUIDv5(namespace, tenant_id + "\0" + record_id)`
319    ///
320    /// # Example
321    ///
322    /// ```rust
323    /// use ingest::IngestMetadata;
324    ///
325    /// let metadata = IngestMetadata {
326    ///     tenant_id: None,
327    ///     doc_id: Some("doc-abc-123".to_string()),
328    ///     received_at: None,
329    ///     original_source: None,
330    ///     attributes: None,
331    /// };
332    /// ```
333    pub doc_id: Option<String>,
334
335    /// Optional timestamp when the content was received.
336    ///
337    /// When `None`, defaults to the current UTC time at ingest.
338    /// Can be validated against future time if
339    /// [`MetadataPolicy::reject_future_timestamps`](crate::MetadataPolicy::reject_future_timestamps)
340    /// is enabled.
341    ///
342    /// # Example
343    ///
344    /// ```rust
345    /// use ingest::IngestMetadata;
346    /// use chrono::Utc;
347    ///
348    /// let metadata = IngestMetadata {
349    ///     tenant_id: None,
350    ///     doc_id: None,
351    ///     received_at: Some(Utc::now()),
352    ///     original_source: None,
353    ///     attributes: None,
354    /// };
355    /// ```
356    pub received_at: Option<DateTime<Utc>>,
357
358    /// Optional original source identifier (e.g., URL or external ID).
359    ///
360    /// This is a human-readable reference to where the content originated.
361    /// Control characters are stripped during sanitization.
362    ///
363    /// # Example
364    ///
365    /// ```rust
366    /// use ingest::IngestMetadata;
367    ///
368    /// let metadata = IngestMetadata {
369    ///     tenant_id: None,
370    ///     doc_id: None,
371    ///     received_at: None,
372    ///     original_source: Some("https://example.com/source".to_string()),
373    ///     attributes: None,
374    /// };
375    /// ```
376    pub original_source: Option<String>,
377
378    /// Arbitrary JSON attributes for extensibility.
379    ///
380    /// This field can store any JSON-serializable data for application-specific
381    /// use cases. Size is limited by
382    /// [`MetadataPolicy::max_attribute_bytes`](crate::MetadataPolicy::max_attribute_bytes)
383    /// when configured.
384    ///
385    /// # Example
386    ///
387    /// ```rust
388    /// use ingest::IngestMetadata;
389    /// use serde_json::json;
390    ///
391    /// let metadata = IngestMetadata {
392    ///     tenant_id: None,
393    ///     doc_id: None,
394    ///     received_at: None,
395    ///     original_source: None,
396    ///     attributes: Some(json!({
397    ///         "category": "report",
398    ///         "priority": "high",
399    ///         "metadata": {
400    ///             "author": "Jane Smith",
401    ///             "department": "Engineering"
402    ///         }
403    ///     })),
404    /// };
405    /// ```
406    pub attributes: Option<serde_json::Value>,
407}
408
409/// The inbound record for ingest.
410///
411/// `RawIngestRecord` is the primary input type for the ingest pipeline. It contains
412/// all information needed to process content: identification, source metadata, and
413/// optional payload.
414///
415/// # Lifecycle
416///
417/// 1. Create `RawIngestRecord` with raw data
418/// 2. Call [`ingest()`](crate::ingest) to normalize
419/// 3. Receive [`CanonicalIngestRecord`] for downstream processing
420///
421/// # Examples
422///
423/// ## Text Content
424///
425/// ```rust
426/// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestPayload};
427/// use chrono::Utc;
428///
429/// let record = RawIngestRecord {
430///     id: "text-001".to_string(),
431///     source: IngestSource::RawText,
432///     metadata: IngestMetadata {
433///         tenant_id: Some("tenant".to_string()),
434///         doc_id: Some("doc".to_string()),
435///         received_at: Some(Utc::now()),
436///         original_source: None,
437///         attributes: None,
438///     },
439///     payload: Some(IngestPayload::Text(
440///         "  Content with   extra spaces  ".to_string()
441///     )),
442/// };
443/// ```
444///
445/// ## Binary File
446///
447/// ```rust
448/// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestPayload};
449///
450/// let record = RawIngestRecord {
451///     id: "file-001".to_string(),
452///     source: IngestSource::File {
453///         filename: "image.png".to_string(),
454///         content_type: Some("image/png".to_string()),
455///     },
456///     metadata: IngestMetadata {
457///         tenant_id: Some("tenant".to_string()),
458///         doc_id: Some("doc-123".to_string()),
459///         received_at: None,
460///         original_source: Some("uploads/image.png".to_string()),
461///         attributes: None,
462///     },
463///     payload: Some(IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47])),
464/// };
465/// ```
466#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
467pub struct RawIngestRecord {
468    /// Unique identifier for this ingest operation.
469    ///
470    /// This ID is used for:
471    /// - Tracing and log correlation
472    /// - Deterministic document ID derivation (when `doc_id` not provided)
473    /// - Deduplication and idempotency
474    ///
475    /// Should be unique per ingest request. If not provided, a UUID should be
476    /// generated by the caller.
477    ///
478    /// # Example
479    ///
480    /// ```rust
481    /// use ingest::RawIngestRecord;
482    ///
483    /// let record = RawIngestRecord {
484    ///     id: "ingest-550e8400-e29b-41d4-a716-446655440000".to_string(),
485    ///     ..Default::default()
486    /// };
487    /// ```
488    pub id: String,
489
490    /// Source of the content.
491    ///
492    /// Indicates where the content came from and affects validation rules.
493    /// See [`IngestSource`] for details.
494    pub source: IngestSource,
495
496    /// Metadata associated with the record.
497    ///
498    /// Contains contextual information like tenant, timestamps, and custom attributes.
499    /// See [`IngestMetadata`] for details.
500    pub metadata: IngestMetadata,
501
502    /// Raw payload content.
503    ///
504    /// The actual content being ingested. May be `None` for metadata-only events
505    /// (e.g., `IngestSource::Api`).
506    ///
507    /// See [`IngestPayload`] for the different payload types.
508    pub payload: Option<IngestPayload>,
509}
510
511impl Default for RawIngestRecord {
512    fn default() -> Self {
513        Self {
514            id: String::new(),
515            source: IngestSource::Api,
516            metadata: IngestMetadata {
517                tenant_id: None,
518                doc_id: None,
519                received_at: None,
520                original_source: None,
521                attributes: None,
522            },
523            payload: None,
524        }
525    }
526}
527
528/// Raw payload content provided during ingest.
529///
530/// `IngestPayload` supports multi-modal content ingestion, allowing the same
531/// pipeline to handle text and binary data uniformly.
532///
533/// # Payload Types
534///
535/// - `Text(String)`: Clean UTF-8 text (will be whitespace-normalized)
536/// - `TextBytes(Vec<u8>)`: Raw bytes expected to be valid UTF-8 (will be validated + normalized)
537/// - `Binary(Vec<u8>)`: Arbitrary binary data (passed through unchanged)
538///
539/// # Processing
540///
541/// | Variant | Validation | Normalization | Size Limits |
542/// |---------|-----------|---------------|-------------|
543/// | `Text` | None | Whitespace collapsed | Both limits |
544/// | `TextBytes` | UTF-8 | Whitespace collapsed | Both limits |
545/// | `Binary` | Non-empty | None | Raw limit only |
546///
547/// # Examples
548///
549/// ```rust
550/// use ingest::IngestPayload;
551///
552/// // Text payload
553/// let text = IngestPayload::Text("Hello world".to_string());
554///
555/// // Text from bytes (validates UTF-8)
556/// let text_bytes = IngestPayload::TextBytes(b"Hello world".to_vec());
557///
558/// // Binary payload (preserved as-is)
559/// let binary = IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]); // PNG magic
560/// ```
561#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
562#[non_exhaustive]
563pub enum IngestPayload {
564    /// UTF-8 text payload for normalization and canonicalization.
565    ///
566    /// This is the preferred variant for text content. The text will have
567    /// whitespace collapsed during ingest.
568    ///
569    /// # Example
570    ///
571    /// ```rust
572    /// use ingest::IngestPayload;
573    ///
574    /// let payload = IngestPayload::Text(
575    ///     "  Content with   extra whitespace  ".to_string()
576    /// );
577    /// // After ingest: "Content with extra whitespace"
578    /// ```
579    Text(String),
580
581    /// Raw UTF-8 bytes that will be decoded during ingest.
582    ///
583    /// Use this variant when you have bytes that should be valid UTF-8 but
584    /// need validation. Invalid UTF-8 will result in
585    /// [`IngestError::InvalidUtf8`](crate::IngestError::InvalidUtf8).
586    ///
587    /// # Example
588    ///
589    /// ```rust
590    /// use ingest::IngestPayload;
591    ///
592    /// let payload = IngestPayload::TextBytes(
593    ///     b"Hello from bytes".to_vec()
594    /// );
595    /// ```
596    ///
597    /// # Error
598    ///
599    /// ```rust
600    /// use ingest::{IngestPayload, ingest, IngestError};
601    /// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestConfig};
602    ///
603    /// let record = RawIngestRecord {
604    ///     id: "test".to_string(),
605    ///     source: IngestSource::RawText,
606    ///     metadata: IngestMetadata {
607    ///         tenant_id: Some("t".to_string()),
608    ///         doc_id: Some("d".to_string()),
609    ///         received_at: None,
610    ///         original_source: None,
611    ///         attributes: None,
612    ///     },
613    ///     payload: Some(IngestPayload::TextBytes(vec![0xFF, 0xFE])), // Invalid UTF-8
614    /// };
615    ///
616    /// // This will fail with InvalidUtf8
617    /// // let result = ingest(record, &IngestConfig::default());
618    /// ```
619    TextBytes(Vec<u8>),
620
621    /// Arbitrary binary payload for downstream processing.
622    ///
623    /// Binary payloads are passed through unmodified (except for emptiness check).
624    /// They are suitable for images, PDFs, audio files, and other non-text content.
625    ///
626    /// # Example
627    ///
628    /// ```rust
629    /// use ingest::IngestPayload;
630    ///
631    /// // PNG file header
632    /// let payload = IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
633    /// ```
634    ///
635    /// # Validation
636    ///
637    /// Empty binary payloads (zero bytes) are rejected with
638    /// [`IngestError::EmptyBinaryPayload`](crate::IngestError::EmptyBinaryPayload).
639    Binary(Vec<u8>),
640}
641
642/// Normalized record produced by ingest.
643///
644/// `CanonicalIngestRecord` is the output of the ingest pipeline. It represents
645/// a cleaned, validated, and deterministic version of the input that downstream
646/// stages can rely on.
647///
648/// # Guarantees
649///
650/// - All required fields are present (tenant_id, doc_id, received_at)
651/// - Metadata is sanitized (control characters stripped)
652/// - Payload is normalized (text whitespace collapsed, binary preserved)
653/// - Document ID is stable (derived deterministically if not provided)
654///
655/// # Examples
656///
657/// ```rust
658/// use ingest::{ingest, IngestConfig, RawIngestRecord, CanonicalPayload};
659/// use ingest::{IngestMetadata, IngestSource, IngestPayload};
660///
661/// let config = IngestConfig::default();
662/// let record = RawIngestRecord {
663///     id: "test-001".to_string(),
664///     source: IngestSource::RawText,
665///     metadata: IngestMetadata {
666///         tenant_id: Some("tenant".to_string()),
667///         doc_id: None, // Will be derived
668///         received_at: None, // Will default to now
669///         original_source: None,
670///         attributes: None,
671///     },
672///     payload: Some(IngestPayload::Text("  Hello   world  ".to_string())),
673/// };
674///
675/// let canonical = ingest(record, &config).unwrap();
676///
677/// // All fields are guaranteed present
678/// assert!(!canonical.tenant_id.is_empty());
679/// assert!(!canonical.doc_id.is_empty());
680///
681/// // Text is normalized
682/// match &canonical.normalized_payload {
683///     Some(CanonicalPayload::Text(text)) => {
684///         assert_eq!(text, "Hello world");
685///     }
686///     _ => panic!("Expected text payload"),
687/// }
688/// ```
689#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
690pub struct CanonicalIngestRecord {
691    /// Unique identifier for this ingest operation (mirrors [`RawIngestRecord::id`]).
692    ///
693    /// This is the sanitized version of the original ID (control characters stripped).
694    pub id: String,
695
696    /// Tenant identifier for multi-tenant isolation.
697    ///
698    /// This is the effective tenant ID after applying defaults:
699    /// - If provided and non-empty: the sanitized provided value
700    /// - Otherwise: `IngestConfig::default_tenant_id`
701    pub tenant_id: String,
702
703    /// Document identifier.
704    ///
705    /// This is the effective document ID after derivation:
706    /// - If provided and non-empty: the sanitized provided value
707    /// - Otherwise: UUIDv5 derived from tenant + record ID
708    pub doc_id: String,
709
710    /// Timestamp when the record was received.
711    ///
712    /// This is the effective timestamp after applying defaults:
713    /// - If provided: the sanitized provided value
714    /// - Otherwise: current UTC time at ingest
715    pub received_at: DateTime<Utc>,
716
717    /// Original source information if provided.
718    ///
719    /// Sanitized version of [`IngestMetadata::original_source`] with control
720    /// characters stripped. `None` if not provided.
721    pub original_source: Option<String>,
722
723    /// Source of the content (mirrors [`RawIngestRecord::source`]).
724    pub source: IngestSource,
725
726    /// Normalized payload ready for downstream stages.
727    ///
728    /// - For text: whitespace collapsed, size limits enforced
729    /// - For binary: preserved unchanged, non-empty check performed
730    /// - `None` if no payload was provided
731    pub normalized_payload: Option<CanonicalPayload>,
732
733    /// Attributes JSON preserved for downstream use.
734    ///
735    /// This is the sanitized and size-checked version of
736    /// [`IngestMetadata::attributes`]. `None` if not provided.
737    pub attributes: Option<serde_json::Value>,
738}
739
740impl CanonicalIngestRecord {
741    /// Returns true if this record has a text payload.
742    ///
743    /// # Example
744    ///
745    /// ```rust
746    /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
747    ///
748    /// let record = CanonicalIngestRecord {
749    ///     id: "test".to_string(),
750    ///     tenant_id: "tenant".to_string(),
751    ///     doc_id: "doc".to_string(),
752    ///     received_at: chrono::Utc::now(),
753    ///     original_source: None,
754    ///     source: ingest::IngestSource::RawText,
755    ///     normalized_payload: Some(CanonicalPayload::Text("hello".to_string())),
756    ///     attributes: None,
757    /// };
758    ///
759    /// assert!(record.has_text_payload());
760    /// ```
761    pub fn has_text_payload(&self) -> bool {
762        matches!(self.normalized_payload, Some(CanonicalPayload::Text(_)))
763    }
764
765    /// Returns true if this record has a binary payload.
766    ///
767    /// # Example
768    ///
769    /// ```rust
770    /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
771    ///
772    /// let record = CanonicalIngestRecord {
773    ///     id: "test".to_string(),
774    ///     tenant_id: "tenant".to_string(),
775    ///     doc_id: "doc".to_string(),
776    ///     received_at: chrono::Utc::now(),
777    ///     original_source: None,
778    ///     source: ingest::IngestSource::File {
779    ///         filename: "test.bin".to_string(),
780    ///         content_type: None,
781    ///     },
782    ///     normalized_payload: Some(CanonicalPayload::Binary(vec![1, 2, 3])),
783    ///     attributes: None,
784    /// };
785    ///
786    /// assert!(record.has_binary_payload());
787    /// ```
788    pub fn has_binary_payload(&self) -> bool {
789        matches!(self.normalized_payload, Some(CanonicalPayload::Binary(_)))
790    }
791
792    /// Returns the text payload if present, otherwise None.
793    ///
794    /// # Example
795    ///
796    /// ```rust
797    /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
798    ///
799    /// let record = CanonicalIngestRecord {
800    ///     id: "test".to_string(),
801    ///     tenant_id: "tenant".to_string(),
802    ///     doc_id: "doc".to_string(),
803    ///     received_at: chrono::Utc::now(),
804    ///     original_source: None,
805    ///     source: ingest::IngestSource::RawText,
806    ///     normalized_payload: Some(CanonicalPayload::Text("hello world".to_string())),
807    ///     attributes: None,
808    /// };
809    ///
810    /// assert_eq!(record.text_payload(), Some("hello world"));
811    /// ```
812    pub fn text_payload(&self) -> Option<&str> {
813        match &self.normalized_payload {
814            Some(CanonicalPayload::Text(text)) => Some(text),
815            _ => None,
816        }
817    }
818
819    /// Returns the binary payload if present, otherwise None.
820    pub fn binary_payload(&self) -> Option<&[u8]> {
821        match &self.normalized_payload {
822            Some(CanonicalPayload::Binary(bytes)) => Some(bytes),
823            _ => None,
824        }
825    }
826}
827
828/// Normalized payload ready for downstream stages.
829///
830/// `CanonicalPayload` represents the final, processed form of ingest payload.
831/// Text payloads have whitespace normalized, while binary payloads pass through
832/// unchanged.
833///
834/// # Variants
835///
836/// - `Text(String)`: Normalized UTF-8 text with collapsed whitespace
837/// - `Binary(Vec<u8>)`: Binary payload preserved exactly
838///
839/// # Examples
840///
841/// ```rust
842/// use ingest::CanonicalPayload;
843///
844/// // Normalized text
845/// let text = CanonicalPayload::Text("Hello world".to_string());
846///
847/// // Preserved binary
848/// let binary = CanonicalPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
849/// ```
850#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
851#[non_exhaustive]
852pub enum CanonicalPayload {
853    /// Normalized UTF-8 text payload.
854    ///
855    /// Text has been through whitespace normalization (multiple spaces/tabs/newlines
856    /// collapsed to single spaces, leading/trailing whitespace trimmed).
857    ///
858    /// # Example
859    ///
860    /// ```rust
861    /// use ingest::CanonicalPayload;
862    ///
863    /// // This represents text that was "  Hello   world  " before normalization
864    /// let payload = CanonicalPayload::Text("Hello world".to_string());
865    /// ```
866    Text(String),
867
868    /// Binary payload preserved for downstream perceptual/semantic stages.
869    ///
870    /// Binary data (images, PDFs, audio, etc.) passes through ingest unchanged
871    /// except for the non-empty validation.
872    ///
873    /// # Example
874    ///
875    /// ```rust
876    /// use ingest::CanonicalPayload;
877    ///
878    /// let payload = CanonicalPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
879    /// ```
880    Binary(Vec<u8>),
881}
882
883impl CanonicalPayload {
884    /// Returns the length of the payload in bytes.
885    ///
886    /// # Example
887    ///
888    /// ```rust
889    /// use ingest::CanonicalPayload;
890    ///
891    /// let text = CanonicalPayload::Text("Hello".to_string());
892    /// assert_eq!(text.len(), 5);
893    ///
894    /// let binary = CanonicalPayload::Binary(vec![1, 2, 3, 4]);
895    /// assert_eq!(binary.len(), 4);
896    /// ```
897    pub fn len(&self) -> usize {
898        match self {
899            CanonicalPayload::Text(s) => s.len(),
900            CanonicalPayload::Binary(b) => b.len(),
901        }
902    }
903
904    /// Returns true if the payload is empty.
905    ///
906    /// Note: Empty payloads should never reach this stage (they are rejected
907    /// during ingest), but this method is provided for completeness.
908    pub fn is_empty(&self) -> bool {
909        self.len() == 0
910    }
911
912    /// Returns true if this is a text payload.
913    pub fn is_text(&self) -> bool {
914        matches!(self, CanonicalPayload::Text(_))
915    }
916
917    /// Returns true if this is a binary payload.
918    pub fn is_binary(&self) -> bool {
919        matches!(self, CanonicalPayload::Binary(_))
920    }
921}