ingest/types.rs
1//! Core data model types for the ingest crate.
2//!
3//! These types represent the shape of ingest requests and the normalized
4//! records that flow to downstream pipeline stages. They are designed to be:
5//!
6//! - **Serializable**: Support for JSON, binary formats via serde
7//! - **Cloneable**: Cheap to clone for pipeline processing
8//! - **Comparable**: Support equality checks for testing
9//! - **Extensible**: Marked `#[non_exhaustive]` where appropriate
10//!
11//! # Type Hierarchy
12//!
13//! ```text
14//! RawIngestRecord
15//! ├── id: String
16//! ├── source: IngestSource
17//! ├── metadata: IngestMetadata
18//! │ ├── tenant_id: Option<String>
19//! │ ├── doc_id: Option<String>
20//! │ ├── received_at: Option<DateTime<Utc>>
21//! │ ├── original_source: Option<String>
22//! │ └── attributes: Option<Value>
23//! └── payload: Option<IngestPayload>
24//! ├── Text(String)
25//! ├── TextBytes(Vec<u8>)
26//! └── Binary(Vec<u8>)
27//!
28//! ↓ ingest()
29//!
30//! CanonicalIngestRecord
31//! ├── id: String (sanitized)
32//! ├── tenant_id: String (default applied)
33//! ├── doc_id: String (derived or provided)
34//! ├── received_at: DateTime<Utc> (default applied)
35//! ├── original_source: Option<String> (sanitized)
36//! ├── source: IngestSource
37//! ├── normalized_payload: Option<CanonicalPayload>
38//! │ ├── Text(String) (whitespace normalized)
39//! │ └── Binary(Vec<u8>) (preserved)
40//! └── attributes: Option<Value>
41//! ```
42//!
43//! # Examples
44//!
45//! ## Creating a Raw Record
46//!
47//! ```rust
48//! use ingest::{
49//! RawIngestRecord, IngestMetadata, IngestSource,
50//! IngestPayload
51//! };
52//! use chrono::Utc;
53//!
54//! let record = RawIngestRecord {
55//! id: "doc-001".to_string(),
56//! source: IngestSource::RawText,
57//! metadata: IngestMetadata {
58//! tenant_id: Some("acme-corp".to_string()),
59//! doc_id: Some("report-q4-2024".to_string()),
60//! received_at: Some(Utc::now()),
61//! original_source: None,
62//! attributes: None,
63//! },
64//! payload: Some(IngestPayload::Text(
65//! "Quarterly report content...".to_string()
66//! )),
67//! };
68//! ```
69//!
70//! ## Working with Canonical Records
71//!
72//! ```rust
73//! use ingest::{CanonicalIngestRecord, CanonicalPayload};
74//!
75//! fn process_text(record: &CanonicalIngestRecord) -> Option<String> {
76//! match &record.normalized_payload {
77//! Some(CanonicalPayload::Text(text)) => Some(text.clone()),
78//! Some(CanonicalPayload::Binary(_)) => {
79//! println!("Skipping binary payload");
80//! None
81//! }
82//! None => {
83//! println!("No payload");
84//! None
85//! }
86//! }
87//! }
88//! ```
89use chrono::{DateTime, Utc};
90use serde::{Deserialize, Serialize};
91
92/// Source kinds accepted at ingest time.
93///
94/// `IngestSource` identifies where content originated, which affects validation
95/// rules (e.g., whether a payload is required) and downstream processing.
96///
97/// # Source Types
98///
99/// - `RawText`: Plain text supplied directly (requires text payload)
100/// - `Url(String)`: Content from a URL (requires text payload)
101/// - `File { filename, content_type }`: Uploaded file (requires payload)
102/// - `Api`: Generic API call (payload optional)
103///
104/// # Payload Requirements
105///
106/// | Source | Payload Required | Text Required |
107/// |--------|-----------------|---------------|
108/// | `RawText` | Yes | Yes |
109/// | `Url` | Yes | Yes |
110/// | `File` | Yes | No |
111/// | `Api` | No | No |
112///
113/// # Examples
114///
115/// ```rust
116/// use ingest::IngestSource;
117///
118/// // Raw text input
119/// let source = IngestSource::RawText;
120///
121/// // URL-sourced content
122/// let source = IngestSource::Url("https://example.com/page".to_string());
123///
124/// // File upload
125/// let source = IngestSource::File {
126/// filename: "document.pdf".to_string(),
127/// content_type: Some("application/pdf".to_string()),
128/// };
129///
130/// // Generic API call
131/// let source = IngestSource::Api;
132/// ```
133#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
134#[non_exhaustive]
135pub enum IngestSource {
136 /// Plain text supplied directly in the request body.
137 ///
138 /// This source requires a text payload. The content will be whitespace-normalized
139 /// during ingest.
140 ///
141 /// # Example
142 ///
143 /// ```rust
144 /// use ingest::{IngestSource, IngestPayload, RawIngestRecord, IngestMetadata};
145 ///
146 /// let record = RawIngestRecord {
147 /// id: "text-001".to_string(),
148 /// source: IngestSource::RawText,
149 /// metadata: IngestMetadata {
150 /// tenant_id: Some("tenant".to_string()),
151 /// doc_id: Some("doc".to_string()),
152 /// received_at: None,
153 /// original_source: None,
154 /// attributes: None,
155 /// },
156 /// payload: Some(IngestPayload::Text("Hello world".to_string())),
157 /// };
158 /// ```
159 RawText,
160
161 /// Content logically associated with a URL.
162 ///
163 /// This source requires a text payload and is typically used for content
164 /// crawled from web pages.
165 ///
166 /// # Example
167 ///
168 /// ```rust
169 /// use ingest::IngestSource;
170 ///
171 /// let source = IngestSource::Url(
172 /// "https://example.com/article".to_string()
173 /// );
174 /// ```
175 Url(String),
176
177 /// An uploaded file with associated metadata.
178 ///
179 /// This source requires a payload (text or binary) and captures file metadata
180 /// for downstream processing.
181 ///
182 /// # Fields
183 ///
184 /// - `filename`: The original filename
185 /// - `content_type`: Optional MIME type (e.g., "application/pdf")
186 ///
187 /// # Example
188 ///
189 /// ```rust
190 /// use ingest::{IngestSource, IngestPayload, RawIngestRecord, IngestMetadata};
191 ///
192 /// let record = RawIngestRecord {
193 /// id: "file-001".to_string(),
194 /// source: IngestSource::File {
195 /// filename: "report.pdf".to_string(),
196 /// content_type: Some("application/pdf".to_string()),
197 /// },
198 /// metadata: IngestMetadata {
199 /// tenant_id: Some("tenant".to_string()),
200 /// doc_id: Some("doc-123".to_string()),
201 /// received_at: None,
202 /// original_source: Some("uploads/report.pdf".to_string()),
203 /// attributes: None,
204 /// },
205 /// payload: Some(IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47])), // PNG header
206 /// };
207 /// ```
208 File {
209 /// The original filename of the uploaded file.
210 filename: String,
211 /// Optional MIME type of the file (e.g., "application/pdf", "image/png").
212 content_type: Option<String>,
213 },
214
215 /// Catch-all for ingests originating from an API call.
216 ///
217 /// Unlike other sources, `Api` does not require a payload, making it suitable
218 /// for metadata-only events or API calls without content.
219 ///
220 /// # Example
221 ///
222 /// ```rust
223 /// use ingest::{IngestSource, RawIngestRecord, IngestMetadata};
224 ///
225 /// let record = RawIngestRecord {
226 /// id: "api-001".to_string(),
227 /// source: IngestSource::Api,
228 /// metadata: IngestMetadata {
229 /// tenant_id: Some("tenant".to_string()),
230 /// doc_id: Some("doc".to_string()),
231 /// received_at: None,
232 /// original_source: None,
233 /// attributes: Some(serde_json::json!({"event": "user_action"})),
234 /// },
235 /// payload: None, // Optional for Api source
236 /// };
237 /// ```
238 Api,
239}
240
241/// Metadata associated with an ingest request.
242///
243/// `IngestMetadata` carries contextual information about the content being ingested.
244/// All fields are optional and will be defaulted during normalization if not provided.
245///
246/// # Field Defaults
247///
248/// | Field | Default Behavior |
249/// |-------|------------------|
250/// | `tenant_id` | Falls back to `IngestConfig::default_tenant_id` |
251/// | `doc_id` | Derived via UUIDv5 if not provided |
252/// | `received_at` | Set to current UTC time |
253/// | `original_source` | Remains `None` if not provided |
254/// | `attributes` | Remains `None` if not provided |
255///
256/// # Examples
257///
258/// ## Minimal Metadata
259///
260/// ```rust
261/// use ingest::IngestMetadata;
262///
263/// let metadata = IngestMetadata {
264/// tenant_id: None,
265/// doc_id: None,
266/// received_at: None,
267/// original_source: None,
268/// attributes: None,
269/// };
270/// // All fields will be defaulted during ingest
271/// ```
272///
273/// ## Full Metadata
274///
275/// ```rust
276/// use ingest::IngestMetadata;
277/// use chrono::Utc;
278/// use serde_json::json;
279///
280/// let metadata = IngestMetadata {
281/// tenant_id: Some("acme-corp".to_string()),
282/// doc_id: Some("report-q4-2024".to_string()),
283/// received_at: Some(Utc::now()),
284/// original_source: Some("https://docs.example.com/reports/q4".to_string()),
285/// attributes: Some(json!({
286/// "department": "Engineering",
287/// "classification": "internal",
288/// "tags": ["quarterly", "2024"]
289/// })),
290/// };
291/// ```
292#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
293pub struct IngestMetadata {
294 /// Optional tenant identifier for multi-tenant isolation.
295 ///
296 /// When `None` or empty after sanitization, falls back to
297 /// `IngestConfig::default_tenant_id`.
298 ///
299 /// # Example
300 ///
301 /// ```rust
302 /// use ingest::IngestMetadata;
303 ///
304 /// let metadata = IngestMetadata {
305 /// tenant_id: Some("tenant-123".to_string()),
306 /// doc_id: None,
307 /// received_at: None,
308 /// original_source: None,
309 /// attributes: None,
310 /// };
311 /// ```
312 pub tenant_id: Option<String>,
313
314 /// Optional document identifier.
315 ///
316 /// When `None` or empty after sanitization, a deterministic UUIDv5 is generated
317 /// using `IngestConfig::doc_id_namespace`:
318 /// `UUIDv5(namespace, tenant_id + "\0" + record_id)`
319 ///
320 /// # Example
321 ///
322 /// ```rust
323 /// use ingest::IngestMetadata;
324 ///
325 /// let metadata = IngestMetadata {
326 /// tenant_id: None,
327 /// doc_id: Some("doc-abc-123".to_string()),
328 /// received_at: None,
329 /// original_source: None,
330 /// attributes: None,
331 /// };
332 /// ```
333 pub doc_id: Option<String>,
334
335 /// Optional timestamp when the content was received.
336 ///
337 /// When `None`, defaults to the current UTC time at ingest.
338 /// Can be validated against future time if
339 /// [`MetadataPolicy::reject_future_timestamps`](crate::MetadataPolicy::reject_future_timestamps)
340 /// is enabled.
341 ///
342 /// # Example
343 ///
344 /// ```rust
345 /// use ingest::IngestMetadata;
346 /// use chrono::Utc;
347 ///
348 /// let metadata = IngestMetadata {
349 /// tenant_id: None,
350 /// doc_id: None,
351 /// received_at: Some(Utc::now()),
352 /// original_source: None,
353 /// attributes: None,
354 /// };
355 /// ```
356 pub received_at: Option<DateTime<Utc>>,
357
358 /// Optional original source identifier (e.g., URL or external ID).
359 ///
360 /// This is a human-readable reference to where the content originated.
361 /// Control characters are stripped during sanitization.
362 ///
363 /// # Example
364 ///
365 /// ```rust
366 /// use ingest::IngestMetadata;
367 ///
368 /// let metadata = IngestMetadata {
369 /// tenant_id: None,
370 /// doc_id: None,
371 /// received_at: None,
372 /// original_source: Some("https://example.com/source".to_string()),
373 /// attributes: None,
374 /// };
375 /// ```
376 pub original_source: Option<String>,
377
378 /// Arbitrary JSON attributes for extensibility.
379 ///
380 /// This field can store any JSON-serializable data for application-specific
381 /// use cases. Size is limited by
382 /// [`MetadataPolicy::max_attribute_bytes`](crate::MetadataPolicy::max_attribute_bytes)
383 /// when configured.
384 ///
385 /// # Example
386 ///
387 /// ```rust
388 /// use ingest::IngestMetadata;
389 /// use serde_json::json;
390 ///
391 /// let metadata = IngestMetadata {
392 /// tenant_id: None,
393 /// doc_id: None,
394 /// received_at: None,
395 /// original_source: None,
396 /// attributes: Some(json!({
397 /// "category": "report",
398 /// "priority": "high",
399 /// "metadata": {
400 /// "author": "Jane Smith",
401 /// "department": "Engineering"
402 /// }
403 /// })),
404 /// };
405 /// ```
406 pub attributes: Option<serde_json::Value>,
407}
408
409/// The inbound record for ingest.
410///
411/// `RawIngestRecord` is the primary input type for the ingest pipeline. It contains
412/// all information needed to process content: identification, source metadata, and
413/// optional payload.
414///
415/// # Lifecycle
416///
417/// 1. Create `RawIngestRecord` with raw data
418/// 2. Call [`ingest()`](crate::ingest) to normalize
419/// 3. Receive [`CanonicalIngestRecord`] for downstream processing
420///
421/// # Examples
422///
423/// ## Text Content
424///
425/// ```rust
426/// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestPayload};
427/// use chrono::Utc;
428///
429/// let record = RawIngestRecord {
430/// id: "text-001".to_string(),
431/// source: IngestSource::RawText,
432/// metadata: IngestMetadata {
433/// tenant_id: Some("tenant".to_string()),
434/// doc_id: Some("doc".to_string()),
435/// received_at: Some(Utc::now()),
436/// original_source: None,
437/// attributes: None,
438/// },
439/// payload: Some(IngestPayload::Text(
440/// " Content with extra spaces ".to_string()
441/// )),
442/// };
443/// ```
444///
445/// ## Binary File
446///
447/// ```rust
448/// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestPayload};
449///
450/// let record = RawIngestRecord {
451/// id: "file-001".to_string(),
452/// source: IngestSource::File {
453/// filename: "image.png".to_string(),
454/// content_type: Some("image/png".to_string()),
455/// },
456/// metadata: IngestMetadata {
457/// tenant_id: Some("tenant".to_string()),
458/// doc_id: Some("doc-123".to_string()),
459/// received_at: None,
460/// original_source: Some("uploads/image.png".to_string()),
461/// attributes: None,
462/// },
463/// payload: Some(IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47])),
464/// };
465/// ```
466#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
467pub struct RawIngestRecord {
468 /// Unique identifier for this ingest operation.
469 ///
470 /// This ID is used for:
471 /// - Tracing and log correlation
472 /// - Deterministic document ID derivation (when `doc_id` not provided)
473 /// - Deduplication and idempotency
474 ///
475 /// Should be unique per ingest request. If not provided, a UUID should be
476 /// generated by the caller.
477 ///
478 /// # Example
479 ///
480 /// ```rust
481 /// use ingest::RawIngestRecord;
482 ///
483 /// let record = RawIngestRecord {
484 /// id: "ingest-550e8400-e29b-41d4-a716-446655440000".to_string(),
485 /// ..Default::default()
486 /// };
487 /// ```
488 pub id: String,
489
490 /// Source of the content.
491 ///
492 /// Indicates where the content came from and affects validation rules.
493 /// See [`IngestSource`] for details.
494 pub source: IngestSource,
495
496 /// Metadata associated with the record.
497 ///
498 /// Contains contextual information like tenant, timestamps, and custom attributes.
499 /// See [`IngestMetadata`] for details.
500 pub metadata: IngestMetadata,
501
502 /// Raw payload content.
503 ///
504 /// The actual content being ingested. May be `None` for metadata-only events
505 /// (e.g., `IngestSource::Api`).
506 ///
507 /// See [`IngestPayload`] for the different payload types.
508 pub payload: Option<IngestPayload>,
509}
510
511impl Default for RawIngestRecord {
512 fn default() -> Self {
513 Self {
514 id: String::new(),
515 source: IngestSource::Api,
516 metadata: IngestMetadata {
517 tenant_id: None,
518 doc_id: None,
519 received_at: None,
520 original_source: None,
521 attributes: None,
522 },
523 payload: None,
524 }
525 }
526}
527
528/// Raw payload content provided during ingest.
529///
530/// `IngestPayload` supports multi-modal content ingestion, allowing the same
531/// pipeline to handle text and binary data uniformly.
532///
533/// # Payload Types
534///
535/// - `Text(String)`: Clean UTF-8 text (will be whitespace-normalized)
536/// - `TextBytes(Vec<u8>)`: Raw bytes expected to be valid UTF-8 (will be validated + normalized)
537/// - `Binary(Vec<u8>)`: Arbitrary binary data (passed through unchanged)
538///
539/// # Processing
540///
541/// | Variant | Validation | Normalization | Size Limits |
542/// |---------|-----------|---------------|-------------|
543/// | `Text` | None | Whitespace collapsed | Both limits |
544/// | `TextBytes` | UTF-8 | Whitespace collapsed | Both limits |
545/// | `Binary` | Non-empty | None | Raw limit only |
546///
547/// # Examples
548///
549/// ```rust
550/// use ingest::IngestPayload;
551///
552/// // Text payload
553/// let text = IngestPayload::Text("Hello world".to_string());
554///
555/// // Text from bytes (validates UTF-8)
556/// let text_bytes = IngestPayload::TextBytes(b"Hello world".to_vec());
557///
558/// // Binary payload (preserved as-is)
559/// let binary = IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]); // PNG magic
560/// ```
561#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
562#[non_exhaustive]
563pub enum IngestPayload {
564 /// UTF-8 text payload for normalization and canonicalization.
565 ///
566 /// This is the preferred variant for text content. The text will have
567 /// whitespace collapsed during ingest.
568 ///
569 /// # Example
570 ///
571 /// ```rust
572 /// use ingest::IngestPayload;
573 ///
574 /// let payload = IngestPayload::Text(
575 /// " Content with extra whitespace ".to_string()
576 /// );
577 /// // After ingest: "Content with extra whitespace"
578 /// ```
579 Text(String),
580
581 /// Raw UTF-8 bytes that will be decoded during ingest.
582 ///
583 /// Use this variant when you have bytes that should be valid UTF-8 but
584 /// need validation. Invalid UTF-8 will result in
585 /// [`IngestError::InvalidUtf8`](crate::IngestError::InvalidUtf8).
586 ///
587 /// # Example
588 ///
589 /// ```rust
590 /// use ingest::IngestPayload;
591 ///
592 /// let payload = IngestPayload::TextBytes(
593 /// b"Hello from bytes".to_vec()
594 /// );
595 /// ```
596 ///
597 /// # Error
598 ///
599 /// ```rust
600 /// use ingest::{IngestPayload, ingest, IngestError};
601 /// use ingest::{RawIngestRecord, IngestMetadata, IngestSource, IngestConfig};
602 ///
603 /// let record = RawIngestRecord {
604 /// id: "test".to_string(),
605 /// source: IngestSource::RawText,
606 /// metadata: IngestMetadata {
607 /// tenant_id: Some("t".to_string()),
608 /// doc_id: Some("d".to_string()),
609 /// received_at: None,
610 /// original_source: None,
611 /// attributes: None,
612 /// },
613 /// payload: Some(IngestPayload::TextBytes(vec![0xFF, 0xFE])), // Invalid UTF-8
614 /// };
615 ///
616 /// // This will fail with InvalidUtf8
617 /// // let result = ingest(record, &IngestConfig::default());
618 /// ```
619 TextBytes(Vec<u8>),
620
621 /// Arbitrary binary payload for downstream processing.
622 ///
623 /// Binary payloads are passed through unmodified (except for emptiness check).
624 /// They are suitable for images, PDFs, audio files, and other non-text content.
625 ///
626 /// # Example
627 ///
628 /// ```rust
629 /// use ingest::IngestPayload;
630 ///
631 /// // PNG file header
632 /// let payload = IngestPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
633 /// ```
634 ///
635 /// # Validation
636 ///
637 /// Empty binary payloads (zero bytes) are rejected with
638 /// [`IngestError::EmptyBinaryPayload`](crate::IngestError::EmptyBinaryPayload).
639 Binary(Vec<u8>),
640}
641
642/// Normalized record produced by ingest.
643///
644/// `CanonicalIngestRecord` is the output of the ingest pipeline. It represents
645/// a cleaned, validated, and deterministic version of the input that downstream
646/// stages can rely on.
647///
648/// # Guarantees
649///
650/// - All required fields are present (tenant_id, doc_id, received_at)
651/// - Metadata is sanitized (control characters stripped)
652/// - Payload is normalized (text whitespace collapsed, binary preserved)
653/// - Document ID is stable (derived deterministically if not provided)
654///
655/// # Examples
656///
657/// ```rust
658/// use ingest::{ingest, IngestConfig, RawIngestRecord, CanonicalPayload};
659/// use ingest::{IngestMetadata, IngestSource, IngestPayload};
660///
661/// let config = IngestConfig::default();
662/// let record = RawIngestRecord {
663/// id: "test-001".to_string(),
664/// source: IngestSource::RawText,
665/// metadata: IngestMetadata {
666/// tenant_id: Some("tenant".to_string()),
667/// doc_id: None, // Will be derived
668/// received_at: None, // Will default to now
669/// original_source: None,
670/// attributes: None,
671/// },
672/// payload: Some(IngestPayload::Text(" Hello world ".to_string())),
673/// };
674///
675/// let canonical = ingest(record, &config).unwrap();
676///
677/// // All fields are guaranteed present
678/// assert!(!canonical.tenant_id.is_empty());
679/// assert!(!canonical.doc_id.is_empty());
680///
681/// // Text is normalized
682/// match &canonical.normalized_payload {
683/// Some(CanonicalPayload::Text(text)) => {
684/// assert_eq!(text, "Hello world");
685/// }
686/// _ => panic!("Expected text payload"),
687/// }
688/// ```
689#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
690pub struct CanonicalIngestRecord {
691 /// Unique identifier for this ingest operation (mirrors [`RawIngestRecord::id`]).
692 ///
693 /// This is the sanitized version of the original ID (control characters stripped).
694 pub id: String,
695
696 /// Tenant identifier for multi-tenant isolation.
697 ///
698 /// This is the effective tenant ID after applying defaults:
699 /// - If provided and non-empty: the sanitized provided value
700 /// - Otherwise: `IngestConfig::default_tenant_id`
701 pub tenant_id: String,
702
703 /// Document identifier.
704 ///
705 /// This is the effective document ID after derivation:
706 /// - If provided and non-empty: the sanitized provided value
707 /// - Otherwise: UUIDv5 derived from tenant + record ID
708 pub doc_id: String,
709
710 /// Timestamp when the record was received.
711 ///
712 /// This is the effective timestamp after applying defaults:
713 /// - If provided: the sanitized provided value
714 /// - Otherwise: current UTC time at ingest
715 pub received_at: DateTime<Utc>,
716
717 /// Original source information if provided.
718 ///
719 /// Sanitized version of [`IngestMetadata::original_source`] with control
720 /// characters stripped. `None` if not provided.
721 pub original_source: Option<String>,
722
723 /// Source of the content (mirrors [`RawIngestRecord::source`]).
724 pub source: IngestSource,
725
726 /// Normalized payload ready for downstream stages.
727 ///
728 /// - For text: whitespace collapsed, size limits enforced
729 /// - For binary: preserved unchanged, non-empty check performed
730 /// - `None` if no payload was provided
731 pub normalized_payload: Option<CanonicalPayload>,
732
733 /// Attributes JSON preserved for downstream use.
734 ///
735 /// This is the sanitized and size-checked version of
736 /// [`IngestMetadata::attributes`]. `None` if not provided.
737 pub attributes: Option<serde_json::Value>,
738}
739
740impl CanonicalIngestRecord {
741 /// Returns true if this record has a text payload.
742 ///
743 /// # Example
744 ///
745 /// ```rust
746 /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
747 ///
748 /// let record = CanonicalIngestRecord {
749 /// id: "test".to_string(),
750 /// tenant_id: "tenant".to_string(),
751 /// doc_id: "doc".to_string(),
752 /// received_at: chrono::Utc::now(),
753 /// original_source: None,
754 /// source: ingest::IngestSource::RawText,
755 /// normalized_payload: Some(CanonicalPayload::Text("hello".to_string())),
756 /// attributes: None,
757 /// };
758 ///
759 /// assert!(record.has_text_payload());
760 /// ```
761 pub fn has_text_payload(&self) -> bool {
762 matches!(self.normalized_payload, Some(CanonicalPayload::Text(_)))
763 }
764
765 /// Returns true if this record has a binary payload.
766 ///
767 /// # Example
768 ///
769 /// ```rust
770 /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
771 ///
772 /// let record = CanonicalIngestRecord {
773 /// id: "test".to_string(),
774 /// tenant_id: "tenant".to_string(),
775 /// doc_id: "doc".to_string(),
776 /// received_at: chrono::Utc::now(),
777 /// original_source: None,
778 /// source: ingest::IngestSource::File {
779 /// filename: "test.bin".to_string(),
780 /// content_type: None,
781 /// },
782 /// normalized_payload: Some(CanonicalPayload::Binary(vec![1, 2, 3])),
783 /// attributes: None,
784 /// };
785 ///
786 /// assert!(record.has_binary_payload());
787 /// ```
788 pub fn has_binary_payload(&self) -> bool {
789 matches!(self.normalized_payload, Some(CanonicalPayload::Binary(_)))
790 }
791
792 /// Returns the text payload if present, otherwise None.
793 ///
794 /// # Example
795 ///
796 /// ```rust
797 /// use ingest::{CanonicalIngestRecord, CanonicalPayload};
798 ///
799 /// let record = CanonicalIngestRecord {
800 /// id: "test".to_string(),
801 /// tenant_id: "tenant".to_string(),
802 /// doc_id: "doc".to_string(),
803 /// received_at: chrono::Utc::now(),
804 /// original_source: None,
805 /// source: ingest::IngestSource::RawText,
806 /// normalized_payload: Some(CanonicalPayload::Text("hello world".to_string())),
807 /// attributes: None,
808 /// };
809 ///
810 /// assert_eq!(record.text_payload(), Some("hello world"));
811 /// ```
812 pub fn text_payload(&self) -> Option<&str> {
813 match &self.normalized_payload {
814 Some(CanonicalPayload::Text(text)) => Some(text),
815 _ => None,
816 }
817 }
818
819 /// Returns the binary payload if present, otherwise None.
820 pub fn binary_payload(&self) -> Option<&[u8]> {
821 match &self.normalized_payload {
822 Some(CanonicalPayload::Binary(bytes)) => Some(bytes),
823 _ => None,
824 }
825 }
826}
827
828/// Normalized payload ready for downstream stages.
829///
830/// `CanonicalPayload` represents the final, processed form of ingest payload.
831/// Text payloads have whitespace normalized, while binary payloads pass through
832/// unchanged.
833///
834/// # Variants
835///
836/// - `Text(String)`: Normalized UTF-8 text with collapsed whitespace
837/// - `Binary(Vec<u8>)`: Binary payload preserved exactly
838///
839/// # Examples
840///
841/// ```rust
842/// use ingest::CanonicalPayload;
843///
844/// // Normalized text
845/// let text = CanonicalPayload::Text("Hello world".to_string());
846///
847/// // Preserved binary
848/// let binary = CanonicalPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
849/// ```
850#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
851#[non_exhaustive]
852pub enum CanonicalPayload {
853 /// Normalized UTF-8 text payload.
854 ///
855 /// Text has been through whitespace normalization (multiple spaces/tabs/newlines
856 /// collapsed to single spaces, leading/trailing whitespace trimmed).
857 ///
858 /// # Example
859 ///
860 /// ```rust
861 /// use ingest::CanonicalPayload;
862 ///
863 /// // This represents text that was " Hello world " before normalization
864 /// let payload = CanonicalPayload::Text("Hello world".to_string());
865 /// ```
866 Text(String),
867
868 /// Binary payload preserved for downstream perceptual/semantic stages.
869 ///
870 /// Binary data (images, PDFs, audio, etc.) passes through ingest unchanged
871 /// except for the non-empty validation.
872 ///
873 /// # Example
874 ///
875 /// ```rust
876 /// use ingest::CanonicalPayload;
877 ///
878 /// let payload = CanonicalPayload::Binary(vec![0x89, 0x50, 0x4E, 0x47]);
879 /// ```
880 Binary(Vec<u8>),
881}
882
883impl CanonicalPayload {
884 /// Returns the length of the payload in bytes.
885 ///
886 /// # Example
887 ///
888 /// ```rust
889 /// use ingest::CanonicalPayload;
890 ///
891 /// let text = CanonicalPayload::Text("Hello".to_string());
892 /// assert_eq!(text.len(), 5);
893 ///
894 /// let binary = CanonicalPayload::Binary(vec![1, 2, 3, 4]);
895 /// assert_eq!(binary.len(), 4);
896 /// ```
897 pub fn len(&self) -> usize {
898 match self {
899 CanonicalPayload::Text(s) => s.len(),
900 CanonicalPayload::Binary(b) => b.len(),
901 }
902 }
903
904 /// Returns true if the payload is empty.
905 ///
906 /// Note: Empty payloads should never reach this stage (they are rejected
907 /// during ingest), but this method is provided for completeness.
908 pub fn is_empty(&self) -> bool {
909 self.len() == 0
910 }
911
912 /// Returns true if this is a text payload.
913 pub fn is_text(&self) -> bool {
914 matches!(self, CanonicalPayload::Text(_))
915 }
916
917 /// Returns true if this is a binary payload.
918 pub fn is_binary(&self) -> bool {
919 matches!(self, CanonicalPayload::Binary(_))
920 }
921}