Skip to main content

ingest/
payload.rs

1//! Payload validation and normalization utilities.
2//!
3//! This module contains helpers for enforcing payload presence/shape policies
4//! and transforming raw payloads into [`CanonicalPayload`] values suitable for
5//! downstream processing.
6//!
7//! # Responsibilities
8//!
9//! - **Requirement Validation**: Check if source type mandates a payload
10//! - **Type Validation**: Ensure text sources get text payloads
11//! - **Content Validation**: UTF-8 validation, emptiness checks
12//! - **Normalization**: Whitespace collapsing for text
13//! - **Size Enforcement**: Apply payload size limits
14//!
15//! # Payload Flow
16//!
17//! ```text
18//! IngestPayload (raw)
19//!        │
20//!        ▼
21//! ┌─────────────────────────────┐
22//! │ 1. Check requirements       │
23//! │    - Source mandates?       │
24//! │    - Text source = text?    │
25//! ├─────────────────────────────┤
26//! │ 2. Decode/validate          │
27//! │    - UTF-8 validation       │
28//! │    - Binary checks          │
29//! ├─────────────────────────────┤
30//! │ 3. Normalize                │
31//! │    - Collapse whitespace    │
32//! │    - Size limits            │
33//! └─────────────────────────────┘
34//!        │
35//!        ▼
36//! CanonicalPayload (normalized)
37//! ```
38//!
39//! # Examples
40//!
41//! ```rust
42//! use ingest::{
43//!     validate_payload_requirements, normalize_payload_option,
44//!     IngestPayload, IngestSource, IngestConfig
45//! };
46//!
47//! let source = IngestSource::RawText;
48//! let payload = Some(IngestPayload::Text("Hello world".to_string()));
49//!
50//! // Validate requirements
51//! validate_payload_requirements(&source, &payload).unwrap();
52//!
53//! // Normalize
54//! let config = IngestConfig::default();
55//! let canonical = normalize_payload_option(&source, payload, &config).unwrap();
56//! ```
57use crate::config::IngestConfig;
58use crate::error::IngestError;
59use crate::types::{CanonicalPayload, IngestPayload, IngestSource};
60
61/// Checks if the source requires a payload.
62///
63/// This function validates that sources which mandate payloads (like `RawText`
64/// and `File`) actually have one provided. This is an early validation step
65/// before any processing occurs.
66///
67/// # Source Requirements
68///
69/// | Source | Payload Required |
70/// |--------|-----------------|
71/// | `RawText` | Yes |
72/// | `Url` | No (but typically has one) |
73/// | `File` | Yes |
74/// | `Api` | No |
75///
76/// # Arguments
77///
78/// * `source` - The ingest source type
79/// * `payload` - The optional payload
80///
81/// # Returns
82///
83/// - `Ok(())` - Payload requirements satisfied
84/// - `Err(IngestError::MissingPayload)` - Required payload is missing
85///
86/// # Examples
87///
88/// ```rust
89/// use ingest::{validate_payload_requirements, IngestPayload, IngestSource};
90///
91/// // RawText requires payload
92/// let source = IngestSource::RawText;
93/// let result = validate_payload_requirements(&source, &Some(IngestPayload::Text("test".to_string())));
94/// assert!(result.is_ok());
95///
96/// // Missing required payload
97/// let result = validate_payload_requirements(&source, &None);
98/// assert!(result.is_err());
99///
100/// // Api doesn't require payload
101/// let source = IngestSource::Api;
102/// let result = validate_payload_requirements(&source, &None);
103/// assert!(result.is_ok());
104/// ```
105pub fn validate_payload_requirements(
106    source: &IngestSource,
107    payload: &Option<IngestPayload>,
108) -> Result<(), IngestError> {
109    let has_payload = payload.is_some();
110    if source_requires_payload(source) && !has_payload {
111        return Err(IngestError::MissingPayload);
112    }
113    Ok(())
114}
115
116/// Determines if a source type requires a payload.
117///
118/// This internal function defines which source types mandate payload presence.
119///
120/// # Arguments
121///
122/// * `source` - The ingest source type
123///
124/// # Returns
125///
126/// `true` if the source requires a payload, `false` otherwise
127fn source_requires_payload(source: &IngestSource) -> bool {
128    matches!(source, IngestSource::RawText | IngestSource::File { .. })
129}
130
131/// Determines if a source type requires a text payload.
132///
133/// Text-based sources (like `RawText` and `Url`) should receive text payloads
134/// rather than binary data.
135///
136/// # Arguments
137///
138/// * `source` - The ingest source type
139///
140/// # Returns
141///
142/// `true` if the source requires text content, `false` otherwise
143fn source_requires_text_payload(source: &IngestSource) -> bool {
144    matches!(source, IngestSource::RawText | IngestSource::Url(_))
145}
146
147/// Normalizes the payload based on its type.
148///
149/// This is the main entry point for payload processing. It:
150/// 1. Handles `None` payloads (returns `None`)
151/// 2. Normalizes the payload value
152/// 3. Validates text source requirements
153/// 4. Returns the canonical payload
154///
155/// # Arguments
156///
157/// * `source` - The ingest source (for type validation)
158/// * `payload` - The optional raw payload
159/// * `cfg` - Configuration for normalization
160///
161/// # Returns
162///
163/// - `Ok(Some(CanonicalPayload))` - Successfully normalized payload
164/// - `Ok(None)` - No payload provided
165/// - `Err(IngestError)` - Validation or normalization failure
166///
167/// # Errors
168///
169/// - [`IngestError::InvalidMetadata`] - Text source received binary payload
170/// - All errors from `normalize_payload_value`
171///
172/// # Examples
173///
174/// ```rust,ignore
175/// use ingest::{normalize_payload_option, IngestPayload, IngestSource, IngestConfig};
176///
177/// let config = IngestConfig::default();
178///
179/// // Text normalization
180/// let result = normalize_payload_option(
181///     &IngestSource::RawText,
182///     Some(IngestPayload::Text("  Hello   world  ".to_string())),
183///     &config
184/// ).unwrap();
185///
186/// // Binary preservation
187/// let result = normalize_payload_option(
188///     &IngestSource::File { filename: "test.bin".to_string(), content_type: None },
189///     Some(IngestPayload::Binary(vec![1, 2, 3])),
190///     &config
191/// ).unwrap();
192/// ```
193pub fn normalize_payload_option(
194    source: &IngestSource,
195    payload: Option<IngestPayload>,
196    cfg: &IngestConfig,
197) -> Result<Option<CanonicalPayload>, IngestError> {
198    let payload = match payload {
199        Some(value) => value,
200        None => return Ok(None),
201    };
202
203    let canonical = normalize_payload_value(payload, cfg)?;
204    // Some sources only make sense with a text payload.
205    if source_requires_text_payload(source) && !matches!(canonical, CanonicalPayload::Text(_)) {
206        return Err(IngestError::InvalidMetadata(
207            "text-based source requires text payload".into(),
208        ));
209    }
210    Ok(Some(canonical))
211}
212
213/// Normalizes the payload value itself.
214///
215/// This function processes the actual payload content based on its type:
216/// - `Text`: Validates and normalizes whitespace
217/// - `TextBytes`: Validates UTF-8, then treats as Text
218/// - `Binary`: Validates non-empty, passes through unchanged
219///
220/// # Arguments
221///
222/// * `payload` - The raw payload value
223/// * `cfg` - Configuration for normalization
224///
225/// # Returns
226///
227/// - `Ok(CanonicalPayload)` - Successfully normalized payload
228/// - `Err(IngestError)` - Validation or normalization failure
229///
230/// # Errors
231///
232/// - [`IngestError::InvalidUtf8`] - TextBytes contains invalid UTF-8
233/// - [`IngestError::EmptyBinaryPayload`] - Binary payload is empty
234/// - [`IngestError::InvalidMetadata`] - Binary contains suspicious patterns
235/// - [`IngestError::PayloadTooLarge`] - Size limit exceeded
236/// - [`IngestError::EmptyNormalizedText`] - Text empty after normalization
237fn normalize_payload_value(
238    payload: IngestPayload,
239    cfg: &IngestConfig,
240) -> Result<CanonicalPayload, IngestError> {
241    match payload {
242        IngestPayload::Text(text) => normalize_text_payload(text, cfg),
243        IngestPayload::TextBytes(bytes) => {
244            let text = String::from_utf8(bytes)
245                .map_err(|err| IngestError::InvalidUtf8(err.to_string()))?;
246            normalize_text_payload(text, cfg)
247        }
248        IngestPayload::Binary(bytes) => {
249            if bytes.is_empty() {
250                Err(IngestError::EmptyBinaryPayload)
251            } else {
252                // Scan for suspicious patterns in binary data
253                if bytes.len() > 1024 {
254                    let suspicious_patterns = [b'\x00', b'\xFF', b'\xFE'];
255                    let pattern_count = bytes
256                        .iter()
257                        .filter(|&&b| suspicious_patterns.contains(&b))
258                        .count();
259                    if pattern_count > bytes.len() / 4 {
260                        return Err(IngestError::InvalidMetadata(
261                            "binary payload contains suspicious patterns".into(),
262                        ));
263                    }
264                }
265
266                Ok(CanonicalPayload::Binary(bytes))
267            }
268        }
269    }
270}
271
272/// Validates text content for potential issues before normalization.
273///
274/// This function performs sanity checks on text content:
275/// - Null byte detection
276/// - Excessive control characters
277/// - Empty content check
278///
279/// # Arguments
280///
281/// * `text` - The text to validate
282/// * `cfg` - Configuration (controls control character checking)
283///
284/// # Returns
285///
286/// - `Ok(())` - Text is valid
287/// - `Err(IngestError)` - Validation failure
288///
289/// # Errors
290///
291/// - [`IngestError::InvalidMetadata`] - Null bytes or too many control characters
292/// - [`IngestError::EmptyNormalizedText`] - Text is empty/whitespace only
293fn validate_text_content(text: &str, cfg: &IngestConfig) -> Result<(), IngestError> {
294    // Check for null bytes
295    if text.contains('\0') {
296        return Err(IngestError::InvalidMetadata(
297            "text contains null bytes".into(),
298        ));
299    }
300
301    // Check for excessive control characters
302    let control_count = text
303        .chars()
304        .filter(|c| c.is_control() && *c != '\t' && *c != '\n' && *c != '\r')
305        .count();
306    if cfg.strip_control_chars && control_count > text.len() / 10 {
307        return Err(IngestError::InvalidMetadata(
308            "text contains too many control characters".into(),
309        ));
310    }
311
312    // Check minimum content length
313    if text.trim().is_empty() {
314        return Err(IngestError::EmptyNormalizedText);
315    }
316
317    Ok(())
318}
319
320/// Normalizes a text payload by collapsing whitespace.
321///
322/// This function performs the full text normalization pipeline:
323/// 1. Validates content (null bytes, control chars, emptiness)
324/// 2. Collapses whitespace using [`normalize_payload`](crate::normalize_payload)
325/// 3. Enforces size limits
326/// 4. Checks for empty result
327///
328/// # Arguments
329///
330/// * `text` - The raw text to normalize
331/// * `cfg` - Configuration for normalization and size limits
332///
333/// # Returns
334///
335/// - `Ok(CanonicalPayload::Text)` - Successfully normalized text
336/// - `Err(IngestError)` - Validation or normalization failure
337///
338/// # Errors
339///
340/// - All errors from [`validate_text_content`]
341/// - [`IngestError::PayloadTooLarge`] - Normalized text exceeds limit
342/// - [`IngestError::EmptyNormalizedText`] - Result is empty
343fn normalize_text_payload(
344    text: String,
345    cfg: &IngestConfig,
346) -> Result<CanonicalPayload, IngestError> {
347    // Validate content first
348    validate_text_content(&text, cfg)?;
349
350    let normalized = crate::normalize_payload(&text);
351    if let Some(limit) = cfg.max_normalized_bytes {
352        if normalized.len() > limit {
353            return Err(IngestError::PayloadTooLarge(format!(
354                "normalized payload size {} exceeds limit of {}",
355                normalized.len(),
356                limit
357            )));
358        }
359    }
360
361    if normalized.is_empty() {
362        Err(IngestError::EmptyNormalizedText)
363    } else {
364        Ok(CanonicalPayload::Text(normalized))
365    }
366}
367
368/// Returns a string representation of the payload kind for logging.
369///
370/// This is a utility function for structured logging to categorize payloads
371/// without exposing actual content.
372///
373/// # Arguments
374///
375/// * `payload` - Optional reference to canonical payload
376///
377/// # Returns
378///
379/// String describing the payload type: `"text"`, `"binary"`, or `"none"`
380///
381/// # Examples
382///
383/// ```rust
384/// use ingest::{payload_kind, CanonicalPayload};
385///
386/// let text = Some(CanonicalPayload::Text("hello".to_string()));
387/// assert_eq!(payload_kind(text.as_ref()), "text");
388///
389/// let binary = Some(CanonicalPayload::Binary(vec![1, 2, 3]));
390/// assert_eq!(payload_kind(binary.as_ref()), "binary");
391///
392/// assert_eq!(payload_kind(None), "none");
393/// ```
394pub fn payload_kind(payload: Option<&CanonicalPayload>) -> &'static str {
395    match payload {
396        Some(CanonicalPayload::Text(_)) => "text",
397        Some(CanonicalPayload::Binary(_)) => "binary",
398        None => "none",
399    }
400}
401
402/// Returns the length of the payload for logging.
403///
404/// This is a utility function for structured logging to record payload sizes
405/// without exposing actual content.
406///
407/// # Arguments
408///
409/// * `payload` - Optional reference to canonical payload
410///
411/// # Returns
412///
413/// Size in bytes, or 0 if no payload
414///
415/// # Examples
416///
417/// ```rust
418/// use ingest::{payload_length, CanonicalPayload};
419///
420/// let text = Some(CanonicalPayload::Text("hello".to_string()));
421/// assert_eq!(payload_length(text.as_ref()), 5);
422///
423/// let binary = Some(CanonicalPayload::Binary(vec![1, 2, 3, 4]));
424/// assert_eq!(payload_length(binary.as_ref()), 4);
425///
426/// assert_eq!(payload_length(None), 0);
427/// ```
428pub fn payload_length(payload: Option<&CanonicalPayload>) -> usize {
429    match payload {
430        Some(CanonicalPayload::Text(text)) => text.len(),
431        Some(CanonicalPayload::Binary(bytes)) => bytes.len(),
432        None => 0,
433    }
434}