atelier_data 0.0.15

Data Artifacts and I/O for the atelier-rs engine
//! Data validation utilities for timestamp sequences.
//!
//! Provides checks for monotonicity, deduplication, and gap detection
//! on sorted nanosecond timestamp vectors before interarrival computation.

use crate::errors::TemporalError;
use serde::{Deserialize, Serialize};

/// Information about a detected gap in the timestamp sequence.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GapInfo {
    /// Index in the timestamp vector where the gap starts (i.e. between `[index]` and `[index+1]`).
    pub index: usize,
    /// Timestamp at the start of the gap (nanoseconds).
    pub start_ns: u64,
    /// Timestamp at the end of the gap (nanoseconds).
    pub end_ns: u64,
    /// Duration of the gap (nanoseconds).
    pub gap_ns: u64,
}

/// Validate that timestamps are strictly monotonically increasing.
///
/// # Errors
///
/// Returns `TemporalError::NonMonotonic` at the first violation found.
///
/// # Examples
///
/// ```
/// use atelier_data::temporal::validations::validate_monotonic;
///
/// let good = vec![1_u64, 2, 3, 4, 5];
/// assert!(validate_monotonic(&good).is_ok());
///
/// let bad = vec![1_u64, 3, 2, 4];
/// assert!(validate_monotonic(&bad).is_err());
/// ```
pub fn validate_monotonic(timestamps: &[u64]) -> Result<(), TemporalError> {
    for i in 1..timestamps.len() {
        if timestamps[i] <= timestamps[i - 1] {
            return Err(TemporalError::NonMonotonic {
                index: i,
                prev: timestamps[i - 1],
                curr: timestamps[i],
            });
        }
    }
    Ok(())
}

/// Remove consecutive duplicate timestamps in-place.
///
/// This uses `Vec::dedup()` semantics: only adjacent duplicates are removed.
/// The input should be sorted for this to catch all duplicates.
///
/// # Returns
///
/// The number of duplicates removed.
///
/// # Examples
///
/// ```
/// use atelier_data::temporal::validations::deduplicate;
///
/// let mut ts = vec![1_u64, 1, 2, 3, 3, 3, 4];
/// let removed = deduplicate(&mut ts);
/// assert_eq!(removed, 3);
/// assert_eq!(ts, vec![1, 2, 3, 4]);
/// ```
pub fn deduplicate(timestamps: &mut Vec<u64>) -> usize {
    let original_len = timestamps.len();
    timestamps.dedup();
    original_len - timestamps.len()
}

/// Detect gaps in the timestamp sequence that exceed a threshold.
///
/// A "gap" is defined as `t[i+1] - t[i] > threshold_ns`. This is useful
/// for identifying disconnections in the data feed (e.g. exchange downtime,
/// network drops) that would corrupt interarrival statistics.
///
/// # Arguments
///
/// * `timestamps` - Sorted nanosecond timestamps.
/// * `threshold_ns` - Minimum gap duration to report.
///
/// # Returns
///
/// A vector of `GapInfo` for every gap exceeding the threshold.
///
/// # Examples
///
/// ```
/// use atelier_data::temporal::validations::detect_gaps;
///
/// let ts = vec![0_u64, 100, 200, 10_000, 10_100];
/// let gaps = detect_gaps(&ts, 1_000);
/// assert_eq!(gaps.len(), 1);
/// assert_eq!(gaps[0].index, 2);
/// assert_eq!(gaps[0].gap_ns, 9_800);
/// ```
pub fn detect_gaps(timestamps: &[u64], threshold_ns: u64) -> Vec<GapInfo> {
    let mut gaps = Vec::new();

    for i in 1..timestamps.len() {
        let gap = timestamps[i].saturating_sub(timestamps[i - 1]);
        if gap > threshold_ns {
            gaps.push(GapInfo {
                index: i - 1,
                start_ns: timestamps[i - 1],
                end_ns: timestamps[i],
                gap_ns: gap,
            });
        }
    }

    gaps
}