Skip to main content

timeseries_table_core/coverage/
layout.rs

1//! Coverage on-disk layout helpers.
2//!
3//! These helpers define:
4//! - how coverage ids are validated
5//! - how coverage sidecar paths are constructed (relative to the table root)
6//! - deterministic id derivation helpers for per-segment and table snapshots
7//!
8//! Note: these functions return *relative* paths (under a table root). Callers
9//! should join them with the table root / storage backend before doing IO.
10
11use std::path::PathBuf;
12
13use snafu::Snafu;
14
15use crate::metadata::table_metadata::TimeBucket;
16
17/// Root directory for coverage data.
18pub const COVERAGE_ROOT_DIR: &str = "_coverage";
19/// Directory for segment coverage data.
20pub const SEGMENT_COVERAGE_DIR: &str = "_coverage/segments";
21/// Directory for table snapshot coverage data.
22pub const TABLE_SNAPSHOT_DIR: &str = "_coverage/table";
23/// File extension for coverage files.
24pub const COVERAGE_EXT: &str = "roar";
25
26/// Errors that can occur during coverage layout operations.
27#[derive(Debug, Snafu)]
28pub enum CoverageLayoutError {
29    /// Returned when an invalid coverage ID is provided.
30    #[snafu(display("Invalid coverage id: {coverage_id}"))]
31    InvalidCoverageId {
32        /// The invalid coverage ID.
33        coverage_id: String,
34    },
35}
36
37/// Validates that a coverage ID meets security and format requirements.
38///
39/// A valid coverage ID must:
40/// - Not be empty and not exceed 128 characters
41/// - Not contain path separators (`/`, `\\`) or `..` sequences
42/// - Only contain ASCII alphanumeric characters, dots, underscores, and hyphens
43pub fn validate_coverage_id(coverage_id: &str) -> Result<(), CoverageLayoutError> {
44    if coverage_id.is_empty() || coverage_id.len() > 128 {
45        return Err(CoverageLayoutError::InvalidCoverageId {
46            coverage_id: coverage_id.to_string(),
47        });
48    }
49
50    // Require at least one alphanumeric
51    if !coverage_id.chars().any(|c| c.is_ascii_alphanumeric()) {
52        return Err(CoverageLayoutError::InvalidCoverageId {
53            coverage_id: coverage_id.to_string(),
54        });
55    }
56
57    // Reject leading dot
58    if coverage_id.starts_with('.') {
59        return Err(CoverageLayoutError::InvalidCoverageId {
60            coverage_id: coverage_id.to_string(),
61        });
62    }
63
64    // Reject any path separator and any ".." component-ish content.
65    if coverage_id.contains('/') || coverage_id.contains('\\') || coverage_id.contains("..") {
66        return Err(CoverageLayoutError::InvalidCoverageId {
67            coverage_id: coverage_id.to_string(),
68        });
69    }
70
71    // Restrict to a conservative ASCII allowlist.
72    let ok = coverage_id
73        .chars()
74        .all(|c| c.is_ascii_alphanumeric() || matches!(c, '.' | '_' | '-'));
75
76    if !ok {
77        return Err(CoverageLayoutError::InvalidCoverageId {
78            coverage_id: coverage_id.to_string(),
79        });
80    }
81
82    Ok(())
83}
84
85/// Relative path: `_coverage/segments/<coverage_id>.roar`
86pub fn segment_coverage_path(coverage_id: &str) -> Result<PathBuf, CoverageLayoutError> {
87    validate_coverage_id(coverage_id)?;
88    let mut p = PathBuf::from(COVERAGE_ROOT_DIR);
89    p.push("segments");
90    p.push(format!("{coverage_id}.{COVERAGE_EXT}"));
91    Ok(p)
92}
93
94/// Relative path: `_coverage/table/<version>-<snapshot_id>.roar`
95pub fn table_snapshot_path(
96    version: u64,
97    snapshot_id: &str,
98) -> Result<PathBuf, CoverageLayoutError> {
99    validate_coverage_id(snapshot_id)?;
100    let mut p = PathBuf::from(COVERAGE_ROOT_DIR);
101    p.push("table");
102    p.push(format!("{version}-{snapshot_id}.{COVERAGE_EXT}"));
103    Ok(p)
104}
105
106fn coverage_id_v1(
107    domain_prefix: &[u8],
108    output_prefix: &str,
109    bucket_spec: &TimeBucket,
110    time_column: &str,
111    coverage_bytes: &[u8],
112) -> String {
113    let mut h = blake3::Hasher::new();
114
115    // domain separation
116    h.update(domain_prefix);
117    h.update(b"\0");
118
119    // stable encoding for TimeBucket (avoid Display/to_string)
120    match bucket_spec {
121        TimeBucket::Seconds(n) => {
122            h.update(b"S");
123            h.update(&n.to_le_bytes());
124        }
125        TimeBucket::Minutes(n) => {
126            h.update(b"M");
127            h.update(&n.to_le_bytes());
128        }
129        TimeBucket::Hours(n) => {
130            h.update(b"H");
131            h.update(&n.to_le_bytes());
132        }
133        TimeBucket::Days(n) => {
134            h.update(b"D");
135            h.update(&n.to_le_bytes());
136        }
137    }
138
139    h.update(b"\0");
140    h.update(time_column.as_bytes());
141    h.update(b"\0");
142    h.update(coverage_bytes);
143
144    let hex = h.finalize().to_hex();
145    format!("{output_prefix}-{}", &hex[..32])
146}
147
148/// Deterministically derive a safe coverage id for a segment coverage sidecar.
149pub fn segment_coverage_id_v1(
150    bucket_spec: &TimeBucket,
151    time_column: &str,
152    coverage_bytes: &[u8],
153) -> String {
154    coverage_id_v1(
155        b"segcov-v1",
156        "segcov",
157        bucket_spec,
158        time_column,
159        coverage_bytes,
160    )
161}
162
163/// Deterministically derive a safe coverage id for a table snapshot sidecar.
164pub fn table_coverage_id_v1(
165    bucket_spec: &TimeBucket,
166    time_column: &str,
167    coverage_bytes: &[u8],
168) -> String {
169    coverage_id_v1(
170        b"tblcov-v1",
171        "tblcov",
172        bucket_spec,
173        time_column,
174        coverage_bytes,
175    )
176}
177
178#[cfg(test)]
179mod tests {
180    use super::*;
181
182    #[test]
183    fn validate_coverage_id_accepts_valid_ids() {
184        let long = "a".repeat(128);
185        let valid_ids = ["abc", "A_B-1.2", long.as_str()];
186
187        for id in valid_ids {
188            validate_coverage_id(id).expect("valid id should pass");
189        }
190    }
191
192    #[test]
193    fn validate_coverage_id_rejects_empty_or_too_long() {
194        let too_long = "x".repeat(129);
195        assert!(validate_coverage_id("").is_err());
196        assert!(validate_coverage_id(&too_long).is_err());
197    }
198
199    #[test]
200    fn validate_coverage_id_rejects_path_components() {
201        for id in ["a/b", "a\\b", "a..b", "..", "../etc"] {
202            assert!(validate_coverage_id(id).is_err(), "id `{id}` should fail");
203        }
204    }
205
206    #[test]
207    fn validate_coverage_id_rejects_disallowed_chars() {
208        for id in ["space id", "id*", "id@", "id$", "id:"] {
209            assert!(validate_coverage_id(id).is_err(), "id `{id}` should fail");
210        }
211    }
212
213    #[test]
214    fn segment_coverage_path_formats_and_validates() {
215        let id = "seg-001";
216        let path = segment_coverage_path(id).expect("valid id");
217        assert_eq!(path, PathBuf::from("_coverage/segments/seg-001.roar"));
218
219        // Ensure validation runs
220        assert!(segment_coverage_path("bad/id").is_err());
221    }
222
223    #[test]
224    fn table_snapshot_path_formats() {
225        let path = table_snapshot_path(42, "snap-001").expect("valid snapshot id");
226        assert_eq!(path, PathBuf::from("_coverage/table/42-snap-001.roar"));
227    }
228
229    #[test]
230    fn segment_coverage_id_is_deterministic_and_valid() {
231        let bucket = TimeBucket::Minutes(1);
232        let time_col = "ts";
233        let bytes = b"bitmap-bytes";
234
235        let id1 = segment_coverage_id_v1(&bucket, time_col, bytes);
236        let id2 = segment_coverage_id_v1(&bucket, time_col, bytes);
237
238        assert_eq!(id1, id2, "same inputs must produce stable id");
239        assert!(id1.starts_with("segcov-"));
240        assert_eq!(id1.len(), "segcov-".len() + 32, "prefix + 32 hex chars");
241        validate_coverage_id(&id1).expect("derived id should be valid");
242    }
243
244    #[test]
245    fn segment_coverage_id_changes_with_inputs() {
246        let bytes = b"bytes";
247
248        let base = segment_coverage_id_v1(&TimeBucket::Seconds(5), "ts", bytes);
249        let different_bucket = segment_coverage_id_v1(&TimeBucket::Hours(5), "ts", bytes);
250        let different_column = segment_coverage_id_v1(&TimeBucket::Seconds(5), "event_time", bytes);
251        let different_bytes = segment_coverage_id_v1(&TimeBucket::Seconds(5), "ts", b"other");
252
253        assert_ne!(base, different_bucket, "bucket spec should affect id");
254        assert_ne!(base, different_column, "time column should affect id");
255        assert_ne!(base, different_bytes, "coverage bytes should affect id");
256    }
257
258    #[test]
259    fn table_coverage_id_is_deterministic_and_valid() {
260        let bucket = TimeBucket::Hours(1);
261        let time_col = "ts";
262        let bytes = b"table-bitmap";
263
264        let id1 = table_coverage_id_v1(&bucket, time_col, bytes);
265        let id2 = table_coverage_id_v1(&bucket, time_col, bytes);
266
267        assert_eq!(id1, id2, "same inputs must produce stable id");
268        assert!(id1.starts_with("tblcov-"));
269        assert_eq!(id1.len(), "tblcov-".len() + 32, "prefix + 32 hex chars");
270        validate_coverage_id(&id1).expect("derived id should be valid");
271    }
272
273    #[test]
274    fn table_coverage_id_changes_with_inputs() {
275        let bytes = b"bytes";
276
277        let base = table_coverage_id_v1(&TimeBucket::Minutes(15), "ts", bytes);
278        let different_bucket = table_coverage_id_v1(&TimeBucket::Days(1), "ts", bytes);
279        let different_column = table_coverage_id_v1(&TimeBucket::Minutes(15), "event_time", bytes);
280        let different_bytes = table_coverage_id_v1(&TimeBucket::Minutes(15), "ts", b"other");
281
282        assert_ne!(base, different_bucket, "bucket spec should affect id");
283        assert_ne!(base, different_column, "time column should affect id");
284        assert_ne!(base, different_bytes, "coverage bytes should affect id");
285    }
286}