Skip to main content

embeddenator_testkit/
fixtures.rs

1//! Test data fixtures and dataset generation
2//!
3//! Provides utilities for creating test datasets:
4//! - Various data patterns (zeros, sequential, random, text, etc.)
5//! - File generation with controlled sizes
6//! - Realistic test data scenarios
7
8use std::fs;
9use std::path::Path;
10
11/// Test data patterns for file generation
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum TestDataPattern {
14    /// All zeros
15    Zeros,
16    /// All ones (0xFF)
17    Ones,
18    /// Sequential bytes (0, 1, 2, ..., 255, 0, 1, ...)
19    Sequential,
20    /// Pseudo-random pattern (deterministic)
21    Random,
22    /// Compressible repeating text
23    Compressible,
24    /// ASCII text pattern
25    Text,
26}
27
28/// Create test data with specified pattern
29///
30/// # Arguments
31/// * `size_mb` - Size in megabytes
32/// * `pattern` - Data pattern to generate
33///
34/// # Returns
35/// Vector of bytes with the specified pattern
36pub fn create_test_data(size_mb: usize, pattern: TestDataPattern) -> Vec<u8> {
37    let size_bytes = size_mb * 1024 * 1024;
38
39    match pattern {
40        TestDataPattern::Zeros => vec![0u8; size_bytes],
41        TestDataPattern::Ones => vec![0xFF; size_bytes],
42        TestDataPattern::Sequential => (0..size_bytes).map(|i| (i % 256) as u8).collect(),
43        TestDataPattern::Random => {
44            // Simple deterministic "random" pattern using LCG
45            (0..size_bytes)
46                .map(|i| ((i.wrapping_mul(2654435761)) % 256) as u8)
47                .collect()
48        }
49        TestDataPattern::Compressible => {
50            // Repeating pattern that compresses well
51            let pattern = b"The quick brown fox jumps over the lazy dog. ";
52            (0..size_bytes)
53                .map(|i| pattern[i % pattern.len()])
54                .collect()
55        }
56        TestDataPattern::Text => {
57            // ASCII text pattern
58            let chars = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 \n";
59            (0..size_bytes).map(|i| chars[i % chars.len()]).collect()
60        }
61    }
62}
63
64/// Verify data matches expected pattern (with sampling for large data)
65///
66/// # Arguments
67/// * `data` - Data to verify
68/// * `expected_pattern` - Expected pattern
69/// * `sample_points` - Number of points to sample
70pub fn verify_data_sampled(data: &[u8], expected_pattern: TestDataPattern, sample_points: usize) {
71    let len = data.len();
72    let stride = len / sample_points;
73
74    for i in 0..sample_points {
75        let pos = i * stride;
76        if pos >= len {
77            break;
78        }
79        let expected = match expected_pattern {
80            TestDataPattern::Zeros => 0u8,
81            TestDataPattern::Ones => 0xFF,
82            TestDataPattern::Sequential => (pos % 256) as u8,
83            TestDataPattern::Random => ((pos.wrapping_mul(2654435761)) % 256) as u8,
84            TestDataPattern::Compressible => {
85                let pattern = b"The quick brown fox jumps over the lazy dog. ";
86                pattern[pos % pattern.len()]
87            }
88            TestDataPattern::Text => {
89                let chars = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 \n";
90                chars[pos % chars.len()]
91            }
92        };
93        assert_eq!(
94            data[pos], expected,
95            "Mismatch at position {} (sample {}): expected {}, got {}",
96            pos, i, expected, data[pos]
97        );
98    }
99}
100
101/// Create a test dataset directory with multiple files
102///
103/// # Arguments
104/// * `base_path` - Base directory for dataset
105/// * `size_mb` - Total size in megabytes
106/// * `pattern` - Data pattern to use
107///
108/// # Returns
109/// Number of files created
110pub fn create_test_dataset(base_path: &Path, size_mb: usize, pattern: TestDataPattern) -> usize {
111    fs::create_dir_all(base_path).expect("Failed to create dataset directory");
112
113    let target_bytes = size_mb * 1024 * 1024;
114    let mut written = 0;
115    let mut file_count = 0;
116
117    // Create files of varying sizes (1KB to 1MB)
118    while written < target_bytes {
119        let file_size = match file_count % 5 {
120            0 => 1024,        // 1KB
121            1 => 10 * 1024,   // 10KB
122            2 => 100 * 1024,  // 100KB
123            3 => 500 * 1024,  // 500KB
124            _ => 1024 * 1024, // 1MB
125        };
126
127        let actual_size = file_size.min(target_bytes - written);
128        let filename = format!("file_{:04}.bin", file_count);
129        let filepath = base_path.join(&filename);
130
131        let data = create_test_data_bytes(actual_size, pattern);
132        fs::write(&filepath, data).expect("Failed to write test file");
133
134        written += actual_size;
135        file_count += 1;
136    }
137
138    file_count
139}
140
141/// Create test data with exact byte count (helper)
142fn create_test_data_bytes(size_bytes: usize, pattern: TestDataPattern) -> Vec<u8> {
143    match pattern {
144        TestDataPattern::Zeros => vec![0u8; size_bytes],
145        TestDataPattern::Ones => vec![0xFF; size_bytes],
146        TestDataPattern::Sequential => (0..size_bytes).map(|i| (i % 256) as u8).collect(),
147        TestDataPattern::Random => (0..size_bytes)
148            .map(|i| ((i.wrapping_mul(2654435761)) % 256) as u8)
149            .collect(),
150        TestDataPattern::Compressible => {
151            let pattern = b"The quick brown fox jumps over the lazy dog. ";
152            (0..size_bytes)
153                .map(|i| pattern[i % pattern.len()])
154                .collect()
155        }
156        TestDataPattern::Text => {
157            let chars = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 \n";
158            (0..size_bytes).map(|i| chars[i % chars.len()]).collect()
159        }
160    }
161}
162
163/// Write a file of specified size with pattern
164pub fn write_file_of_size(
165    path: &Path,
166    size_bytes: usize,
167    pattern: TestDataPattern,
168) -> std::io::Result<()> {
169    let data = create_test_data_bytes(size_bytes, pattern);
170    fs::write(path, data)
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176    use tempfile::TempDir;
177
178    #[test]
179    fn test_create_test_data() {
180        let data = create_test_data(1, TestDataPattern::Zeros);
181        assert_eq!(data.len(), 1024 * 1024);
182        assert!(data.iter().all(|&b| b == 0));
183
184        let data = create_test_data(1, TestDataPattern::Ones);
185        assert!(data.iter().all(|&b| b == 0xFF));
186    }
187
188    #[test]
189    fn test_sequential_pattern() {
190        let data = create_test_data_bytes(512, TestDataPattern::Sequential);
191        assert_eq!(data.len(), 512);
192        for (i, &byte) in data.iter().enumerate().take(256) {
193            assert_eq!(byte, i as u8);
194        }
195        // Should wrap around
196        for (i, &byte) in data.iter().enumerate().take(512).skip(256) {
197            assert_eq!(byte, (i % 256) as u8);
198        }
199    }
200
201    #[test]
202    fn test_compressible_pattern() {
203        let data = create_test_data_bytes(100, TestDataPattern::Compressible);
204        let pattern = b"The quick brown fox jumps over the lazy dog. ";
205
206        // Check first occurrence
207        assert_eq!(&data[0..pattern.len()], pattern);
208    }
209
210    #[test]
211    fn test_verify_data_sampled() {
212        let data = create_test_data_bytes(10000, TestDataPattern::Sequential);
213        // Should not panic
214        verify_data_sampled(&data, TestDataPattern::Sequential, 100);
215    }
216
217    #[test]
218    #[should_panic(expected = "Mismatch at position")]
219    fn test_verify_data_sampled_mismatch() {
220        let mut data = create_test_data_bytes(1000, TestDataPattern::Sequential);
221        data[500] = 0xFF; // Corrupt data
222        verify_data_sampled(&data, TestDataPattern::Sequential, 100);
223    }
224
225    #[test]
226    fn test_create_test_dataset() {
227        let temp_dir = TempDir::new().unwrap();
228        let dataset_path = temp_dir.path().join("dataset");
229
230        let file_count = create_test_dataset(&dataset_path, 5, TestDataPattern::Random);
231
232        assert!(file_count > 0);
233        assert!(dataset_path.exists());
234
235        // Verify total size is approximately correct
236        let mut total_size = 0;
237        for entry in fs::read_dir(&dataset_path).unwrap() {
238            let entry = entry.unwrap();
239            let metadata = entry.metadata().unwrap();
240            total_size += metadata.len();
241        }
242
243        let expected_size = 5 * 1024 * 1024;
244        assert!(total_size >= expected_size - 1024 * 1024); // Within 1MB
245        assert!(total_size <= expected_size + 1024 * 1024);
246    }
247
248    #[test]
249    fn test_write_file_of_size() {
250        let temp_dir = TempDir::new().unwrap();
251        let filepath = temp_dir.path().join("test.bin");
252
253        write_file_of_size(&filepath, 4096, TestDataPattern::Random).unwrap();
254
255        assert!(filepath.exists());
256        let metadata = fs::metadata(&filepath).unwrap();
257        assert_eq!(metadata.len(), 4096);
258    }
259}