Skip to main content

ustar_test_utils/
test_data_download_utils.rs

1//! Test data download utilities for ustar parser.
2//!
3//! This module is provided because size limits prevent the complete project
4//! from being stored on crates.io. Therefore if test data files are missing
5//! when running cargo test the missing files will be automatically downloaded
6//! from the GitHub repository.
7//!
8//! This module provides functionality to:
9//! - Automatically discover test directories with checksums.sha1 files
10//! - Verify test data integrity using SHA-1 checksums  
11//! - Download missing test data files from GitHub when needed
12//! - Ensure test data is available before running tests
13//! - Support for disabling downloads via --features no-large-tests
14
15use sha1::{Digest, Sha1};
16use std::fs;
17use std::io::Cursor;
18use std::path::Path;
19use std::sync::OnceLock;
20
21/// Error type for test data operations
22#[derive(Debug)]
23pub enum TestDataError {
24    /// IO error during file operations
25    Io(std::io::Error),
26    /// Checksum verification failed
27    ChecksumMismatch {
28        file: String,
29        expected: String,
30        actual: String,
31    },
32    /// Checksum file is missing or invalid
33    InvalidChecksumFile(String),
34    /// Test data directory not found
35    DirectoryNotFound(String),
36}
37
38impl std::fmt::Display for TestDataError {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        match self {
41            TestDataError::Io(e) => write!(f, "IO error: {}", e),
42            TestDataError::ChecksumMismatch {
43                file,
44                expected,
45                actual,
46            } => {
47                write!(
48                    f,
49                    "Checksum mismatch for {}: expected {}, got {}",
50                    file, expected, actual
51                )
52            }
53            TestDataError::InvalidChecksumFile(path) => {
54                write!(f, "Invalid or missing checksum file: {}", path)
55            }
56            TestDataError::DirectoryNotFound(path) => {
57                write!(f, "Test data directory not found: {}", path)
58            }
59        }
60    }
61}
62
63impl std::error::Error for TestDataError {}
64
65impl From<std::io::Error> for TestDataError {
66    fn from(error: std::io::Error) -> Self {
67        TestDataError::Io(error)
68    }
69}
70
71/// Calculate SHA-1 hash of a file
72fn calculate_file_sha1<P: AsRef<Path>>(file_path: P) -> Result<String, TestDataError> {
73    let mut file = fs::File::open(file_path)?;
74    let mut hasher = Sha1::new();
75    std::io::copy(&mut file, &mut hasher)?;
76    let result = hasher.finalize();
77    Ok(format!("{:x}", result))
78}
79
80/// Parse a single line from checksums.sha1 file
81fn parse_checksum_line(line: &str) -> Option<(String, String)> {
82    // Expected format: "hash  filename"
83    let parts: Vec<&str> = line.splitn(2, "  ").collect();
84    if parts.len() == 2 {
85        Some((parts[0].to_string(), parts[1].to_string()))
86    } else {
87        None
88    }
89}
90
91/// Verify checksums for all files in a test data directory
92///
93/// Reads the `checksums.sha1` file and verifies each file listed.
94/// Uses the Rust `sha1` crate for reliable cross-platform verification.
95pub fn verify_test_data_checksums<P: AsRef<Path>>(test_data_dir: P) -> Result<(), TestDataError> {
96    let test_data_dir = test_data_dir.as_ref();
97
98    if !test_data_dir.exists() {
99        return Err(TestDataError::DirectoryNotFound(
100            test_data_dir.display().to_string(),
101        ));
102    }
103
104    let checksum_file = test_data_dir.join("checksums.sha1");
105    if !checksum_file.exists() {
106        return Err(TestDataError::InvalidChecksumFile(
107            checksum_file.display().to_string(),
108        ));
109    }
110
111    let checksum_content = fs::read_to_string(&checksum_file)?;
112
113    for (line_num, line) in checksum_content.lines().enumerate() {
114        let line = line.trim();
115        if line.is_empty() {
116            continue;
117        }
118
119        let (expected_hash, filename) = parse_checksum_line(line).ok_or_else(|| {
120            TestDataError::InvalidChecksumFile(format!(
121                "Invalid checksum format at line {} in {}",
122                line_num + 1,
123                checksum_file.display()
124            ))
125        })?;
126
127        let file_path = test_data_dir.join(&filename);
128        if !file_path.exists() {
129            return Err(TestDataError::DirectoryNotFound(format!(
130                "Referenced file not found: {}",
131                filename
132            )));
133        }
134
135        let actual_hash = calculate_file_sha1(&file_path)?;
136        if expected_hash != actual_hash {
137            return Err(TestDataError::ChecksumMismatch {
138                file: filename,
139                expected: expected_hash,
140                actual: actual_hash,
141            });
142        }
143    }
144
145    Ok(())
146}
147
148/// Get the list of expected files from checksums.sha1
149fn get_expected_files<P: AsRef<Path>>(test_data_dir: P) -> Result<Vec<String>, TestDataError> {
150    let checksum_file = test_data_dir.as_ref().join("checksums.sha1");
151    let content = fs::read_to_string(&checksum_file)?;
152
153    let mut files = Vec::new();
154    for line in content.lines() {
155        let line = line.trim();
156        if line.is_empty() {
157            continue;
158        }
159
160        if let Some((_, filename)) = parse_checksum_line(line) {
161            files.push(filename);
162        }
163    }
164
165    Ok(files)
166}
167
168/// Check which files are missing from a test data directory
169fn get_missing_files<P: AsRef<Path>>(test_data_dir: P) -> Result<Vec<String>, TestDataError> {
170    let test_data_dir = test_data_dir.as_ref();
171    let expected_files = get_expected_files(test_data_dir)?;
172
173    let missing_files: Vec<String> = expected_files
174        .into_iter()
175        .filter(|filename| !test_data_dir.join(filename).exists())
176        .collect();
177
178    Ok(missing_files)
179}
180
181/// Discover all test data directories that have checksums.sha1 files
182fn discover_test_data_directories<P: AsRef<Path>>(
183    base_dir: P,
184) -> Result<Vec<std::path::PathBuf>, TestDataError> {
185    let base_dir = base_dir.as_ref();
186
187    if !base_dir.exists() {
188        return Ok(Vec::new());
189    }
190
191    let mut directories = Vec::new();
192
193    // Walk through immediate subdirectories
194    let read_dir = fs::read_dir(base_dir).map_err(TestDataError::Io)?;
195
196    for entry in read_dir {
197        let entry = entry.map_err(TestDataError::Io)?;
198        let path = entry.path();
199
200        if path.is_dir() {
201            let checksum_file = path.join("checksums.sha1");
202            if checksum_file.exists() {
203                directories.push(path);
204            }
205        }
206    }
207
208    Ok(directories)
209}
210
211/// Ensure test data is available and verified
212///
213/// This function:
214/// 1. Dynamically discovers test data directories with checksums.sha1 files
215/// 2. Checks all discovered directories for missing files
216/// 3. Downloads missing data if needed (unless disabled)
217/// 4. Verifies checksums of all files
218///
219/// Can be called with either a specific directory or a base directory to scan
220pub fn ensure_test_data_available<P: AsRef<Path>>(path: P) -> Result<(), TestDataError> {
221    let path = path.as_ref();
222
223    // Determine if this is a specific directory or base directory to scan
224    let (_base_dir, specific_dirs) = if path.join("checksums.sha1").exists() {
225        // This is a specific test data directory
226        (path.parent().unwrap_or(path), vec![path.to_path_buf()])
227    } else {
228        // This is a base directory - discover all test data directories
229        let discovered = discover_test_data_directories(path)?;
230        (path, discovered)
231    };
232
233    if specific_dirs.is_empty() {
234        return Ok(()); // No test data directories found, nothing to do
235    }
236
237    // Check all directories for missing files
238    let mut all_missing_files = Vec::new();
239    let mut dirs_with_missing = Vec::new();
240
241    for dir in &specific_dirs {
242        let missing_files = get_missing_files(dir)?;
243        if !missing_files.is_empty() {
244            println!(
245                "Missing files in {}: {}",
246                dir.file_name().unwrap_or_default().to_string_lossy(),
247                missing_files.join(", ")
248            );
249            all_missing_files.extend(missing_files.iter().cloned());
250            dirs_with_missing.push(dir.clone());
251        }
252    }
253
254    // If any files are missing, attempt download
255    if !all_missing_files.is_empty() {
256        if !cfg!(feature = "no-large-tests") {
257            println!("Attempting to download test data from GitHub...");
258
259            match download_test_data_from_github() {
260                Ok(_) => {
261                    println!("Test data download completed successfully!");
262
263                    // Re-check all directories that had missing files
264                    for dir in &dirs_with_missing {
265                        let still_missing = get_missing_files(dir)?;
266                        if !still_missing.is_empty() {
267                            return Err(TestDataError::DirectoryNotFound(format!(
268                                "Download completed but still missing files in {}: {}",
269                                dir.display(),
270                                still_missing.join(", ")
271                            )));
272                        }
273                    }
274                }
275                Err(e) => {
276                    eprintln!("Error: Failed to download test data: {}", e);
277                    eprintln!("To skip large tests, run: cargo test --features no-large-tests");
278                    eprintln!("To download manually:");
279                    eprintln!("  git clone --depth=1 https://github.com/varioustoxins/ustar.git temp_ustar");
280                    eprintln!("  cp -r temp_ustar/ustar-parser/tests/test_data/* <your-project>/tests/test_data/");
281                    eprintln!("  rm -rf temp_ustar");
282
283                    return Err(TestDataError::DirectoryNotFound(format!(
284                        "Missing test data files and download failed: {}",
285                        e
286                    )));
287                }
288            }
289        } else {
290            return Err(TestDataError::DirectoryNotFound(format!(
291                "Missing test data files: {}. Download disabled by --features no-large-tests.",
292                all_missing_files.join(", ")
293            )));
294        }
295    }
296
297    // Verify checksums of all discovered directories
298    for dir in &specific_dirs {
299        verify_test_data_checksums(dir)?;
300    }
301
302    Ok(())
303}
304
305static DOWNLOAD_RESULT: OnceLock<Result<(), String>> = OnceLock::new();
306
307/// Download missing test data from GitHub repository
308/// Uses sparse checkout to only download test data directories
309fn download_test_data_from_github() -> Result<(), Box<dyn std::error::Error>> {
310    // Ensure download only happens once per test run using OnceLock
311    let result = DOWNLOAD_RESULT.get_or_init(|| perform_download().map_err(|e| e.to_string()));
312
313    // Return the cached result
314    result.clone().map_err(|e| e.into())
315}
316
317/// Perform the actual test data download from GitHub
318fn perform_download() -> Result<(), Box<dyn std::error::Error>> {
319    // Use tokio runtime for async operations
320    let rt = tokio::runtime::Runtime::new()?;
321    rt.block_on(download_github_archive())
322}
323
324/// Download and extract test data from GitHub ZIP archive
325async fn download_github_archive() -> Result<(), Box<dyn std::error::Error>> {
326    let archive_url = "https://github.com/varioustoxins/ustar/archive/refs/heads/main.zip";
327
328    println!("Downloading repository archive from GitHub...");
329
330    // Download the ZIP archive directly to memory (no temporary files needed)
331    let response = reqwest::get(archive_url).await?;
332    if !response.status().is_success() {
333        return Err(format!("Failed to download archive: HTTP {}", response.status()).into());
334    }
335
336    let zip_bytes = response.bytes().await?;
337    println!(
338        "Downloaded {} bytes, extracting test data...",
339        zip_bytes.len()
340    );
341
342    // Extract test data from ZIP
343    let cursor = Cursor::new(zip_bytes);
344    let mut archive = zip::ZipArchive::new(cursor)?;
345
346    // Determine current package root directory for target
347    let manifest_dir = std::env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| ".".to_string());
348    let target_base = format!("{}/tests/test_data", manifest_dir);
349
350    // Create target directory if it doesn't exist
351    fs::create_dir_all(&target_base)?;
352
353    let mut extracted_files = 0;
354    let mut extracted_dirs = std::collections::HashSet::new();
355
356    // Extract files from the archive
357    for i in 0..archive.len() {
358        let mut file = archive.by_index(i)?;
359        let file_path = file.name();
360
361        // Only extract files from ustar-main/ustar-parser/tests/test_data/
362        if let Some(relative_path) = extract_test_data_path(file_path) {
363            let target_path = format!("{}/{}", target_base, relative_path);
364
365            // Track which directories we're extracting
366            if let Some(dir) = relative_path.split('/').next() {
367                if extracted_dirs.insert(dir.to_string()) {
368                    println!("Extracting {}...", dir);
369                }
370            }
371
372            // Create parent directories
373            if let Some(parent) = Path::new(&target_path).parent() {
374                fs::create_dir_all(parent)?;
375            }
376
377            // Extract the file
378            let mut target_file = fs::File::create(&target_path)?;
379            std::io::copy(&mut file, &mut target_file)?;
380            extracted_files += 1;
381        }
382    }
383
384    println!(
385        "Extracted {} files from {} directories",
386        extracted_files,
387        extracted_dirs.len()
388    );
389    Ok(())
390}
391
392/// Extract the test data relative path from a ZIP archive path
393/// Converts "ustar-main/ustar-parser/tests/test_data/nef_spec/file.nef"
394/// to "nef_spec/file.nef"
395fn extract_test_data_path(archive_path: &str) -> Option<String> {
396    // Look for the test_data directory in the path
397    if let Some(test_data_pos) = archive_path.find("/tests/test_data/") {
398        let after_test_data = &archive_path[test_data_pos + "/tests/test_data/".len()..];
399        if !after_test_data.is_empty() && !after_test_data.ends_with('/') {
400            return Some(after_test_data.to_string());
401        }
402    }
403    None
404}
405
406/// Get OS-provided temporary directory for future use
407/// Currently not needed as we download directly to memory, but useful for large archives
408#[allow(dead_code)]
409fn get_temp_dir() -> std::path::PathBuf {
410    std::env::temp_dir()
411}
412
413#[cfg(test)]
414mod tests {
415    use super::*;
416    use tempfile::TempDir;
417
418    #[test]
419    fn test_get_expected_files() {
420        let temp_dir = TempDir::new().unwrap();
421        let checksum_content = "abc123  file1.txt\ndef456  file2.txt\n";
422        fs::write(temp_dir.path().join("checksums.sha1"), checksum_content).unwrap();
423
424        let files = get_expected_files(temp_dir.path()).unwrap();
425        assert_eq!(files, vec!["file1.txt", "file2.txt"]);
426    }
427
428    #[test]
429    fn test_missing_files_detection() {
430        let temp_dir = TempDir::new().unwrap();
431        let checksum_content = "abc123  file1.txt\ndef456  file2.txt\n";
432        fs::write(temp_dir.path().join("checksums.sha1"), checksum_content).unwrap();
433
434        // Create only one file
435        fs::write(temp_dir.path().join("file1.txt"), "content").unwrap();
436
437        let missing = get_missing_files(temp_dir.path()).unwrap();
438        assert_eq!(missing, vec!["file2.txt"]);
439    }
440
441    #[test]
442    fn test_sha1_verification_success() {
443        let temp_dir = TempDir::new().unwrap();
444
445        // Create a test file
446        let test_content = "Hello, world!";
447        fs::write(temp_dir.path().join("test.txt"), test_content).unwrap();
448
449        // Calculate the expected SHA-1 hash for "Hello, world!"
450        let expected_hash = "943a702d06f34599aee1f8da8ef9f7296031d699";
451
452        let checksum_content = format!("{}  test.txt\n", expected_hash);
453        fs::write(temp_dir.path().join("checksums.sha1"), checksum_content).unwrap();
454
455        // Should pass verification
456        let result = verify_test_data_checksums(temp_dir.path());
457        assert!(result.is_ok());
458    }
459
460    #[test]
461    fn test_sha1_verification_failure() {
462        let temp_dir = TempDir::new().unwrap();
463
464        // Create a test file
465        fs::write(temp_dir.path().join("test.txt"), "Hello, world!").unwrap();
466
467        // Use wrong hash
468        let wrong_hash = "0000000000000000000000000000000000000000";
469        let checksum_content = format!("{}  test.txt\n", wrong_hash);
470        fs::write(temp_dir.path().join("checksums.sha1"), checksum_content).unwrap();
471
472        // Should fail verification
473        let result = verify_test_data_checksums(temp_dir.path());
474        assert!(result.is_err());
475
476        match result.unwrap_err() {
477            TestDataError::ChecksumMismatch {
478                file,
479                expected,
480                actual,
481            } => {
482                assert_eq!(file, "test.txt");
483                assert_eq!(expected, wrong_hash);
484                assert_eq!(actual, "943a702d06f34599aee1f8da8ef9f7296031d699");
485            }
486            _ => panic!("Expected ChecksumMismatch error"),
487        }
488    }
489
490    #[test]
491    fn test_parse_checksum_line() {
492        assert_eq!(
493            parse_checksum_line("abc123  file.txt"),
494            Some(("abc123".to_string(), "file.txt".to_string()))
495        );
496
497        assert_eq!(parse_checksum_line("invalid line"), None);
498    }
499
500    #[test]
501    fn test_discover_test_data_directories() {
502        let temp_dir = TempDir::new().unwrap();
503
504        // Create test structure with some directories that have checksums.sha1 and some that don't
505        let dir1 = temp_dir.path().join("dir1");
506        let dir2 = temp_dir.path().join("dir2");
507        let dir3 = temp_dir.path().join("dir3");
508
509        fs::create_dir_all(&dir1).unwrap();
510        fs::create_dir_all(&dir2).unwrap();
511        fs::create_dir_all(&dir3).unwrap();
512
513        // Only dir1 and dir3 have checksums.sha1
514        fs::write(dir1.join("checksums.sha1"), "abc123  file1.txt\n").unwrap();
515        fs::write(dir3.join("checksums.sha1"), "def456  file3.txt\n").unwrap();
516
517        // dir2 has no checksum file
518        fs::write(dir2.join("some_other_file.txt"), "content").unwrap();
519
520        let discovered = discover_test_data_directories(temp_dir.path()).unwrap();
521
522        // Should find dir1 and dir3, but not dir2
523        assert_eq!(discovered.len(), 2);
524
525        let dir_names: Vec<String> = discovered
526            .iter()
527            .map(|p| p.file_name().unwrap().to_string_lossy().to_string())
528            .collect();
529
530        assert!(dir_names.contains(&"dir1".to_string()));
531        assert!(dir_names.contains(&"dir3".to_string()));
532        assert!(!dir_names.contains(&"dir2".to_string()));
533    }
534}