Skip to main content

ref_solver/utils/
validation.rs

1//! Centralized validation and helper functions.
2
3use crate::web::format_detection::FileFormat;
4use std::collections::HashSet;
5
6/// Maximum number of contigs allowed in a single file (DOS protection)
7pub const MAX_CONTIGS: usize = 100_000;
8
9/// Security-related constants for input validation
10pub const MAX_FILENAME_LENGTH: usize = 255;
11pub const MIN_FILE_CONTENT_SIZE: usize = 1;
12
13/// Validate that a string is a valid MD5 checksum (32 hex characters).
14///
15/// # Examples
16///
17/// ```
18/// use ref_solver::utils::validation::is_valid_md5;
19///
20/// assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
21/// assert!(!is_valid_md5("not-an-md5"));
22/// assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); // 31 chars
23/// ```
24#[must_use]
25pub fn is_valid_md5(s: &str) -> bool {
26    s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit())
27}
28
29/// Normalize an MD5 string to lowercase.
30/// Returns None if the input is not a valid MD5.
31#[must_use]
32pub fn normalize_md5(s: &str) -> Option<String> {
33    if is_valid_md5(s) {
34        Some(s.to_lowercase())
35    } else {
36        None
37    }
38}
39
40/// Compute the GA4GH sha512t24u digest for a sequence.
41///
42/// Algorithm: SHA-512 the sequence bytes, truncate to the first 24 bytes,
43/// then base64url-encode without padding, producing a 32-character string.
44///
45/// The input sequence should already be uppercased.
46///
47/// # Examples
48///
49/// ```
50/// use ref_solver::utils::validation::compute_sha512t24u;
51///
52/// let digest = compute_sha512t24u(b"ACGT");
53/// assert_eq!(digest.len(), 32);
54/// ```
55#[must_use]
56pub fn compute_sha512t24u(sequence: &[u8]) -> String {
57    refget_digest::sha512t24u(sequence)
58}
59
60/// Validate that a string is a valid sha512t24u digest (32 chars, base64url alphabet).
61///
62/// # Examples
63///
64/// ```
65/// use ref_solver::utils::validation::is_valid_sha512t24u;
66///
67/// assert!(is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2"));
68/// assert!(!is_valid_sha512t24u("too-short"));
69/// ```
70#[must_use]
71pub fn is_valid_sha512t24u(s: &str) -> bool {
72    s.len() == 32
73        && s.bytes()
74            .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_')
75}
76
77/// Compute a signature hash from a set of MD5 checksums.
78///
79/// The signature is computed by:
80/// 1. Sorting the MD5s alphabetically
81/// 2. Joining them with commas
82/// 3. Computing MD5 of the concatenated string
83///
84/// This provides a deterministic identifier for a set of contigs.
85#[must_use]
86#[allow(clippy::implicit_hasher)] // Default hasher is fine for this use case
87pub fn compute_signature(md5s: &HashSet<String>) -> String {
88    if md5s.is_empty() {
89        return String::new();
90    }
91
92    let mut sorted: Vec<&str> = md5s.iter().map(std::string::String::as_str).collect();
93    sorted.sort_unstable();
94    let concatenated = sorted.join(",");
95    let digest = md5::compute(concatenated.as_bytes());
96    format!("{digest:x}")
97}
98
99/// Check if adding another contig would exceed the maximum allowed.
100///
101/// Call this with the current count BEFORE adding a new contig.
102/// Returns an error message if adding would exceed the limit, None if safe to add.
103///
104/// # Example
105/// ```ignore
106/// if check_contig_limit(contigs.len()).is_some() {
107///     return Err(...);
108/// }
109/// contigs.push(new_contig); // Safe to add
110/// ```
111#[must_use]
112pub fn check_contig_limit(count: usize) -> Option<String> {
113    if count >= MAX_CONTIGS {
114        Some(format!(
115            "Too many contigs: adding another would exceed maximum of {MAX_CONTIGS}"
116        ))
117    } else {
118        None
119    }
120}
121
122/// Security validation error types
123#[derive(Debug, thiserror::Error)]
124pub enum ValidationError {
125    #[error("Filename too long: exceeds {MAX_FILENAME_LENGTH} characters")]
126    FilenameTooLong,
127    #[error("Invalid filename: contains path traversal or invalid characters")]
128    InvalidFilename,
129    #[error("Empty filename provided")]
130    EmptyFilename,
131    #[error("File content appears malformed or invalid")]
132    InvalidFileContent,
133    #[error("File format validation failed")]
134    FormatValidationFailed,
135}
136
137/// Secure filename validation to prevent directory traversal and other attacks
138///
139/// Validates and sanitizes filenames by:
140/// - Checking length limits
141/// - Preventing directory traversal (../, ..\\)
142/// - Removing potentially dangerous characters
143/// - Ensuring filename is not empty after sanitization
144///
145/// # Errors
146///
147/// Returns `ValidationError::EmptyFilename` if the filename is empty,
148/// `ValidationError::FilenameTooLong` if it exceeds the limit, or
149/// `ValidationError::InvalidFilename` if it contains invalid characters.
150pub fn validate_filename(filename: &str) -> Result<String, ValidationError> {
151    // Check if filename is empty
152    if filename.trim().is_empty() {
153        return Err(ValidationError::EmptyFilename);
154    }
155
156    // Check length limit
157    if filename.len() > MAX_FILENAME_LENGTH {
158        return Err(ValidationError::FilenameTooLong);
159    }
160
161    // Prevent directory traversal attacks
162    if filename.contains("..") || filename.contains('/') || filename.contains('\\') {
163        return Err(ValidationError::InvalidFilename);
164    }
165
166    // Check for null bytes and other dangerous characters
167    if filename.contains('\0') || filename.chars().any(|c| ('\x01'..='\x1F').contains(&c)) {
168        return Err(ValidationError::InvalidFilename);
169    }
170
171    // Sanitize filename by keeping only safe characters
172    let sanitized = filename
173        .chars()
174        .filter(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == '-' || *c == '_' || *c == ' ')
175        .collect::<String>();
176
177    // Ensure sanitized filename is not empty
178    if sanitized.trim().is_empty() {
179        return Err(ValidationError::InvalidFilename);
180    }
181
182    // Prevent hidden files (starting with .) unless it's a known extension
183    if sanitized.starts_with('.') && !has_known_extension(&sanitized) {
184        return Err(ValidationError::InvalidFilename);
185    }
186
187    Ok(sanitized)
188}
189
190/// Check if filename has a known safe extension
191fn has_known_extension(filename: &str) -> bool {
192    let safe_extensions = [
193        ".sam",
194        ".bam",
195        ".cram",
196        ".dict",
197        ".vcf",
198        ".txt",
199        ".tsv",
200        ".csv",
201        ".gz",
202        ".assembly_report.txt",
203    ];
204
205    safe_extensions
206        .iter()
207        .any(|ext| filename.to_lowercase().ends_with(ext))
208}
209
210/// Validate file content using magic numbers for known binary formats
211///
212/// Performs format validation by checking magic numbers (file signatures)
213/// to prevent format confusion attacks and ensure file integrity
214#[must_use]
215pub fn validate_file_format(content: &[u8], expected_format: FileFormat) -> bool {
216    if content.is_empty() {
217        return false;
218    }
219
220    match expected_format {
221        FileFormat::Bam => {
222            // BAM files start with "BAM\x01"
223            content.len() >= 4 && content.starts_with(b"BAM\x01")
224        }
225        FileFormat::Cram => {
226            // CRAM files start with "CRAM"
227            content.len() >= 4 && content.starts_with(b"CRAM")
228        }
229        FileFormat::Vcf => {
230            // VCF files should start with "##fileformat=VCF"
231            let content_str = std::str::from_utf8(content).unwrap_or("");
232            content_str.starts_with("##fileformat=VCF")
233        }
234        FileFormat::Sam => {
235            // SAM files are text-based, check for header indicators
236            let content_str = std::str::from_utf8(content).unwrap_or("");
237            content_str.contains("@SQ")
238                || content_str.contains("@HD")
239                || content_str.contains("SN:")
240                || content_str.contains("LN:")
241        }
242        FileFormat::Dict => {
243            // Picard dictionary files have @HD and @SQ headers
244            let content_str = std::str::from_utf8(content).unwrap_or("");
245            content_str.contains("@HD") && content_str.contains("@SQ")
246        }
247        FileFormat::NcbiReport => {
248            // NCBI assembly reports have specific column headers
249            let content_str = std::str::from_utf8(content).unwrap_or("");
250            content_str.contains("Sequence-Name") || content_str.contains("Sequence-Role")
251        }
252        FileFormat::Tsv => {
253            // TSV files should have tab-separated content
254            let content_str = std::str::from_utf8(content).unwrap_or("");
255            content_str.contains('\t')
256                && (content_str.to_lowercase().contains("length")
257                    || content_str.to_lowercase().contains("sequence")
258                    || content_str.to_lowercase().contains("size"))
259        }
260        FileFormat::Fai => {
261            // FAI files have 5 tab-separated columns per line
262            let content_str = std::str::from_utf8(content).unwrap_or("");
263            content_str.lines().take(5).any(|line| {
264                let fields: Vec<&str> = line.split('\t').collect();
265                fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
266            })
267        }
268        FileFormat::Fasta => {
269            // FASTA files start with '>' or are gzip compressed (0x1f 0x8b)
270            content.starts_with(b">")
271                || (content.len() >= 2 && content[0] == 0x1f && content[1] == 0x8b)
272        }
273        FileFormat::Auto => {
274            // Auto-detection always passes initial validation
275            true
276        }
277    }
278}
279
280/// Validate that file content is not malicious or malformed
281///
282/// Basic security checks for file content integrity:
283/// - Minimum size requirements
284/// - Binary content detection for text formats
285/// - Basic malformation checks
286///
287/// # Errors
288///
289/// Returns `ValidationError::InvalidFileContent` if the content is too small,
290/// contains unexpected binary data for text formats, or fails UTF-8 validation.
291pub fn validate_file_content(content: &[u8], expected_text: bool) -> Result<(), ValidationError> {
292    // Check minimum content size
293    if content.len() < MIN_FILE_CONTENT_SIZE {
294        return Err(ValidationError::InvalidFileContent);
295    }
296
297    // If we expect text content, validate it's not binary
298    if expected_text {
299        // Check for excessive non-printable characters
300        let non_printable_count = content
301            .iter()
302            .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
303            .count();
304
305        // Allow up to 5% non-printable characters for text files
306        if content.len() > 100 && non_printable_count > content.len() / 20 {
307            return Err(ValidationError::InvalidFileContent);
308        }
309
310        // Basic UTF-8 validation for text content
311        if std::str::from_utf8(content).is_err() {
312            return Err(ValidationError::InvalidFileContent);
313        }
314    }
315
316    Ok(())
317}
318
319/// Comprehensive input validation combining filename and content checks
320///
321/// Performs complete security validation for file uploads:
322/// - Filename sanitization and security checks
323/// - File format validation via magic numbers
324/// - Content integrity validation
325///
326/// # Errors
327///
328/// Returns a `ValidationError` if filename validation fails, the file format
329/// doesn't match the expected format, or content validation fails.
330pub fn validate_upload(
331    filename: Option<&str>,
332    content: &[u8],
333    expected_format: FileFormat,
334) -> Result<Option<String>, ValidationError> {
335    // Validate filename if provided
336    let validated_filename = if let Some(name) = filename {
337        Some(validate_filename(name)?)
338    } else {
339        None
340    };
341
342    // Validate content integrity
343    let is_text_format = matches!(
344        expected_format,
345        FileFormat::Sam
346            | FileFormat::Dict
347            | FileFormat::Vcf
348            | FileFormat::NcbiReport
349            | FileFormat::Tsv
350            | FileFormat::Auto
351    );
352
353    validate_file_content(content, is_text_format)?;
354
355    // Validate file format - even for auto-detection, check for obvious mismatches
356    if expected_format == FileFormat::Auto {
357        // For auto-detection, at least verify it's not a malformed binary file
358        // Check if it looks like a known binary format but is malformed
359        if content.len() >= 4 {
360            let starts_with_bam = content.starts_with(b"BAM");
361            let starts_with_cram = content.starts_with(b"CRAM");
362
363            // If it looks like it should be BAM/CRAM but isn't valid, reject it
364            if starts_with_bam && !validate_file_format(content, FileFormat::Bam) {
365                return Err(ValidationError::FormatValidationFailed);
366            }
367            if starts_with_cram && !validate_file_format(content, FileFormat::Cram) {
368                return Err(ValidationError::FormatValidationFailed);
369            }
370        }
371    } else if !validate_file_format(content, expected_format) {
372        return Err(ValidationError::FormatValidationFailed);
373    }
374
375    Ok(validated_filename)
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    #[test]
383    fn test_is_valid_md5() {
384        assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
385        assert!(is_valid_md5("AABBCCDD11223344556677889900AABB")); // uppercase ok
386        assert!(!is_valid_md5("not-an-md5"));
387        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); // 31 chars
388        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178ddd")); // 33 chars
389        assert!(!is_valid_md5("")); // empty
390        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178dg")); // invalid char
391    }
392
393    #[test]
394    fn test_compute_sha512t24u() {
395        // "ACGT" -> known sha512t24u digest
396        let digest = compute_sha512t24u(b"ACGT");
397        assert_eq!(digest.len(), 32);
398        assert!(is_valid_sha512t24u(&digest));
399
400        // Deterministic: same input -> same output
401        assert_eq!(digest, compute_sha512t24u(b"ACGT"));
402
403        // Different input -> different output
404        assert_ne!(digest, compute_sha512t24u(b"TGCA"));
405
406        // Verify against known value (SHA-512 of "ACGT", truncated to 24 bytes, base64url no-pad)
407        assert_eq!(digest, "aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2");
408    }
409
410    #[test]
411    fn test_is_valid_sha512t24u() {
412        assert!(is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2"));
413        assert!(!is_valid_sha512t24u("too-short"));
414        assert!(!is_valid_sha512t24u(""));
415        // 33 chars - too long
416        assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2X"));
417        // Invalid character (space)
418        assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw "));
419    }
420
421    #[test]
422    fn test_normalize_md5() {
423        assert_eq!(
424            normalize_md5("6AEF897C3D6FF0C78AFF06AC189178DD"),
425            Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
426        );
427        assert_eq!(normalize_md5("invalid"), None);
428    }
429
430    #[test]
431    fn test_compute_signature() {
432        let mut md5s = HashSet::new();
433        md5s.insert("aaaa".repeat(8)); // fake MD5
434        md5s.insert("bbbb".repeat(8));
435
436        let sig = compute_signature(&md5s);
437        assert_eq!(sig.len(), 32);
438
439        // Same input should give same output
440        let sig2 = compute_signature(&md5s);
441        assert_eq!(sig, sig2);
442
443        // Empty set gives empty string
444        let empty: HashSet<String> = HashSet::new();
445        assert_eq!(compute_signature(&empty), "");
446    }
447
448    #[test]
449    fn test_check_contig_limit() {
450        assert!(check_contig_limit(100).is_none());
451        assert!(check_contig_limit(MAX_CONTIGS - 1).is_none());
452        assert!(check_contig_limit(MAX_CONTIGS).is_some());
453        assert!(check_contig_limit(MAX_CONTIGS + 1).is_some());
454    }
455
456    // Security validation tests
457    #[test]
458    fn test_validate_filename_safe() {
459        assert!(validate_filename("test.sam").is_ok());
460        assert!(validate_filename("my-file.bam").is_ok());
461        assert!(validate_filename("data_file.txt").is_ok());
462        assert!(validate_filename("sample 123.vcf").is_ok());
463    }
464
465    #[test]
466    fn test_validate_filename_dangerous() {
467        // Directory traversal attempts
468        assert!(validate_filename("../etc/passwd").is_err());
469        assert!(validate_filename("..\\windows\\system32").is_err());
470        assert!(validate_filename("test/../../secret").is_err());
471
472        // Null bytes and control characters
473        assert!(validate_filename("test\0.txt").is_err());
474        assert!(validate_filename("test\x01.txt").is_err());
475
476        // Too long filename
477        let long_name = "a".repeat(300);
478        assert!(validate_filename(&long_name).is_err());
479
480        // Empty or whitespace-only
481        assert!(validate_filename("").is_err());
482        assert!(validate_filename("   ").is_err());
483
484        // Hidden files without known extensions
485        assert!(validate_filename(".hidden").is_err());
486    }
487
488    #[test]
489    fn test_validate_filename_sanitization() {
490        // Should remove dangerous characters but keep safe ones
491        let result = validate_filename("test@#$%file.txt").unwrap();
492        assert_eq!(result, "testfile.txt");
493
494        // Should preserve safe characters
495        let result = validate_filename("my-file_123.sam").unwrap();
496        assert_eq!(result, "my-file_123.sam");
497    }
498
499    #[test]
500    fn test_validate_file_format_bam() {
501        let bam_content = b"BAM\x01test_content";
502        assert!(validate_file_format(bam_content, FileFormat::Bam));
503
504        let invalid_bam = b"NOTBAM\x01";
505        assert!(!validate_file_format(invalid_bam, FileFormat::Bam));
506    }
507
508    #[test]
509    fn test_validate_file_format_cram() {
510        let cram_content = b"CRAMtest_content";
511        assert!(validate_file_format(cram_content, FileFormat::Cram));
512
513        let invalid_cram = b"NOTCRAM";
514        assert!(!validate_file_format(invalid_cram, FileFormat::Cram));
515    }
516
517    #[test]
518    fn test_validate_file_format_vcf() {
519        let vcf_content = b"##fileformat=VCFv4.2\n##contig=<ID=chr1>";
520        assert!(validate_file_format(vcf_content, FileFormat::Vcf));
521
522        let invalid_vcf = b"@SQ\tSN:chr1\tLN:123";
523        assert!(!validate_file_format(invalid_vcf, FileFormat::Vcf));
524    }
525
526    #[test]
527    fn test_validate_file_format_sam() {
528        let sam_content = b"@SQ\tSN:chr1\tLN:123456";
529        assert!(validate_file_format(sam_content, FileFormat::Sam));
530
531        let sam_content2 = b"@HD\tVN:1.0\tSO:coordinate";
532        assert!(validate_file_format(sam_content2, FileFormat::Sam));
533    }
534
535    #[test]
536    fn test_validate_file_content_text() {
537        let valid_text = b"@SQ\tSN:chr1\tLN:123456\n@SQ\tSN:chr2\tLN:654321";
538        assert!(validate_file_content(valid_text, true).is_ok());
539
540        // Too much binary data for text format
541        let binary_data = vec![0u8; 1000];
542        assert!(validate_file_content(&binary_data, true).is_err());
543
544        // Empty content
545        assert!(validate_file_content(b"", true).is_err());
546    }
547
548    #[test]
549    fn test_validate_file_content_binary() {
550        let binary_data = vec![0xABu8; 100];
551        assert!(validate_file_content(&binary_data, false).is_ok());
552
553        // Empty content still invalid for binary
554        assert!(validate_file_content(b"", false).is_err());
555    }
556
557    #[test]
558    fn test_validate_upload_complete() {
559        let sam_content = b"@SQ\tSN:chr1\tLN:123456";
560
561        // Valid upload with filename
562        let result = validate_upload(Some("test.sam"), sam_content, FileFormat::Sam);
563        assert!(result.is_ok());
564        assert_eq!(result.unwrap().unwrap(), "test.sam");
565
566        // Valid upload without filename
567        let result = validate_upload(None, sam_content, FileFormat::Sam);
568        assert!(result.is_ok());
569        assert!(result.unwrap().is_none());
570
571        // Invalid filename
572        let result = validate_upload(Some("../etc/passwd"), sam_content, FileFormat::Sam);
573        assert!(result.is_err());
574
575        // Format mismatch
576        let bam_content = b"BAM\x01test";
577        let result = validate_upload(Some("test.sam"), bam_content, FileFormat::Sam);
578        assert!(result.is_err());
579    }
580
581    #[test]
582    fn test_has_known_extension() {
583        assert!(has_known_extension(".sam"));
584        assert!(has_known_extension(".bam"));
585        assert!(has_known_extension(".vcf.gz"));
586        assert!(has_known_extension("test.assembly_report.txt"));
587
588        assert!(!has_known_extension(".exe"));
589        assert!(!has_known_extension(".hidden"));
590        assert!(!has_known_extension(".config"));
591    }
592}