Skip to main content

ref_solver/utils/
validation.rs

1//! Centralized validation and helper functions.
2
3use crate::web::format_detection::FileFormat;
4use std::collections::HashSet;
5
6/// Maximum number of contigs allowed in a single file (DOS protection)
7pub const MAX_CONTIGS: usize = 100_000;
8
9/// Security-related constants for input validation
10pub const MAX_FILENAME_LENGTH: usize = 255;
11pub const MIN_FILE_CONTENT_SIZE: usize = 1;
12
13/// Validate that a string is a valid MD5 checksum (32 hex characters).
14///
15/// # Examples
16///
17/// ```
18/// use ref_solver::utils::validation::is_valid_md5;
19///
20/// assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
21/// assert!(!is_valid_md5("not-an-md5"));
22/// assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); // 31 chars
23/// ```
24#[must_use]
25pub fn is_valid_md5(s: &str) -> bool {
26    s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit())
27}
28
29/// Normalize an MD5 string to lowercase.
30/// Returns None if the input is not a valid MD5.
31#[must_use]
32pub fn normalize_md5(s: &str) -> Option<String> {
33    if is_valid_md5(s) {
34        Some(s.to_lowercase())
35    } else {
36        None
37    }
38}
39
40/// Compute a signature hash from a set of MD5 checksums.
41///
42/// The signature is computed by:
43/// 1. Sorting the MD5s alphabetically
44/// 2. Joining them with commas
45/// 3. Computing MD5 of the concatenated string
46///
47/// This provides a deterministic identifier for a set of contigs.
48#[must_use]
49#[allow(clippy::implicit_hasher)] // Default hasher is fine for this use case
50pub fn compute_signature(md5s: &HashSet<String>) -> String {
51    if md5s.is_empty() {
52        return String::new();
53    }
54
55    let mut sorted: Vec<&str> = md5s.iter().map(std::string::String::as_str).collect();
56    sorted.sort_unstable();
57    let concatenated = sorted.join(",");
58    let digest = md5::compute(concatenated.as_bytes());
59    format!("{digest:x}")
60}
61
62/// Check if adding another contig would exceed the maximum allowed.
63///
64/// Call this with the current count BEFORE adding a new contig.
65/// Returns an error message if adding would exceed the limit, None if safe to add.
66///
67/// # Example
68/// ```ignore
69/// if check_contig_limit(contigs.len()).is_some() {
70///     return Err(...);
71/// }
72/// contigs.push(new_contig); // Safe to add
73/// ```
74#[must_use]
75pub fn check_contig_limit(count: usize) -> Option<String> {
76    if count >= MAX_CONTIGS {
77        Some(format!(
78            "Too many contigs: adding another would exceed maximum of {MAX_CONTIGS}"
79        ))
80    } else {
81        None
82    }
83}
84
85/// Security validation error types
86#[derive(Debug, thiserror::Error)]
87pub enum ValidationError {
88    #[error("Filename too long: exceeds {MAX_FILENAME_LENGTH} characters")]
89    FilenameTooLong,
90    #[error("Invalid filename: contains path traversal or invalid characters")]
91    InvalidFilename,
92    #[error("Empty filename provided")]
93    EmptyFilename,
94    #[error("File content appears malformed or invalid")]
95    InvalidFileContent,
96    #[error("File format validation failed")]
97    FormatValidationFailed,
98}
99
100/// Secure filename validation to prevent directory traversal and other attacks
101///
102/// Validates and sanitizes filenames by:
103/// - Checking length limits
104/// - Preventing directory traversal (../, ..\\)
105/// - Removing potentially dangerous characters
106/// - Ensuring filename is not empty after sanitization
107///
108/// # Errors
109///
110/// Returns `ValidationError::EmptyFilename` if the filename is empty,
111/// `ValidationError::FilenameTooLong` if it exceeds the limit, or
112/// `ValidationError::InvalidFilename` if it contains invalid characters.
113pub fn validate_filename(filename: &str) -> Result<String, ValidationError> {
114    // Check if filename is empty
115    if filename.trim().is_empty() {
116        return Err(ValidationError::EmptyFilename);
117    }
118
119    // Check length limit
120    if filename.len() > MAX_FILENAME_LENGTH {
121        return Err(ValidationError::FilenameTooLong);
122    }
123
124    // Prevent directory traversal attacks
125    if filename.contains("..") || filename.contains('/') || filename.contains('\\') {
126        return Err(ValidationError::InvalidFilename);
127    }
128
129    // Check for null bytes and other dangerous characters
130    if filename.contains('\0') || filename.chars().any(|c| ('\x01'..='\x1F').contains(&c)) {
131        return Err(ValidationError::InvalidFilename);
132    }
133
134    // Sanitize filename by keeping only safe characters
135    let sanitized = filename
136        .chars()
137        .filter(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == '-' || *c == '_' || *c == ' ')
138        .collect::<String>();
139
140    // Ensure sanitized filename is not empty
141    if sanitized.trim().is_empty() {
142        return Err(ValidationError::InvalidFilename);
143    }
144
145    // Prevent hidden files (starting with .) unless it's a known extension
146    if sanitized.starts_with('.') && !has_known_extension(&sanitized) {
147        return Err(ValidationError::InvalidFilename);
148    }
149
150    Ok(sanitized)
151}
152
153/// Check if filename has a known safe extension
154fn has_known_extension(filename: &str) -> bool {
155    let safe_extensions = [
156        ".sam",
157        ".bam",
158        ".cram",
159        ".dict",
160        ".vcf",
161        ".txt",
162        ".tsv",
163        ".csv",
164        ".gz",
165        ".assembly_report.txt",
166    ];
167
168    safe_extensions
169        .iter()
170        .any(|ext| filename.to_lowercase().ends_with(ext))
171}
172
173/// Validate file content using magic numbers for known binary formats
174///
175/// Performs format validation by checking magic numbers (file signatures)
176/// to prevent format confusion attacks and ensure file integrity
177#[must_use]
178pub fn validate_file_format(content: &[u8], expected_format: FileFormat) -> bool {
179    if content.is_empty() {
180        return false;
181    }
182
183    match expected_format {
184        FileFormat::Bam => {
185            // BAM files start with "BAM\x01"
186            content.len() >= 4 && content.starts_with(b"BAM\x01")
187        }
188        FileFormat::Cram => {
189            // CRAM files start with "CRAM"
190            content.len() >= 4 && content.starts_with(b"CRAM")
191        }
192        FileFormat::Vcf => {
193            // VCF files should start with "##fileformat=VCF"
194            let content_str = std::str::from_utf8(content).unwrap_or("");
195            content_str.starts_with("##fileformat=VCF")
196        }
197        FileFormat::Sam => {
198            // SAM files are text-based, check for header indicators
199            let content_str = std::str::from_utf8(content).unwrap_or("");
200            content_str.contains("@SQ")
201                || content_str.contains("@HD")
202                || content_str.contains("SN:")
203                || content_str.contains("LN:")
204        }
205        FileFormat::Dict => {
206            // Picard dictionary files have @HD and @SQ headers
207            let content_str = std::str::from_utf8(content).unwrap_or("");
208            content_str.contains("@HD") && content_str.contains("@SQ")
209        }
210        FileFormat::NcbiReport => {
211            // NCBI assembly reports have specific column headers
212            let content_str = std::str::from_utf8(content).unwrap_or("");
213            content_str.contains("Sequence-Name") || content_str.contains("Sequence-Role")
214        }
215        FileFormat::Tsv => {
216            // TSV files should have tab-separated content
217            let content_str = std::str::from_utf8(content).unwrap_or("");
218            content_str.contains('\t')
219                && (content_str.to_lowercase().contains("length")
220                    || content_str.to_lowercase().contains("sequence")
221                    || content_str.to_lowercase().contains("size"))
222        }
223        FileFormat::Fai => {
224            // FAI files have 5 tab-separated columns per line
225            let content_str = std::str::from_utf8(content).unwrap_or("");
226            content_str.lines().take(5).any(|line| {
227                let fields: Vec<&str> = line.split('\t').collect();
228                fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
229            })
230        }
231        FileFormat::Fasta => {
232            // FASTA files start with '>' or are gzip compressed (0x1f 0x8b)
233            content.starts_with(b">")
234                || (content.len() >= 2 && content[0] == 0x1f && content[1] == 0x8b)
235        }
236        FileFormat::Auto => {
237            // Auto-detection always passes initial validation
238            true
239        }
240    }
241}
242
243/// Validate that file content is not malicious or malformed
244///
245/// Basic security checks for file content integrity:
246/// - Minimum size requirements
247/// - Binary content detection for text formats
248/// - Basic malformation checks
249///
250/// # Errors
251///
252/// Returns `ValidationError::InvalidFileContent` if the content is too small,
253/// contains unexpected binary data for text formats, or fails UTF-8 validation.
254pub fn validate_file_content(content: &[u8], expected_text: bool) -> Result<(), ValidationError> {
255    // Check minimum content size
256    if content.len() < MIN_FILE_CONTENT_SIZE {
257        return Err(ValidationError::InvalidFileContent);
258    }
259
260    // If we expect text content, validate it's not binary
261    if expected_text {
262        // Check for excessive non-printable characters
263        let non_printable_count = content
264            .iter()
265            .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
266            .count();
267
268        // Allow up to 5% non-printable characters for text files
269        if content.len() > 100 && non_printable_count > content.len() / 20 {
270            return Err(ValidationError::InvalidFileContent);
271        }
272
273        // Basic UTF-8 validation for text content
274        if std::str::from_utf8(content).is_err() {
275            return Err(ValidationError::InvalidFileContent);
276        }
277    }
278
279    Ok(())
280}
281
282/// Comprehensive input validation combining filename and content checks
283///
284/// Performs complete security validation for file uploads:
285/// - Filename sanitization and security checks
286/// - File format validation via magic numbers
287/// - Content integrity validation
288///
289/// # Errors
290///
291/// Returns a `ValidationError` if filename validation fails, the file format
292/// doesn't match the expected format, or content validation fails.
293pub fn validate_upload(
294    filename: Option<&str>,
295    content: &[u8],
296    expected_format: FileFormat,
297) -> Result<Option<String>, ValidationError> {
298    // Validate filename if provided
299    let validated_filename = if let Some(name) = filename {
300        Some(validate_filename(name)?)
301    } else {
302        None
303    };
304
305    // Validate content integrity
306    let is_text_format = matches!(
307        expected_format,
308        FileFormat::Sam
309            | FileFormat::Dict
310            | FileFormat::Vcf
311            | FileFormat::NcbiReport
312            | FileFormat::Tsv
313            | FileFormat::Auto
314    );
315
316    validate_file_content(content, is_text_format)?;
317
318    // Validate file format - even for auto-detection, check for obvious mismatches
319    if expected_format == FileFormat::Auto {
320        // For auto-detection, at least verify it's not a malformed binary file
321        // Check if it looks like a known binary format but is malformed
322        if content.len() >= 4 {
323            let starts_with_bam = content.starts_with(b"BAM");
324            let starts_with_cram = content.starts_with(b"CRAM");
325
326            // If it looks like it should be BAM/CRAM but isn't valid, reject it
327            if starts_with_bam && !validate_file_format(content, FileFormat::Bam) {
328                return Err(ValidationError::FormatValidationFailed);
329            }
330            if starts_with_cram && !validate_file_format(content, FileFormat::Cram) {
331                return Err(ValidationError::FormatValidationFailed);
332            }
333        }
334    } else if !validate_file_format(content, expected_format) {
335        return Err(ValidationError::FormatValidationFailed);
336    }
337
338    Ok(validated_filename)
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    #[test]
346    fn test_is_valid_md5() {
347        assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
348        assert!(is_valid_md5("AABBCCDD11223344556677889900AABB")); // uppercase ok
349        assert!(!is_valid_md5("not-an-md5"));
350        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); // 31 chars
351        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178ddd")); // 33 chars
352        assert!(!is_valid_md5("")); // empty
353        assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178dg")); // invalid char
354    }
355
356    #[test]
357    fn test_normalize_md5() {
358        assert_eq!(
359            normalize_md5("6AEF897C3D6FF0C78AFF06AC189178DD"),
360            Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
361        );
362        assert_eq!(normalize_md5("invalid"), None);
363    }
364
365    #[test]
366    fn test_compute_signature() {
367        let mut md5s = HashSet::new();
368        md5s.insert("aaaa".repeat(8)); // fake MD5
369        md5s.insert("bbbb".repeat(8));
370
371        let sig = compute_signature(&md5s);
372        assert_eq!(sig.len(), 32);
373
374        // Same input should give same output
375        let sig2 = compute_signature(&md5s);
376        assert_eq!(sig, sig2);
377
378        // Empty set gives empty string
379        let empty: HashSet<String> = HashSet::new();
380        assert_eq!(compute_signature(&empty), "");
381    }
382
383    #[test]
384    fn test_check_contig_limit() {
385        assert!(check_contig_limit(100).is_none());
386        assert!(check_contig_limit(MAX_CONTIGS - 1).is_none());
387        assert!(check_contig_limit(MAX_CONTIGS).is_some());
388        assert!(check_contig_limit(MAX_CONTIGS + 1).is_some());
389    }
390
391    // Security validation tests
392    #[test]
393    fn test_validate_filename_safe() {
394        assert!(validate_filename("test.sam").is_ok());
395        assert!(validate_filename("my-file.bam").is_ok());
396        assert!(validate_filename("data_file.txt").is_ok());
397        assert!(validate_filename("sample 123.vcf").is_ok());
398    }
399
400    #[test]
401    fn test_validate_filename_dangerous() {
402        // Directory traversal attempts
403        assert!(validate_filename("../etc/passwd").is_err());
404        assert!(validate_filename("..\\windows\\system32").is_err());
405        assert!(validate_filename("test/../../secret").is_err());
406
407        // Null bytes and control characters
408        assert!(validate_filename("test\0.txt").is_err());
409        assert!(validate_filename("test\x01.txt").is_err());
410
411        // Too long filename
412        let long_name = "a".repeat(300);
413        assert!(validate_filename(&long_name).is_err());
414
415        // Empty or whitespace-only
416        assert!(validate_filename("").is_err());
417        assert!(validate_filename("   ").is_err());
418
419        // Hidden files without known extensions
420        assert!(validate_filename(".hidden").is_err());
421    }
422
423    #[test]
424    fn test_validate_filename_sanitization() {
425        // Should remove dangerous characters but keep safe ones
426        let result = validate_filename("test@#$%file.txt").unwrap();
427        assert_eq!(result, "testfile.txt");
428
429        // Should preserve safe characters
430        let result = validate_filename("my-file_123.sam").unwrap();
431        assert_eq!(result, "my-file_123.sam");
432    }
433
434    #[test]
435    fn test_validate_file_format_bam() {
436        let bam_content = b"BAM\x01test_content";
437        assert!(validate_file_format(bam_content, FileFormat::Bam));
438
439        let invalid_bam = b"NOTBAM\x01";
440        assert!(!validate_file_format(invalid_bam, FileFormat::Bam));
441    }
442
443    #[test]
444    fn test_validate_file_format_cram() {
445        let cram_content = b"CRAMtest_content";
446        assert!(validate_file_format(cram_content, FileFormat::Cram));
447
448        let invalid_cram = b"NOTCRAM";
449        assert!(!validate_file_format(invalid_cram, FileFormat::Cram));
450    }
451
452    #[test]
453    fn test_validate_file_format_vcf() {
454        let vcf_content = b"##fileformat=VCFv4.2\n##contig=<ID=chr1>";
455        assert!(validate_file_format(vcf_content, FileFormat::Vcf));
456
457        let invalid_vcf = b"@SQ\tSN:chr1\tLN:123";
458        assert!(!validate_file_format(invalid_vcf, FileFormat::Vcf));
459    }
460
461    #[test]
462    fn test_validate_file_format_sam() {
463        let sam_content = b"@SQ\tSN:chr1\tLN:123456";
464        assert!(validate_file_format(sam_content, FileFormat::Sam));
465
466        let sam_content2 = b"@HD\tVN:1.0\tSO:coordinate";
467        assert!(validate_file_format(sam_content2, FileFormat::Sam));
468    }
469
470    #[test]
471    fn test_validate_file_content_text() {
472        let valid_text = b"@SQ\tSN:chr1\tLN:123456\n@SQ\tSN:chr2\tLN:654321";
473        assert!(validate_file_content(valid_text, true).is_ok());
474
475        // Too much binary data for text format
476        let binary_data = vec![0u8; 1000];
477        assert!(validate_file_content(&binary_data, true).is_err());
478
479        // Empty content
480        assert!(validate_file_content(b"", true).is_err());
481    }
482
483    #[test]
484    fn test_validate_file_content_binary() {
485        let binary_data = vec![0xABu8; 100];
486        assert!(validate_file_content(&binary_data, false).is_ok());
487
488        // Empty content still invalid for binary
489        assert!(validate_file_content(b"", false).is_err());
490    }
491
492    #[test]
493    fn test_validate_upload_complete() {
494        let sam_content = b"@SQ\tSN:chr1\tLN:123456";
495
496        // Valid upload with filename
497        let result = validate_upload(Some("test.sam"), sam_content, FileFormat::Sam);
498        assert!(result.is_ok());
499        assert_eq!(result.unwrap().unwrap(), "test.sam");
500
501        // Valid upload without filename
502        let result = validate_upload(None, sam_content, FileFormat::Sam);
503        assert!(result.is_ok());
504        assert!(result.unwrap().is_none());
505
506        // Invalid filename
507        let result = validate_upload(Some("../etc/passwd"), sam_content, FileFormat::Sam);
508        assert!(result.is_err());
509
510        // Format mismatch
511        let bam_content = b"BAM\x01test";
512        let result = validate_upload(Some("test.sam"), bam_content, FileFormat::Sam);
513        assert!(result.is_err());
514    }
515
516    #[test]
517    fn test_has_known_extension() {
518        assert!(has_known_extension(".sam"));
519        assert!(has_known_extension(".bam"));
520        assert!(has_known_extension(".vcf.gz"));
521        assert!(has_known_extension("test.assembly_report.txt"));
522
523        assert!(!has_known_extension(".exe"));
524        assert!(!has_known_extension(".hidden"));
525        assert!(!has_known_extension(".config"));
526    }
527}