ref_solver/utils/
validation.rs1use crate::web::format_detection::FileFormat;
4use std::collections::HashSet;
5
6pub const MAX_CONTIGS: usize = 100_000;
8
9pub const MAX_FILENAME_LENGTH: usize = 255;
11pub const MIN_FILE_CONTENT_SIZE: usize = 1;
12
13#[must_use]
25pub fn is_valid_md5(s: &str) -> bool {
26 s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit())
27}
28
29#[must_use]
32pub fn normalize_md5(s: &str) -> Option<String> {
33 if is_valid_md5(s) {
34 Some(s.to_lowercase())
35 } else {
36 None
37 }
38}
39
40#[must_use]
56pub fn compute_sha512t24u(sequence: &[u8]) -> String {
57 refget_digest::sha512t24u(sequence)
58}
59
60#[must_use]
71pub fn is_valid_sha512t24u(s: &str) -> bool {
72 s.len() == 32
73 && s.bytes()
74 .all(|b| b.is_ascii_alphanumeric() || b == b'-' || b == b'_')
75}
76
77#[must_use]
86#[allow(clippy::implicit_hasher)] pub fn compute_signature(md5s: &HashSet<String>) -> String {
88 if md5s.is_empty() {
89 return String::new();
90 }
91
92 let mut sorted: Vec<&str> = md5s.iter().map(std::string::String::as_str).collect();
93 sorted.sort_unstable();
94 let concatenated = sorted.join(",");
95 let digest = md5::compute(concatenated.as_bytes());
96 format!("{digest:x}")
97}
98
99#[must_use]
112pub fn check_contig_limit(count: usize) -> Option<String> {
113 if count >= MAX_CONTIGS {
114 Some(format!(
115 "Too many contigs: adding another would exceed maximum of {MAX_CONTIGS}"
116 ))
117 } else {
118 None
119 }
120}
121
122#[derive(Debug, thiserror::Error)]
124pub enum ValidationError {
125 #[error("Filename too long: exceeds {MAX_FILENAME_LENGTH} characters")]
126 FilenameTooLong,
127 #[error("Invalid filename: contains path traversal or invalid characters")]
128 InvalidFilename,
129 #[error("Empty filename provided")]
130 EmptyFilename,
131 #[error("File content appears malformed or invalid")]
132 InvalidFileContent,
133 #[error("File format validation failed")]
134 FormatValidationFailed,
135}
136
137pub fn validate_filename(filename: &str) -> Result<String, ValidationError> {
151 if filename.trim().is_empty() {
153 return Err(ValidationError::EmptyFilename);
154 }
155
156 if filename.len() > MAX_FILENAME_LENGTH {
158 return Err(ValidationError::FilenameTooLong);
159 }
160
161 if filename.contains("..") || filename.contains('/') || filename.contains('\\') {
163 return Err(ValidationError::InvalidFilename);
164 }
165
166 if filename.contains('\0') || filename.chars().any(|c| ('\x01'..='\x1F').contains(&c)) {
168 return Err(ValidationError::InvalidFilename);
169 }
170
171 let sanitized = filename
173 .chars()
174 .filter(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == '-' || *c == '_' || *c == ' ')
175 .collect::<String>();
176
177 if sanitized.trim().is_empty() {
179 return Err(ValidationError::InvalidFilename);
180 }
181
182 if sanitized.starts_with('.') && !has_known_extension(&sanitized) {
184 return Err(ValidationError::InvalidFilename);
185 }
186
187 Ok(sanitized)
188}
189
190fn has_known_extension(filename: &str) -> bool {
192 let safe_extensions = [
193 ".sam",
194 ".bam",
195 ".cram",
196 ".dict",
197 ".vcf",
198 ".txt",
199 ".tsv",
200 ".csv",
201 ".gz",
202 ".assembly_report.txt",
203 ];
204
205 safe_extensions
206 .iter()
207 .any(|ext| filename.to_lowercase().ends_with(ext))
208}
209
210#[must_use]
215pub fn validate_file_format(content: &[u8], expected_format: FileFormat) -> bool {
216 if content.is_empty() {
217 return false;
218 }
219
220 match expected_format {
221 FileFormat::Bam => {
222 content.len() >= 4 && content.starts_with(b"BAM\x01")
224 }
225 FileFormat::Cram => {
226 content.len() >= 4 && content.starts_with(b"CRAM")
228 }
229 FileFormat::Vcf => {
230 let content_str = std::str::from_utf8(content).unwrap_or("");
232 content_str.starts_with("##fileformat=VCF")
233 }
234 FileFormat::Sam => {
235 let content_str = std::str::from_utf8(content).unwrap_or("");
237 content_str.contains("@SQ")
238 || content_str.contains("@HD")
239 || content_str.contains("SN:")
240 || content_str.contains("LN:")
241 }
242 FileFormat::Dict => {
243 let content_str = std::str::from_utf8(content).unwrap_or("");
245 content_str.contains("@HD") && content_str.contains("@SQ")
246 }
247 FileFormat::NcbiReport => {
248 let content_str = std::str::from_utf8(content).unwrap_or("");
250 content_str.contains("Sequence-Name") || content_str.contains("Sequence-Role")
251 }
252 FileFormat::Tsv => {
253 let content_str = std::str::from_utf8(content).unwrap_or("");
255 content_str.contains('\t')
256 && (content_str.to_lowercase().contains("length")
257 || content_str.to_lowercase().contains("sequence")
258 || content_str.to_lowercase().contains("size"))
259 }
260 FileFormat::Fai => {
261 let content_str = std::str::from_utf8(content).unwrap_or("");
263 content_str.lines().take(5).any(|line| {
264 let fields: Vec<&str> = line.split('\t').collect();
265 fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
266 })
267 }
268 FileFormat::Fasta => {
269 content.starts_with(b">")
271 || (content.len() >= 2 && content[0] == 0x1f && content[1] == 0x8b)
272 }
273 FileFormat::Auto => {
274 true
276 }
277 }
278}
279
280pub fn validate_file_content(content: &[u8], expected_text: bool) -> Result<(), ValidationError> {
292 if content.len() < MIN_FILE_CONTENT_SIZE {
294 return Err(ValidationError::InvalidFileContent);
295 }
296
297 if expected_text {
299 let non_printable_count = content
301 .iter()
302 .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
303 .count();
304
305 if content.len() > 100 && non_printable_count > content.len() / 20 {
307 return Err(ValidationError::InvalidFileContent);
308 }
309
310 if std::str::from_utf8(content).is_err() {
312 return Err(ValidationError::InvalidFileContent);
313 }
314 }
315
316 Ok(())
317}
318
319pub fn validate_upload(
331 filename: Option<&str>,
332 content: &[u8],
333 expected_format: FileFormat,
334) -> Result<Option<String>, ValidationError> {
335 let validated_filename = if let Some(name) = filename {
337 Some(validate_filename(name)?)
338 } else {
339 None
340 };
341
342 let is_text_format = matches!(
344 expected_format,
345 FileFormat::Sam
346 | FileFormat::Dict
347 | FileFormat::Vcf
348 | FileFormat::NcbiReport
349 | FileFormat::Tsv
350 | FileFormat::Auto
351 );
352
353 validate_file_content(content, is_text_format)?;
354
355 if expected_format == FileFormat::Auto {
357 if content.len() >= 4 {
360 let starts_with_bam = content.starts_with(b"BAM");
361 let starts_with_cram = content.starts_with(b"CRAM");
362
363 if starts_with_bam && !validate_file_format(content, FileFormat::Bam) {
365 return Err(ValidationError::FormatValidationFailed);
366 }
367 if starts_with_cram && !validate_file_format(content, FileFormat::Cram) {
368 return Err(ValidationError::FormatValidationFailed);
369 }
370 }
371 } else if !validate_file_format(content, expected_format) {
372 return Err(ValidationError::FormatValidationFailed);
373 }
374
375 Ok(validated_filename)
376}
377
378#[cfg(test)]
379mod tests {
380 use super::*;
381
382 #[test]
383 fn test_is_valid_md5() {
384 assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
385 assert!(is_valid_md5("AABBCCDD11223344556677889900AABB")); assert!(!is_valid_md5("not-an-md5"));
387 assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178ddd")); assert!(!is_valid_md5("")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178dg")); }
392
393 #[test]
394 fn test_compute_sha512t24u() {
395 let digest = compute_sha512t24u(b"ACGT");
397 assert_eq!(digest.len(), 32);
398 assert!(is_valid_sha512t24u(&digest));
399
400 assert_eq!(digest, compute_sha512t24u(b"ACGT"));
402
403 assert_ne!(digest, compute_sha512t24u(b"TGCA"));
405
406 assert_eq!(digest, "aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2");
408 }
409
410 #[test]
411 fn test_is_valid_sha512t24u() {
412 assert!(is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2"));
413 assert!(!is_valid_sha512t24u("too-short"));
414 assert!(!is_valid_sha512t24u(""));
415 assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2X"));
417 assert!(!is_valid_sha512t24u("aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw "));
419 }
420
421 #[test]
422 fn test_normalize_md5() {
423 assert_eq!(
424 normalize_md5("6AEF897C3D6FF0C78AFF06AC189178DD"),
425 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
426 );
427 assert_eq!(normalize_md5("invalid"), None);
428 }
429
430 #[test]
431 fn test_compute_signature() {
432 let mut md5s = HashSet::new();
433 md5s.insert("aaaa".repeat(8)); md5s.insert("bbbb".repeat(8));
435
436 let sig = compute_signature(&md5s);
437 assert_eq!(sig.len(), 32);
438
439 let sig2 = compute_signature(&md5s);
441 assert_eq!(sig, sig2);
442
443 let empty: HashSet<String> = HashSet::new();
445 assert_eq!(compute_signature(&empty), "");
446 }
447
448 #[test]
449 fn test_check_contig_limit() {
450 assert!(check_contig_limit(100).is_none());
451 assert!(check_contig_limit(MAX_CONTIGS - 1).is_none());
452 assert!(check_contig_limit(MAX_CONTIGS).is_some());
453 assert!(check_contig_limit(MAX_CONTIGS + 1).is_some());
454 }
455
456 #[test]
458 fn test_validate_filename_safe() {
459 assert!(validate_filename("test.sam").is_ok());
460 assert!(validate_filename("my-file.bam").is_ok());
461 assert!(validate_filename("data_file.txt").is_ok());
462 assert!(validate_filename("sample 123.vcf").is_ok());
463 }
464
465 #[test]
466 fn test_validate_filename_dangerous() {
467 assert!(validate_filename("../etc/passwd").is_err());
469 assert!(validate_filename("..\\windows\\system32").is_err());
470 assert!(validate_filename("test/../../secret").is_err());
471
472 assert!(validate_filename("test\0.txt").is_err());
474 assert!(validate_filename("test\x01.txt").is_err());
475
476 let long_name = "a".repeat(300);
478 assert!(validate_filename(&long_name).is_err());
479
480 assert!(validate_filename("").is_err());
482 assert!(validate_filename(" ").is_err());
483
484 assert!(validate_filename(".hidden").is_err());
486 }
487
488 #[test]
489 fn test_validate_filename_sanitization() {
490 let result = validate_filename("test@#$%file.txt").unwrap();
492 assert_eq!(result, "testfile.txt");
493
494 let result = validate_filename("my-file_123.sam").unwrap();
496 assert_eq!(result, "my-file_123.sam");
497 }
498
499 #[test]
500 fn test_validate_file_format_bam() {
501 let bam_content = b"BAM\x01test_content";
502 assert!(validate_file_format(bam_content, FileFormat::Bam));
503
504 let invalid_bam = b"NOTBAM\x01";
505 assert!(!validate_file_format(invalid_bam, FileFormat::Bam));
506 }
507
508 #[test]
509 fn test_validate_file_format_cram() {
510 let cram_content = b"CRAMtest_content";
511 assert!(validate_file_format(cram_content, FileFormat::Cram));
512
513 let invalid_cram = b"NOTCRAM";
514 assert!(!validate_file_format(invalid_cram, FileFormat::Cram));
515 }
516
517 #[test]
518 fn test_validate_file_format_vcf() {
519 let vcf_content = b"##fileformat=VCFv4.2\n##contig=<ID=chr1>";
520 assert!(validate_file_format(vcf_content, FileFormat::Vcf));
521
522 let invalid_vcf = b"@SQ\tSN:chr1\tLN:123";
523 assert!(!validate_file_format(invalid_vcf, FileFormat::Vcf));
524 }
525
526 #[test]
527 fn test_validate_file_format_sam() {
528 let sam_content = b"@SQ\tSN:chr1\tLN:123456";
529 assert!(validate_file_format(sam_content, FileFormat::Sam));
530
531 let sam_content2 = b"@HD\tVN:1.0\tSO:coordinate";
532 assert!(validate_file_format(sam_content2, FileFormat::Sam));
533 }
534
535 #[test]
536 fn test_validate_file_content_text() {
537 let valid_text = b"@SQ\tSN:chr1\tLN:123456\n@SQ\tSN:chr2\tLN:654321";
538 assert!(validate_file_content(valid_text, true).is_ok());
539
540 let binary_data = vec![0u8; 1000];
542 assert!(validate_file_content(&binary_data, true).is_err());
543
544 assert!(validate_file_content(b"", true).is_err());
546 }
547
548 #[test]
549 fn test_validate_file_content_binary() {
550 let binary_data = vec![0xABu8; 100];
551 assert!(validate_file_content(&binary_data, false).is_ok());
552
553 assert!(validate_file_content(b"", false).is_err());
555 }
556
557 #[test]
558 fn test_validate_upload_complete() {
559 let sam_content = b"@SQ\tSN:chr1\tLN:123456";
560
561 let result = validate_upload(Some("test.sam"), sam_content, FileFormat::Sam);
563 assert!(result.is_ok());
564 assert_eq!(result.unwrap().unwrap(), "test.sam");
565
566 let result = validate_upload(None, sam_content, FileFormat::Sam);
568 assert!(result.is_ok());
569 assert!(result.unwrap().is_none());
570
571 let result = validate_upload(Some("../etc/passwd"), sam_content, FileFormat::Sam);
573 assert!(result.is_err());
574
575 let bam_content = b"BAM\x01test";
577 let result = validate_upload(Some("test.sam"), bam_content, FileFormat::Sam);
578 assert!(result.is_err());
579 }
580
581 #[test]
582 fn test_has_known_extension() {
583 assert!(has_known_extension(".sam"));
584 assert!(has_known_extension(".bam"));
585 assert!(has_known_extension(".vcf.gz"));
586 assert!(has_known_extension("test.assembly_report.txt"));
587
588 assert!(!has_known_extension(".exe"));
589 assert!(!has_known_extension(".hidden"));
590 assert!(!has_known_extension(".config"));
591 }
592}