ref_solver/utils/
validation.rs1use crate::web::format_detection::FileFormat;
4use std::collections::HashSet;
5
6pub const MAX_CONTIGS: usize = 100_000;
8
9pub const MAX_FILENAME_LENGTH: usize = 255;
11pub const MIN_FILE_CONTENT_SIZE: usize = 1;
12
13#[must_use]
25pub fn is_valid_md5(s: &str) -> bool {
26 s.len() == 32 && s.chars().all(|c| c.is_ascii_hexdigit())
27}
28
29#[must_use]
32pub fn normalize_md5(s: &str) -> Option<String> {
33 if is_valid_md5(s) {
34 Some(s.to_lowercase())
35 } else {
36 None
37 }
38}
39
40#[must_use]
49#[allow(clippy::implicit_hasher)] pub fn compute_signature(md5s: &HashSet<String>) -> String {
51 if md5s.is_empty() {
52 return String::new();
53 }
54
55 let mut sorted: Vec<&str> = md5s.iter().map(std::string::String::as_str).collect();
56 sorted.sort_unstable();
57 let concatenated = sorted.join(",");
58 let digest = md5::compute(concatenated.as_bytes());
59 format!("{digest:x}")
60}
61
62#[must_use]
75pub fn check_contig_limit(count: usize) -> Option<String> {
76 if count >= MAX_CONTIGS {
77 Some(format!(
78 "Too many contigs: adding another would exceed maximum of {MAX_CONTIGS}"
79 ))
80 } else {
81 None
82 }
83}
84
85#[derive(Debug, thiserror::Error)]
87pub enum ValidationError {
88 #[error("Filename too long: exceeds {MAX_FILENAME_LENGTH} characters")]
89 FilenameTooLong,
90 #[error("Invalid filename: contains path traversal or invalid characters")]
91 InvalidFilename,
92 #[error("Empty filename provided")]
93 EmptyFilename,
94 #[error("File content appears malformed or invalid")]
95 InvalidFileContent,
96 #[error("File format validation failed")]
97 FormatValidationFailed,
98}
99
100pub fn validate_filename(filename: &str) -> Result<String, ValidationError> {
114 if filename.trim().is_empty() {
116 return Err(ValidationError::EmptyFilename);
117 }
118
119 if filename.len() > MAX_FILENAME_LENGTH {
121 return Err(ValidationError::FilenameTooLong);
122 }
123
124 if filename.contains("..") || filename.contains('/') || filename.contains('\\') {
126 return Err(ValidationError::InvalidFilename);
127 }
128
129 if filename.contains('\0') || filename.chars().any(|c| ('\x01'..='\x1F').contains(&c)) {
131 return Err(ValidationError::InvalidFilename);
132 }
133
134 let sanitized = filename
136 .chars()
137 .filter(|c| c.is_ascii_alphanumeric() || *c == '.' || *c == '-' || *c == '_' || *c == ' ')
138 .collect::<String>();
139
140 if sanitized.trim().is_empty() {
142 return Err(ValidationError::InvalidFilename);
143 }
144
145 if sanitized.starts_with('.') && !has_known_extension(&sanitized) {
147 return Err(ValidationError::InvalidFilename);
148 }
149
150 Ok(sanitized)
151}
152
153fn has_known_extension(filename: &str) -> bool {
155 let safe_extensions = [
156 ".sam",
157 ".bam",
158 ".cram",
159 ".dict",
160 ".vcf",
161 ".txt",
162 ".tsv",
163 ".csv",
164 ".gz",
165 ".assembly_report.txt",
166 ];
167
168 safe_extensions
169 .iter()
170 .any(|ext| filename.to_lowercase().ends_with(ext))
171}
172
173#[must_use]
178pub fn validate_file_format(content: &[u8], expected_format: FileFormat) -> bool {
179 if content.is_empty() {
180 return false;
181 }
182
183 match expected_format {
184 FileFormat::Bam => {
185 content.len() >= 4 && content.starts_with(b"BAM\x01")
187 }
188 FileFormat::Cram => {
189 content.len() >= 4 && content.starts_with(b"CRAM")
191 }
192 FileFormat::Vcf => {
193 let content_str = std::str::from_utf8(content).unwrap_or("");
195 content_str.starts_with("##fileformat=VCF")
196 }
197 FileFormat::Sam => {
198 let content_str = std::str::from_utf8(content).unwrap_or("");
200 content_str.contains("@SQ")
201 || content_str.contains("@HD")
202 || content_str.contains("SN:")
203 || content_str.contains("LN:")
204 }
205 FileFormat::Dict => {
206 let content_str = std::str::from_utf8(content).unwrap_or("");
208 content_str.contains("@HD") && content_str.contains("@SQ")
209 }
210 FileFormat::NcbiReport => {
211 let content_str = std::str::from_utf8(content).unwrap_or("");
213 content_str.contains("Sequence-Name") || content_str.contains("Sequence-Role")
214 }
215 FileFormat::Tsv => {
216 let content_str = std::str::from_utf8(content).unwrap_or("");
218 content_str.contains('\t')
219 && (content_str.to_lowercase().contains("length")
220 || content_str.to_lowercase().contains("sequence")
221 || content_str.to_lowercase().contains("size"))
222 }
223 FileFormat::Fai => {
224 let content_str = std::str::from_utf8(content).unwrap_or("");
226 content_str.lines().take(5).any(|line| {
227 let fields: Vec<&str> = line.split('\t').collect();
228 fields.len() == 5 && fields[1..].iter().all(|f| f.parse::<u64>().is_ok())
229 })
230 }
231 FileFormat::Fasta => {
232 content.starts_with(b">")
234 || (content.len() >= 2 && content[0] == 0x1f && content[1] == 0x8b)
235 }
236 FileFormat::Auto => {
237 true
239 }
240 }
241}
242
243pub fn validate_file_content(content: &[u8], expected_text: bool) -> Result<(), ValidationError> {
255 if content.len() < MIN_FILE_CONTENT_SIZE {
257 return Err(ValidationError::InvalidFileContent);
258 }
259
260 if expected_text {
262 let non_printable_count = content
264 .iter()
265 .filter(|&&b| b < 9 || (b > 13 && b < 32) || b > 126)
266 .count();
267
268 if content.len() > 100 && non_printable_count > content.len() / 20 {
270 return Err(ValidationError::InvalidFileContent);
271 }
272
273 if std::str::from_utf8(content).is_err() {
275 return Err(ValidationError::InvalidFileContent);
276 }
277 }
278
279 Ok(())
280}
281
282pub fn validate_upload(
294 filename: Option<&str>,
295 content: &[u8],
296 expected_format: FileFormat,
297) -> Result<Option<String>, ValidationError> {
298 let validated_filename = if let Some(name) = filename {
300 Some(validate_filename(name)?)
301 } else {
302 None
303 };
304
305 let is_text_format = matches!(
307 expected_format,
308 FileFormat::Sam
309 | FileFormat::Dict
310 | FileFormat::Vcf
311 | FileFormat::NcbiReport
312 | FileFormat::Tsv
313 | FileFormat::Auto
314 );
315
316 validate_file_content(content, is_text_format)?;
317
318 if expected_format == FileFormat::Auto {
320 if content.len() >= 4 {
323 let starts_with_bam = content.starts_with(b"BAM");
324 let starts_with_cram = content.starts_with(b"CRAM");
325
326 if starts_with_bam && !validate_file_format(content, FileFormat::Bam) {
328 return Err(ValidationError::FormatValidationFailed);
329 }
330 if starts_with_cram && !validate_file_format(content, FileFormat::Cram) {
331 return Err(ValidationError::FormatValidationFailed);
332 }
333 }
334 } else if !validate_file_format(content, expected_format) {
335 return Err(ValidationError::FormatValidationFailed);
336 }
337
338 Ok(validated_filename)
339}
340
341#[cfg(test)]
342mod tests {
343 use super::*;
344
345 #[test]
346 fn test_is_valid_md5() {
347 assert!(is_valid_md5("6aef897c3d6ff0c78aff06ac189178dd"));
348 assert!(is_valid_md5("AABBCCDD11223344556677889900AABB")); assert!(!is_valid_md5("not-an-md5"));
350 assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178d")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178ddd")); assert!(!is_valid_md5("")); assert!(!is_valid_md5("6aef897c3d6ff0c78aff06ac189178dg")); }
355
356 #[test]
357 fn test_normalize_md5() {
358 assert_eq!(
359 normalize_md5("6AEF897C3D6FF0C78AFF06AC189178DD"),
360 Some("6aef897c3d6ff0c78aff06ac189178dd".to_string())
361 );
362 assert_eq!(normalize_md5("invalid"), None);
363 }
364
365 #[test]
366 fn test_compute_signature() {
367 let mut md5s = HashSet::new();
368 md5s.insert("aaaa".repeat(8)); md5s.insert("bbbb".repeat(8));
370
371 let sig = compute_signature(&md5s);
372 assert_eq!(sig.len(), 32);
373
374 let sig2 = compute_signature(&md5s);
376 assert_eq!(sig, sig2);
377
378 let empty: HashSet<String> = HashSet::new();
380 assert_eq!(compute_signature(&empty), "");
381 }
382
383 #[test]
384 fn test_check_contig_limit() {
385 assert!(check_contig_limit(100).is_none());
386 assert!(check_contig_limit(MAX_CONTIGS - 1).is_none());
387 assert!(check_contig_limit(MAX_CONTIGS).is_some());
388 assert!(check_contig_limit(MAX_CONTIGS + 1).is_some());
389 }
390
391 #[test]
393 fn test_validate_filename_safe() {
394 assert!(validate_filename("test.sam").is_ok());
395 assert!(validate_filename("my-file.bam").is_ok());
396 assert!(validate_filename("data_file.txt").is_ok());
397 assert!(validate_filename("sample 123.vcf").is_ok());
398 }
399
400 #[test]
401 fn test_validate_filename_dangerous() {
402 assert!(validate_filename("../etc/passwd").is_err());
404 assert!(validate_filename("..\\windows\\system32").is_err());
405 assert!(validate_filename("test/../../secret").is_err());
406
407 assert!(validate_filename("test\0.txt").is_err());
409 assert!(validate_filename("test\x01.txt").is_err());
410
411 let long_name = "a".repeat(300);
413 assert!(validate_filename(&long_name).is_err());
414
415 assert!(validate_filename("").is_err());
417 assert!(validate_filename(" ").is_err());
418
419 assert!(validate_filename(".hidden").is_err());
421 }
422
423 #[test]
424 fn test_validate_filename_sanitization() {
425 let result = validate_filename("test@#$%file.txt").unwrap();
427 assert_eq!(result, "testfile.txt");
428
429 let result = validate_filename("my-file_123.sam").unwrap();
431 assert_eq!(result, "my-file_123.sam");
432 }
433
434 #[test]
435 fn test_validate_file_format_bam() {
436 let bam_content = b"BAM\x01test_content";
437 assert!(validate_file_format(bam_content, FileFormat::Bam));
438
439 let invalid_bam = b"NOTBAM\x01";
440 assert!(!validate_file_format(invalid_bam, FileFormat::Bam));
441 }
442
443 #[test]
444 fn test_validate_file_format_cram() {
445 let cram_content = b"CRAMtest_content";
446 assert!(validate_file_format(cram_content, FileFormat::Cram));
447
448 let invalid_cram = b"NOTCRAM";
449 assert!(!validate_file_format(invalid_cram, FileFormat::Cram));
450 }
451
452 #[test]
453 fn test_validate_file_format_vcf() {
454 let vcf_content = b"##fileformat=VCFv4.2\n##contig=<ID=chr1>";
455 assert!(validate_file_format(vcf_content, FileFormat::Vcf));
456
457 let invalid_vcf = b"@SQ\tSN:chr1\tLN:123";
458 assert!(!validate_file_format(invalid_vcf, FileFormat::Vcf));
459 }
460
461 #[test]
462 fn test_validate_file_format_sam() {
463 let sam_content = b"@SQ\tSN:chr1\tLN:123456";
464 assert!(validate_file_format(sam_content, FileFormat::Sam));
465
466 let sam_content2 = b"@HD\tVN:1.0\tSO:coordinate";
467 assert!(validate_file_format(sam_content2, FileFormat::Sam));
468 }
469
470 #[test]
471 fn test_validate_file_content_text() {
472 let valid_text = b"@SQ\tSN:chr1\tLN:123456\n@SQ\tSN:chr2\tLN:654321";
473 assert!(validate_file_content(valid_text, true).is_ok());
474
475 let binary_data = vec![0u8; 1000];
477 assert!(validate_file_content(&binary_data, true).is_err());
478
479 assert!(validate_file_content(b"", true).is_err());
481 }
482
483 #[test]
484 fn test_validate_file_content_binary() {
485 let binary_data = vec![0xABu8; 100];
486 assert!(validate_file_content(&binary_data, false).is_ok());
487
488 assert!(validate_file_content(b"", false).is_err());
490 }
491
492 #[test]
493 fn test_validate_upload_complete() {
494 let sam_content = b"@SQ\tSN:chr1\tLN:123456";
495
496 let result = validate_upload(Some("test.sam"), sam_content, FileFormat::Sam);
498 assert!(result.is_ok());
499 assert_eq!(result.unwrap().unwrap(), "test.sam");
500
501 let result = validate_upload(None, sam_content, FileFormat::Sam);
503 assert!(result.is_ok());
504 assert!(result.unwrap().is_none());
505
506 let result = validate_upload(Some("../etc/passwd"), sam_content, FileFormat::Sam);
508 assert!(result.is_err());
509
510 let bam_content = b"BAM\x01test";
512 let result = validate_upload(Some("test.sam"), bam_content, FileFormat::Sam);
513 assert!(result.is_err());
514 }
515
516 #[test]
517 fn test_has_known_extension() {
518 assert!(has_known_extension(".sam"));
519 assert!(has_known_extension(".bam"));
520 assert!(has_known_extension(".vcf.gz"));
521 assert!(has_known_extension("test.assembly_report.txt"));
522
523 assert!(!has_known_extension(".exe"));
524 assert!(!has_known_extension(".hidden"));
525 assert!(!has_known_extension(".config"));
526 }
527}